clone

Build error

App Files Files Community

nikkmitra commited on Oct 9, 2024

Commit

f4d5ad2

verified ·

1 Parent(s): 0826a4e

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -7

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import tempfile
 from pymongo import MongoClient
 from dotenv import load_dotenv
 from huggingface_hub import hf_hub_download
 # Load environment variables
 load_dotenv()
@@ -56,16 +57,35 @@ def check_voice_files():
     else:
         return "**All voice files are present.** 🎉"
-# New function to split text into chunks of 100 tokens
-def split_text_into_chunks(text, max_tokens=100):
     """
     Splits the input text into chunks with a maximum of `max_tokens` tokens each.
     Inserts a newline after each chunk.
     """
-    words = text.split()
     chunks = []
-    for i in range(0, len(words), max_tokens):
-        chunk = ' '.join(words[i:i + max_tokens])
         chunks.append(chunk)
     return '\n'.join(chunks)
@@ -73,7 +93,7 @@ def split_text_into_chunks(text, max_tokens=100):
 def tts_generate(text, voice, language):
     # Check for Hindi language and split text if necessary
     if language == "hi":
-        text = split_text_into_chunks(text, max_tokens=100)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
         temp_audio_path = temp_audio.name
@@ -101,7 +121,7 @@ def tts_generate(text, voice, language):
 def clone_voice(text, audio_file, language):
     # Check for Hindi language and split text if necessary
     if language == "hi":
-        text = split_text_into_chunks(text, max_tokens=100)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
         temp_audio_path = temp_audio.name

 from pymongo import MongoClient
 from dotenv import load_dotenv
 from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer
 # Load environment variables
 load_dotenv()
     else:
         return "**All voice files are present.** 🎉"
+# Initialize Hindi tokenizer
+def load_hindi_tokenizer():
+    """
+    Loads a pre-trained Hindi tokenizer from Hugging Face.
+    """
+    tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert", use_fast=True)
+    return tokenizer
+hindi_tokenizer = load_hindi_tokenizer()
+# New function to split text into chunks of 100 tokens using the Hindi tokenizer
+def split_text_into_chunks(text, max_tokens=100, language="en"):
     """
     Splits the input text into chunks with a maximum of `max_tokens` tokens each.
     Inserts a newline after each chunk.
+    Uses a specialized tokenizer for Hindi language.
     """
+    if language == "hi":
+        tokens = hindi_tokenizer.tokenize(text)
+    else:
+        tokens = text.split()  # Fallback to simple splitting for other languages
     chunks = []
+    for i in range(0, len(tokens), max_tokens):
+        if language == "hi":
+            # Convert tokens back to string for Hindi
+            chunk = hindi_tokenizer.convert_tokens_to_string(tokens[i:i + max_tokens])
+        else:
+            chunk = ' '.join(tokens[i:i + max_tokens])
         chunks.append(chunk)
     return '\n'.join(chunks)
 def tts_generate(text, voice, language):
     # Check for Hindi language and split text if necessary
     if language == "hi":
+        text = split_text_into_chunks(text, max_tokens=100, language=language)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
         temp_audio_path = temp_audio.name
 def clone_voice(text, audio_file, language):
     # Check for Hindi language and split text if necessary
     if language == "hi":
+        text = split_text_into_chunks(text, max_tokens=100, language=language)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
         temp_audio_path = temp_audio.name