nikkmitra commited on
Commit
f4d5ad2
·
verified ·
1 Parent(s): 0826a4e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -7
app.py CHANGED
@@ -7,6 +7,7 @@ import tempfile
7
  from pymongo import MongoClient
8
  from dotenv import load_dotenv
9
  from huggingface_hub import hf_hub_download
 
10
 
11
  # Load environment variables
12
  load_dotenv()
@@ -56,16 +57,35 @@ def check_voice_files():
56
  else:
57
  return "**All voice files are present.** 🎉"
58
 
59
- # New function to split text into chunks of 100 tokens
60
- def split_text_into_chunks(text, max_tokens=100):
 
 
 
 
 
 
 
 
 
 
61
  """
62
  Splits the input text into chunks with a maximum of `max_tokens` tokens each.
63
  Inserts a newline after each chunk.
 
64
  """
65
- words = text.split()
 
 
 
 
66
  chunks = []
67
- for i in range(0, len(words), max_tokens):
68
- chunk = ' '.join(words[i:i + max_tokens])
 
 
 
 
69
  chunks.append(chunk)
70
  return '\n'.join(chunks)
71
 
@@ -73,7 +93,7 @@ def split_text_into_chunks(text, max_tokens=100):
73
  def tts_generate(text, voice, language):
74
  # Check for Hindi language and split text if necessary
75
  if language == "hi":
76
- text = split_text_into_chunks(text, max_tokens=100)
77
 
78
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
79
  temp_audio_path = temp_audio.name
@@ -101,7 +121,7 @@ def tts_generate(text, voice, language):
101
  def clone_voice(text, audio_file, language):
102
  # Check for Hindi language and split text if necessary
103
  if language == "hi":
104
- text = split_text_into_chunks(text, max_tokens=100)
105
 
106
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
107
  temp_audio_path = temp_audio.name
 
7
  from pymongo import MongoClient
8
  from dotenv import load_dotenv
9
  from huggingface_hub import hf_hub_download
10
+ from transformers import AutoTokenizer
11
 
12
  # Load environment variables
13
  load_dotenv()
 
57
  else:
58
  return "**All voice files are present.** 🎉"
59
 
60
+ # Initialize Hindi tokenizer
61
+ def load_hindi_tokenizer():
62
+ """
63
+ Loads a pre-trained Hindi tokenizer from Hugging Face.
64
+ """
65
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert", use_fast=True)
66
+ return tokenizer
67
+
68
+ hindi_tokenizer = load_hindi_tokenizer()
69
+
70
+ # New function to split text into chunks of 100 tokens using the Hindi tokenizer
71
+ def split_text_into_chunks(text, max_tokens=100, language="en"):
72
  """
73
  Splits the input text into chunks with a maximum of `max_tokens` tokens each.
74
  Inserts a newline after each chunk.
75
+ Uses a specialized tokenizer for Hindi language.
76
  """
77
+ if language == "hi":
78
+ tokens = hindi_tokenizer.tokenize(text)
79
+ else:
80
+ tokens = text.split() # Fallback to simple splitting for other languages
81
+
82
  chunks = []
83
+ for i in range(0, len(tokens), max_tokens):
84
+ if language == "hi":
85
+ # Convert tokens back to string for Hindi
86
+ chunk = hindi_tokenizer.convert_tokens_to_string(tokens[i:i + max_tokens])
87
+ else:
88
+ chunk = ' '.join(tokens[i:i + max_tokens])
89
  chunks.append(chunk)
90
  return '\n'.join(chunks)
91
 
 
93
  def tts_generate(text, voice, language):
94
  # Check for Hindi language and split text if necessary
95
  if language == "hi":
96
+ text = split_text_into_chunks(text, max_tokens=100, language=language)
97
 
98
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
99
  temp_audio_path = temp_audio.name
 
121
  def clone_voice(text, audio_file, language):
122
  # Check for Hindi language and split text if necessary
123
  if language == "hi":
124
+ text = split_text_into_chunks(text, max_tokens=100, language=language)
125
 
126
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
127
  temp_audio_path = temp_audio.name