WARNING:phonemizer:words count mismatch on 100.0% of the lines (1/1) Truncated to 510 tokens
#76
by
Mentrasss
- opened
i have many problems with this issue i don't know how i can make a smaller for don't miss context if anyone can give me hand how read files txt for make small text ! should be great
thanks in advance
This is my solution. It works quite well.
def read(self, text, read_in_chunks=True):
if read_in_chunks:
audio = []
for chunk in self.split_text_into_chunks(text=text):
chunk = chunk.rstrip("\n")
chunk = chunk.lstrip("-")
chunk = chunk.lstrip(" ")
chunk = chunk.lstrip("\n")
if len(chunk) < 2: continue
snippet, _ = generate(self.model, chunk, self.voicepack, lang=self.voice_name[0])
audio.extend(snippet)
audio = np.array(audio, dtype=np.float32) # Convert to NumPy array
else:
audio, out_ps = generate(self.model, text, self.voicepack, lang=self.voice_name[0])
sd.play(audio, SAMPLE_RATE)
def split_text_into_chunks(self, text, max_length=200, min_length=100):
# 1) Collapse consecutive new lines into a single newline.
text = re.sub(r'\n+', '\n', text).strip()
# 2) Split by single newlines.
lines = text.split('\n')
chunks = []
for line in lines:
line = line.strip()
if not line:
continue # skip empty lines after stripping
if len(line) <= max_length:
# If within limit, just add this line as a chunk.
chunks.append(line)
else:
# Otherwise, split this line by "sentences".
sentences = self.split_line_by_sentences(line)
# Accumulate sentences into sub-chunks without exceeding max_length
current_chunk = ""
for sentence in sentences:
if not current_chunk:
# start a new chunk
current_chunk = sentence
else:
# try to add this sentence to the current chunk
# (plus 1 char for a space, if you prefer a separator)
if len(current_chunk) + 1 + len(sentence) <= max_length:
current_chunk += " " + sentence
else:
# if adding the sentence would exceed max_length,
# finalize the current chunk and start a new one
chunks.append(current_chunk)
current_chunk = sentence
# leftover
if current_chunk:
chunks.append(current_chunk)
# 4) Merge chunks that are smaller than min_length with the next chunk if possible.
merged_chunks = []
for chunk in chunks:
chunk = chunk.strip()
if not chunk:
continue
if not merged_chunks:
# If there's nothing yet, just add the first chunk.
merged_chunks.append(chunk)
else:
# Check if the last chunk is too small
if len(merged_chunks[-1]) < min_length:
# Try to merge the last chunk with the current one
if len(merged_chunks[-1]) + 1 + len(chunk) <= max_length:
# Merge them
merged_chunks[-1] += " " + chunk
else:
# If merging would exceed max_length, we cannot merge
merged_chunks.append(chunk)
else:
# If the last chunk is okay in length, just add the new chunk
merged_chunks.append(chunk)
return chunks#merged_chunks
# Helper function: split a line into "sentences" by periods that are not part of a decimal number.
def split_line_by_sentences(self, line):
marked = re.sub(r'(?<!\d)\.(?!\d)', '.SPLIT_HERE', line)
# Split on that marker, and then strip each piece.
raw_sentences = marked.split('SPLIT_HERE')
sentences = [s.strip() for s in raw_sentences if s.strip()]
return sentences