Text-to-Speech
English

WARNING:phonemizer:words count mismatch on 100.0% of the lines (1/1) Truncated to 510 tokens

#76
by Mentrasss - opened

i have many problems with this issue i don't know how i can make a smaller for don't miss context if anyone can give me hand how read files txt for make small text ! should be great

thanks in advance

This is my solution. It works quite well.

def read(self, text, read_in_chunks=True):
    
    if read_in_chunks:
        audio = []
        for chunk in self.split_text_into_chunks(text=text):
            chunk = chunk.rstrip("\n")
            chunk = chunk.lstrip("-")
            chunk = chunk.lstrip(" ") 
            chunk = chunk.lstrip("\n") 
            if len(chunk) < 2: continue
            snippet, _ = generate(self.model, chunk, self.voicepack, lang=self.voice_name[0])
            audio.extend(snippet)
        audio = np.array(audio, dtype=np.float32)  # Convert to NumPy array
    else:
        audio, out_ps = generate(self.model, text, self.voicepack, lang=self.voice_name[0])
    
    sd.play(audio, SAMPLE_RATE)


def split_text_into_chunks(self, text, max_length=200, min_length=100):
    # 1) Collapse consecutive new lines into a single newline.
    text = re.sub(r'\n+', '\n', text).strip()

    # 2) Split by single newlines.
    lines = text.split('\n')

    chunks = []
    for line in lines:
        line = line.strip()
        if not line:
            continue  # skip empty lines after stripping

        if len(line) <= max_length:
            # If within limit, just add this line as a chunk.
            chunks.append(line)
        else:
            # Otherwise, split this line by "sentences".
            sentences = self.split_line_by_sentences(line)

            # Accumulate sentences into sub-chunks without exceeding max_length
            current_chunk = ""
            for sentence in sentences:
                if not current_chunk:
                    # start a new chunk
                    current_chunk = sentence
                else:
                    # try to add this sentence to the current chunk
                    # (plus 1 char for a space, if you prefer a separator)
                    if len(current_chunk) + 1 + len(sentence) <= max_length:
                        current_chunk += " " + sentence
                    else:
                        # if adding the sentence would exceed max_length,
                        # finalize the current chunk and start a new one
                        chunks.append(current_chunk)
                        current_chunk = sentence

            # leftover
            if current_chunk:
                chunks.append(current_chunk)

    # 4) Merge chunks that are smaller than min_length with the next chunk if possible.
    merged_chunks = []
    for chunk in chunks:
        chunk = chunk.strip()
        if not chunk:
            continue
        if not merged_chunks:
            # If there's nothing yet, just add the first chunk.
            merged_chunks.append(chunk)
        else:
            # Check if the last chunk is too small
            if len(merged_chunks[-1]) < min_length:
                # Try to merge the last chunk with the current one
                if len(merged_chunks[-1]) + 1 + len(chunk) <= max_length:
                    # Merge them
                    merged_chunks[-1] += " " + chunk
                else:
                    # If merging would exceed max_length, we cannot merge
                    merged_chunks.append(chunk)
            else:
                # If the last chunk is okay in length, just add the new chunk
                merged_chunks.append(chunk)
    return chunks#merged_chunks

        # Helper function: split a line into "sentences" by periods that are not part of a decimal number.
def split_line_by_sentences(self, line):
    marked = re.sub(r'(?<!\d)\.(?!\d)', '.SPLIT_HERE', line)

    # Split on that marker, and then strip each piece.
    raw_sentences = marked.split('SPLIT_HERE')
    sentences = [s.strip() for s in raw_sentences if s.strip()]
    return sentences

Sign up or log in to comment