import os import re import json import torch import inflect import random import uroman as ur import numpy as np import torchaudio import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from outetts.wav_tokenizer.decoder import WavTokenizer from yarngpt.audiotokenizer import AudioTokenizerV2 # Initialize paths and models tokenizer_path = "saheedniyi/YarnGPT2" wav_tokenizer_config_path = "wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml" wav_tokenizer_model_path = "wavtokenizer_large_speech_320_24k.ckpt" # Initialize the audio tokenizer audio_tokenizer = AudioTokenizerV2( tokenizer_path, wav_tokenizer_model_path, wav_tokenizer_config_path ) # Load the model model = AutoModelForCausalLM.from_pretrained( tokenizer_path, torch_dtype="auto" ).to(audio_tokenizer.device) # Function to generate speech def generate_speech(text, language, speaker_name, temperature=0.1, repetition_penalty=1.1): # Create prompt prompt = audio_tokenizer.create_prompt(text, lang=language, speaker_name=speaker_name) # Tokenize prompt input_ids = audio_tokenizer.tokenize_prompt(prompt) # Generate output output = model.generate( input_ids=input_ids, temperature=temperature, repetition_penalty=repetition_penalty, max_length=4000, ) # Get audio codes and convert to audio codes = audio_tokenizer.get_codes(output) audio = audio_tokenizer.get_audio(codes) # Save audio to file output_path = "output.wav" torchaudio.save(output_path, audio, sample_rate=24000) return output_path # Create Gradio interface def tts_interface(text, language, speaker_name, temperature, repetition_penalty): try: audio_path = generate_speech( text, language, speaker_name, temperature, repetition_penalty ) return audio_path except Exception as e: return f"Error: {str(e)}" # Define available languages and speakers languages = ["english", "igbo", "yoruba", "hausa", "pidgin"] speakers = ["idera", "enitan", "abeo", "eniola", "kachi", "aisha", "amara", "bello", "chidi"] # Create the Gradio interface demo = gr.Interface( fn=tts_interface, inputs=[ gr.Textbox(label="Text to convert to speech", lines=5), gr.Dropdown(languages, label="Language", value="english"), gr.Dropdown(speakers, label="Speaker", value="idera"), gr.Slider(0.1, 1.0, value=0.1, label="Temperature"), gr.Slider(1.0, 2.0, value=1.1, label="Repetition Penalty"), ], outputs=gr.Audio(type="filepath"), title="YarnGPT Text-to-Speech", description="Convert text to speech using YarnGPT model for various African languages", ) # Launch the app if __name__ == "__main__": demo.launch()