File size: 4,800 Bytes
78b0078 03d09ab b5d25fc 03d09ab b5d25fc d4a2e16 03d09ab ac16e60 29bfa47 03d09ab d4a2e16 03d09ab 29bfa47 03d09ab 29bfa47 03d09ab b6fd3a8 03d09ab b6fd3a8 b5d25fc 03d09ab ac16e60 b5d25fc 03d09ab b5d25fc 03d09ab b5d25fc 03d09ab b6fd3a8 b5d25fc 03d09ab b6fd3a8 b5d25fc 03d09ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import os
import gradio as gr
import torch
import torchaudio
import uroman
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from outetts.wav_tokenizer.decoder import WavTokenizer
# Import the YarnGPT AudioTokenizer
# Assuming the git repository is cloned in the same directory
from yarngpt.audiotokenizer import AudioTokenizerV2
# Constants and paths
MODEL_PATH = "saheedniyi/YarnGPT2b"
WAV_TOKENIZER_CONFIG_PATH = "wavtokenizer_config.yaml"
WAV_TOKENIZER_MODEL_PATH = "wavtokenizer_model.ckpt"
# Download the model files at startup
os.system(f"wget -O {WAV_TOKENIZER_CONFIG_PATH} https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml")
os.system(f"wget -O {WAV_TOKENIZER_MODEL_PATH} https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt")
os.system("git clone https://github.com/saheedniyi02/yarngpt.git")
# Initialize the model and tokenizer
def initialize_model():
audio_tokenizer = AudioTokenizerV2(
MODEL_PATH,
WAV_TOKENIZER_MODEL_PATH,
WAV_TOKENIZER_CONFIG_PATH
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype="auto"
).to(audio_tokenizer.device)
return model, audio_tokenizer
# Initialize the model and tokenizer
model, audio_tokenizer = initialize_model()
# Available voices and languages
VOICES = ["idera", "jude", "kemi", "tunde", "funmi"]
LANGUAGES = ["english", "yoruba", "igbo", "hausa", "pidgin"]
# Function to generate speech
def generate_speech(text, language, voice, temperature=0.1, rep_penalty=1.1):
if not text:
return None, "Please enter some text to convert to speech."
try:
# Create prompt
prompt = audio_tokenizer.create_prompt(text, lang=language, speaker_name=voice)
# Tokenize prompt
input_ids = audio_tokenizer.tokenize_prompt(prompt)
# Generate output
output = model.generate(
input_ids=input_ids,
temperature=temperature,
repetition_penalty=rep_penalty,
max_length=4000,
)
# Convert to audio
codes = audio_tokenizer.get_codes(output)
audio = audio_tokenizer.get_audio(codes)
# Save audio to file
temp_audio_path = "output.wav"
torchaudio.save(temp_audio_path, audio, sample_rate=24000)
return temp_audio_path, f"Successfully generated speech for: {text[:50]}..."
except Exception as e:
return None, f"Error generating speech: {str(e)}"
# Create the Gradio interface
with gr.Blocks(title="YarnGPT - Nigerian Accented Text-to-Speech") as demo:
gr.Markdown("# YarnGPT - Nigerian Accented Text-to-Speech")
gr.Markdown("Generate speech with Nigerian accents using YarnGPT model.")
with gr.Tab("Basic TTS"):
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to convert to speech",
placeholder="Enter text here...",
lines=5
)
language = gr.Dropdown(
label="Language",
choices=LANGUAGES,
value="english"
)
voice = gr.Dropdown(
label="Voice",
choices=VOICES,
value="idera"
)
temperature = gr.Slider(
label="Temperature",
minimum=0.1,
maximum=1.0,
value=0.1,
step=0.1
)
rep_penalty = gr.Slider(
label="Repetition Penalty",
minimum=1.0,
maximum=2.0,
value=1.1,
step=0.1
)
generate_btn = gr.Button("Generate Speech")
with gr.Column():
audio_output = gr.Audio(label="Generated Speech")
status_output = gr.Textbox(label="Status")
generate_btn.click(
generate_speech,
inputs=[text_input, language, voice, temperature, rep_penalty],
outputs=[audio_output, status_output]
)
gr.Markdown("""
## About YarnGPT
YarnGPT is a text-to-speech model with Nigerian accents. It supports multiple languages and voices.
### Credits
- Model by [saheedniyi](https://huggingface.co/saheedniyi/YarnGPT2b)
- [Original Repository](https://github.com/saheedniyi02/yarngpt)
""")
# Launch the app
demo.launch() |