Spaces:

okewunmi
/

tts

Running

tts

File size: 2,777 Bytes

b6fd3a8
 
 
78b0078
d4a2e16
78b0078
b6fd3a8
d4a2e16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6fd3a8
 
 
d4a2e16
b6fd3a8

import gradio as gr
import torch
import torchaudio
import os
import re
import subprocess
from transformers import AutoModelForCausalLM
from yarngpt_utils import AudioTokenizer

# Download model files if they don't exist
def download_if_not_exists(url, filename):
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        subprocess.run(["wget", url, "-O", filename])
        print(f"Downloaded {filename}")

# Download necessary files
download_if_not_exists(
    "https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml",
    "wavtokenizer_config.yaml"
)
download_if_not_exists(
    "https://huggingface.co/novateur/WavTokenizer-large-speech-75token/blob/main/wavtokenizer_large_speech_320_v2.ckpt",
    "wavtokenizer_model.ckpt"
)

# Initialize the model (this runs when the app starts)
def initialize_model():
    # Set paths
    hf_path = "saheedniyi/YarnGPT"
    wav_tokenizer_config_path = "wavtokenizer_config.yaml"
    wav_tokenizer_model_path = "wavtokenizer_model.ckpt"
    
    # Create AudioTokenizer
    audio_tokenizer = AudioTokenizer(
        hf_path, wav_tokenizer_model_path, wav_tokenizer_config_path
    )
    
    # Load model
    model = AutoModelForCausalLM.from_pretrained(hf_path, torch_dtype="auto").to(audio_tokenizer.device)
    
    return model, audio_tokenizer

# Generate audio from text
def generate_speech(text, speaker_name):
    # Create prompt
    prompt = audio_tokenizer.create_prompt(text, speaker_name)
    
    # Tokenize prompt
    input_ids = audio_tokenizer.tokenize_prompt(prompt)
    
    # Generate output
    output = model.generate(
        input_ids=input_ids,
        temperature=0.1,
        repetition_penalty=1.1,
        max_length=4000,
    )
    
    # Convert to audio codes
    codes = audio_tokenizer.get_codes(output)
    
    # Convert codes to audio
    audio = audio_tokenizer.get_audio(codes)
    
    # Save audio temporarily
    temp_path = "output.wav"
    torchaudio.save(temp_path, audio, sample_rate=24000)
    
    return temp_path

# Load model globally
print("Loading model...")
model, audio_tokenizer = initialize_model()
print("Model loaded!")

# Create Gradio interface
speakers = ["idera", "emma", "jude", "osagie", "tayo", "zainab", "joke", "regina", "remi", "umar", "chinenye"]

demo = gr.Interface(
    fn=generate_speech,
    inputs=[
        gr.Textbox(lines=5, placeholder="Enter text here..."),
        gr.Dropdown(choices=speakers, label="Speaker", value="idera")
    ],
    outputs=gr.Audio(type="filepath"),
    title="YarnGPT: Nigerian Accented Text-to-Speech",
    description="Generate natural-sounding Nigerian accented speech from text."
)

demo.launch()