File size: 4,800 Bytes
78b0078
03d09ab
b5d25fc
 
03d09ab
 
b5d25fc
 
d4a2e16
03d09ab
 
ac16e60
29bfa47
03d09ab
 
 
 
d4a2e16
03d09ab
 
 
 
29bfa47
03d09ab
 
 
 
 
 
 
 
 
 
 
 
 
29bfa47
03d09ab
 
b6fd3a8
03d09ab
 
 
b6fd3a8
b5d25fc
03d09ab
 
 
ac16e60
b5d25fc
03d09ab
 
 
 
 
 
 
 
 
 
 
 
b5d25fc
03d09ab
 
 
 
 
 
 
 
 
 
 
b5d25fc
03d09ab
b6fd3a8
b5d25fc
03d09ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6fd3a8
b5d25fc
03d09ab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import gradio as gr
import torch
import torchaudio
import uroman
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from outetts.wav_tokenizer.decoder import WavTokenizer

# Import the YarnGPT AudioTokenizer
# Assuming the git repository is cloned in the same directory
from yarngpt.audiotokenizer import AudioTokenizerV2

# Constants and paths
MODEL_PATH = "saheedniyi/YarnGPT2b"
WAV_TOKENIZER_CONFIG_PATH = "wavtokenizer_config.yaml"
WAV_TOKENIZER_MODEL_PATH = "wavtokenizer_model.ckpt"

# Download the model files at startup
os.system(f"wget -O {WAV_TOKENIZER_CONFIG_PATH} https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml")
os.system(f"wget -O {WAV_TOKENIZER_MODEL_PATH} https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt")
os.system("git clone https://github.com/saheedniyi02/yarngpt.git")

# Initialize the model and tokenizer
def initialize_model():
    audio_tokenizer = AudioTokenizerV2(
        MODEL_PATH, 
        WAV_TOKENIZER_MODEL_PATH, 
        WAV_TOKENIZER_CONFIG_PATH
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH, 
        torch_dtype="auto"
    ).to(audio_tokenizer.device)
    
    return model, audio_tokenizer

# Initialize the model and tokenizer
model, audio_tokenizer = initialize_model()

# Available voices and languages
VOICES = ["idera", "jude", "kemi", "tunde", "funmi"]
LANGUAGES = ["english", "yoruba", "igbo", "hausa", "pidgin"]

# Function to generate speech
def generate_speech(text, language, voice, temperature=0.1, rep_penalty=1.1):
    if not text:
        return None, "Please enter some text to convert to speech."
    
    try:
        # Create prompt
        prompt = audio_tokenizer.create_prompt(text, lang=language, speaker_name=voice)
        
        # Tokenize prompt
        input_ids = audio_tokenizer.tokenize_prompt(prompt)
        
        # Generate output
        output = model.generate(
            input_ids=input_ids,
            temperature=temperature,
            repetition_penalty=rep_penalty,
            max_length=4000,
        )
        
        # Convert to audio
        codes = audio_tokenizer.get_codes(output)
        audio = audio_tokenizer.get_audio(codes)
        
        # Save audio to file
        temp_audio_path = "output.wav"
        torchaudio.save(temp_audio_path, audio, sample_rate=24000)
        
        return temp_audio_path, f"Successfully generated speech for: {text[:50]}..."
    
    except Exception as e:
        return None, f"Error generating speech: {str(e)}"

# Create the Gradio interface
with gr.Blocks(title="YarnGPT - Nigerian Accented Text-to-Speech") as demo:
    gr.Markdown("# YarnGPT - Nigerian Accented Text-to-Speech")
    gr.Markdown("Generate speech with Nigerian accents using YarnGPT model.")
    
    with gr.Tab("Basic TTS"):
        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(
                    label="Text to convert to speech", 
                    placeholder="Enter text here...",
                    lines=5
                )
                language = gr.Dropdown(
                    label="Language", 
                    choices=LANGUAGES, 
                    value="english"
                )
                voice = gr.Dropdown(
                    label="Voice", 
                    choices=VOICES, 
                    value="idera"
                )
                temperature = gr.Slider(
                    label="Temperature", 
                    minimum=0.1, 
                    maximum=1.0, 
                    value=0.1, 
                    step=0.1
                )
                rep_penalty = gr.Slider(
                    label="Repetition Penalty", 
                    minimum=1.0, 
                    maximum=2.0, 
                    value=1.1, 
                    step=0.1
                )
                generate_btn = gr.Button("Generate Speech")
            
            with gr.Column():
                audio_output = gr.Audio(label="Generated Speech")
                status_output = gr.Textbox(label="Status")
    
    generate_btn.click(
        generate_speech, 
        inputs=[text_input, language, voice, temperature, rep_penalty], 
        outputs=[audio_output, status_output]
    )
    
    gr.Markdown("""
    ## About YarnGPT
    YarnGPT is a text-to-speech model with Nigerian accents. It supports multiple languages and voices.
    
    ### Credits
    - Model by [saheedniyi](https://huggingface.co/saheedniyi/YarnGPT2b)
    - [Original Repository](https://github.com/saheedniyi02/yarngpt)
    """)

# Launch the app
demo.launch()