Spaces:

okewunmi
/

tts

Running

App Files Files Community

tts / app.py

okewunmi

Update app.py

d4a2e16 verified 2 months ago

raw

history blame

2.78 kB

	import gradio as gr
	import torch
	import torchaudio
	import os
	import re
	import subprocess
	from transformers import AutoModelForCausalLM
	from yarngpt_utils import AudioTokenizer

	# Download model files if they don't exist
	def download_if_not_exists(url, filename):
	if not os.path.exists(filename):
	print(f"Downloading {filename}...")
	subprocess.run(["wget", url, "-O", filename])
	print(f"Downloaded {filename}")

	# Download necessary files
	download_if_not_exists(
	"https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml",
	"wavtokenizer_config.yaml"
	)
	download_if_not_exists(
	"https://huggingface.co/novateur/WavTokenizer-large-speech-75token/blob/main/wavtokenizer_large_speech_320_v2.ckpt",
	"wavtokenizer_model.ckpt"
	)

	# Initialize the model (this runs when the app starts)
	def initialize_model():
	# Set paths
	hf_path = "saheedniyi/YarnGPT"
	wav_tokenizer_config_path = "wavtokenizer_config.yaml"
	wav_tokenizer_model_path = "wavtokenizer_model.ckpt"

	# Create AudioTokenizer
	audio_tokenizer = AudioTokenizer(
	hf_path, wav_tokenizer_model_path, wav_tokenizer_config_path
	)

	# Load model
	model = AutoModelForCausalLM.from_pretrained(hf_path, torch_dtype="auto").to(audio_tokenizer.device)

	return model, audio_tokenizer

	# Generate audio from text
	def generate_speech(text, speaker_name):
	# Create prompt
	prompt = audio_tokenizer.create_prompt(text, speaker_name)

	# Tokenize prompt
	input_ids = audio_tokenizer.tokenize_prompt(prompt)

	# Generate output
	output = model.generate(
	input_ids=input_ids,
	temperature=0.1,
	repetition_penalty=1.1,
	max_length=4000,
	)

	# Convert to audio codes
	codes = audio_tokenizer.get_codes(output)

	# Convert codes to audio
	audio = audio_tokenizer.get_audio(codes)

	# Save audio temporarily
	temp_path = "output.wav"
	torchaudio.save(temp_path, audio, sample_rate=24000)

	return temp_path

	# Load model globally
	print("Loading model...")
	model, audio_tokenizer = initialize_model()
	print("Model loaded!")

	# Create Gradio interface
	speakers = ["idera", "emma", "jude", "osagie", "tayo", "zainab", "joke", "regina", "remi", "umar", "chinenye"]

	demo = gr.Interface(
	fn=generate_speech,
	inputs=[
	gr.Textbox(lines=5, placeholder="Enter text here..."),
	gr.Dropdown(choices=speakers, label="Speaker", value="idera")
	],
	outputs=gr.Audio(type="filepath"),
	title="YarnGPT: Nigerian Accented Text-to-Speech",
	description="Generate natural-sounding Nigerian accented speech from text."
	)

	demo.launch()