GPT_SoTyde / app.py
AngeT10's picture
Update app.py
af37368 verified
raw
history blame
No virus
3.45 kB
import gradio as gr
import os
import requests
import torch
import zipfile
from TTS.api import TTS
from pydub import AudioSegment
# Constants
AUDIO_FORMATS = [".wav", ".mp3", ".flac", ".mp4"]
LANGUAGES = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko", "hi"]
# Device setup
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# TTS model setup
MODEL_PATH = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(MODEL_PATH).to(device)
class AudioProcessor:
def __init__(self):
pass
def convert_to_wav(self, input_audio_file: str) -> str:
file_extension = os.path.splitext(input_audio_file)[-1].lower()
if file_extension!= ".wav":
audio = AudioSegment.from_file(input_audio_file)
audio.export("temp.wav", format="wav")
os.remove(input_audio_file)
return "temp.wav"
return input_audio_file
def synthesize_text(self, text: str, input_audio_file: str, language: str) -> str:
input_audio_file = self.convert_to_wav(input_audio_file)
tts.tts_to_file(text=text, speaker_wav=input_audio_file, language=language, file_path="./output.wav")
return "./output.wav"
def download_audio_file(url: str) -> str:
try:
response = requests.get(url)
file_extension = os.path.splitext(url)[-1].lower()
if file_extension not in AUDIO_FORMATS:
raise ValueError(f"Unsupported file extension: {file_extension}")
file_name = f"temp{file_extension}"
with open(file_name, "wb") as f:
f.write(response.content)
return file_name
except requests.exceptions.RequestException as e:
print(f"Error downloading audio file: {e}")
return None
def extract_zip_file(zip_file: str) -> bool:
try:
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall()
return True
except zipfile.BadZipfile as e:
print(f"Error extracting zip file: {e}")
return False
def synthesize_audio(text: str, input_file: gr.File, language: str) -> str:
audio_processor = AudioProcessor()
if input_file is None:
return None
if input_file.name.endswith(".zip"):
if extract_zip_file(input_file):
input_audio_file = [f for f in os.listdir('.') if os.path.isfile(f) and f.endswith(tuple(AUDIO_FORMATS))]
if len(input_audio_file) == 1:
input_audio_file = input_audio_file[0]
else:
return "Error: Please select a single audio file from the extracted files."
else:
input_audio_file = input_file.name
output_file_path = audio_processor.synthesize_text(text, input_audio_file, language)
return output_file_path
iface = gr.Interface(
fn=synthesize_audio,
inputs=["text", gr.File(label="Input File", file_types=[".zip", *AUDIO_FORMATS]), gr.Dropdown(choices=LANGUAGES, label="Language")],
outputs=gr.Audio(type='filepath'),
title='Voice Clone',
description=""" by [Angetyde](https://youtube.com/@Angetyde?si=7nusP31nTumIkPTF) and [Tony Assi](https://www.tonyassi.com/ ) use this colab with caution <3. Clone any voice with a model and generate a speech waveform.""",
examples=[["Hello! My name is Voice Clone. What is your name?", None, "en"]],
height=600,
width=1200,
)
iface.launch()