Spaces:
Running
Running
File size: 3,445 Bytes
f94a020 bdeb120 17855f6 3251e7e 156316e fef87f0 029f491 156316e 029f491 fef87f0 029f491 fef87f0 029f491 af37368 d3ac099 029f491 bdeb120 029f491 bdeb120 029f491 17855f6 af37368 17855f6 af37368 a42bf65 3251e7e fef87f0 af37368 029f491 81e5784 af37368 f82b319 029f491 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import gradio as gr
import os
import requests
import torch
import zipfile
from TTS.api import TTS
from pydub import AudioSegment
# Constants
AUDIO_FORMATS = [".wav", ".mp3", ".flac", ".mp4"]
LANGUAGES = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko", "hi"]
# Device setup
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# TTS model setup
MODEL_PATH = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(MODEL_PATH).to(device)
class AudioProcessor:
def __init__(self):
pass
def convert_to_wav(self, input_audio_file: str) -> str:
file_extension = os.path.splitext(input_audio_file)[-1].lower()
if file_extension!= ".wav":
audio = AudioSegment.from_file(input_audio_file)
audio.export("temp.wav", format="wav")
os.remove(input_audio_file)
return "temp.wav"
return input_audio_file
def synthesize_text(self, text: str, input_audio_file: str, language: str) -> str:
input_audio_file = self.convert_to_wav(input_audio_file)
tts.tts_to_file(text=text, speaker_wav=input_audio_file, language=language, file_path="./output.wav")
return "./output.wav"
def download_audio_file(url: str) -> str:
try:
response = requests.get(url)
file_extension = os.path.splitext(url)[-1].lower()
if file_extension not in AUDIO_FORMATS:
raise ValueError(f"Unsupported file extension: {file_extension}")
file_name = f"temp{file_extension}"
with open(file_name, "wb") as f:
f.write(response.content)
return file_name
except requests.exceptions.RequestException as e:
print(f"Error downloading audio file: {e}")
return None
def extract_zip_file(zip_file: str) -> bool:
try:
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall()
return True
except zipfile.BadZipfile as e:
print(f"Error extracting zip file: {e}")
return False
def synthesize_audio(text: str, input_file: gr.File, language: str) -> str:
audio_processor = AudioProcessor()
if input_file is None:
return None
if input_file.name.endswith(".zip"):
if extract_zip_file(input_file):
input_audio_file = [f for f in os.listdir('.') if os.path.isfile(f) and f.endswith(tuple(AUDIO_FORMATS))]
if len(input_audio_file) == 1:
input_audio_file = input_audio_file[0]
else:
return "Error: Please select a single audio file from the extracted files."
else:
input_audio_file = input_file.name
output_file_path = audio_processor.synthesize_text(text, input_audio_file, language)
return output_file_path
iface = gr.Interface(
fn=synthesize_audio,
inputs=["text", gr.File(label="Input File", file_types=[".zip", *AUDIO_FORMATS]), gr.Dropdown(choices=LANGUAGES, label="Language")],
outputs=gr.Audio(type='filepath'),
title='Voice Clone',
description=""" by [Angetyde](https://youtube.com/@Angetyde?si=7nusP31nTumIkPTF) and [Tony Assi](https://www.tonyassi.com/ ) use this colab with caution <3. Clone any voice with a model and generate a speech waveform.""",
examples=[["Hello! My name is Voice Clone. What is your name?", None, "en"]],
height=600,
width=1200,
)
iface.launch() |