Pedro_Lab_XTTS_demo

Paused

App Files Files

Blakus commited on 25 days ago

Commit

789fcd7

verified ·

1 Parent(s): b4511bb

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -316

app.py CHANGED Viewed

@@ -6,420 +6,279 @@ import subprocess
 import logging
 from pathlib import Path
 import torch
-import torchaudio
 import gradio as gr
-from TTS.api import TTS
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.utils.generic_utils import get_user_data_dir
 from huggingface_hub import hf_hub_download
 import scipy.io.wavfile as wavfile
-# Configurar logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 # Configuración inicial
 os.environ["COQUI_TOS_AGREED"] = "1"
-# Configurar variables de entorno para evitar warnings
-if "OMP_NUM_THREADS" not in os.environ or not os.environ["OMP_NUM_THREADS"]:
-    os.environ["OMP_NUM_THREADS"] = "1"
-# Suprimir warnings de DeepSpeed si no es necesario
-import warnings
-warnings.filterwarnings("ignore", category=FutureWarning, module="deepspeed")
-warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
 class PedroTTSApp:
     def __init__(self):
         self.model = None
         self.config = None
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        logger.info(f"Usando dispositivo: {self.device}")
-    def check_and_install(self, package):
-        """Verifica e instala paquetes faltantes"""
-        try:
-            __import__(package)
-        except ImportError:
-            logger.info(f"{package} no está instalado. Instalando...")
-            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
     def setup_model(self):
         """Descarga y configura el modelo"""
         try:
-            logger.info("Descargando y configurando el modelo...")
             repo_id = "Blakus/Pedro_Lab_XTTS"
             local_dir = Path(get_user_data_dir("tts")) / "tts_models--multilingual--multi-dataset--xtts_v2"
             local_dir.mkdir(parents=True, exist_ok=True)
             files_to_download = ["config.json", "model.pth", "vocab.json"]
             for file_name in files_to_download:
-                logger.info(f"Descargando {file_name} de {repo_id}")
-                hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=str(local_dir))
-            config_path = local_dir / "config.json"
-            checkpoint_path = local_dir / "model.pth"
-            vocab_path = local_dir / "vocab.json"
-            # Verificar que los archivos existen
-            for path in [config_path, checkpoint_path, vocab_path]:
-                if not path.exists():
-                    raise FileNotFoundError(f"Archivo requerido no encontrado: {path}")
             self.config = XttsConfig()
-            self.config.load_json(str(config_path))
             self.model = Xtts.init_from_config(self.config)
-            # Desactivar DeepSpeed para evitar problemas en Spaces
-            use_deepspeed = False
-            if self.device == "cuda" and torch.cuda.is_available():
-                try:
-                    # Intentar con DeepSpeed solo si está disponible y funcional
-                    self.model.load_checkpoint(
-                        self.config,
-                        checkpoint_path=str(checkpoint_path),
-                        vocab_path=str(vocab_path),
-                        eval=True,
-                        use_deepspeed=True
-                    )
-                    use_deepspeed = True
-                except Exception as deepspeed_error:
-                    logger.warning(f"DeepSpeed falló, cargando sin DeepSpeed: {deepspeed_error}")
-                    self.model.load_checkpoint(
-                        self.config,
-                        checkpoint_path=str(checkpoint_path),
-                        vocab_path=str(vocab_path),
-                        eval=True,
-                        use_deepspeed=False
-                    )
-            else:
-                self.model.load_checkpoint(
-                    self.config,
-                    checkpoint_path=str(checkpoint_path),
-                    vocab_path=str(vocab_path),
-                    eval=True,
-                    use_deepspeed=False
-                )
             if self.device == "cuda" and torch.cuda.is_available():
                 self.model.cuda()
-                logger.info(f"Modelo cargado en GPU (DeepSpeed: {use_deepspeed})")
             else:
                 logger.info("Modelo cargado en CPU")
         except Exception as e:
-            logger.error(f"Error al configurar el modelo: {e}")
             raise
-    def validate_input(self, text, audio_file):
-        """Valida los parámetros de entrada"""
-        if not text or len(text.strip()) < 2:
-            return False, "El texto debe tener al menos 2 caracteres."
-        if len(text) > 600:
-            return False, "El texto no puede exceder los 600 caracteres."
-        if not audio_file:
-            return False, "Debe seleccionar un audio de referencia."
-        # Verificar que el archivo de audio existe
-        if not os.path.exists(audio_file):
-            return False, f"El archivo de audio de referencia no existe: {audio_file}"
-        return True, ""
-    def predict(self, prompt, language, reference_audio, speed):
-        """Genera la síntesis de voz"""
         try:
-            # Validar entrada
-            is_valid, error_msg = self.validate_input(prompt, reference_audio)
-            if not is_valid:
-                return None, error_msg
-            # Limpiar y preparar el texto
-            prompt = prompt.strip()
-            prompt = re.sub(r'\s+', ' ', prompt)  # Normalizar espacios
-            if not self.model:
-                return None, "Modelo no cargado correctamente."
-            # Parámetros optimizados para mejor calidad
-            inference_params = {
-                "temperature": 0.65,
-                "length_penalty": 1.2,
-                "repetition_penalty": 2.2,
-                "top_k": 40,
-                "top_p": 0.75,
-                "enable_text_splitting": True,
-                "speed": max(0.5, min(2.0, speed))  # Asegurar rango válido
-            }
-            logger.info(f"Generando audio para: '{prompt[:50]}...' en idioma: {language}")
-            # Obtener embeddings de condicionamiento
             gpt_cond_latent, speaker_embedding = self.model.get_conditioning_latents(
                 audio_path=reference_audio
             )
             start_time = time.time()
             # Generar audio
             out = self.model.inference(
-                prompt,
                 language,
                 gpt_cond_latent,
                 speaker_embedding,
-                **inference_params
             )
             inference_time = time.time() - start_time
-            # Generar nombre de archivo único
             timestamp = int(time.time())
-            output_path = f"pedro_labattaglia_TTS_{timestamp}.wav"
-            # Guardar audio con verificación
             sample_rate = self.config.audio.get("output_sample_rate", 22050)
             wavfile.write(output_path, sample_rate, out["wav"])
-            # Verificar que el archivo se creó correctamente
-            if not os.path.exists(output_path):
-                raise Exception("Error al guardar el archivo de audio")
-            # Calcular métricas
             audio_length = len(out["wav"]) / sample_rate
-            real_time_factor = inference_time / audio_length if audio_length > 0 else float('inf')
-            metrics_text = f"""Tiempo de generación: {inference_time:.2f} segundos
-Duración del audio: {audio_length:.2f} segundos
-Factor de tiempo real: {real_time_factor:.2f}x
-Frecuencia de muestreo: {sample_rate} Hz
-Dispositivo: {self.device.upper()}"""
-            logger.info(f"Audio generado exitosamente: {output_path}")
-            return output_path, metrics_text
         except Exception as e:
-            error_msg = f"Error durante la generación: {str(e)}"
             logger.error(error_msg)
             return None, error_msg
-def create_gradio_interface():
-    """Crea la interfaz de Gradio"""
-    # Inicializar la aplicación
-    app = PedroTTSApp()
-    try:
-        app.setup_model()
-    except Exception as e:
-        logger.error(f"Error al inicializar la aplicación: {e}")
-        raise
-    # Configuración de opciones
-    supported_languages = [
-        ("Español", "es"),
-        ("English", "en")
-    ]
-    reference_audios = [
-        ("Serio", "serio.wav"),
-        ("Neutral", "neutral.wav"),
         ("Alegre", "alegre.wav"),
         ("Neutral Inglés", "neutral_ingles.wav")
     ]
-    # Tema personalizado
-    theme = gr.themes.Soft(
-        primary_hue="blue",
-        secondary_hue="gray",
-    ).set(
-        body_background_fill='*neutral_50',
-        body_background_fill_dark='*neutral_900',
-    )
-    description = """
-    # 🎙️ Sintetizador de Voz - Pedro Labattaglia
-    Sintetizador de voz de alta calidad con la voz del reconocido locutor argentino Pedro Labattaglia.
-    ## 📖 Instrucciones de uso:
-    1. **Seleccione el idioma** (Español o English)
-    2. **Elija un audio de referencia** que determine el tono y estilo
-    3. **Ajuste la velocidad** del habla según su preferencia
-    4. **Escriba el texto** que desea sintetizar (2-600 caracteres)
-    5. **Presione "Generar Voz"** y espere el resultado
-    > ⚡ El proceso puede tomar unos segundos dependiendo de la longitud del texto.
-    """
     # Crear interfaz
-    with gr.Blocks(theme=theme, title="Pedro Labattaglia TTS") as demo:
-        gr.Markdown(description)
-        # Imagen centrada
-        with gr.Row():
-            with gr.Column():
-                gr.Image(
-                    "https://www.labattaglia.com.ar/images/about_me_pic2.jpg",
-                    label="Pedro Labattaglia",
-                    show_label=True,
-                    container=True,
-                    height=300,
-                    width=300
-                )
-        # Controles principales
         with gr.Row():
             with gr.Column(scale=2):
-                with gr.Group():
-                    gr.Markdown("### ⚙️ Configuración")
-                    language_selector = gr.Dropdown(
-                        label="🌐 Idioma",
-                        choices=supported_languages,
-                        value="es",
-                        interactive=True
-                    )
-                    reference_audio = gr.Dropdown(
-                        label="🎵 Audio de referencia",
-                        choices=reference_audios,
-                        value="neutral.wav",
-                        interactive=True
-                    )
-                    speed_slider = gr.Slider(
-                        minimum=0.5,
-                        maximum=2.0,
-                        value=1.0,
-                        step=0.1,
-                        label="🎛️ Velocidad del habla",
-                        interactive=True
-                    )
-                with gr.Group():
-                    gr.Markdown("### 📝 Texto a Sintetizar")
-                    input_text = gr.Textbox(
-                        label="Texto",
-                        placeholder="Escriba aquí el texto que desea convertir a voz...",
-                        lines=4,
-                        max_lines=8,
-                        interactive=True
-                    )
-                    with gr.Row():
-                        char_count = gr.Textbox(
-                            label="Contador de caracteres",
-                            value="0/600",
-                            interactive=False,
-                            scale=1
-                        )
-                        generate_button = gr.Button(
-                            "🎙️ Generar Voz",
-                            variant="primary",
-                            scale=2,
-                            size="lg"
-                        )
             with gr.Column(scale=1):
-                with gr.Group():
-                    gr.Markdown("### 🎧 Resultado")
-                    generated_audio = gr.Audio(
-                        label="Audio generado",
-                        interactive=False,
-                        show_download_button=True
-                    )
-                    metrics_output = gr.Textbox(
-                        label="📊 Métricas de generación",
-                        value="Métricas aparecerán aquí después de generar el audio...",
-                        lines=6,
-                        interactive=False
-                    )
-        # Actualizar contador de caracteres
-        def update_char_count(text):
-            count = len(text) if text else 0
-            return f"{count}/600"
-        input_text.change(
-            update_char_count,
-            inputs=[input_text],
-            outputs=[char_count]
-        )
-        # Configurar generación
-        generate_button.click(
-            app.predict,
-            inputs=[input_text, language_selector, reference_audio, speed_slider],
-            outputs=[generated_audio, metrics_output]
         )
-        # Información adicional
-        with gr.Row():
-            gr.Markdown("""
-            ### ℹ️ Información Adicional
-            - **Calidad:** Audio de alta fidelidad a 22kHz
-            - **Idiomas soportados:** Español e Inglés
-            - **Longitud máxima:** 600 caracteres por generación
-            - **Tiempo de procesamiento:** Variable según la longitud del texto
-            """)
     return demo
 def main():
-    """Función principal"""
     try:
-        # Configurar variables de entorno para Hugging Face Spaces
-        os.environ.setdefault("GRADIO_SERVER_NAME", "0.0.0.0")
-        os.environ.setdefault("GRADIO_SERVER_PORT", "7860")
-        # Configurar OpenMP para evitar warnings
-        if "OMP_NUM_THREADS" not in os.environ or not os.environ["OMP_NUM_THREADS"]:
-            os.environ["OMP_NUM_THREADS"] = "1"
-        demo = create_gradio_interface()
-        # Detectar si estamos en Hugging Face Spaces
-        is_spaces = os.getenv("SPACE_ID") is not None
-        # Configuración de lanzamiento
-        launch_kwargs = {
             "show_error": True,
             "quiet": False,
-            "favicon_path": None,
         }
-        if is_spaces:
-            # Configuración específica para Hugging Face Spaces
-            launch_kwargs.update({
-                "server_name": "0.0.0.0",
-                "server_port": 7860,
-                "share": False,  # No necesario en Spaces
-            })
-        else:
-            # Configuración para desarrollo local
-            launch_kwargs.update({
-                "server_name": "127.0.0.1",
-                "server_port": 7860,
-                "share": True,  # Crear link compartible para desarrollo local
-                "auth": [("Pedro Labattaglia", "PL2024"), ("Invitado", "PLTTS2024")]
-            })
-        logger.info(f"Lanzando aplicación {'en Hugging Face Spaces' if is_spaces else 'localmente'}...")
-        demo.launch(**launch_kwargs)
     except Exception as e:
-        logger.error(f"Error al lanzar la aplicación: {e}")
-        # En caso de error, intentar lanzamiento básico
         try:
-            logger.info("Intentando lanzamiento básico...")
-            demo = create_gradio_interface()
-            demo.launch(share=True, show_error=True)
-        except Exception as fallback_error:
-            logger.error(f"Error en lanzamiento básico: {fallback_error}")
-            raise
 if __name__ == "__main__":
     main()

 import logging
 from pathlib import Path
 import torch
 import gradio as gr
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.utils.generic_utils import get_user_data_dir
 from huggingface_hub import hf_hub_download
 import scipy.io.wavfile as wavfile
+import warnings
+# Suprimir warnings
+warnings.filterwarnings("ignore")
 # Configuración inicial
 os.environ["COQUI_TOS_AGREED"] = "1"
+os.environ["OMP_NUM_THREADS"] = "1"
+# Configurar logging
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
 class PedroTTSApp:
     def __init__(self):
         self.model = None
         self.config = None
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Inicializando en dispositivo: {self.device}")
     def setup_model(self):
         """Descarga y configura el modelo"""
         try:
+            logger.info("Configurando modelo XTTS...")
+            # Configuración del repositorio
             repo_id = "Blakus/Pedro_Lab_XTTS"
             local_dir = Path(get_user_data_dir("tts")) / "tts_models--multilingual--multi-dataset--xtts_v2"
             local_dir.mkdir(parents=True, exist_ok=True)
+            # Descargar archivos necesarios
             files_to_download = ["config.json", "model.pth", "vocab.json"]
             for file_name in files_to_download:
+                file_path = local_dir / file_name
+                if not file_path.exists():
+                    logger.info(f"Descargando {file_name}...")
+                    hf_hub_download(
+                        repo_id=repo_id,
+                        filename=file_name,
+                        local_dir=str(local_dir)
+                    )
+                else:
+                    logger.info(f"{file_name} ya existe")
+            # Configurar modelo
+            config_path = str(local_dir / "config.json")
+            checkpoint_path = str(local_dir / "model.pth")
+            vocab_path = str(local_dir / "vocab.json")
             self.config = XttsConfig()
+            self.config.load_json(config_path)
             self.model = Xtts.init_from_config(self.config)
+            # Cargar sin DeepSpeed para mayor compatibilidad
+            self.model.load_checkpoint(
+                self.config,
+                checkpoint_path=checkpoint_path,
+                vocab_path=vocab_path,
+                eval=True,
+                use_deepspeed=False
+            )
             if self.device == "cuda" and torch.cuda.is_available():
                 self.model.cuda()
+                logger.info("Modelo cargado en GPU")
             else:
+                self.model.cpu()
                 logger.info("Modelo cargado en CPU")
+            logger.info("Modelo configurado exitosamente")
         except Exception as e:
+            logger.error(f"Error configurando modelo: {e}")
             raise
+    def generate_speech(self, text, language, reference_audio, speed):
+        """Genera el audio de voz"""
         try:
+            # Validaciones básicas
+            if not text or len(text.strip()) < 2:
+                return None, "❌ El texto debe tener al menos 2 caracteres"
+            if len(text) > 600:
+                return None, "❌ El texto no puede exceder 600 caracteres"
+            if not reference_audio:
+                return None, "❌ Seleccione un audio de referencia"
+            text = text.strip()
+            logger.info(f"Generando audio para: '{text[:50]}...'")
+            # Obtener embeddings
             gpt_cond_latent, speaker_embedding = self.model.get_conditioning_latents(
                 audio_path=reference_audio
             )
             start_time = time.time()
             # Generar audio
             out = self.model.inference(
+                text,
                 language,
                 gpt_cond_latent,
                 speaker_embedding,
+                temperature=0.7,
+                length_penalty=1.0,
+                repetition_penalty=2.0,
+                top_k=50,
+                top_p=0.8,
+                speed=speed,
+                enable_text_splitting=True
             )
             inference_time = time.time() - start_time
+            # Guardar audio
             timestamp = int(time.time())
+            output_path = f"output_{timestamp}.wav"
             sample_rate = self.config.audio.get("output_sample_rate", 22050)
             wavfile.write(output_path, sample_rate, out["wav"])
+            # Métricas
             audio_length = len(out["wav"]) / sample_rate
+            rtf = inference_time / audio_length if audio_length > 0 else 0
+            metrics = f"""✅ Generación completada
+🕐 Tiempo: {inference_time:.2f}s
+📏 Duración: {audio_length:.2f}s
+⚡ Factor RT: {rtf:.2f}x
+🎵 Sample Rate: {sample_rate}Hz"""
+            return output_path, metrics
         except Exception as e:
+            error_msg = f"❌ Error: {str(e)}"
             logger.error(error_msg)
             return None, error_msg
+# Inicializar aplicación global
+app = PedroTTSApp()
+def create_interface():
+    """Crear interfaz Gradio simplificada"""
+    # Configurar opciones
+    languages = [("Español", "es"), ("English", "en")]
+    audio_refs = [
+        ("Neutral", "neutral.wav"),
+        ("Serio", "serio.wav"),
         ("Alegre", "alegre.wav"),
         ("Neutral Inglés", "neutral_ingles.wav")
     ]
     # Crear interfaz
+    with gr.Blocks(
+        title="Pedro Labattaglia TTS",
+        theme=gr.themes.Soft()
+    ) as demo:
+        gr.Markdown("""
+        # 🎙️ Pedro Labattaglia - Síntesis de Voz
+        Generador de voz con IA usando la voz del locutor Pedro Labattaglia
+        """)
         with gr.Row():
             with gr.Column(scale=2):
+                # Controles
+                language = gr.Dropdown(
+                    choices=languages,
+                    value="es",
+                    label="🌐 Idioma"
+                )
+                reference = gr.Dropdown(
+                    choices=audio_refs,
+                    value="neutral.wav",
+                    label="🎵 Estilo de voz"
+                )
+                speed = gr.Slider(
+                    0.5, 2.0, 1.0, 0.1,
+                    label="⚡ Velocidad"
+                )
+                text_input = gr.Textbox(
+                    label="📝 Texto a sintetizar",
+                    placeholder="Escriba el texto aquí...",
+                    lines=4,
+                    max_lines=6
+                )
+                generate_btn = gr.Button(
+                    "🎙️ Generar Voz",
+                    variant="primary",
+                    size="lg"
+                )
             with gr.Column(scale=1):
+                # Resultados
+                audio_output = gr.Audio(
+                    label="🎧 Audio Generado",
+                    show_download_button=True
+                )
+                metrics_output = gr.Textbox(
+                    label="📊 Estado",
+                    value="Listo para generar audio...",
+                    lines=6
+                )
+        # Conectar función
+        generate_btn.click(
+            fn=app.generate_speech,
+            inputs=[text_input, language, reference, speed],
+            outputs=[audio_output, metrics_output]
         )
+        gr.Markdown("""
+        ### ℹ️ Información
+        - **Longitud**: 2-600 caracteres
+        - **Idiomas**: Español e Inglés
+        - **Calidad**: 22kHz
+        """)
     return demo
 def main():
+    """Función principal simplificada"""
     try:
+        logger.info("Iniciando aplicación...")
+        # Configurar modelo
+        app.setup_model()
+        # Crear interfaz
+        demo = create_interface()
+        # Configuración de lanzamiento para Spaces
+        launch_config = {
+            "server_name": "0.0.0.0",
+            "server_port": 7860,
+            "share": True,  # Siempre activar para Spaces
             "show_error": True,
             "quiet": False,
+            "inbrowser": False
         }
+        logger.info("Lanzando interfaz...")
+        demo.launch(**launch_config)
     except Exception as e:
+        logger.error(f"Error crítico: {e}")
+        # Intentar lanzamiento mínimo
         try:
+            demo = gr.Interface(
+                fn=lambda: "Error en la configuración",
+                inputs=gr.Textbox("Error"),
+                outputs=gr.Textbox("Error"),
+                title="Error de Configuración"
+            )
+            demo.launch(share=True)
+        except:
+            pass
 if __name__ == "__main__":
     main()