Spaces:

bektim
/

kzs2t

Runtime error

App Files Files Community

bektim commited on Jan 16

Commit

865b00a

verified ·

1 Parent(s): a585ac9

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -28

app.py CHANGED Viewed

@@ -1,15 +1,29 @@
 import gradio as gr
 import requests
 import os
 import numpy as np
 import soundfile as sf
 from tempfile import NamedTemporaryFile
 # Get API token from environment variable
 API_TOKEN = os.environ.get("HF_API_TOKEN")  # Use your token here
 API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large"
 headers = {"Authorization": f"Bearer {API_TOKEN}"}
 def save_audio_to_tempfile(audio_data, sample_rate):
     """Save raw audio data to a temporary WAV file."""
     with NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
@@ -20,7 +34,7 @@ def query(audio_input):
     try:
         # Check if input is None (no audio provided)
         if audio_input is None:
-            return "Please provide an audio file or record from the microphone."
         # Handle microphone input (returns a tuple: (sample_rate, audio_data))
         if isinstance(audio_input, tuple):
@@ -29,44 +43,81 @@ def query(audio_input):
             print(f"Audio data shape: {audio_data.shape}")
             audio_path = save_audio_to_tempfile(audio_data, sample_rate)
             print(f"Temporary file saved at: {audio_path}")
         else:
-            return "Invalid input. Please provide an audio file or record from the microphone."
-        # Read the audio file
-        with open(audio_path, "rb") as f:
-            data = f.read()
-        # Send the request to the Inference API
-        response = requests.post(API_URL, headers=headers, data=data)
-        # Check for errors
-        if response.status_code != 200:
-            return f"Error: {response.status_code}, {response.text}"
-        # Return the transcription
-        return response.json().get("text", "No transcription found in response.")
     except Exception as e:
-        return f"Error during API request: {str(e)}"
     finally:
-        # Clean up the temporary file
         if "audio_path" in locals() and os.path.exists(audio_path):
             os.remove(audio_path)
             print(f"Temporary file deleted: {audio_path}")
 # Gradio interface
-interface = gr.Interface(
-    fn=query,
-    inputs=gr.Audio(
-        label="Record from Microphone",
-        sources=["microphone"],  # Only microphone input
-        type="numpy"  # Get audio as a NumPy array
-    ),
-    outputs=gr.Textbox(label="Transcription"),
-    title="Whisper Speech-to-Text (Microphone Only)",
-    description="Record audio from your microphone to transcribe speech using Hugging Face's Inference API.",
-    examples=None,
-    cache_examples=False
-)
 # Launch the app
-interface.launch()

 import gradio as gr
 import requests
 import os
+import time
 import numpy as np
 import soundfile as sf
 from tempfile import NamedTemporaryFile
+import subprocess
 # Get API token from environment variable
 API_TOKEN = os.environ.get("HF_API_TOKEN")  # Use your token here
 API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large"
 headers = {"Authorization": f"Bearer {API_TOKEN}"}
+def preprocess_audio_with_ffmpeg(input_path, output_path):
+    """Preprocess audio using FFmpeg."""
+    command = [
+        "ffmpeg",
+        "-i", input_path,  # Input file
+        "-ar", "16000",    # Resample to 16 kHz
+        "-ac", "1",        # Convert to mono
+        "-y",              # Overwrite output file if it exists
+        output_path        # Output file
+    ]
+    subprocess.run(command, check=True)
 def save_audio_to_tempfile(audio_data, sample_rate):
     """Save raw audio data to a temporary WAV file."""
     with NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
     try:
         # Check if input is None (no audio provided)
         if audio_input is None:
+            return "Please record audio or upload an audio file.", None, None
         # Handle microphone input (returns a tuple: (sample_rate, audio_data))
         if isinstance(audio_input, tuple):
             print(f"Audio data shape: {audio_data.shape}")
             audio_path = save_audio_to_tempfile(audio_data, sample_rate)
             print(f"Temporary file saved at: {audio_path}")
+        # Handle file upload (returns a file path)
+        elif isinstance(audio_input, str):
+            audio_path = audio_input
+            print(f"Uploaded file path: {audio_path}")
         else:
+            return "Invalid input. Please record audio or upload an audio file.", None, None
+        # Preprocess the audio using FFmpeg
+        with NamedTemporaryFile(suffix=".wav", delete=False) as processed_temp_file:
+            processed_audio_path = processed_temp_file.name
+            preprocess_audio_with_ffmpeg(audio_path, processed_audio_path)
+            print(f"Processed audio saved at: {processed_audio_path}")
+        # Read the processed audio file
+        with open(processed_audio_path, "rb") as f:
+            data = f.read()
+        # Send the request to the Inference API with retry logic
+        max_retries = 5
+        retry_delay = 30  # Wait 30 seconds between retries
+        for attempt in range(max_retries):
+            # Add language parameter for Kazakh transcription
+            response = requests.post(
+                API_URL,
+                headers=headers,
+                data=data,
+                json={"language": "kaz"}  # Set target language to Kazakh
+            )
+            # Check for errors
+            if response.status_code == 200:
+                # Return the transcription
+                transcription = response.json().get("text", "No transcription found in response.")
+                return transcription, audio_path, f"transcription_{os.path.basename(audio_path)}.txt"
+            elif response.status_code == 503:  # Model is loading
+                print(f"Model is loading. Attempt {attempt + 1}/{max_retries}. Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+            else:
+                return f"Error: {response.status_code}, {response.text}", None, None
+        return "Model is still loading. Please try again later.", None, None
     except Exception as e:
+        return f"Error during API request: {str(e)}", None, None
     finally:
+        # Clean up the temporary files
         if "audio_path" in locals() and os.path.exists(audio_path):
             os.remove(audio_path)
             print(f"Temporary file deleted: {audio_path}")
+        if "processed_audio_path" in locals() and os.path.exists(processed_audio_path):
+            os.remove(processed_audio_path)
+            print(f"Processed temporary file deleted: {processed_audio_path}")
 # Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Kazakh Speech-to-Text")
+    gr.Markdown("Record audio or upload an audio file to transcribe speech in Kazakh using Hugging Face's Inference API.")
+    with gr.Row():
+        audio_input = gr.Audio(
+            label="Record or Upload Audio",
+            sources=["microphone", "upload"],
+            type="numpy"  # Get audio as a NumPy array for microphone input
+        )
+    with gr.Row():
+        transcription_output = gr.Textbox(label="Transcription", lines=4)
+        audio_playback = gr.Audio(label="Playback Audio", visible=True)
+        download_button = gr.File(label="Download Transcription")
+    submit_button = gr.Button("Submit")
+    submit_button.click(
+        fn=query,
+        inputs=[audio_input],
+        outputs=[transcription_output, audio_playback, download_button]
+    )
 # Launch the app
+demo.launch()