Update app.py
Browse files
app.py
CHANGED
@@ -1,15 +1,29 @@
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
import os
|
|
|
4 |
import numpy as np
|
5 |
import soundfile as sf
|
6 |
from tempfile import NamedTemporaryFile
|
|
|
7 |
|
8 |
# Get API token from environment variable
|
9 |
API_TOKEN = os.environ.get("HF_API_TOKEN") # Use your token here
|
10 |
API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large"
|
11 |
headers = {"Authorization": f"Bearer {API_TOKEN}"}
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def save_audio_to_tempfile(audio_data, sample_rate):
|
14 |
"""Save raw audio data to a temporary WAV file."""
|
15 |
with NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
@@ -20,7 +34,7 @@ def query(audio_input):
|
|
20 |
try:
|
21 |
# Check if input is None (no audio provided)
|
22 |
if audio_input is None:
|
23 |
-
return "Please
|
24 |
|
25 |
# Handle microphone input (returns a tuple: (sample_rate, audio_data))
|
26 |
if isinstance(audio_input, tuple):
|
@@ -29,44 +43,81 @@ def query(audio_input):
|
|
29 |
print(f"Audio data shape: {audio_data.shape}")
|
30 |
audio_path = save_audio_to_tempfile(audio_data, sample_rate)
|
31 |
print(f"Temporary file saved at: {audio_path}")
|
|
|
|
|
|
|
|
|
32 |
else:
|
33 |
-
return "Invalid input. Please
|
34 |
|
35 |
-
#
|
36 |
-
with
|
37 |
-
|
|
|
|
|
38 |
|
39 |
-
#
|
40 |
-
|
|
|
41 |
|
42 |
-
#
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
47 |
-
return response.json().get("text", "No transcription found in response.")
|
48 |
except Exception as e:
|
49 |
-
return f"Error during API request: {str(e)}"
|
50 |
finally:
|
51 |
-
# Clean up the temporary
|
52 |
if "audio_path" in locals() and os.path.exists(audio_path):
|
53 |
os.remove(audio_path)
|
54 |
print(f"Temporary file deleted: {audio_path}")
|
|
|
|
|
|
|
55 |
|
56 |
# Gradio interface
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
# Launch the app
|
72 |
-
|
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
import os
|
4 |
+
import time
|
5 |
import numpy as np
|
6 |
import soundfile as sf
|
7 |
from tempfile import NamedTemporaryFile
|
8 |
+
import subprocess
|
9 |
|
10 |
# Get API token from environment variable
|
11 |
API_TOKEN = os.environ.get("HF_API_TOKEN") # Use your token here
|
12 |
API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large"
|
13 |
headers = {"Authorization": f"Bearer {API_TOKEN}"}
|
14 |
|
15 |
+
def preprocess_audio_with_ffmpeg(input_path, output_path):
|
16 |
+
"""Preprocess audio using FFmpeg."""
|
17 |
+
command = [
|
18 |
+
"ffmpeg",
|
19 |
+
"-i", input_path, # Input file
|
20 |
+
"-ar", "16000", # Resample to 16 kHz
|
21 |
+
"-ac", "1", # Convert to mono
|
22 |
+
"-y", # Overwrite output file if it exists
|
23 |
+
output_path # Output file
|
24 |
+
]
|
25 |
+
subprocess.run(command, check=True)
|
26 |
+
|
27 |
def save_audio_to_tempfile(audio_data, sample_rate):
|
28 |
"""Save raw audio data to a temporary WAV file."""
|
29 |
with NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
|
|
34 |
try:
|
35 |
# Check if input is None (no audio provided)
|
36 |
if audio_input is None:
|
37 |
+
return "Please record audio or upload an audio file.", None, None
|
38 |
|
39 |
# Handle microphone input (returns a tuple: (sample_rate, audio_data))
|
40 |
if isinstance(audio_input, tuple):
|
|
|
43 |
print(f"Audio data shape: {audio_data.shape}")
|
44 |
audio_path = save_audio_to_tempfile(audio_data, sample_rate)
|
45 |
print(f"Temporary file saved at: {audio_path}")
|
46 |
+
# Handle file upload (returns a file path)
|
47 |
+
elif isinstance(audio_input, str):
|
48 |
+
audio_path = audio_input
|
49 |
+
print(f"Uploaded file path: {audio_path}")
|
50 |
else:
|
51 |
+
return "Invalid input. Please record audio or upload an audio file.", None, None
|
52 |
|
53 |
+
# Preprocess the audio using FFmpeg
|
54 |
+
with NamedTemporaryFile(suffix=".wav", delete=False) as processed_temp_file:
|
55 |
+
processed_audio_path = processed_temp_file.name
|
56 |
+
preprocess_audio_with_ffmpeg(audio_path, processed_audio_path)
|
57 |
+
print(f"Processed audio saved at: {processed_audio_path}")
|
58 |
|
59 |
+
# Read the processed audio file
|
60 |
+
with open(processed_audio_path, "rb") as f:
|
61 |
+
data = f.read()
|
62 |
|
63 |
+
# Send the request to the Inference API with retry logic
|
64 |
+
max_retries = 5
|
65 |
+
retry_delay = 30 # Wait 30 seconds between retries
|
66 |
+
for attempt in range(max_retries):
|
67 |
+
# Add language parameter for Kazakh transcription
|
68 |
+
response = requests.post(
|
69 |
+
API_URL,
|
70 |
+
headers=headers,
|
71 |
+
data=data,
|
72 |
+
json={"language": "kaz"} # Set target language to Kazakh
|
73 |
+
)
|
74 |
+
|
75 |
+
# Check for errors
|
76 |
+
if response.status_code == 200:
|
77 |
+
# Return the transcription
|
78 |
+
transcription = response.json().get("text", "No transcription found in response.")
|
79 |
+
return transcription, audio_path, f"transcription_{os.path.basename(audio_path)}.txt"
|
80 |
+
elif response.status_code == 503: # Model is loading
|
81 |
+
print(f"Model is loading. Attempt {attempt + 1}/{max_retries}. Retrying in {retry_delay} seconds...")
|
82 |
+
time.sleep(retry_delay)
|
83 |
+
else:
|
84 |
+
return f"Error: {response.status_code}, {response.text}", None, None
|
85 |
|
86 |
+
return "Model is still loading. Please try again later.", None, None
|
|
|
87 |
except Exception as e:
|
88 |
+
return f"Error during API request: {str(e)}", None, None
|
89 |
finally:
|
90 |
+
# Clean up the temporary files
|
91 |
if "audio_path" in locals() and os.path.exists(audio_path):
|
92 |
os.remove(audio_path)
|
93 |
print(f"Temporary file deleted: {audio_path}")
|
94 |
+
if "processed_audio_path" in locals() and os.path.exists(processed_audio_path):
|
95 |
+
os.remove(processed_audio_path)
|
96 |
+
print(f"Processed temporary file deleted: {processed_audio_path}")
|
97 |
|
98 |
# Gradio interface
|
99 |
+
with gr.Blocks() as demo:
|
100 |
+
gr.Markdown("# Kazakh Speech-to-Text")
|
101 |
+
gr.Markdown("Record audio or upload an audio file to transcribe speech in Kazakh using Hugging Face's Inference API.")
|
102 |
+
|
103 |
+
with gr.Row():
|
104 |
+
audio_input = gr.Audio(
|
105 |
+
label="Record or Upload Audio",
|
106 |
+
sources=["microphone", "upload"],
|
107 |
+
type="numpy" # Get audio as a NumPy array for microphone input
|
108 |
+
)
|
109 |
+
|
110 |
+
with gr.Row():
|
111 |
+
transcription_output = gr.Textbox(label="Transcription", lines=4)
|
112 |
+
audio_playback = gr.Audio(label="Playback Audio", visible=True)
|
113 |
+
download_button = gr.File(label="Download Transcription")
|
114 |
+
|
115 |
+
submit_button = gr.Button("Submit")
|
116 |
+
submit_button.click(
|
117 |
+
fn=query,
|
118 |
+
inputs=[audio_input],
|
119 |
+
outputs=[transcription_output, audio_playback, download_button]
|
120 |
+
)
|
121 |
|
122 |
# Launch the app
|
123 |
+
demo.launch()
|