bektim commited on
Commit
865b00a
·
verified ·
1 Parent(s): a585ac9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -28
app.py CHANGED
@@ -1,15 +1,29 @@
1
  import gradio as gr
2
  import requests
3
  import os
 
4
  import numpy as np
5
  import soundfile as sf
6
  from tempfile import NamedTemporaryFile
 
7
 
8
  # Get API token from environment variable
9
  API_TOKEN = os.environ.get("HF_API_TOKEN") # Use your token here
10
  API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large"
11
  headers = {"Authorization": f"Bearer {API_TOKEN}"}
12
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def save_audio_to_tempfile(audio_data, sample_rate):
14
  """Save raw audio data to a temporary WAV file."""
15
  with NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
@@ -20,7 +34,7 @@ def query(audio_input):
20
  try:
21
  # Check if input is None (no audio provided)
22
  if audio_input is None:
23
- return "Please provide an audio file or record from the microphone."
24
 
25
  # Handle microphone input (returns a tuple: (sample_rate, audio_data))
26
  if isinstance(audio_input, tuple):
@@ -29,44 +43,81 @@ def query(audio_input):
29
  print(f"Audio data shape: {audio_data.shape}")
30
  audio_path = save_audio_to_tempfile(audio_data, sample_rate)
31
  print(f"Temporary file saved at: {audio_path}")
 
 
 
 
32
  else:
33
- return "Invalid input. Please provide an audio file or record from the microphone."
34
 
35
- # Read the audio file
36
- with open(audio_path, "rb") as f:
37
- data = f.read()
 
 
38
 
39
- # Send the request to the Inference API
40
- response = requests.post(API_URL, headers=headers, data=data)
 
41
 
42
- # Check for errors
43
- if response.status_code != 200:
44
- return f"Error: {response.status_code}, {response.text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- # Return the transcription
47
- return response.json().get("text", "No transcription found in response.")
48
  except Exception as e:
49
- return f"Error during API request: {str(e)}"
50
  finally:
51
- # Clean up the temporary file
52
  if "audio_path" in locals() and os.path.exists(audio_path):
53
  os.remove(audio_path)
54
  print(f"Temporary file deleted: {audio_path}")
 
 
 
55
 
56
  # Gradio interface
57
- interface = gr.Interface(
58
- fn=query,
59
- inputs=gr.Audio(
60
- label="Record from Microphone",
61
- sources=["microphone"], # Only microphone input
62
- type="numpy" # Get audio as a NumPy array
63
- ),
64
- outputs=gr.Textbox(label="Transcription"),
65
- title="Whisper Speech-to-Text (Microphone Only)",
66
- description="Record audio from your microphone to transcribe speech using Hugging Face's Inference API.",
67
- examples=None,
68
- cache_examples=False
69
- )
 
 
 
 
 
 
 
 
 
70
 
71
  # Launch the app
72
- interface.launch()
 
1
  import gradio as gr
2
  import requests
3
  import os
4
+ import time
5
  import numpy as np
6
  import soundfile as sf
7
  from tempfile import NamedTemporaryFile
8
+ import subprocess
9
 
10
  # Get API token from environment variable
11
  API_TOKEN = os.environ.get("HF_API_TOKEN") # Use your token here
12
  API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large"
13
  headers = {"Authorization": f"Bearer {API_TOKEN}"}
14
 
15
+ def preprocess_audio_with_ffmpeg(input_path, output_path):
16
+ """Preprocess audio using FFmpeg."""
17
+ command = [
18
+ "ffmpeg",
19
+ "-i", input_path, # Input file
20
+ "-ar", "16000", # Resample to 16 kHz
21
+ "-ac", "1", # Convert to mono
22
+ "-y", # Overwrite output file if it exists
23
+ output_path # Output file
24
+ ]
25
+ subprocess.run(command, check=True)
26
+
27
  def save_audio_to_tempfile(audio_data, sample_rate):
28
  """Save raw audio data to a temporary WAV file."""
29
  with NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
 
34
  try:
35
  # Check if input is None (no audio provided)
36
  if audio_input is None:
37
+ return "Please record audio or upload an audio file.", None, None
38
 
39
  # Handle microphone input (returns a tuple: (sample_rate, audio_data))
40
  if isinstance(audio_input, tuple):
 
43
  print(f"Audio data shape: {audio_data.shape}")
44
  audio_path = save_audio_to_tempfile(audio_data, sample_rate)
45
  print(f"Temporary file saved at: {audio_path}")
46
+ # Handle file upload (returns a file path)
47
+ elif isinstance(audio_input, str):
48
+ audio_path = audio_input
49
+ print(f"Uploaded file path: {audio_path}")
50
  else:
51
+ return "Invalid input. Please record audio or upload an audio file.", None, None
52
 
53
+ # Preprocess the audio using FFmpeg
54
+ with NamedTemporaryFile(suffix=".wav", delete=False) as processed_temp_file:
55
+ processed_audio_path = processed_temp_file.name
56
+ preprocess_audio_with_ffmpeg(audio_path, processed_audio_path)
57
+ print(f"Processed audio saved at: {processed_audio_path}")
58
 
59
+ # Read the processed audio file
60
+ with open(processed_audio_path, "rb") as f:
61
+ data = f.read()
62
 
63
+ # Send the request to the Inference API with retry logic
64
+ max_retries = 5
65
+ retry_delay = 30 # Wait 30 seconds between retries
66
+ for attempt in range(max_retries):
67
+ # Add language parameter for Kazakh transcription
68
+ response = requests.post(
69
+ API_URL,
70
+ headers=headers,
71
+ data=data,
72
+ json={"language": "kaz"} # Set target language to Kazakh
73
+ )
74
+
75
+ # Check for errors
76
+ if response.status_code == 200:
77
+ # Return the transcription
78
+ transcription = response.json().get("text", "No transcription found in response.")
79
+ return transcription, audio_path, f"transcription_{os.path.basename(audio_path)}.txt"
80
+ elif response.status_code == 503: # Model is loading
81
+ print(f"Model is loading. Attempt {attempt + 1}/{max_retries}. Retrying in {retry_delay} seconds...")
82
+ time.sleep(retry_delay)
83
+ else:
84
+ return f"Error: {response.status_code}, {response.text}", None, None
85
 
86
+ return "Model is still loading. Please try again later.", None, None
 
87
  except Exception as e:
88
+ return f"Error during API request: {str(e)}", None, None
89
  finally:
90
+ # Clean up the temporary files
91
  if "audio_path" in locals() and os.path.exists(audio_path):
92
  os.remove(audio_path)
93
  print(f"Temporary file deleted: {audio_path}")
94
+ if "processed_audio_path" in locals() and os.path.exists(processed_audio_path):
95
+ os.remove(processed_audio_path)
96
+ print(f"Processed temporary file deleted: {processed_audio_path}")
97
 
98
  # Gradio interface
99
+ with gr.Blocks() as demo:
100
+ gr.Markdown("# Kazakh Speech-to-Text")
101
+ gr.Markdown("Record audio or upload an audio file to transcribe speech in Kazakh using Hugging Face's Inference API.")
102
+
103
+ with gr.Row():
104
+ audio_input = gr.Audio(
105
+ label="Record or Upload Audio",
106
+ sources=["microphone", "upload"],
107
+ type="numpy" # Get audio as a NumPy array for microphone input
108
+ )
109
+
110
+ with gr.Row():
111
+ transcription_output = gr.Textbox(label="Transcription", lines=4)
112
+ audio_playback = gr.Audio(label="Playback Audio", visible=True)
113
+ download_button = gr.File(label="Download Transcription")
114
+
115
+ submit_button = gr.Button("Submit")
116
+ submit_button.click(
117
+ fn=query,
118
+ inputs=[audio_input],
119
+ outputs=[transcription_output, audio_playback, download_button]
120
+ )
121
 
122
  # Launch the app
123
+ demo.launch()