allandclive commited on
Commit
4aeda1d
·
1 Parent(s): 5c0f6b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -40
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
  import torch
3
  import librosa
4
- import numpy as np
5
  from transformers import pipeline
6
  from stitched_model import CombinedModel
7
 
@@ -10,34 +9,7 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
  # Load the model
11
  model = CombinedModel("facebook/mms-1b-all", "Sunbird/sunbird-mul-en-mbart-merged", device=device)
12
 
13
- def preprocess_audio(audio_file):
14
- speech, sample_rate = librosa.load(audio_file, sr=16000, mono=True)
15
- chunk_size = 10 * 16000
16
- chunks = []
17
- for i in range(0, len(speech), chunk_size):
18
- chunk = speech[i:i + chunk_size]
19
- if len(chunk) < chunk_size:
20
- # Pad the last chunk if its duration is less than 10 seconds
21
- chunk = np.pad(chunk, (0, chunk_size - len(chunk)))
22
- chunks.append(chunk)
23
- return chunks
24
-
25
- def transcribe(chunks):
26
- transcriptions = []
27
- translations = []
28
- for chunk in chunks:
29
- chunk = torch.tensor([chunk])
30
- with torch.no_grad():
31
- transcription, translation = model({"audio": chunk})
32
- transcriptions.append(transcription)
33
- translations.append(translation[0])
34
-
35
- transcription = "".join(transcriptions)
36
- translation = " ".join(translations)
37
-
38
- return transcription, translation
39
-
40
- def process_audio(audio_file_mic=None, audio_file_upload=None):
41
  if audio_file_mic:
42
  audio_file = audio_file_mic
43
  elif audio_file_upload:
@@ -45,19 +17,28 @@ def process_audio(audio_file_mic=None, audio_file_upload=None):
45
  else:
46
  return "Please upload an audio file or record one"
47
 
48
- chunks = preprocess_audio(audio_file)
49
- transcription, translation = transcribe(chunks)
 
 
 
 
 
 
50
  return transcription, translation
51
 
52
  description = '''Luganda to English Speech Translation'''
53
 
54
- iface = gr.Interface(fn=process_audio,
55
- inputs=[
56
- gr.Audio(source="microphone", type="filepath", label="Record Audio"),
57
- gr.Audio(source="upload", type="filepath", label="Upload Audio")],
58
- outputs=[gr.Textbox(label="Transcription"),
59
- gr.Textbox(label="Translation")
60
- ],
61
- description=description
62
- )
 
 
 
63
  iface.launch()
 
1
  import gradio as gr
2
  import torch
3
  import librosa
 
4
  from transformers import pipeline
5
  from stitched_model import CombinedModel
6
 
 
9
  # Load the model
10
  model = CombinedModel("facebook/mms-1b-all", "Sunbird/sunbird-mul-en-mbart-merged", device=device)
11
 
12
+ def transcribe(audio_file_mic=None, audio_file_upload=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  if audio_file_mic:
14
  audio_file = audio_file_mic
15
  elif audio_file_upload:
 
17
  else:
18
  return "Please upload an audio file or record one"
19
 
20
+ # Load the audio file
21
+ speech, sample_rate = librosa.load(audio_file, sr=16000, mono=True)
22
+
23
+ # Process the audio and perform transcription
24
+ speech_tensor = torch.tensor([speech])
25
+ with torch.no_grad():
26
+ transcription, translation = model({"audio": speech_tensor})
27
+
28
  return transcription, translation
29
 
30
  description = '''Luganda to English Speech Translation'''
31
 
32
+ iface = gr.Interface(
33
+ fn=transcribe,
34
+ inputs=[
35
+ gr.Audio(source="microphone", type="filepath", label="Record Audio"),
36
+ gr.Audio(source="upload", type="filepath", label="Upload Audio")
37
+ ],
38
+ outputs=[
39
+ gr.Textbox(label="Transcription"),
40
+ gr.Textbox(label="Translation")
41
+ ],
42
+ description=description
43
+ )
44
  iface.launch()