Update app.py
Browse files
app.py
CHANGED
@@ -59,16 +59,34 @@ LANGUANGE_MAP = {
|
|
59 |
}
|
60 |
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
model.eval()
|
65 |
-
model.to(device)
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
-
|
69 |
-
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
|
|
|
|
|
|
|
|
72 |
|
73 |
def detect_language(sentence):
|
74 |
|
@@ -80,7 +98,18 @@ def detect_language(sentence):
|
|
80 |
predictions = torch.nn.functional.softmax(output.logits, dim=-1)
|
81 |
probability, pred_idx = torch.max(predictions, dim=-1)
|
82 |
language = LANGUANGE_MAP[pred_idx.item()]
|
83 |
-
return language, probability.item()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
|
86 |
def process_audio_file(file, sampling_rate):
|
@@ -123,7 +152,7 @@ def transcribe(Microphone, File_Upload):
|
|
123 |
language, probability = detect_language(transcription)
|
124 |
|
125 |
return transcription.capitalize(), language, probability
|
126 |
-
|
127 |
|
128 |
examples=['sample1.mp3', 'sample2.mp3', 'sample3.mp3']
|
129 |
examples = [[f"./{f}"] for f in examples]
|
|
|
59 |
}
|
60 |
|
61 |
|
62 |
+
from pytube import YouTube
|
63 |
+
import whisper
|
|
|
|
|
64 |
|
65 |
+
# define function for transcription
|
66 |
+
def transcribe(Microphone, File_Upload):
|
67 |
+
warn_output = ""
|
68 |
+
if (Microphone is not None) and (File_Upload is not None):
|
69 |
+
warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
|
70 |
+
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
|
71 |
+
file = Microphone
|
72 |
|
73 |
+
elif (Microphone is None) and (File_Upload is None):
|
74 |
+
return "ERROR: You have to either use the microphone or upload an audio file"
|
75 |
|
76 |
+
elif Microphone is not None:
|
77 |
+
file = Microphone
|
78 |
+
else:
|
79 |
+
file = File_Upload
|
80 |
+
|
81 |
+
|
82 |
+
language = None
|
83 |
+
|
84 |
+
options = whisper.DecodingOptions(without_timestamps=True)
|
85 |
|
86 |
+
loaded_model = whisper.load_model("base")
|
87 |
+
transcript = loaded_model.transcribe(file, language=language)
|
88 |
+
|
89 |
+
return detect_language(transcript["text"])
|
90 |
|
91 |
def detect_language(sentence):
|
92 |
|
|
|
98 |
predictions = torch.nn.functional.softmax(output.logits, dim=-1)
|
99 |
probability, pred_idx = torch.max(predictions, dim=-1)
|
100 |
language = LANGUANGE_MAP[pred_idx.item()]
|
101 |
+
return sentence, language, probability.item()
|
102 |
+
|
103 |
+
|
104 |
+
"""
|
105 |
+
processor = WhisperProcessor.from_pretrained(model_id)
|
106 |
+
model = WhisperForConditionalGeneration.from_pretrained(model_id)
|
107 |
+
model.eval()
|
108 |
+
model.to(device)
|
109 |
+
|
110 |
+
|
111 |
+
bos_token_id = processor.tokenizer.all_special_ids[-106]
|
112 |
+
decoder_input_ids = torch.tensor([bos_token_id]).to(device)
|
113 |
|
114 |
|
115 |
def process_audio_file(file, sampling_rate):
|
|
|
152 |
language, probability = detect_language(transcription)
|
153 |
|
154 |
return transcription.capitalize(), language, probability
|
155 |
+
"""
|
156 |
|
157 |
examples=['sample1.mp3', 'sample2.mp3', 'sample3.mp3']
|
158 |
examples = [[f"./{f}"] for f in examples]
|