Spaces:

Sunbird
/

sb-mms-inference

Sleeping

App Files Files Community

akera commited on Feb 20, 2024

Commit

d4afb45

verified ·

1 Parent(s): 361f06d

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -28

app.py CHANGED Viewed

@@ -5,52 +5,73 @@ import librosa
 import json
 import os
 import huggingface_hub
 # with open('ISO_codes.json', 'r') as file:
 #     iso_codes = json.load(file)
-languages = ["lug", "ach", "nyn", "teo"]
 auth_token = os.environ.get("HF_TOKEN")
-from huggingface_hub import login
-login(token=auth_token)
-model_id = "Sunbird/sunbird-mms"
-model = Wav2Vec2ForCTC.from_pretrained(model_id, use_auth_token=auth_token)
-processor = Wav2Vec2Processor.from_pretrained(model_id, use_auth_token=auth_token)
-def transcribe(audio_file_mic=None, audio_file_upload=None, language="Luganda (lug)"):
-    if audio_file_mic:
-        audio_file = audio_file_mic
-    elif audio_file_upload:
-        audio_file = audio_file_upload
-    else:
-        return "Please upload an audio file or record one"
-    # Make sure audio is 16kHz
-    speech, sample_rate = librosa.load(audio_file)
-    if sample_rate != 16000:
-        speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
-    # Keep the same model in memory and simply switch out the language adapters by calling load_adapter() for the model and set_target_lang() for the tokenizer
-    language_code = language
-    processor.tokenizer.set_target_lang(language_code)
-    model.load_adapter(language_code)
-    inputs = processor(speech, sampling_rate=16_000, return_tensors="pt")
-    with torch.no_grad():
-        outputs = model(**inputs).logits
-    ids = torch.argmax(outputs, dim=-1)[0]
-    transcription = processor.decode(ids)
-    return transcription
 description = '''ASR with salt-mms'''
-iface = gr.Interface(fn=transcribe,
                      inputs=[
                          gr.Audio(source="microphone", type="filepath", label="Record Audio"),
                          gr.Audio(source="upload", type="filepath", label="Upload Audio"),

 import json
 import os
 import huggingface_hub
+from transformers import pipeline
 # with open('ISO_codes.json', 'r') as file:
 #     iso_codes = json.load(file)
+# languages = ["lug", "ach", "nyn", "teo"]
 auth_token = os.environ.get("HF_TOKEN")
+target_lang_options = {"English": "eng", "Luganda": "lug", "Acholi": "ach", "Runyankole": "nyn", "Lugbara": "lgg"}
+target_lang_code = target_lang_options[target_lang]
+languages = list(target_lang_options.keys())
+if target_lang_code=="eng":
+    model_id = "facebook/mms-1b-all"
+else:
+    model_id = "Sunbird/sunbird-mms"
+# Transcribe audio using custom model
+def transcribe_audio(input_file, target_lang_code,
+                      device, model_id=model_id,
+                      chunk_length_s=10, stride_length_s=(4, 2), return_timestamps="word"):
+    pipe = pipeline(model=model_id, device=device, token=hf_auth_token)
+    pipe.tokenizer.set_target_lang(target_lang_code)
+    pipe.model.load_adapter(target_lang_code)
+    # Read audio file
+    audio_data = input_file.read()
+    output = pipe(audio_data, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, return_timestamps=return_timestamps)
+    return output
+# def transcribe(audio_file_mic=None, audio_file_upload=None, language="Luganda (lug)"):
+#     if audio_file_mic:
+#         audio_file = audio_file_mic
+#     elif audio_file_upload:
+#         audio_file = audio_file_upload
+#     else:
+#         return "Please upload an audio file or record one"
+#     # Make sure audio is 16kHz
+#     speech, sample_rate = librosa.load(audio_file)
+#     if sample_rate != 16000:
+#         speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
+#     # Keep the same model in memory and simply switch out the language adapters by calling load_adapter() for the model and set_target_lang() for the tokenizer
+#     language_code = language
+#     processor.tokenizer.set_target_lang(language_code)
+#     model.load_adapter(language_code)
+#     inputs = processor(speech, sampling_rate=16_000, return_tensors="pt")
+#     with torch.no_grad():
+#         outputs = model(**inputs).logits
+#     ids = torch.argmax(outputs, dim=-1)[0]
+#     transcription = processor.decode(ids)
+#     return transcription
 description = '''ASR with salt-mms'''
+iface = gr.Interface(fn=transcribe_audio,
                      inputs=[
                          gr.Audio(source="microphone", type="filepath", label="Record Audio"),
                          gr.Audio(source="upload", type="filepath", label="Upload Audio"),