Spaces:

smajumdar
/

nemo_multilingual_language_id

Runtime error

App Files Files Community

smajumdar commited on Dec 6, 2022

Commit

14db2b1

1 Parent(s): 0dfaf95

Finalize HF demo

Browse files

Signed-off-by: smajumdar <[email protected]>

Files changed (1) hide show

app.py +149 -22

app.py CHANGED Viewed

@@ -4,23 +4,37 @@ import uuid
 import tempfile
 import subprocess
 import re
 import gradio as gr
 import pytube as pt
 import nemo.collections.asr as nemo_asr
 import speech_to_text_buffered_infer_ctc as buffered_ctc
 import speech_to_text_buffered_infer_rnnt as buffered_rnnt
 # Set NeMo cache dir as /tmp
 from nemo import constants
-os.environ[constants.NEMO_ENV_CACHE_DIR] = "/tmp/nemo"
-SAMPLE_RATE = 16000
 TITLE = "NeMo ASR Inference on Hugging Face"
 DESCRIPTION = "Demo of all languages supported by NeMo ASR"
 DEFAULT_EN_MODEL = "nvidia/stt_en_conformer_transducer_xlarge"
 MARKDOWN = f"""
 # {TITLE}
@@ -32,6 +46,13 @@ CSS = """
 p.big {
   font-size: 20px;
 }
 """
 ARTICLE = """
@@ -58,6 +79,9 @@ for info in hf_infos:
 SUPPORTED_MODEL_NAMES = sorted(list(SUPPORTED_MODEL_NAMES))
 model_dict = {model_name: gr.Interface.load(f'models/{model_name}') for model_name in SUPPORTED_MODEL_NAMES}
 SUPPORTED_LANG_MODEL_DICT = {}
@@ -77,6 +101,14 @@ for lang in SUPPORTED_LANG_MODEL_DICT.keys():
     SUPPORTED_LANG_MODEL_DICT[lang] = model_ids
 def parse_duration(audio_file):
     """
     FFMPEG to calculate durations. Libraries can do it too, but filetypes cause different libraries to behave differently.
@@ -108,7 +140,7 @@ def resolve_model_type(model_name: str) -> str:
         return 'ctc'
     # Model specific maps
-    elif 'jasper' in model_name:
         return 'ctc'
     elif 'quartznet' in model_name:
         return 'ctc'
@@ -116,9 +148,8 @@ def resolve_model_type(model_name: str) -> str:
         return 'ctc'
     elif 'contextnet' in model_name:
         return 'ctc'
-    else:
-        # Unknown model type
-        return None
 def resolve_model_stride(model_name) -> int:
@@ -185,6 +216,16 @@ def extract_result_from_manifest(filepath, model_name) -> (bool, str):
         return False, f"Could not perform inference on model with name : {model_name}"
 def infer_audio(model_name: str, audio_file: str) -> str:
     """
     Main method that switches from HF inference for small audio files to Buffered CTC/RNNT mode for long audio files.
@@ -195,17 +236,18 @@ def infer_audio(model_name: str, audio_file: str) -> str:
     Returns:
         str which is the transcription if successful.
     """
     # Parse the duration of the audio file
     duration = parse_duration(audio_file)
-    if duration > 60.0:  # Longer than one minute; use buffered mode
         # Process audio to be of wav type (possible youtube audio)
         audio_file = convert_audio(audio_file)
         # If audio file transcoding failed, let user know
         if audio_file is None:
-            return "Failed to convert audio file to wav."
         # Extract audio dir from resolved audio filepath
         audio_dir = os.path.split(audio_file)[0]
@@ -214,7 +256,7 @@ def infer_audio(model_name: str, audio_file: str) -> str:
         model_stride = resolve_model_stride(model_name)
         if model_stride < 0:
-            return f"Failed to compute the model stride for model with name : {model_name}"
         # Process model type (CTC/RNNT/Hybrid)
         model_type = resolve_model_type(model_name)
@@ -266,7 +308,7 @@ def infer_audio(model_name: str, audio_file: str) -> str:
                 pass
             if RESULT is None:
-                return f"Could not parse model type; failed to perform inference with model {model_name}!"
         elif model_type == 'ctc':
@@ -303,9 +345,10 @@ def infer_audio(model_name: str, audio_file: str) -> str:
             return extract_result_from_manifest('output.json', model_name)[-1]
         else:
-            return f"Could not parse model type; failed to perform inference with model {model_name}!"
     else:
         if model_name in model_dict:
             model = model_dict[model_name]
         else:
@@ -317,7 +360,7 @@ def infer_audio(model_name: str, audio_file: str) -> str:
             return transcriptions
         else:
             error = (
-                f"Could not find model {model_name} in list of available models : "
                 f"{list([k for k in model_dict.keys()])}"
             )
             return error
@@ -334,30 +377,60 @@ def transcribe(microphone, audio_file, model_name):
         audio_data = microphone
     elif (microphone is None) and (audio_file is None):
-        return "ERROR: You have to either use the microphone or upload an audio file"
     elif microphone is not None:
         audio_data = microphone
     else:
         audio_data = audio_file
     try:
         # Use HF API for transcription
         transcriptions = infer_audio(model_name, audio_data)
     except Exception as e:
         transcriptions = ""
-        warn_output = warn_output + "\n\n"
         warn_output += (
             f"The model `{model_name}` is currently loading and cannot be used "
-            f"for transcription.\n"
             f"Please try another model or wait a few minutes."
         )
-    return warn_output + transcriptions
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
         f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
@@ -367,6 +440,7 @@ def _return_yt_html_embed(yt_url):
 def yt_transcribe(yt_url, model_name):
     yt = pt.YouTube(yt_url)
     html_embed_str = _return_yt_html_embed(yt_url)
@@ -374,15 +448,57 @@ def yt_transcribe(yt_url, model_name):
         file_uuid = str(uuid.uuid4().hex)
         file_uuid = f"{tempdir}/{file_uuid}.mp3"
         stream = yt.streams.filter(only_audio=True)[0]
         stream.download(filename=file_uuid)
         text = infer_audio(model_name, file_uuid)
-    return html_embed_str, text
 def create_lang_selector_component(default_en_model=DEFAULT_EN_MODEL):
     lang_selector = gr.components.Dropdown(
         choices=sorted(list(SUPPORTED_LANGUAGES)), value="en", type="value", label="Languages", interactive=True,
     )
@@ -406,6 +522,9 @@ def create_lang_selector_component(default_en_model=DEFAULT_EN_MODEL):
     return lang_selector, models_in_lang
 demo = gr.Blocks(title=TITLE, css=CSS)
 with demo:
@@ -419,9 +538,12 @@ with demo:
         lang_selector, models_in_lang = create_lang_selector_component()
         transcript = gr.components.Label(label='Transcript')
         run = gr.components.Button('Transcribe')
-        run.click(transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript])
     with gr.Tab("Transcribe Youtube"):
         yt_url = gr.components.Textbox(
@@ -429,14 +551,19 @@ with demo:
         )
         lang_selector_yt, models_in_lang_yt = create_lang_selector_component(
-            default_en_model='nvidia/stt_en_conformer_transducer_large'
         )
-        embedded_video = gr.components.HTML()
         transcript = gr.components.Label(label='Transcript')
-        run = gr.components.Button('Transcribe YouTube')
-        run.click(yt_transcribe, inputs=[yt_url, models_in_lang_yt], outputs=[embedded_video, transcript])
     gr.components.HTML(ARTICLE)

 import tempfile
 import subprocess
 import re
+import time
 import gradio as gr
 import pytube as pt
 import nemo.collections.asr as nemo_asr
+import torch
 import speech_to_text_buffered_infer_ctc as buffered_ctc
 import speech_to_text_buffered_infer_rnnt as buffered_rnnt
+from nemo.utils import logging
 # Set NeMo cache dir as /tmp
 from nemo import constants
+os.environ[constants.NEMO_ENV_CACHE_DIR] = "/tmp/nemo/"
+SAMPLE_RATE = 16000  # Default sample rate for ASR
+BUFFERED_INFERENCE_DURATION_THRESHOLD = 60.0  # 60 second and above will require chunked inference.
 TITLE = "NeMo ASR Inference on Hugging Face"
 DESCRIPTION = "Demo of all languages supported by NeMo ASR"
 DEFAULT_EN_MODEL = "nvidia/stt_en_conformer_transducer_xlarge"
+DEFAULT_BUFFERED_EN_MODEL = "nvidia/stt_en_conformer_transducer_large"
+# Pre-download and cache the model in disk space
+logging.setLevel(logging.ERROR)
+tmp_model = nemo_asr.models.ASRModel.from_pretrained(DEFAULT_BUFFERED_EN_MODEL, map_location='cpu')
+del tmp_model
+logging.setLevel(logging.INFO)
 MARKDOWN = f"""
 # {TITLE}
 p.big {
   font-size: 20px;
 }
+/* From https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/blob/main/app.py */
+.result {display:flex;flex-direction:column}
+.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%;font-size:20px;}
+.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
+.result_item_error {background-color:#ff7070;color:white;align-self:start}
 """
 ARTICLE = """
 SUPPORTED_MODEL_NAMES = sorted(list(SUPPORTED_MODEL_NAMES))
+# DEBUG FILTER
+SUPPORTED_MODEL_NAMES = list(filter(lambda x: "en" in x and "conformer_transducer_large" in x, SUPPORTED_MODEL_NAMES))
 model_dict = {model_name: gr.Interface.load(f'models/{model_name}') for model_name in SUPPORTED_MODEL_NAMES}
 SUPPORTED_LANG_MODEL_DICT = {}
     SUPPORTED_LANG_MODEL_DICT[lang] = model_ids
+def get_device():
+    gpu_available = torch.cuda.is_available()
+    if gpu_available:
+        return torch.cuda.get_device_name()
+    else:
+        return "CPU"
 def parse_duration(audio_file):
     """
     FFMPEG to calculate durations. Libraries can do it too, but filetypes cause different libraries to behave differently.
         return 'ctc'
     # Model specific maps
+    if 'jasper' in model_name:
         return 'ctc'
     elif 'quartznet' in model_name:
         return 'ctc'
         return 'ctc'
     elif 'contextnet' in model_name:
         return 'ctc'
+    return None
 def resolve_model_stride(model_name) -> int:
         return False, f"Could not perform inference on model with name : {model_name}"
+def build_html_output(s: str, style: str = "result_item_success"):
+    return f"""
+    <div class='result'>
+        <div class='result_item {style}'>
+          {s}
+        </div>
+    </div>
+    """
 def infer_audio(model_name: str, audio_file: str) -> str:
     """
     Main method that switches from HF inference for small audio files to Buffered CTC/RNNT mode for long audio files.
     Returns:
         str which is the transcription if successful.
+        str which is HTML output of logs.
     """
     # Parse the duration of the audio file
     duration = parse_duration(audio_file)
+    if duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:  # Longer than one minute; use buffered mode
         # Process audio to be of wav type (possible youtube audio)
         audio_file = convert_audio(audio_file)
         # If audio file transcoding failed, let user know
         if audio_file is None:
+            return "Error:- Failed to convert audio file to wav."
         # Extract audio dir from resolved audio filepath
         audio_dir = os.path.split(audio_file)[0]
         model_stride = resolve_model_stride(model_name)
         if model_stride < 0:
+            return f"Error:- Failed to compute the model stride for model with name : {model_name}"
         # Process model type (CTC/RNNT/Hybrid)
         model_type = resolve_model_type(model_name)
                 pass
             if RESULT is None:
+                return f"Error:- Could not parse model type; failed to perform inference with model {model_name}!"
         elif model_type == 'ctc':
             return extract_result_from_manifest('output.json', model_name)[-1]
         else:
+            return f"Error:- Could not parse model type; failed to perform inference with model {model_name}!"
     else:
+        # Obtain Gradio Model function from cache of models
         if model_name in model_dict:
             model = model_dict[model_name]
         else:
             return transcriptions
         else:
             error = (
+                f"Error:- Could not find model {model_name} in list of available models : "
                 f"{list([k for k in model_dict.keys()])}"
             )
             return error
         audio_data = microphone
     elif (microphone is None) and (audio_file is None):
+        warn_output = "ERROR: You have to either use the microphone or upload an audio file"
     elif microphone is not None:
         audio_data = microphone
     else:
         audio_data = audio_file
+    time_diff = None
     try:
         # Use HF API for transcription
+        start = time.time()
         transcriptions = infer_audio(model_name, audio_data)
+        end = time.time()
+        time_diff = end - start
     except Exception as e:
         transcriptions = ""
+        warn_output = warn_output
+        if warn_output != "":
+            warn_output += "<br><br>"
         warn_output += (
             f"The model `{model_name}` is currently loading and cannot be used "
+            f"for transcription.<br>"
             f"Please try another model or wait a few minutes."
         )
+    # Built HTML output
+    if warn_output != "":
+        html_output = build_html_output(warn_output, style="result_item_error")
+    else:
+        if transcriptions.startswith("Error:-"):
+            html_output = build_html_output(transcriptions, style="result_item_error")
+        else:
+            audio_duration = parse_duration(audio_data)
+            output = f"Successfully transcribed on {get_device()} ! <br>" f"Transcription Time : {time_diff: 0.3f} s"
+            if audio_duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:
+                output += f""" <br><br>
+                Note: Audio duration was {audio_duration: 0.3f} s, so model had to be downloaded, initialized, and then
+                buffered inference was used. <br>
+                Please rerun again in order to measure the time taken for just inference with pre-downloaded model. <br>
+                """
+            html_output = build_html_output(output)
+    return transcriptions, html_output
 def _return_yt_html_embed(yt_url):
+    """ Obtained from https://huggingface.co/spaces/whisper-event/whisper-demo """
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
         f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
 def yt_transcribe(yt_url, model_name):
+    """ Modified from https://huggingface.co/spaces/whisper-event/whisper-demo """
     yt = pt.YouTube(yt_url)
     html_embed_str = _return_yt_html_embed(yt_url)
         file_uuid = str(uuid.uuid4().hex)
         file_uuid = f"{tempdir}/{file_uuid}.mp3"
+        # Download YT Audio temporarily
+        download_time_start = time.time()
         stream = yt.streams.filter(only_audio=True)[0]
         stream.download(filename=file_uuid)
+        download_time_end = time.time()
+        # Get audio duration
+        audio_duration = parse_duration(file_uuid)
+        # Perform transcription
+        infer_time_start = time.time()
         text = infer_audio(model_name, file_uuid)
+        infer_time_end = time.time()
+    if text.startswith("Error:-"):
+        html_output = build_html_output(text, style='result_item_error')
+    else:
+        html_output = f"""
+        Successfully transcribed on {get_device()} ! <br>
+        Audio Download Time : {download_time_end - download_time_start: 0.3f} s <br>
+        Transcription Time : {infer_time_end - infer_time_start: 0.3f} s <br>
+        """
+        if audio_duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:
+            html_output += f""" <br>
+            Note: Audio duration was {audio_duration: 0.3f} s, so model had to be downloaded, initialized, and then
+            buffered inference was used. <br>
+            Please rerun again in order to measure the time taken for just inference with pre-downloaded model. <br>
+            """
+        html_output = build_html_output(html_output)
+    return text, html_embed_str, html_output
 def create_lang_selector_component(default_en_model=DEFAULT_EN_MODEL):
+    """
+    Utility function to select a langauge from a dropdown menu, and simultanously update another dropdown
+    containing the corresponding model checkpoints for that language.
+    Args:
+        default_en_model: str name of a default english model that should be the set default.
+    Returns:
+        Gradio components for lang_selector (Dropdown menu) and models_in_lang (Dropdown menu)
+    """
     lang_selector = gr.components.Dropdown(
         choices=sorted(list(SUPPORTED_LANGUAGES)), value="en", type="value", label="Languages", interactive=True,
     )
     return lang_selector, models_in_lang
+"""
+Define the GUI
+"""
 demo = gr.Blocks(title=TITLE, css=CSS)
 with demo:
         lang_selector, models_in_lang = create_lang_selector_component()
         transcript = gr.components.Label(label='Transcript')
+        audio_html_output = gr.components.HTML()
         run = gr.components.Button('Transcribe')
+        run.click(
+            transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript, audio_html_output]
+        )
     with gr.Tab("Transcribe Youtube"):
         yt_url = gr.components.Textbox(
         )
         lang_selector_yt, models_in_lang_yt = create_lang_selector_component(
+            default_en_model=DEFAULT_BUFFERED_EN_MODEL
         )
+        with gr.Row():
+            run = gr.components.Button('Transcribe YouTube')
+            embedded_video = gr.components.HTML()
         transcript = gr.components.Label(label='Transcript')
+        yt_html_output = gr.components.HTML()
+        run.click(
+            yt_transcribe, inputs=[yt_url, models_in_lang_yt], outputs=[transcript, embedded_video, yt_html_output]
+        )
     gr.components.HTML(ARTICLE)