File size: 6,683 Bytes
ffa317c
 
 
 
 
 
 
 
 
 
b1ff38e
ffa317c
 
 
 
 
2642596
fb061a4
b1ff38e
ffa317c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import gradio as gr
import numpy as np
import librosa

from asr.run_asr import run_asr_inference, load_asr_model
from nlu.run_nlu import run_nlu_inference, load_nlu_model

############### strings
mhubert_link = '[mHuBERT-147 model](https://huggingface.co/utter-project/mHuBERT-147)'
massive_link = '[Speech-MASSIVE dataset](https://huggingface.co/datasets/FBK-MT/Speech-MASSIVE)'
blog_post_link = '[blog post](https://huggingface.co/blog/mzboito/naver-demo-french-slu)'
title = "# DEMO: French Spoken Language Understanding using mHuBERT-147 and Speech-MASSIVE"
description=[
    f"""
    **Interspeech 2024 DEMO.** Cascaded SLU using {mhubert_link} and {massive_link} components.
    """,
    f"""This demo runs on CPU node. You may experience lagging due to network latency depending on the concurrent requests.""",
    f"""**This demo currently doesn't support safari. Please consider accessing it using a Chrome browser.**""",
    f"""For more details on the implementation, check our {blog_post_link}.""",
]

examples = [
    "resources/audios/utt_286.wav",
    "resources/audios/utt_2414.wav",
    "resources/audios/utt_16032.wav",
    "resources/audios/utt_3060.wav",
    "resources/audios/utt_1264.wav",
    "resources/audios/utt_9912.wav",
    "resources/audios/utt_14684.wav",
    "resources/audios/utt_5410.wav",
]
transcriptions = [
    "allume les lumières dans la cuisine",
    "je veux commander une pizza chez michael's pizza",
    "veuillez envoyer un e-mail à sally concernant la réunion de demain",
    "quelles sont les nouvelles de newsource",
    "mon réveil est-il réglé pour demain matin",
    "olly combien de temps dois-je faire bouillir les oeufs",
    "qui est le premier ministre de russie",
    "lis moi les derniers gros titres du new york times"
]
intents = [
    "iot_hue_lighton",
    "takeaway_order",
    "email_sendemail",
    "news_query",
    "alarm_query",
    "cooking_recipe",
    "qa_factoid",
    "news_query"
]
slots = [
    [ "Other", "Other", "Other", "Other", "Other", "house_place" ],
    [ "Other", "Other", "Other", "Other", "food_type", "Other", "business_name", "business_name" ],
    [ 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'person', 'Other', 'Other', 'event_name', 'Other', 'date'],
    [ 'Other', 'Other', 'Other', 'Other', 'Other', 'media_type'],
    [ 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'date', 'timeofday'],
    [ 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'cooking_type', 'Other', 'food_type'],
    [ 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'place_name'],
    [ 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'media_type', 'media_type', 'media_type']
]


utter_ack_text = """This is an output of the European Project UTTER (Unified Transcription and Translation for Extended Reality) funded by European Union’s Horizon Europe Research and Innovation programme under grant agreement number 101070631. 
For more information please visit https://he-utter.eu/"""

ack_authors = """This demo was made by [Beomseok Lee](https://mt.fbk.eu/author/blee/) and [Marcely Zanon Boito](https://sites.google.com/view/mzboito/marcely-zanon-boito)."""

eu_logo = """<img src="https://huggingface.co/spaces/naver/French-SLU-DEMO-Interspeech2024/resolve/main/resources/logos/EU_flag.jpg" width="100" height="100">"""
utter_logo = """<a href="https://he-utter.eu/" target="_blank"><img src="https://huggingface.co/spaces/naver/French-SLU-DEMO-Interspeech2024/resolve/main/resources/logos/Utter_logo.png" width="50" height="50"></a>"""
nle_logo = """<a href="https://europe.naverlabs.com/" target="_blank"><img src="https://huggingface.co/spaces/naver/French-SLU-DEMO-Interspeech2024/resolve/main/resources/logos/NAVERLABS_2_BLACK.png" width="100" height="100"></a>"""
fbk_logo = """<a href="https://mt.fbk.eu/" target="_blank"><img src="https://huggingface.co/spaces/naver/French-SLU-DEMO-Interspeech2024/resolve/main/resources/logos/FBK_logo.png" width="100" height="100"></a>"""


table = f"""
    | File | Transcription | Slots | Intent | 
    | ------------ | ------------------- | ---------- | -----------| 
    | {examples[0].split("/")[-1]}  | {transcriptions[0]} | {slots[0]} | {intents[0]} |
    | {examples[1].split("/")[-1]}  | {transcriptions[1]} | {slots[1]} | {intents[1]} |
    | {examples[2].split("/")[-1]}  | {transcriptions[2]} | {slots[2]} | {intents[2]} |
    | {examples[3].split("/")[-1]}  | {transcriptions[3]} | {slots[3]} | {intents[3]} |
    | {examples[4].split("/")[-1]}  | {transcriptions[4]} | {slots[4]} | {intents[4]} |
    | {examples[5].split("/")[-1]}  | {transcriptions[5]} | {slots[5]} | {intents[5]} |
    | {examples[6].split("/")[-1]}  | {transcriptions[6]} | {slots[6]} | {intents[6]} |
    | {examples[7].split("/")[-1]}  | {transcriptions[7]} | {slots[7]} | {intents[7]} |
    """.strip()

############### calls

def run_inference(audio_file):
    print(audio_file)
    audio_struct = librosa.load(audio_file, sr=16000)
    print(audio_struct)
    audio = {'sampling_rate': audio_struct[1], 'array': audio_struct[0]} #.astype(np.float32)
    transcription = run_asr_inference(asr_model, processor, audio)
    print(transcription)
    structured_output = run_nlu_inference(nlu_model, tokenizer, transcription)

    return structured_output

############### app

asr_model, processor = load_asr_model()
nlu_model, tokenizer = load_nlu_model()

demo = gr.Blocks(
    title=title,
    analytics_enabled=False,
    theme=gr.themes.Base(),
)

with demo:
    gr.Markdown(title)
    for line in description:
        gr.Markdown(line)

    with gr.Row():
        waveform_options = gr.WaveformOptions(sample_rate=16000)
        
        audio_file = gr.Audio(
            label="Audio file", 
            sources=['microphone','upload'],
            type="filepath",
            format='wav',
            waveform_options=waveform_options,
            show_download_button=False,
            show_share_button=False,
            max_length=20,
            )
        
    output = gr.HighlightedText(label="ASR result + NLU result")

    gr.Button("Run Inference", variant='primary').click(
        run_inference,
        concurrency_limit=2,
        inputs=audio_file,
        outputs=output,
    )

    with gr.Row():
        gr.Examples(label="Speech-MASSIVE test utterances:", inputs=audio_file, examples=examples)
    gr.Markdown(table)

    gr.Markdown("# Aknowledgments")
    gr.Markdown(utter_ack_text)    
    gr.Markdown(ack_authors)
    
    with gr.Row():
        gr.Markdown(eu_logo)
        gr.Markdown(utter_logo)
        gr.Markdown(nle_logo)
        gr.Markdown(fbk_logo)

demo.queue()
demo.launch()