awacke1 commited on
Commit
d41626c
·
1 Parent(s): b2d63dc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys
3
+ import gradio as gr
4
+ from transformers import pipeline, AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
5
+
6
+ logging.basicConfig(
7
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
8
+ datefmt="%m/%d/%Y %H:%M:%S",
9
+ handlers=[logging.StreamHandler(sys.stdout)],
10
+ )
11
+ logger = logging.getLogger(__name__)
12
+ logger.setLevel(logging.DEBUG)
13
+
14
+
15
+ LARGE_MODEL_BY_LANGUAGE = {
16
+ "Arabic": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic", "has_lm": False},
17
+ "Chinese": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn", "has_lm": False},
18
+ "Dutch": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-dutch", "has_lm": True},
19
+ "English": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-english", "has_lm": True},
20
+ "Finnish": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-finnish", "has_lm": False},
21
+ "French": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-french", "has_lm": True},
22
+ "German": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-german", "has_lm": True},
23
+ "Greek": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-greek", "has_lm": False},
24
+ "Hungarian": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian", "has_lm": False},
25
+ "Italian": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-italian", "has_lm": True},
26
+ "Japanese": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-japanese", "has_lm": False},
27
+ "Persian": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-persian", "has_lm": False},
28
+ "Polish": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-polish", "has_lm": True},
29
+ "Portuguese": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese", "has_lm": True},
30
+ "Russian": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-russian", "has_lm": True},
31
+ "Spanish": {"model_id": "jonatasgrosman/wav2vec2-large-xlsr-53-spanish", "has_lm": True},
32
+ }
33
+
34
+ XLARGE_MODEL_BY_LANGUAGE = {
35
+ "Dutch": {"model_id": "jonatasgrosman/wav2vec2-xls-r-1b-dutch", "has_lm": True},
36
+ "English": {"model_id": "jonatasgrosman/wav2vec2-xls-r-1b-english", "has_lm": True},
37
+ "French": {"model_id": "jonatasgrosman/wav2vec2-xls-r-1b-french", "has_lm": True},
38
+ "German": {"model_id": "jonatasgrosman/wav2vec2-xls-r-1b-german", "has_lm": True},
39
+ "Italian": {"model_id": "jonatasgrosman/wav2vec2-xls-r-1b-italian", "has_lm": True},
40
+ "Polish": {"model_id": "jonatasgrosman/wav2vec2-xls-r-1b-polish", "has_lm": True},
41
+ "Portuguese": {"model_id": "jonatasgrosman/wav2vec2-xls-r-1b-portuguese", "has_lm": True},
42
+ "Russian": {"model_id": "jonatasgrosman/wav2vec2-xls-r-1b-russian", "has_lm": True},
43
+ "Spanish": {"model_id": "jonatasgrosman/wav2vec2-xls-r-1b-spanish", "has_lm": True},
44
+ }
45
+
46
+
47
+ # LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys())
48
+
49
+ # the container given by HF has 16GB of RAM, so we need to limit the number of models to load
50
+ LANGUAGES = sorted(XLARGE_MODEL_BY_LANGUAGE.keys())
51
+ CACHED_MODELS_BY_ID = {}
52
+
53
+
54
+ def run(input_file, language, decoding_type, history, model_size="300M"):
55
+
56
+ logger.info(f"Running ASR {language}-{model_size}-{decoding_type} for {input_file}")
57
+
58
+ history = history or []
59
+
60
+ if model_size == "300M":
61
+ model = LARGE_MODEL_BY_LANGUAGE.get(language, None)
62
+ else:
63
+ model = XLARGE_MODEL_BY_LANGUAGE.get(language, None)
64
+
65
+ if model is None:
66
+ history.append({
67
+ "error_message": f"Model size {model_size} not found for {language} language :("
68
+ })
69
+ elif decoding_type == "LM" and not model["has_lm"]:
70
+ history.append({
71
+ "error_message": f"LM not available for {language} language :("
72
+ })
73
+ else:
74
+
75
+ # model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
76
+ model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
77
+ if model_instance is None:
78
+ model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
79
+ CACHED_MODELS_BY_ID[model["model_id"]] = model_instance
80
+
81
+ if decoding_type == "LM":
82
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(model["model_id"])
83
+ asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
84
+ feature_extractor=processor.feature_extractor, decoder=processor.decoder)
85
+ else:
86
+ processor = Wav2Vec2Processor.from_pretrained(model["model_id"])
87
+ asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
88
+ feature_extractor=processor.feature_extractor, decoder=None)
89
+
90
+ transcription = asr(input_file, chunk_length_s=5, stride_length_s=1)["text"]
91
+
92
+ logger.info(f"Transcription for {input_file}: {transcription}")
93
+
94
+ history.append({
95
+ "model_id": model["model_id"],
96
+ "language": language,
97
+ "model_size": model_size,
98
+ "decoding_type": decoding_type,
99
+ "transcription": transcription,
100
+ "error_message": None
101
+ })
102
+
103
+ html_output = "<div class='result'>"
104
+ for item in history:
105
+ if item["error_message"] is not None:
106
+ html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>"
107
+ else:
108
+ url_suffix = " + LM" if item["decoding_type"] == "LM" else ""
109
+ html_output += "<div class='result_item result_item_success'>"
110
+ html_output += f'<strong><a target="_blank" href="https://huggingface.co/{item["model_id"]}">{item["model_id"]}{url_suffix}</a></strong><br/><br/>'
111
+ html_output += f'{item["transcription"]}<br/>'
112
+ html_output += "</div>"
113
+ html_output += "</div>"
114
+
115
+ return html_output, history
116
+
117
+
118
+ gr.Interface(
119
+ run,
120
+ inputs=[
121
+ gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."),
122
+ gr.inputs.Radio(label="Language", choices=LANGUAGES),
123
+ gr.inputs.Radio(label="Decoding type", choices=["greedy", "LM"]),
124
+ # gr.inputs.Radio(label="Model size", choices=["300M", "1B"]),
125
+ "state"
126
+ ],
127
+ outputs=[
128
+ gr.outputs.HTML(label="Outputs"),
129
+ "state"
130
+ ],
131
+ title="Automatic Speech Recognition",
132
+ description="",
133
+ css="""
134
+ .result {display:flex;flex-direction:column}
135
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
136
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
137
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
138
+ """,
139
+ allow_screenshot=False,
140
+ allow_flagging="never",
141
+ theme="grass"
142
+ ).launch(enable_queue=True)