ahmedJaafari commited on
Commit
85959f4
·
1 Parent(s): 0e5b41a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -28
app.py CHANGED
@@ -1,32 +1,36 @@
1
- import librosa
2
- import gradio as gr
3
- import numpy as np
4
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
  import soundfile as sf
6
  import torch
7
-
8
- # load model and tokenizer
9
- processor = Wav2Vec2Processor.from_pretrained("ahmedJaafari/Annarabic3.2", use_auth_token=st.secrets["AnnarabicToken"])
10
- model = Wav2Vec2ForCTC.from_pretrained("ahmedJaafari/Annarabic3.2", use_auth_token=st.secrets["AnnarabicToken"])
11
-
12
- def speech2text(audio):
13
- sr, data = audio
14
-
15
- # resample to 16hz
16
- data_16hz = librosa.resample(data[:,0].astype(np.float32),sr,16000)
17
-
18
- # tokenize
19
- input_values = processor([data_16hz], return_tensors="pt", padding="longest").input_values # Batch size 1
20
-
21
- # retrieve logits
 
 
 
 
 
22
  logits = model(input_values).logits
23
-
24
- # take argmax and decode
25
  predicted_ids = torch.argmax(logits, dim=-1)
26
- transcription = processor.batch_decode(predicted_ids)
27
-
28
- return transcription[0] # batch size 1
29
-
30
- iface = gr.Interface(speech2text, "microphone", "text")
31
-
32
- iface.launch()
 
 
 
 
 
 
 
 
 
 
1
  import soundfile as sf
2
  import torch
3
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
+ import gradio as gr
5
+ import sox
6
+ import os
7
+
8
+ def convert(inputfile, outfile):
9
+ sox_tfm = sox.Transformer()
10
+ sox_tfm.set_output_format(
11
+ file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
12
+ )
13
+ sox_tfm.build(inputfile, outfile)
14
+ api_token = os.getenv("AnnarabicToken")
15
+ model_name = "ahmedJaafari/Annarabic3.2"
16
+ processor = Wav2Vec2Processor.from_pretrained(model_name, use_auth_token=api_token)
17
+ model = Wav2Vec2ForCTC.from_pretrained(model_name, use_auth_token=api_token)
18
+ def parse_transcription(wav_file):
19
+ filename = wav_file.name.split('.')[0]
20
+ convert(wav_file.name, filename + "16k.wav")
21
+ speech, _ = sf.read(filename + "16k.wav")
22
+ input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
23
  logits = model(input_values).logits
 
 
24
  predicted_ids = torch.argmax(logits, dim=-1)
25
+ transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
26
+ return transcription
27
+ output = gr.outputs.Textbox(label="The transcript")
28
+ input_ = gr.inputs.Audio(source="microphone", type="file")
29
+ gr.Interface(parse_transcription, inputs=input_, outputs=[output],
30
+ analytics_enabled=False,
31
+ show_tips=False,
32
+ theme='huggingface',
33
+ layout='vertical',
34
+ title="Speech Recognition for Darija",
35
+ description="Speech Recognition Live Demo for Darija",
36
+ enable_queue=True).launch( inline=False)