yashsrivastava commited on
Commit
e9e9175
·
1 Parent(s): 7c3bb58

Upload aap.py

Browse files
Files changed (1) hide show
  1. aap.py +63 -0
aap.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[ ]:
5
+
6
+
7
+ import soundfile as sf
8
+ import torch
9
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
10
+ import argparse
11
+ from glob import glob
12
+ import torchaudio
13
+ import subprocess
14
+ import gradio as gr
15
+
16
+ resampler = torchaudio.transforms.Resample(48_000, 16_000)
17
+
18
+ def get_filename(wav_file):
19
+ filename_local = wav_file.split('/')[-1][:-4]
20
+ filename_new = '/tmp/'+filename_local+'_16.wav'
21
+
22
+
23
+ subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(wav_file, str(16000), filename_new)], shell=True)
24
+ return filename_new
25
+
26
+ def parse_transcription(wav_file):
27
+ # load pretrained model
28
+ processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
29
+ model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
30
+
31
+ # load audio
32
+
33
+
34
+ wav_file = get_filename(wav_file.name)
35
+ audio_input, sample_rate = sf.read(wav_file)
36
+ #test_file = resampler(test_file[0])
37
+
38
+ # pad input values and return pt tensor
39
+ input_values = processor(audio_input, sampling_rate=16_000, return_tensors="pt").input_values
40
+
41
+ # INFERENCE
42
+ # retrieve logits & take argmax
43
+ logits = model(input_values).logits
44
+ predicted_ids = torch.argmax(logits, dim=-1)
45
+
46
+ # transcribe
47
+ transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
48
+ return transcription
49
+
50
+
51
+ # In[ ]:
52
+
53
+
54
+ import gradio as gr
55
+ title = "Speech-to-Text-English"
56
+ description = "Upload a English audio clip, and let AI do the hard work of transcribing."
57
+
58
+ gr.Interface(
59
+ parse_transcription,
60
+ title=title,
61
+ inputs=gr.inputs.Audio(label="Record Audio File", type="file", source = "microphone"),
62
+ description=description, outputs = "text").launch(inline = False)
63
+