nunenuh commited on
Commit
476b1c3
·
1 Parent(s): 36c3ab2

feat: add infer, utils and change app.py for gradio

Browse files
app.py CHANGED
@@ -1,21 +1,28 @@
1
- from transformers import pipeline
2
  import gradio as gr
3
 
4
- p = pipeline("automatic-speech-recognition")
5
 
6
- def transcribe(audio, state=""):
7
- text = p(audio)["text"]
8
- state += text + " "
9
- return state, state
 
 
10
 
11
- gr.Interface(
12
- fn=transcribe,
13
  inputs=[
14
- gr.Audio(source="microphone", type="filepath", streaming=True),
15
- "state"
 
 
 
16
  ],
17
- outputs=[
18
- "textbox",
19
- "state"
20
- ],
21
- live=True).launch()
 
 
 
 
1
+ from src import infer, utils
2
  import gradio as gr
3
 
 
4
 
5
+ audio_examples = [
6
+ [None, "assets/audio/male-indonesian.wav", None],
7
+ [None, "assets/audio/female-indonesian.wav", None],
8
+ [None, "assets/audio/male-english.wav", None],
9
+ [None, "assets/audio/female-english.wav", None],
10
+ ]
11
 
12
+ demo = gr.Interface(
13
+ fn=infer.predict,
14
  inputs=[
15
+ gr.Radio(label="Language",
16
+ choices=["indonesian","english"],
17
+ value="indonesian"),
18
+ gr.Audio(label="Speak", source="microphone", type="numpy"),
19
+ gr.Audio(label="Upload audio", source="upload", type="numpy"),
20
  ],
21
+ outputs=[gr.TextArea(label="Output Text"),],
22
+ title="OpenAI Whisper Base",
23
+ description=utils.parsing_text("assets/descriptions.md"),
24
+ article=""utils.parsing_text("assets/articles.md"),
25
+ examples=audio_examples,
26
+ )
27
+
28
+ demo.launch()
assets/articles.md ADDED
@@ -0,0 +1 @@
 
 
1
+ articles file
assets/audio/readme.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ---
2
+ noteId: "b807e330385e11eeb3554fca5e7f7a03"
3
+ tags: []
4
+
5
+ ---
6
+
assets/descriptions.md ADDED
@@ -0,0 +1 @@
 
 
1
+ description files
src/__init__.py ADDED
File without changes
src/infer.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import *
3
+ from src import utils
4
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
5
+
6
+ model_name: str = f"openai/whisper-base"
7
+ processor: Any = WhisperProcessor.from_pretrained(model_name)
8
+ model: Any = WhisperForConditionalGeneration.from_pretrained(model_name)
9
+
10
+ sample_rate: int = 16000
11
+ float_factor: float = 32678.0
12
+
13
+
14
+ def predict(language, mic_audio=None, audio=None):
15
+ if mic_audio is not None:
16
+ sampling_rate, waveform = mic_audio
17
+ elif audio is not None:
18
+ sampling_rate, waveform = audio
19
+ else:
20
+ return "(please provide audio)"
21
+
22
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
23
+
24
+ waveform = utils.preprocess_audio(sampling_rate, waveform)
25
+ inputs = processor(audio=waveform, sampling_rate=sample_rate, return_tensors="pt")
26
+ predicted_ids = model.generate(**inputs, max_length=400, forced_decoder_ids=forced_decoder_ids)
27
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
28
+ return transcription[0]
src/utils.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import librosa
3
+ import torch
4
+ from pathlib import Path
5
+
6
+ sample_rate: int = 16000
7
+ float_factor: float = 32678.0
8
+
9
+ def preprocess_audio(sampling_rate, waveform):
10
+ waveform = waveform / float_factor
11
+
12
+ if len(waveform) > 1:
13
+ waveform = librosa.to_mono(waveform.T)
14
+
15
+ if sampling_rate != sample_rate:
16
+ waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=sample_rate)
17
+
18
+ waveform = waveform[:sample_rate*30]
19
+ waveform = torch.tensor(waveform)
20
+ return waveform
21
+
22
+
23
+ def parsing_text(filepath: str):
24
+ path = Path(filepath)
25
+ if path.suffix.lower() not in ('.txt', '.md'):
26
+ raise ValueError("Invalid file type. Only '.txt' and '.md' files are supported.")
27
+
28
+ return path.read_text()
29
+
tests/__init__.py ADDED
File without changes