Lguyogiro commited on
Commit
faee479
·
1 Parent(s): d819c6b

try new aproach

Browse files
Files changed (4) hide show
  1. .oldapp.py.swp +0 -0
  2. app.py +41 -126
  3. asr.py +4 -3
  4. oldapp.py +153 -0
.oldapp.py.swp ADDED
Binary file (16.4 kB). View file
 
app.py CHANGED
@@ -1,19 +1,10 @@
1
- import time
2
- import datetime
3
- import logging
4
- import soundfile
5
  import streamlit as st
6
  from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
7
  import numpy as np
8
  import pydub
9
- from pathlib import Path
10
-
11
  from asr import load_model, inference
12
 
13
- LOG_DIR = "./logs"
14
- DATA_DIR = "./data"
15
- logger = logging.getLogger(__name__)
16
-
17
 
18
  # Define a custom audio processor to handle microphone input
19
  class AudioProcessor(AudioProcessorBase):
@@ -33,121 +24,45 @@ class AudioProcessor(AudioProcessorBase):
33
  return combined
34
  return None
35
 
36
-
37
- def upload_audio() -> Path:
38
- # Upload audio file
39
- uploaded_file = st.file_uploader("Choose a audio file(wav, mp3, flac)", type=['wav','mp3','flac'])
40
- if uploaded_file is not None:
41
- # Save audio file
42
- audio_data, samplerate = soundfile.read(uploaded_file)
43
-
44
- # Make save directory
45
- now = datetime.datetime.now()
46
- now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
47
- audio_dir = Path(DATA_DIR) / f"{now_time}"
48
- audio_dir.mkdir(parents=True, exist_ok=True)
49
-
50
- audio_path = audio_dir / uploaded_file.name
51
- soundfile.write(audio_path, audio_data, samplerate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- # Show audio file
54
- with open(audio_path, 'rb') as audio_file:
55
- audio_bytes = audio_file.read()
56
-
57
- st.audio(audio_bytes, format=uploaded_file.type)
58
-
59
- return audio_path
60
-
61
- @st.cache_resource(show_spinner=False)
62
- def call_load_model():
63
- generator = load_model()
64
- return generator
65
-
66
- def main():
67
- st.header("Speech-to-Text app with streamlit")
68
- st.markdown(
69
- """
70
- This STT app is using a fine-tuned MMS ASR model.
71
- """
72
- )
73
-
74
- audio_path = upload_audio()
75
- logger.info(f"Uploaded audio file: {audio_path}")
76
-
77
- with st.spinner(text="Wait for loading ASR Model..."):
78
- generator = call_load_model()
79
-
80
- if audio_path is not None:
81
- start_time = time.time()
82
- with st.spinner(text='Wait for inference...'):
83
- output = inference(generator, audio_path)
84
-
85
- end_time = time.time()
86
-
87
- process_time = time.gmtime(end_time - start_time)
88
- process_time = time.strftime("%H hour %M min %S secs", process_time)
89
-
90
- st.success(f"Inference finished in {process_time}.")
91
- st.write(f"output: {output['text']}")
92
-
93
- st.title("Microphone Input for ASR")
94
-
95
- # Initialize the audio processor
96
- audio_processor = AudioProcessor()
97
-
98
- webrtc_streamer(
99
- key="audio",
100
- mode=WebRtcMode.SENDONLY,
101
- audio_processor_factory=lambda: audio_processor,
102
- media_stream_constraints={"audio": True, "video": False},
103
- )
104
-
105
-
106
- if st.button("Process Audio"):
107
- audio_data = audio_processor.get_audio_data()
108
- if audio_data is not None:
109
- # Convert the NumPy array to a WAV-like audio segment
110
- audio_segment = pydub.AudioSegment(
111
- audio_data.tobytes(),
112
- frame_rate=16000, # Default WebRTC audio frame rate
113
- sample_width=2, # 16-bit audio
114
- channels=1 # Mono
115
- )
116
- # Save or process audio_segment as needed
117
- st.success("Audio captured successfully!")
118
- # st.audio(audio_segment.export(format="wav"), format="audio/wav")
119
- else:
120
- st.warning("No audio data captured!")
121
-
122
-
123
- if st.button("Transcribe Audio"):
124
- if audio_data is not None:
125
- # Perform ASR on the audio segment
126
- transcription = inference(generator, audio_segment.raw_data)
127
- st.text_area("Transcription", transcription["text"])
128
- else:
129
- st.warning("No audio data to transcribe!")
130
-
131
 
132
- if __name__ == "__main__":
133
- # Setting logger
134
- logger.setLevel(logging.INFO)
135
-
136
- formatter = logging.Formatter("%(levelname)8s %(asctime)s %(name)s %(message)s")
137
-
138
- stream_handler = logging.StreamHandler()
139
- stream_handler.setFormatter(formatter)
140
- logger.addHandler(stream_handler)
141
-
142
- now = datetime.datetime.now()
143
- now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
144
- log_dir = Path(LOG_DIR)
145
- log_dir.mkdir(parents=True, exist_ok=True)
146
- log_file = log_dir / f"{now_time}.log"
147
- file_handler = logging.FileHandler(str(log_file), encoding='utf-8')
148
- file_handler.setFormatter(formatter)
149
- logger.addHandler(file_handler)
150
-
151
- logger.info('Start App')
152
-
153
- main()
 
 
 
 
 
1
  import streamlit as st
2
  from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
3
  import numpy as np
4
  import pydub
5
+ from transformers import pipeline
 
6
  from asr import load_model, inference
7
 
 
 
 
 
8
 
9
  # Define a custom audio processor to handle microphone input
10
  class AudioProcessor(AudioProcessorBase):
 
24
  return combined
25
  return None
26
 
27
+ # Title of the app
28
+ st.title("Real-Time Speech-to-Text")
29
+
30
+ # Initialize the audio processor
31
+ audio_processor = AudioProcessor()
32
+
33
+ # WebRTC streamer to capture microphone input
34
+ webrtc_streamer(
35
+ key="audio",
36
+ mode=WebRtcMode.SENDONLY,
37
+ audio_processor_factory=lambda: audio_processor,
38
+ media_stream_constraints={"audio": True, "video": False},
39
+ )
40
+
41
+ # Load a pre-trained ASR pipeline from Hugging Face
42
+ @st.cache_resource
43
+ def load_asr_model():
44
+ return load_model()
45
+
46
+ asr_model = load_asr_model()
47
+
48
+ # Button to process audio and perform ASR
49
+ if st.button("Transcribe Audio"):
50
+ audio_data = audio_processor.get_audio_data()
51
+ if audio_data is not None:
52
+ # Convert the NumPy array to a WAV-like audio segment
53
+ audio_segment = pydub.AudioSegment(
54
+ audio_data.tobytes(),
55
+ frame_rate=16000, # Default WebRTC audio frame rate
56
+ sample_width=2, # 16-bit audio
57
+ channels=1 # Mono
58
+ )
59
+
60
+ # Perform ASR on the audio segment
61
+ st.info("Transcribing audio...")
62
+ transcription = inference(asr_model, audio_segment.raw_data)
63
 
64
+ # Display transcription
65
+ st.text_area("Transcription", transcription["text"], height=200)
66
+ else:
67
+ st.warning("No audio data captured! Please speak into your microphone.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
asr.py CHANGED
@@ -16,9 +16,10 @@ def load_model():
16
  model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True, use_safetensors=True, use_auth_token=hf_token)
17
 
18
 
19
- def inference(model, audio_path):
20
- arr, rate = read_audio_data(audio_path)
21
- inputs = processor(arr.squeeze().numpy(), sampling_rate=16_000, return_tensors="pt")
 
22
 
23
  with torch.no_grad():
24
  outputs = model(**inputs).logits
 
16
  model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True, use_safetensors=True, use_auth_token=hf_token)
17
 
18
 
19
+ def inference(model, raw_data):
20
+ # arr, rate = read_audio_data(audio_path)
21
+ # arr.squeeze().numpy(), ...
22
+ inputs = processor(raw_data, sampling_rate=16_000, return_tensors="pt")
23
 
24
  with torch.no_grad():
25
  outputs = model(**inputs).logits
oldapp.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import datetime
3
+ import logging
4
+ import soundfile
5
+ import streamlit as st
6
+ from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
7
+ import numpy as np
8
+ import pydub
9
+ from pathlib import Path
10
+
11
+ from asr import load_model, inference
12
+
13
+ LOG_DIR = "./logs"
14
+ DATA_DIR = "./data"
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ # Define a custom audio processor to handle microphone input
19
+ class AudioProcessor(AudioProcessorBase):
20
+ def __init__(self):
21
+ self.audio_data = []
22
+
23
+ def recv_audio(self, frame):
24
+ # Convert the audio frame to a NumPy array
25
+ audio_array = np.frombuffer(frame.to_ndarray(), dtype=np.int16)
26
+ self.audio_data.append(audio_array)
27
+ return frame
28
+
29
+ def get_audio_data(self):
30
+ # Combine all captured audio data
31
+ if self.audio_data:
32
+ combined = np.concatenate(self.audio_data, axis=0)
33
+ return combined
34
+ return None
35
+
36
+
37
+ def upload_audio() -> Path:
38
+ # Upload audio file
39
+ uploaded_file = st.file_uploader("Choose a audio file(wav, mp3, flac)", type=['wav','mp3','flac'])
40
+ if uploaded_file is not None:
41
+ # Save audio file
42
+ audio_data, samplerate = soundfile.read(uploaded_file)
43
+
44
+ # Make save directory
45
+ now = datetime.datetime.now()
46
+ now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
47
+ audio_dir = Path(DATA_DIR) / f"{now_time}"
48
+ audio_dir.mkdir(parents=True, exist_ok=True)
49
+
50
+ audio_path = audio_dir / uploaded_file.name
51
+ soundfile.write(audio_path, audio_data, samplerate)
52
+
53
+ # Show audio file
54
+ with open(audio_path, 'rb') as audio_file:
55
+ audio_bytes = audio_file.read()
56
+
57
+ st.audio(audio_bytes, format=uploaded_file.type)
58
+
59
+ return audio_path
60
+
61
+ @st.cache_resource(show_spinner=False)
62
+ def call_load_model():
63
+ generator = load_model()
64
+ return generator
65
+
66
+ def main():
67
+ st.header("Speech-to-Text app with streamlit")
68
+ st.markdown(
69
+ """
70
+ This STT app is using a fine-tuned MMS ASR model.
71
+ """
72
+ )
73
+
74
+ audio_path = upload_audio()
75
+ logger.info(f"Uploaded audio file: {audio_path}")
76
+
77
+ with st.spinner(text="Wait for loading ASR Model..."):
78
+ generator = call_load_model()
79
+
80
+ if audio_path is not None:
81
+ start_time = time.time()
82
+ with st.spinner(text='Wait for inference...'):
83
+ output = inference(generator, audio_path)
84
+
85
+ end_time = time.time()
86
+
87
+ process_time = time.gmtime(end_time - start_time)
88
+ process_time = time.strftime("%H hour %M min %S secs", process_time)
89
+
90
+ st.success(f"Inference finished in {process_time}.")
91
+ st.write(f"output: {output['text']}")
92
+
93
+ st.title("Microphone Input for ASR")
94
+
95
+ # Initialize the audio processor
96
+ audio_processor = AudioProcessor()
97
+
98
+ webrtc_streamer(
99
+ key="audio",
100
+ mode=WebRtcMode.SENDONLY,
101
+ audio_processor_factory=lambda: audio_processor,
102
+ media_stream_constraints={"audio": True, "video": False},
103
+ )
104
+
105
+
106
+ if st.button("Process Audio"):
107
+ audio_data = audio_processor.get_audio_data()
108
+ if audio_data is not None:
109
+ # Convert the NumPy array to a WAV-like audio segment
110
+ audio_segment = pydub.AudioSegment(
111
+ audio_data.tobytes(),
112
+ frame_rate=16000, # Default WebRTC audio frame rate
113
+ sample_width=2, # 16-bit audio
114
+ channels=1 # Mono
115
+ )
116
+ # Save or process audio_segment as needed
117
+ st.success("Audio captured successfully!")
118
+ # st.audio(audio_segment.export(format="wav"), format="audio/wav")
119
+ else:
120
+ st.warning("No audio data captured!")
121
+
122
+
123
+ if st.button("Transcribe Audio"):
124
+ if audio_data is not None:
125
+ # Perform ASR on the audio segment
126
+ transcription = inference(generator, audio_segment.raw_data)
127
+ st.text_area("Transcription", transcription["text"])
128
+ else:
129
+ st.warning("No audio data to transcribe!")
130
+
131
+
132
+ if __name__ == "__main__":
133
+ # Setting logger
134
+ logger.setLevel(logging.INFO)
135
+
136
+ formatter = logging.Formatter("%(levelname)8s %(asctime)s %(name)s %(message)s")
137
+
138
+ stream_handler = logging.StreamHandler()
139
+ stream_handler.setFormatter(formatter)
140
+ logger.addHandler(stream_handler)
141
+
142
+ now = datetime.datetime.now()
143
+ now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
144
+ log_dir = Path(LOG_DIR)
145
+ log_dir.mkdir(parents=True, exist_ok=True)
146
+ log_file = log_dir / f"{now_time}.log"
147
+ file_handler = logging.FileHandler(str(log_file), encoding='utf-8')
148
+ file_handler.setFormatter(formatter)
149
+ logger.addHandler(file_handler)
150
+
151
+ logger.info('Start App')
152
+
153
+ main()