Spaces:
Sleeping
Sleeping
try new aproach
Browse files
.oldapp.py.swp
ADDED
Binary file (16.4 kB). View file
|
|
app.py
CHANGED
@@ -1,19 +1,10 @@
|
|
1 |
-
import time
|
2 |
-
import datetime
|
3 |
-
import logging
|
4 |
-
import soundfile
|
5 |
import streamlit as st
|
6 |
from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
|
7 |
import numpy as np
|
8 |
import pydub
|
9 |
-
from
|
10 |
-
|
11 |
from asr import load_model, inference
|
12 |
|
13 |
-
LOG_DIR = "./logs"
|
14 |
-
DATA_DIR = "./data"
|
15 |
-
logger = logging.getLogger(__name__)
|
16 |
-
|
17 |
|
18 |
# Define a custom audio processor to handle microphone input
|
19 |
class AudioProcessor(AudioProcessorBase):
|
@@ -33,121 +24,45 @@ class AudioProcessor(AudioProcessorBase):
|
|
33 |
return combined
|
34 |
return None
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
#
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
st.audio(audio_bytes, format=uploaded_file.type)
|
58 |
-
|
59 |
-
return audio_path
|
60 |
-
|
61 |
-
@st.cache_resource(show_spinner=False)
|
62 |
-
def call_load_model():
|
63 |
-
generator = load_model()
|
64 |
-
return generator
|
65 |
-
|
66 |
-
def main():
|
67 |
-
st.header("Speech-to-Text app with streamlit")
|
68 |
-
st.markdown(
|
69 |
-
"""
|
70 |
-
This STT app is using a fine-tuned MMS ASR model.
|
71 |
-
"""
|
72 |
-
)
|
73 |
-
|
74 |
-
audio_path = upload_audio()
|
75 |
-
logger.info(f"Uploaded audio file: {audio_path}")
|
76 |
-
|
77 |
-
with st.spinner(text="Wait for loading ASR Model..."):
|
78 |
-
generator = call_load_model()
|
79 |
-
|
80 |
-
if audio_path is not None:
|
81 |
-
start_time = time.time()
|
82 |
-
with st.spinner(text='Wait for inference...'):
|
83 |
-
output = inference(generator, audio_path)
|
84 |
-
|
85 |
-
end_time = time.time()
|
86 |
-
|
87 |
-
process_time = time.gmtime(end_time - start_time)
|
88 |
-
process_time = time.strftime("%H hour %M min %S secs", process_time)
|
89 |
-
|
90 |
-
st.success(f"Inference finished in {process_time}.")
|
91 |
-
st.write(f"output: {output['text']}")
|
92 |
-
|
93 |
-
st.title("Microphone Input for ASR")
|
94 |
-
|
95 |
-
# Initialize the audio processor
|
96 |
-
audio_processor = AudioProcessor()
|
97 |
-
|
98 |
-
webrtc_streamer(
|
99 |
-
key="audio",
|
100 |
-
mode=WebRtcMode.SENDONLY,
|
101 |
-
audio_processor_factory=lambda: audio_processor,
|
102 |
-
media_stream_constraints={"audio": True, "video": False},
|
103 |
-
)
|
104 |
-
|
105 |
-
|
106 |
-
if st.button("Process Audio"):
|
107 |
-
audio_data = audio_processor.get_audio_data()
|
108 |
-
if audio_data is not None:
|
109 |
-
# Convert the NumPy array to a WAV-like audio segment
|
110 |
-
audio_segment = pydub.AudioSegment(
|
111 |
-
audio_data.tobytes(),
|
112 |
-
frame_rate=16000, # Default WebRTC audio frame rate
|
113 |
-
sample_width=2, # 16-bit audio
|
114 |
-
channels=1 # Mono
|
115 |
-
)
|
116 |
-
# Save or process audio_segment as needed
|
117 |
-
st.success("Audio captured successfully!")
|
118 |
-
# st.audio(audio_segment.export(format="wav"), format="audio/wav")
|
119 |
-
else:
|
120 |
-
st.warning("No audio data captured!")
|
121 |
-
|
122 |
-
|
123 |
-
if st.button("Transcribe Audio"):
|
124 |
-
if audio_data is not None:
|
125 |
-
# Perform ASR on the audio segment
|
126 |
-
transcription = inference(generator, audio_segment.raw_data)
|
127 |
-
st.text_area("Transcription", transcription["text"])
|
128 |
-
else:
|
129 |
-
st.warning("No audio data to transcribe!")
|
130 |
-
|
131 |
|
132 |
-
if __name__ == "__main__":
|
133 |
-
# Setting logger
|
134 |
-
logger.setLevel(logging.INFO)
|
135 |
-
|
136 |
-
formatter = logging.Formatter("%(levelname)8s %(asctime)s %(name)s %(message)s")
|
137 |
-
|
138 |
-
stream_handler = logging.StreamHandler()
|
139 |
-
stream_handler.setFormatter(formatter)
|
140 |
-
logger.addHandler(stream_handler)
|
141 |
-
|
142 |
-
now = datetime.datetime.now()
|
143 |
-
now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
|
144 |
-
log_dir = Path(LOG_DIR)
|
145 |
-
log_dir.mkdir(parents=True, exist_ok=True)
|
146 |
-
log_file = log_dir / f"{now_time}.log"
|
147 |
-
file_handler = logging.FileHandler(str(log_file), encoding='utf-8')
|
148 |
-
file_handler.setFormatter(formatter)
|
149 |
-
logger.addHandler(file_handler)
|
150 |
-
|
151 |
-
logger.info('Start App')
|
152 |
-
|
153 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
|
3 |
import numpy as np
|
4 |
import pydub
|
5 |
+
from transformers import pipeline
|
|
|
6 |
from asr import load_model, inference
|
7 |
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# Define a custom audio processor to handle microphone input
|
10 |
class AudioProcessor(AudioProcessorBase):
|
|
|
24 |
return combined
|
25 |
return None
|
26 |
|
27 |
+
# Title of the app
|
28 |
+
st.title("Real-Time Speech-to-Text")
|
29 |
+
|
30 |
+
# Initialize the audio processor
|
31 |
+
audio_processor = AudioProcessor()
|
32 |
+
|
33 |
+
# WebRTC streamer to capture microphone input
|
34 |
+
webrtc_streamer(
|
35 |
+
key="audio",
|
36 |
+
mode=WebRtcMode.SENDONLY,
|
37 |
+
audio_processor_factory=lambda: audio_processor,
|
38 |
+
media_stream_constraints={"audio": True, "video": False},
|
39 |
+
)
|
40 |
+
|
41 |
+
# Load a pre-trained ASR pipeline from Hugging Face
|
42 |
+
@st.cache_resource
|
43 |
+
def load_asr_model():
|
44 |
+
return load_model()
|
45 |
+
|
46 |
+
asr_model = load_asr_model()
|
47 |
+
|
48 |
+
# Button to process audio and perform ASR
|
49 |
+
if st.button("Transcribe Audio"):
|
50 |
+
audio_data = audio_processor.get_audio_data()
|
51 |
+
if audio_data is not None:
|
52 |
+
# Convert the NumPy array to a WAV-like audio segment
|
53 |
+
audio_segment = pydub.AudioSegment(
|
54 |
+
audio_data.tobytes(),
|
55 |
+
frame_rate=16000, # Default WebRTC audio frame rate
|
56 |
+
sample_width=2, # 16-bit audio
|
57 |
+
channels=1 # Mono
|
58 |
+
)
|
59 |
+
|
60 |
+
# Perform ASR on the audio segment
|
61 |
+
st.info("Transcribing audio...")
|
62 |
+
transcription = inference(asr_model, audio_segment.raw_data)
|
63 |
|
64 |
+
# Display transcription
|
65 |
+
st.text_area("Transcription", transcription["text"], height=200)
|
66 |
+
else:
|
67 |
+
st.warning("No audio data captured! Please speak into your microphone.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
asr.py
CHANGED
@@ -16,9 +16,10 @@ def load_model():
|
|
16 |
model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True, use_safetensors=True, use_auth_token=hf_token)
|
17 |
|
18 |
|
19 |
-
def inference(model,
|
20 |
-
arr, rate = read_audio_data(audio_path)
|
21 |
-
|
|
|
22 |
|
23 |
with torch.no_grad():
|
24 |
outputs = model(**inputs).logits
|
|
|
16 |
model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True, use_safetensors=True, use_auth_token=hf_token)
|
17 |
|
18 |
|
19 |
+
def inference(model, raw_data):
|
20 |
+
# arr, rate = read_audio_data(audio_path)
|
21 |
+
# arr.squeeze().numpy(), ...
|
22 |
+
inputs = processor(raw_data, sampling_rate=16_000, return_tensors="pt")
|
23 |
|
24 |
with torch.no_grad():
|
25 |
outputs = model(**inputs).logits
|
oldapp.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import datetime
|
3 |
+
import logging
|
4 |
+
import soundfile
|
5 |
+
import streamlit as st
|
6 |
+
from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
|
7 |
+
import numpy as np
|
8 |
+
import pydub
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
from asr import load_model, inference
|
12 |
+
|
13 |
+
LOG_DIR = "./logs"
|
14 |
+
DATA_DIR = "./data"
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
|
18 |
+
# Define a custom audio processor to handle microphone input
|
19 |
+
class AudioProcessor(AudioProcessorBase):
|
20 |
+
def __init__(self):
|
21 |
+
self.audio_data = []
|
22 |
+
|
23 |
+
def recv_audio(self, frame):
|
24 |
+
# Convert the audio frame to a NumPy array
|
25 |
+
audio_array = np.frombuffer(frame.to_ndarray(), dtype=np.int16)
|
26 |
+
self.audio_data.append(audio_array)
|
27 |
+
return frame
|
28 |
+
|
29 |
+
def get_audio_data(self):
|
30 |
+
# Combine all captured audio data
|
31 |
+
if self.audio_data:
|
32 |
+
combined = np.concatenate(self.audio_data, axis=0)
|
33 |
+
return combined
|
34 |
+
return None
|
35 |
+
|
36 |
+
|
37 |
+
def upload_audio() -> Path:
|
38 |
+
# Upload audio file
|
39 |
+
uploaded_file = st.file_uploader("Choose a audio file(wav, mp3, flac)", type=['wav','mp3','flac'])
|
40 |
+
if uploaded_file is not None:
|
41 |
+
# Save audio file
|
42 |
+
audio_data, samplerate = soundfile.read(uploaded_file)
|
43 |
+
|
44 |
+
# Make save directory
|
45 |
+
now = datetime.datetime.now()
|
46 |
+
now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
|
47 |
+
audio_dir = Path(DATA_DIR) / f"{now_time}"
|
48 |
+
audio_dir.mkdir(parents=True, exist_ok=True)
|
49 |
+
|
50 |
+
audio_path = audio_dir / uploaded_file.name
|
51 |
+
soundfile.write(audio_path, audio_data, samplerate)
|
52 |
+
|
53 |
+
# Show audio file
|
54 |
+
with open(audio_path, 'rb') as audio_file:
|
55 |
+
audio_bytes = audio_file.read()
|
56 |
+
|
57 |
+
st.audio(audio_bytes, format=uploaded_file.type)
|
58 |
+
|
59 |
+
return audio_path
|
60 |
+
|
61 |
+
@st.cache_resource(show_spinner=False)
|
62 |
+
def call_load_model():
|
63 |
+
generator = load_model()
|
64 |
+
return generator
|
65 |
+
|
66 |
+
def main():
|
67 |
+
st.header("Speech-to-Text app with streamlit")
|
68 |
+
st.markdown(
|
69 |
+
"""
|
70 |
+
This STT app is using a fine-tuned MMS ASR model.
|
71 |
+
"""
|
72 |
+
)
|
73 |
+
|
74 |
+
audio_path = upload_audio()
|
75 |
+
logger.info(f"Uploaded audio file: {audio_path}")
|
76 |
+
|
77 |
+
with st.spinner(text="Wait for loading ASR Model..."):
|
78 |
+
generator = call_load_model()
|
79 |
+
|
80 |
+
if audio_path is not None:
|
81 |
+
start_time = time.time()
|
82 |
+
with st.spinner(text='Wait for inference...'):
|
83 |
+
output = inference(generator, audio_path)
|
84 |
+
|
85 |
+
end_time = time.time()
|
86 |
+
|
87 |
+
process_time = time.gmtime(end_time - start_time)
|
88 |
+
process_time = time.strftime("%H hour %M min %S secs", process_time)
|
89 |
+
|
90 |
+
st.success(f"Inference finished in {process_time}.")
|
91 |
+
st.write(f"output: {output['text']}")
|
92 |
+
|
93 |
+
st.title("Microphone Input for ASR")
|
94 |
+
|
95 |
+
# Initialize the audio processor
|
96 |
+
audio_processor = AudioProcessor()
|
97 |
+
|
98 |
+
webrtc_streamer(
|
99 |
+
key="audio",
|
100 |
+
mode=WebRtcMode.SENDONLY,
|
101 |
+
audio_processor_factory=lambda: audio_processor,
|
102 |
+
media_stream_constraints={"audio": True, "video": False},
|
103 |
+
)
|
104 |
+
|
105 |
+
|
106 |
+
if st.button("Process Audio"):
|
107 |
+
audio_data = audio_processor.get_audio_data()
|
108 |
+
if audio_data is not None:
|
109 |
+
# Convert the NumPy array to a WAV-like audio segment
|
110 |
+
audio_segment = pydub.AudioSegment(
|
111 |
+
audio_data.tobytes(),
|
112 |
+
frame_rate=16000, # Default WebRTC audio frame rate
|
113 |
+
sample_width=2, # 16-bit audio
|
114 |
+
channels=1 # Mono
|
115 |
+
)
|
116 |
+
# Save or process audio_segment as needed
|
117 |
+
st.success("Audio captured successfully!")
|
118 |
+
# st.audio(audio_segment.export(format="wav"), format="audio/wav")
|
119 |
+
else:
|
120 |
+
st.warning("No audio data captured!")
|
121 |
+
|
122 |
+
|
123 |
+
if st.button("Transcribe Audio"):
|
124 |
+
if audio_data is not None:
|
125 |
+
# Perform ASR on the audio segment
|
126 |
+
transcription = inference(generator, audio_segment.raw_data)
|
127 |
+
st.text_area("Transcription", transcription["text"])
|
128 |
+
else:
|
129 |
+
st.warning("No audio data to transcribe!")
|
130 |
+
|
131 |
+
|
132 |
+
if __name__ == "__main__":
|
133 |
+
# Setting logger
|
134 |
+
logger.setLevel(logging.INFO)
|
135 |
+
|
136 |
+
formatter = logging.Formatter("%(levelname)8s %(asctime)s %(name)s %(message)s")
|
137 |
+
|
138 |
+
stream_handler = logging.StreamHandler()
|
139 |
+
stream_handler.setFormatter(formatter)
|
140 |
+
logger.addHandler(stream_handler)
|
141 |
+
|
142 |
+
now = datetime.datetime.now()
|
143 |
+
now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
|
144 |
+
log_dir = Path(LOG_DIR)
|
145 |
+
log_dir.mkdir(parents=True, exist_ok=True)
|
146 |
+
log_file = log_dir / f"{now_time}.log"
|
147 |
+
file_handler = logging.FileHandler(str(log_file), encoding='utf-8')
|
148 |
+
file_handler.setFormatter(formatter)
|
149 |
+
logger.addHandler(file_handler)
|
150 |
+
|
151 |
+
logger.info('Start App')
|
152 |
+
|
153 |
+
main()
|