Spaces:

Lguyogiro
/

Anishinaabemowin-ASR

Sleeping

App Files Files Community

Lguyogiro commited on Dec 11, 2024

Commit

faee479

1 Parent(s): d819c6b

try new aproach

Browse files

Files changed (4) hide show

.oldapp.py.swp +0 -0
app.py +41 -126
asr.py +4 -3
oldapp.py +153 -0

.oldapp.py.swp ADDED Viewed

Binary file (16.4 kB). View file

app.py CHANGED Viewed

@@ -1,19 +1,10 @@
-import time
-import datetime
-import logging
-import soundfile
 import streamlit as st
 from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
 import numpy as np
 import pydub
-from pathlib import Path
 from asr import load_model, inference
-LOG_DIR = "./logs"
-DATA_DIR = "./data"
-logger = logging.getLogger(__name__)
 # Define a custom audio processor to handle microphone input
 class AudioProcessor(AudioProcessorBase):
@@ -33,121 +24,45 @@ class AudioProcessor(AudioProcessorBase):
             return combined
         return None
-def upload_audio() -> Path:
-    # Upload audio file
-    uploaded_file = st.file_uploader("Choose a audio file(wav, mp3, flac)", type=['wav','mp3','flac'])
-    if uploaded_file is not None:
-        # Save audio file
-        audio_data, samplerate = soundfile.read(uploaded_file)
-        # Make save directory
-        now = datetime.datetime.now()
-        now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
-        audio_dir = Path(DATA_DIR) / f"{now_time}"
-        audio_dir.mkdir(parents=True, exist_ok=True)
-        audio_path = audio_dir / uploaded_file.name
-        soundfile.write(audio_path, audio_data, samplerate)
-        # Show audio file
-        with open(audio_path, 'rb') as audio_file:
-            audio_bytes = audio_file.read()
-        st.audio(audio_bytes, format=uploaded_file.type)
-        return audio_path
-@st.cache_resource(show_spinner=False)
-def call_load_model():
-    generator = load_model()
-    return generator
-def main():
-    st.header("Speech-to-Text app with streamlit")
-    st.markdown(
-        """
-        This STT app is using a fine-tuned MMS ASR model.
-        """
-    )
-    audio_path = upload_audio()
-    logger.info(f"Uploaded audio file: {audio_path}")
-    with st.spinner(text="Wait for loading ASR Model..."):
-        generator = call_load_model()
-    if audio_path is not None:
-        start_time = time.time()
-        with st.spinner(text='Wait for inference...'):
-            output = inference(generator, audio_path)
-        end_time = time.time()
-        process_time = time.gmtime(end_time - start_time)
-        process_time = time.strftime("%H hour %M min %S secs", process_time)
-        st.success(f"Inference finished in {process_time}.")
-        st.write(f"output: {output['text']}")
-    st.title("Microphone Input for ASR")
-    # Initialize the audio processor
-    audio_processor = AudioProcessor()
-    webrtc_streamer(
-        key="audio",
-        mode=WebRtcMode.SENDONLY,
-        audio_processor_factory=lambda: audio_processor,
-        media_stream_constraints={"audio": True, "video": False},
-    )
-    if st.button("Process Audio"):
-        audio_data = audio_processor.get_audio_data()
-        if audio_data is not None:
-            # Convert the NumPy array to a WAV-like audio segment
-            audio_segment = pydub.AudioSegment(
-                audio_data.tobytes(),
-                frame_rate=16000,  # Default WebRTC audio frame rate
-                sample_width=2,  # 16-bit audio
-                channels=1  # Mono
-            )
-            # Save or process audio_segment as needed
-            st.success("Audio captured successfully!")
-            # st.audio(audio_segment.export(format="wav"), format="audio/wav")
-        else:
-            st.warning("No audio data captured!")
-    if st.button("Transcribe Audio"):
-        if audio_data is not None:
-            # Perform ASR on the audio segment
-            transcription = inference(generator, audio_segment.raw_data)
-            st.text_area("Transcription", transcription["text"])
-        else:
-            st.warning("No audio data to transcribe!")
-if __name__ == "__main__":
-    # Setting logger
-    logger.setLevel(logging.INFO)
-    formatter = logging.Formatter("%(levelname)8s %(asctime)s %(name)s %(message)s")
-    stream_handler = logging.StreamHandler()
-    stream_handler.setFormatter(formatter)
-    logger.addHandler(stream_handler)
-    now = datetime.datetime.now()
-    now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
-    log_dir = Path(LOG_DIR)
-    log_dir.mkdir(parents=True, exist_ok=True)
-    log_file = log_dir / f"{now_time}.log"
-    file_handler = logging.FileHandler(str(log_file), encoding='utf-8')
-    file_handler.setFormatter(formatter)
-    logger.addHandler(file_handler)
-    logger.info('Start App')
-    main()

 import streamlit as st
 from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
 import numpy as np
 import pydub
+from transformers import pipeline
 from asr import load_model, inference
 # Define a custom audio processor to handle microphone input
 class AudioProcessor(AudioProcessorBase):
             return combined
         return None
+# Title of the app
+st.title("Real-Time Speech-to-Text")
+# Initialize the audio processor
+audio_processor = AudioProcessor()
+# WebRTC streamer to capture microphone input
+webrtc_streamer(
+    key="audio",
+    mode=WebRtcMode.SENDONLY,
+    audio_processor_factory=lambda: audio_processor,
+    media_stream_constraints={"audio": True, "video": False},
+)
+# Load a pre-trained ASR pipeline from Hugging Face
+@st.cache_resource
+def load_asr_model():
+    return load_model()
+asr_model = load_asr_model()
+# Button to process audio and perform ASR
+if st.button("Transcribe Audio"):
+    audio_data = audio_processor.get_audio_data()
+    if audio_data is not None:
+        # Convert the NumPy array to a WAV-like audio segment
+        audio_segment = pydub.AudioSegment(
+            audio_data.tobytes(),
+            frame_rate=16000,  # Default WebRTC audio frame rate
+            sample_width=2,  # 16-bit audio
+            channels=1  # Mono
+        )
+        # Perform ASR on the audio segment
+        st.info("Transcribing audio...")
+        transcription = inference(asr_model, audio_segment.raw_data)
+        # Display transcription
+        st.text_area("Transcription", transcription["text"], height=200)
+    else:
+        st.warning("No audio data captured! Please speak into your microphone.")

asr.py CHANGED Viewed

@@ -16,9 +16,10 @@ def load_model():
     model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True, use_safetensors=True, use_auth_token=hf_token)
-def inference(model, audio_path):
-    arr, rate = read_audio_data(audio_path)
-    inputs = processor(arr.squeeze().numpy(), sampling_rate=16_000, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs).logits

     model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True, use_safetensors=True, use_auth_token=hf_token)
+def inference(model, raw_data):
+    # arr, rate = read_audio_data(audio_path)
+    # arr.squeeze().numpy(), ...
+    inputs = processor(raw_data, sampling_rate=16_000, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs).logits

oldapp.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import time
+import datetime
+import logging
+import soundfile
+import streamlit as st
+from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
+import numpy as np
+import pydub
+from pathlib import Path
+from asr import load_model, inference
+LOG_DIR = "./logs"
+DATA_DIR = "./data"
+logger = logging.getLogger(__name__)
+# Define a custom audio processor to handle microphone input
+class AudioProcessor(AudioProcessorBase):
+    def __init__(self):
+        self.audio_data = []
+    def recv_audio(self, frame):
+        # Convert the audio frame to a NumPy array
+        audio_array = np.frombuffer(frame.to_ndarray(), dtype=np.int16)
+        self.audio_data.append(audio_array)
+        return frame
+    def get_audio_data(self):
+        # Combine all captured audio data
+        if self.audio_data:
+            combined = np.concatenate(self.audio_data, axis=0)
+            return combined
+        return None
+def upload_audio() -> Path:
+    # Upload audio file
+    uploaded_file = st.file_uploader("Choose a audio file(wav, mp3, flac)", type=['wav','mp3','flac'])
+    if uploaded_file is not None:
+        # Save audio file
+        audio_data, samplerate = soundfile.read(uploaded_file)
+        # Make save directory
+        now = datetime.datetime.now()
+        now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
+        audio_dir = Path(DATA_DIR) / f"{now_time}"
+        audio_dir.mkdir(parents=True, exist_ok=True)
+        audio_path = audio_dir / uploaded_file.name
+        soundfile.write(audio_path, audio_data, samplerate)
+        # Show audio file
+        with open(audio_path, 'rb') as audio_file:
+            audio_bytes = audio_file.read()
+        st.audio(audio_bytes, format=uploaded_file.type)
+        return audio_path
+@st.cache_resource(show_spinner=False)
+def call_load_model():
+    generator = load_model()
+    return generator
+def main():
+    st.header("Speech-to-Text app with streamlit")
+    st.markdown(
+        """
+        This STT app is using a fine-tuned MMS ASR model.
+        """
+    )
+    audio_path = upload_audio()
+    logger.info(f"Uploaded audio file: {audio_path}")
+    with st.spinner(text="Wait for loading ASR Model..."):
+        generator = call_load_model()
+    if audio_path is not None:
+        start_time = time.time()
+        with st.spinner(text='Wait for inference...'):
+            output = inference(generator, audio_path)
+        end_time = time.time()
+        process_time = time.gmtime(end_time - start_time)
+        process_time = time.strftime("%H hour %M min %S secs", process_time)
+        st.success(f"Inference finished in {process_time}.")
+        st.write(f"output: {output['text']}")
+    st.title("Microphone Input for ASR")
+    # Initialize the audio processor
+    audio_processor = AudioProcessor()
+    webrtc_streamer(
+        key="audio",
+        mode=WebRtcMode.SENDONLY,
+        audio_processor_factory=lambda: audio_processor,
+        media_stream_constraints={"audio": True, "video": False},
+    )
+    if st.button("Process Audio"):
+        audio_data = audio_processor.get_audio_data()
+        if audio_data is not None:
+            # Convert the NumPy array to a WAV-like audio segment
+            audio_segment = pydub.AudioSegment(
+                audio_data.tobytes(),
+                frame_rate=16000,  # Default WebRTC audio frame rate
+                sample_width=2,  # 16-bit audio
+                channels=1  # Mono
+            )
+            # Save or process audio_segment as needed
+            st.success("Audio captured successfully!")
+            # st.audio(audio_segment.export(format="wav"), format="audio/wav")
+        else:
+            st.warning("No audio data captured!")
+    if st.button("Transcribe Audio"):
+        if audio_data is not None:
+            # Perform ASR on the audio segment
+            transcription = inference(generator, audio_segment.raw_data)
+            st.text_area("Transcription", transcription["text"])
+        else:
+            st.warning("No audio data to transcribe!")
+if __name__ == "__main__":
+    # Setting logger
+    logger.setLevel(logging.INFO)
+    formatter = logging.Formatter("%(levelname)8s %(asctime)s %(name)s %(message)s")
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(formatter)
+    logger.addHandler(stream_handler)
+    now = datetime.datetime.now()
+    now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
+    log_dir = Path(LOG_DIR)
+    log_dir.mkdir(parents=True, exist_ok=True)
+    log_file = log_dir / f"{now_time}.log"
+    file_handler = logging.FileHandler(str(log_file), encoding='utf-8')
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+    logger.info('Start App')
+    main()