Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from streamlit_webrtc import webrtc_streamer
|
3 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2Model
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
import wave
|
7 |
+
import io
|
8 |
+
|
9 |
+
# Load Wav2Vec 2.0 model and processor
|
10 |
+
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
11 |
+
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
|
12 |
+
|
13 |
+
# Function to generate embeddings
|
14 |
+
def generate_embedding(audio, samplerate):
|
15 |
+
input_values = processor(audio, sampling_rate=samplerate, return_tensors="pt", padding=True).input_values
|
16 |
+
with torch.no_grad():
|
17 |
+
embeddings = model(input_values).last_hidden_state
|
18 |
+
return embeddings.mean(dim=1) # Mean across time
|
19 |
+
|
20 |
+
# Streamlit Interface
|
21 |
+
st.title("Live Audio Recording and Embedding with Wav2Vec 2.0")
|
22 |
+
st.write("Record your audio using the browser and generate embeddings.")
|
23 |
+
|
24 |
+
# WebRTC audio recording
|
25 |
+
webrtc_ctx = webrtc_streamer(
|
26 |
+
key="audio",
|
27 |
+
mode="SENDONLY",
|
28 |
+
media_stream_constraints={"audio": True, "video": False},
|
29 |
+
async_processing=False,
|
30 |
+
)
|
31 |
+
|
32 |
+
if webrtc_ctx.audio_receiver:
|
33 |
+
audio_frames = webrtc_ctx.audio_receiver.get_frames()
|
34 |
+
audio_data = b"".join([frame.to_ndarray().tobytes() for frame in audio_frames])
|
35 |
+
|
36 |
+
# Convert raw audio bytes to a NumPy array
|
37 |
+
audio_array = np.frombuffer(audio_data, dtype=np.float32)
|
38 |
+
|
39 |
+
# Process and save the audio
|
40 |
+
samplerate = 16000 # Default sample rate for Wav2Vec2
|
41 |
+
file_name = "recorded_audio.wav"
|
42 |
+
with wave.open(file_name, "wb") as wf:
|
43 |
+
wf.setnchannels(1)
|
44 |
+
wf.setsampwidth(2)
|
45 |
+
wf.setframerate(samplerate)
|
46 |
+
wf.writeframes(audio_array.tobytes())
|
47 |
+
|
48 |
+
st.audio(file_name, format="audio/wav")
|
49 |
+
|
50 |
+
# Generate embedding
|
51 |
+
embedding = generate_embedding(audio_array, samplerate)
|
52 |
+
st.success("Audio embedding generated!")
|
53 |
+
st.write("Embedding Shape:", embedding.shape)
|
54 |
+
st.write("Embedding Values:", embedding.numpy())
|