Spaces:

HolyMorphsim
/

EchoMind

Sleeping

App Files Files Community

EchoMind / app.py

HolyMorphsim

error fixed

a342691 29 days ago

raw

history blame contribute delete

4.68 kB

	import streamlit as st
	import tempfile
	import os
	import torch
	import torchaudio
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	from zonos.model import Zonos
	from zonos.conditioning import make_cond_dict

	st.set_page_config(page_title="Echo Mind 🧠", page_icon="🧠")

	st.title("Echo Mind 🧠")
	st.write("Voice Transcription with Whisper Large V3 Turbo and Zonos Speech Synthesis")

	@st.cache_resource
	def load_whisper_model():
	device = "cuda" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	model_id = "openai/whisper-large-v3-turbo"

	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=False, use_safetensors=True
	)
	model.to(device)

	processor = AutoProcessor.from_pretrained(model_id)

	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	torch_dtype=torch_dtype,
	device=device,
	)

	return pipe

	@st.cache_resource
	def load_zonos_model():
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=device)
	return model

	def transcribe_audio(audio_path, pipe):
	# Use the pipeline directly for transcription
	result = pipe(audio_path)
	return result["text"]

	def synthesize_speech(text, audio_path, zonos_model):
	"""Generate speech using Zonos with the voice from the input audio"""
	try:
	# Load the input audio to extract speaker characteristics
	wav, sampling_rate = torchaudio.load(audio_path)
	speaker = zonos_model.make_speaker_embedding(wav, sampling_rate)

	# Prepare conditioning and generate speech
	cond_dict = make_cond_dict(text=text, speaker=speaker, language="en-us")
	conditioning = zonos_model.prepare_conditioning(cond_dict)

	codes = zonos_model.generate(conditioning)
	wavs = zonos_model.autoencoder.decode(codes).cpu()

	# Save to a temporary file
	output_path = tempfile.mktemp(suffix=".wav")
	torchaudio.save(output_path, wavs[0], zonos_model.autoencoder.sampling_rate)

	return output_path
	except Exception as e:
	st.error(f"Speech synthesis error: {str(e)}")
	return None

	# Load the Whisper model
	with st.spinner("Loading Whisper Large V3 Turbo model..."):
	pipe = load_whisper_model()
	st.success("Model loaded!")

	# Load the Zonos model
	with st.spinner("Loading Zonos voice synthesis model..."):
	zonos_model = load_zonos_model()
	st.success("Voice synthesis model loaded!")

	audio_bytes = st.audio_input("Record")
	if audio_bytes:

	if audio_bytes:
	st.audio(audio_bytes, format="audio/wav")

	# Save the recorded audio to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	# Read bytes from the UploadedFile object before writing
	tmp_file.write(audio_bytes.read())
	audio_path = tmp_file.name

	col1, col2 = st.columns(2)
	with col1:
	if st.button("Transcribe Audio"):
	with st.spinner("Transcribing with Whisper Large V3 Turbo..."):
	transcription = transcribe_audio(audio_path, pipe)

	st.session_state.transcription = transcription
	st.session_state.audio_path = audio_path

	st.subheader("Transcription")
	st.write(transcription)

	with col2:
	if st.button("Speak Transcription") and 'transcription' in st.session_state:
	with st.spinner("Synthesizing speech with Zonos..."):
	output_path = synthesize_speech(
	st.session_state.transcription,
	st.session_state.audio_path,
	zonos_model
	)

	if output_path:
	st.subheader("Synthesized Speech")
	with open(output_path, "rb") as f:
	audio_bytes = f.read()
	st.audio(audio_bytes, format="audio/wav")
	# Clean up temporary files
	os.unlink(output_path)

	# Clean up temporary files when the app is done
	if 'audio_path' in st.session_state and os.path.exists(st.session_state.audio_path):
	os.unlink(st.session_state.audio_path)