Spaces:
Sleeping
Sleeping
File size: 2,712 Bytes
e0cb286 5b74a4b eda1d8a 83e3ccb c23e905 5b74a4b bc3fe61 eda1d8a def416c a927d1d 2f47955 0639911 75c9b3b 2f47955 bc3fe61 75c9b3b bc3fe61 75c9b3b bc3fe61 75c9b3b 393002d eda1d8a cd0ec84 eda1d8a dfd48ca eda1d8a cd0ec84 75c9b3b 25fb027 75c9b3b c58bd88 75c9b3b 8c23bfa 75c9b3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import torch
import librosa
import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoProcessor, SeamlessM4Tv2Model
import numpy as np
import soundfile as sf
import tempfile
# Load the models and processors
asr_model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
asr_processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
# Load the SeamlessM4T model and processor
translator_model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
translator_processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
tts = pipeline("text-to-speech", model="Baghdad99/hausa_voice_tts")
def translate_speech(audio_file_path):
# Load the audio file as a floating point time series
audio_data, sample_rate = librosa.load(audio_file_path, sr=16000)
# Prepare the input dictionary
input_dict = asr_processor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True) # Pass the resampled audio_data here
# Use the ASR model to get the logits
logits = asr_model(input_dict.input_values.to("cpu")).logits
# Get the predicted IDs
pred_ids = torch.argmax(logits, dim=-1)[0]
# Decode the predicted IDs to get the transcription
transcription = asr_processor.decode(pred_ids)
print(f"Transcription: {transcription}") # Print the transcription
# Prepare the input dictionary for the translator
text_inputs = translator_processor(text=transcription, src_lang="eng", return_tensors="pt")
# Use the translator model to translate the transcription
translated_text = translator_model.generate(**text_inputs, tgt_lang="ha") # Change the target language to Hausa
# Decode the translated text
translated_text_str = translator_processor.decode(translated_text[0])
# Remove special tokens
translated_text_str = translated_text_str.replace("<pad>", "").replace("</s>", "").strip()
print(f"Translated text string: {translated_text_str}") # Print the translated text string
# Use the text-to-speech pipeline to synthesize the translated text
synthesised_speech = tts(translated_text_str)
# Check if the synthesised speech contains 'audio'
if 'audio' in synthesised_speech:
synthesised_speech_data = synthesised_speech['audio']
else:
print("The synthesised speech does not contain 'audio'")
return
# Flatten the audio data
synthesised_speech_data = synthesised_speech_data.flatten()
# Scale the audio data to the range of int16 format
synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
return 16000, synthesised_speech
|