Audio_Chatbot / helper.py
Vageesh1's picture
Update helper.py
0c28758
raw
history blame
2.62 kB
import torch
import torchaudio
import torch
from transformers import pipeline
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import argparse
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import librosa
from huggingface_hub.hf_api import HfFolder
from transformers import MarianMTModel, MarianTokenizer
from elevenlabs import set_api_key
from elevenlabs import clone, generate, play
HfFolder.save_token('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')
access_token = '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'
tokenizer_en_hn = AutoTokenizer.from_pretrained("vasudevgupta/mbart-bhasha-hin-eng")
model_translate_en_hm = AutoModelForSeq2SeqLM.from_pretrained("vasudevgupta/mbart-bhasha-hin-eng")
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
model_speech = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
def parse_transcription(wav_file):
# load audio
audio_input, sample_rate = librosa.load(wav_file, sr=16000)
# pad input values and return pt tensor
input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values
# INFERENCE
# retrieve logits & take argmax
logits = model_speech(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
# transcribe
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription
def hindi_to_english(text):
inputs = tokenizer_en_hn.encode(
text, return_tensors="pt",padding=True,max_length=512,truncation=True)
outputs = model_translate_en_hm.generate(
inputs, max_length=128, num_beams=4, early_stopping=True
)
translated = tokenizer_en_hn.decode(outputs[0]).replace('<pad>',"").replace('<s>', "").strip().lower()
return translated
def translate_english_to_hindi(input_text):
# Load the pre-trained English to Hindi translation model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
# Tokenize the input text and generate translation
inputs = tokenizer(input_text, return_tensors="pt", padding=True)
translated_ids = model.generate(inputs.input_ids)
# Decode the translated output
translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
return translated_text
def hindi_tts(text):
audio = generate(
text=text,
voice="Customer Service",
model="eleven_monolingual_v1"
)
return audio