import torch import torchaudio import torch from transformers import pipeline import soundfile as sf import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import argparse from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import librosa from huggingface_hub.hf_api import HfFolder from transformers import MarianMTModel, MarianTokenizer from elevenlabs import set_api_key from elevenlabs import clone, generate, play HfFolder.save_token('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') access_token = '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' tokenizer_en_hn = AutoTokenizer.from_pretrained("vasudevgupta/mbart-bhasha-hin-eng") model_translate_en_hm = AutoModelForSeq2SeqLM.from_pretrained("vasudevgupta/mbart-bhasha-hin-eng") processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200") model_speech = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200") def parse_transcription(wav_file): # load audio audio_input, sample_rate = librosa.load(wav_file, sr=16000) # pad input values and return pt tensor input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values # INFERENCE # retrieve logits & take argmax logits = model_speech(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) # transcribe transcription = processor.decode(predicted_ids[0], skip_special_tokens=True) return transcription def hindi_to_english(text): inputs = tokenizer_en_hn.encode( text, return_tensors="pt",padding=True,max_length=512,truncation=True) outputs = model_translate_en_hm.generate( inputs, max_length=128, num_beams=4, early_stopping=True ) translated = tokenizer_en_hn.decode(outputs[0]).replace('',"").replace('', "").strip().lower() return translated def translate_english_to_hindi(input_text): # Load the pre-trained English to Hindi translation model and tokenizer model_name = "Helsinki-NLP/opus-mt-en-hi" tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) # Tokenize the input text and generate translation inputs = tokenizer(input_text, return_tensors="pt", padding=True) translated_ids = model.generate(inputs.input_ids) # Decode the translated output translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True) return translated_text def hindi_tts(text): audio = generate( text=text, voice="Customer Service", model="eleven_monolingual_v1" ) return audio