import gradio as gr import torch from transformers import AutoProcessor, SeamlessM4TModel class SeamlessM4TApp: def __init__(self): self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {self.device}") # Load model and processor self.processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large") self.model = SeamlessM4TModel.from_pretrained("facebook/seamless-m4t-v2-large") self.model.to(self.device) def transcribe_audio(self, audio_path): try: # Load and process the audio audio_inputs = self.processor( audios=audio_path, return_tensors="pt", sampling_rate=16000 ).to(self.device) # Generate transcription with torch.no_grad(): generated_tokens = self.model.generate( **audio_inputs, tgt_lang="eng", task="transcribe" ) # Decode the generated tokens transcription = self.processor.decode( generated_tokens[0].tolist(), skip_special_tokens=True ) return transcription except Exception as e: return f"Error during transcription: {str(e)}" # Initialize the Gradio interface def create_interface(): app = SeamlessM4TApp() interface = gr.Interface( fn=app.transcribe_audio, inputs=gr.Audio( type="filepath", label="Upload Audio", source="microphone" ), outputs=gr.Textbox(label="Transcription"), title="SeamlessM4T Speech-to-Text", description="Upload audio or use microphone to transcribe speech to text using SeamlessM4T model.", examples=[], cache_examples=False ) return interface if __name__ == "__main__": interface = create_interface() interface.launch()