import os import tempfile import torch import gradio as gr from transformers import pipeline from huggingface_hub import notebook_login, InferenceClient notebook_login() TEXT_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct" device = 0 if torch.cuda.is_available() else "cpu" AUDIO_MODEL_NAME = ( "distil-whisper/distil-large-v3" ) BATCH_SIZE = 8 pipe = pipeline( task="automatic-speech-recognition", model=AUDIO_MODEL_NAME, chunk_length_s=30, device=device, ) def transcribe(audio_input): """Function to convert audio to text.""" if audio_input is None: raise gr.Error("No audio file submitted.") output = pipe(audio_input, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True) return output["text"] client = InferenceClient() def build_messages(meeting_transcript) -> list: system_input = "You are an assitant that organizes meeting minutes." user_input = """Take this raw meeting transcript and return an organized version. Here is the transcript: {meeting_transcript} """.format( meeting_transcript=meeting_transcript ) messages = [ {"role": "system", "content": system_input}, {"role": "user", "content": user_input}, ] return messages def organize_text(meeting_transcript): messages = build_messages(meeting_transcript) response = client.chat_completion(messages, model=TEXT_MODEL_NAME, max_tokens=250, seed=430) return response.choices[0].message.content def meeting_transcript_tool(audio_input): meeting_text = transcribe(audio_input) organized_text = organize_text(meeting_text) return organized_text demo = gr.Interface( fn=meeting_transcript_tool, inputs=gr.Audio(type="filepath"), outputs=gr.Textbox(show_copy_button=True), title="The Complete Meeting Transcription tool", ) demo.launch()