import gradio as gr from PIL import Image from inference.main import MultiModalPhi2 multimodal_phi2 = MultiModalPhi2( modelname_or_path="Navyabhat/Llava-Phi2", temperature=0.2, max_new_tokens=1024, device="cpu", ) theme = gr.themes.Default(primary_hue="blue").set( loader_color="#FF0000", button_primary_background_fill="*primary_200", button_primary_background_fill_hover="*primary_300", ) def add_content(chatbot, text, image, audio_upload, audio_mic) -> gr.Chatbot: textflag, imageflag, audioflag = False, False, False if text not in ["", None]: chatbot.append((text, None)) textflag = True if image is not None: chatbot.append(((image,), None)) imageflag = True if audio_mic is not None: chatbot.append(((audio_mic,), None)) audioflag = True else: if audio_upload is not None: chatbot.append(((audio_upload,), None)) audioflag = True if not any([textflag, imageflag, audioflag]): # Raise an error if neither text nor file is provided raise gr.Error("Enter a valid text, image or audio") return chatbot def clear_data(): return {prompt: None, image: None, audio_upload: None, audio_mic: None, chatbot: []} def run(history, text, image, audio_upload, audio_mic): if text in [None, ""]: text = None if audio_upload is not None: audio = audio_upload elif audio_mic is not None: audio = audio_mic else: audio = None print("text", text) print("image", image) print("audio", audio) if image is not None: image = Image.open(image) outputs = multimodal_phi2(text, audio, image) # outputs = "" history.append((None, outputs.title())) return history, None, None, None, None with gr.Blocks(theme=theme) as demo: gr.Markdown("## 🤖 Multi-modal LLM") gr.Markdown("This is a multi-modal LLM that takes text, image and audio as inputs.") with gr.Row(): # with gr.Column(scale=4): # with gr.Box(): # with gr.Row(): # # Adding image # image = gr.Image(type="filepath", value=None) # # with gr.Row(): # # Add audio # audio_upload = gr.Audio(source="upload", type="filepath") # audio_mic = gr.Audio( # source="microphone", type="filepath", format="mp3" # ) # with gr.Column(scale=8): # with gr.Box(): # with gr.Row(): chatbot = gr.Chatbot( avatar_images=("🧑", "🤖"), height=560, ) with gr.Row(): image = gr.Image(type="filepath", value=None) audio_upload = gr.Audio(source="upload", type="filepath") audio_mic = gr.Audio( source="microphone", type="filepath", format="mp3" ) with gr.Row(): prompt = gr.Textbox( placeholder="Ask anything", lines=2, label="Query", value=None, scale=4 ) with gr.Row(): # Adding a Button submit = gr.Button(value = "Submit", variant="primary") clear = gr.Button(value="Clear") submit.click( add_content, inputs=[chatbot, prompt, image, audio_upload, audio_mic], outputs=[chatbot], ).success( run, inputs=[chatbot, prompt, image, audio_upload, audio_mic], outputs=[chatbot, prompt, image, audio_upload, audio_mic], ) clear.click( clear_data, outputs=[prompt, image, audio_upload, audio_mic, chatbot], ) demo.launch()