Spaces:

jkorstad
/

Llama-3.2-11B-Vision-Instruct-Chat

Runtime error

App Files Files Community

Llama-3.2-11B-Vision-Instruct-Chat / app.py

jkorstad

Update app.py

5e0039f verified 13 days ago

raw

history blame contribute delete

3.02 kB

	import gradio as gr
	import spaces
	import os
	import torch
	from transformers import AutoProcessor, MllamaForConditionalGeneration
	from PIL import Image, ImageOps
	import whisper

	# Hugging Face token
	hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
	if not hf_token:
	raise ValueError("HUGGING_FACE_HUB_TOKEN not found.")

	# Model
	model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
	model = MllamaForConditionalGeneration.from_pretrained(
	model_name,
	token=hf_token,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)
	processor = AutoProcessor.from_pretrained(model_name, token=hf_token)

	@spaces.GPU
	def predict(image, text):
	messages = [
	{"role": "user", "content": [
	{"type": "image"},
	{"type": "text", "text": text}
	]}
	]

	input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(image, input_text, return_tensors="pt").to(model.device)
	outputs = model.generate(**inputs, max_new_tokens=250)
	response = processor.decode(outputs[0], skip_special_tokens=True)
	# Split the response at the first occurrence of "assistant" and return only the part after it
	response = response.split("assistant", 1)[1].strip()
	return response

	# Whisper STT optional model
	#@spaces.GPU
	#def transcribe_audio(audio):
	# result = whisper.transcribe(audio, model="base")
	# return result["text"]

	# Example photos and prompts
	example_images = [
	ImageOps.exif_transpose(Image.open("Illustration by @twentyone21___.jpg")),
	ImageOps.exif_transpose(Image.open("Kynda Coffee.jpg")),
	ImageOps.exif_transpose(Image.open("Cowboy Hat.jpg")),
	ImageOps.exif_transpose(Image.open("Norway.JPG"))
	]
	example_prompts = ["Describe the photo",
	"Search for the business name on his t-shirt to get a description of where the person is in Texas.",
	"Describe the photo",
	"Where do you think this photo was taken based on the architecture?"
	]

	# Gradio
	demo = gr.Blocks()

	with demo:
	gr.Markdown("# Image Question Answering and Optional (WIP) Audio Transcription")

	with gr.Tab("Image & Text Prompt"):
	image_input = gr.Image(type="pil", label="Image Input")
	text_input = gr.Textbox(label="Text Input")
	output = gr.Textbox(label="Output")
	gr.Button("Submit").click(predict, inputs=[image_input, text_input], outputs=output)

	gr.Examples(examples=[[image, prompt] for image, prompt in zip(example_images, example_prompts)], inputs=[image_input, text_input])

	# with gr.Tab("Audio Transcription (WIP) Prompt"):
	# gr.load("models/openai/whisper-large-v3")
	# audio_input = gr.Audio(label="Audio Input")
	# text_output = gr.Textbox(label="Transcribed Text")
	# gr.Button("Transcribe").click(transcribe_audio, inputs=audio_input, outputs=text_output)
	# gr.Button("Submit").click(predict, inputs=[image_input, text_output], outputs=output)

	demo.launch()