liudongqing
The first version, only recongnize the image
56427b4
raw
history blame
1.05 kB
from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
import torch
from threading import Thread
import gradio as gr
from gradio import FileData
import spaces
model_id = "meta-llama/Llama-3.2-11B-Vision"
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model)
@spaces.GPU
def score_it(input_img):
image = input_img.convert("RGB").resize((224, 224))
prompt = "<|image|><|begin_of_text|>extract the text in this picture"
inputs = processor(image, prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=30)
print(processor.decode(output[0]))
demo = gr.ChatInterface(fn=score_it, title="Upload your English script and get the score",
inputs=[gr.Image()],
outputs=['text'],
stop_btn="Stop Generation",
)
demo.launch(debug=True)