from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer import torch from threading import Thread import gradio as gr from gradio import FileData import spaces model_id = "meta-llama/Llama-3.2-11B-Vision" model = MllamaForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained(model) @spaces.GPU def score_it(input_img): image = input_img.convert("RGB").resize((224, 224)) prompt = "<|image|><|begin_of_text|>extract the text in this picture" inputs = processor(image, prompt, return_tensors="pt").to(model.device) output = model.generate(**inputs, max_new_tokens=30) print(processor.decode(output[0])) demo = gr.ChatInterface(fn=score_it, title="Upload your English script and get the score", inputs=[gr.Image()], outputs=['text'], stop_btn="Stop Generation", ) demo.launch(debug=True)