from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
import torch
from threading import Thread
import gradio as gr
from gradio import FileData
import spaces

from zipnn import zipnn_hf

zipnn_hf()

model_id = "royleibov/Llama-3.2-11B-Vision-Instruct-ZipNN-Compressed"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

processor = AutoProcessor.from_pretrained(model)


@spaces.GPU
def score_it(input_img):
    image = input_img.convert("RGB").resize((224, 224))

    prompt = "<|image|><|begin_of_text|>extract the text in this picture"
    inputs = processor(image, prompt, return_tensors="pt").to(model.device)

    output = model.generate(**inputs, max_new_tokens=30)
    print(processor.decode(output[0]))


demo = gr.ChatInterface(fn=score_it, title="Upload your English script and get the score",

                        inputs=[gr.Image()],
                        outputs=['text'],
                        stop_btn="Stop Generation",
                        )

demo.launch(debug=True)