File size: 1,113 Bytes
56427b4
 
 
5bb878a
56427b4
 
53a41f9
 
 
 
 
 
56427b4
 
 
 
 
 
5bb878a
56427b4
5bb878a
 
56427b4
 
 
5bb878a
56427b4
 
5bb878a
56427b4
 
5bb878a
 
56427b4
5bb878a
56427b4
 
 
 
5bb878a
56427b4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
import torch
from threading import Thread
import gradio as gr
from gradio import FileData
import spaces

from zipnn import zipnn_hf

zipnn_hf()

model_id = "royleibov/Llama-3.2-11B-Vision-Instruct-ZipNN-Compressed"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

processor = AutoProcessor.from_pretrained(model)


@spaces.GPU
def score_it(input_img):
    image = input_img.convert("RGB").resize((224, 224))

    prompt = "<|image|><|begin_of_text|>extract the text in this picture"
    inputs = processor(image, prompt, return_tensors="pt").to(model.device)

    output = model.generate(**inputs, max_new_tokens=30)
    print(processor.decode(output[0]))


demo = gr.ChatInterface(fn=score_it, title="Upload your English script and get the score",

                        inputs=[gr.Image()],
                        outputs=['text'],
                        stop_btn="Stop Generation",
                        )

demo.launch(debug=True)