File size: 2,070 Bytes
772f47b
 
0071505
 
 
 
 
 
772f47b
0071505
 
 
 
 
 
 
 
 
 
 
772f47b
0071505
 
 
 
772f47b
0071505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
772f47b
 
 
 
 
 
0071505
772f47b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
import time
import spaces
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import uuid


model_name = "oddadmix/Qari-OCR-0.1-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
                model_name,
                torch_dtype="auto",
                device_map="cuda"
            )
processor = AutoProcessor.from_pretrained(model_name)
max_tokens = 2000

@spaces.GPU
def perform_ocr(image):
    src = uuid.uuid4() + ".png"

    prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
    image.save(src)

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": f"file://{src}"},
                {"type": "text", "text": prompt},
            ],
        }
    ]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    os.remove(src)
    print(output_text)
    return output_text

iface = gr.Interface(
    fn=perform_ocr,
    inputs=gr.Image(type="numpy"),
    outputs=gr.Textbox(),
    live=True,
    title="Qari Arabic OCR",
    description="Upload an image to extract text in real-time."
)

iface.launch()