Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,24 +1,67 @@
|
|
1 |
import gradio as gr
|
2 |
-
import easyocr
|
3 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
def perform_ocr(image):
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
for text in result:
|
11 |
-
yield text
|
12 |
-
time.sleep(0.5) # Simulate streaming delay
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
iface = gr.Interface(
|
17 |
fn=perform_ocr,
|
18 |
inputs=gr.Image(type="numpy"),
|
19 |
outputs=gr.Textbox(),
|
20 |
live=True,
|
21 |
-
title="
|
22 |
description="Upload an image to extract text in real-time."
|
23 |
)
|
24 |
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import time
|
3 |
+
import spaces
|
4 |
+
from PIL import Image
|
5 |
+
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
6 |
+
from qwen_vl_utils import process_vision_info
|
7 |
+
import torch
|
8 |
+
import uuid
|
9 |
|
10 |
+
|
11 |
+
model_name = "oddadmix/Qari-OCR-0.1-VL-2B-Instruct"
|
12 |
+
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
13 |
+
model_name,
|
14 |
+
torch_dtype="auto",
|
15 |
+
device_map="cuda"
|
16 |
+
)
|
17 |
+
processor = AutoProcessor.from_pretrained(model_name)
|
18 |
+
max_tokens = 2000
|
19 |
+
|
20 |
+
@spaces.GPU
|
21 |
def perform_ocr(image):
|
22 |
+
src = uuid.uuid4() + ".png"
|
23 |
+
|
24 |
+
prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
|
25 |
+
image.save(src)
|
|
|
|
|
|
|
26 |
|
27 |
+
messages = [
|
28 |
+
{
|
29 |
+
"role": "user",
|
30 |
+
"content": [
|
31 |
+
{"type": "image", "image": f"file://{src}"},
|
32 |
+
{"type": "text", "text": prompt},
|
33 |
+
],
|
34 |
+
}
|
35 |
+
]
|
36 |
+
text = processor.apply_chat_template(
|
37 |
+
messages, tokenize=False, add_generation_prompt=True
|
38 |
+
)
|
39 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
40 |
+
inputs = processor(
|
41 |
+
text=[text],
|
42 |
+
images=image_inputs,
|
43 |
+
videos=video_inputs,
|
44 |
+
padding=True,
|
45 |
+
return_tensors="pt",
|
46 |
+
)
|
47 |
+
inputs = inputs.to("cuda")
|
48 |
+
generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)
|
49 |
+
generated_ids_trimmed = [
|
50 |
+
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
51 |
+
]
|
52 |
+
output_text = processor.batch_decode(
|
53 |
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
54 |
+
)[0]
|
55 |
+
os.remove(src)
|
56 |
+
print(output_text)
|
57 |
+
return output_text
|
58 |
|
59 |
iface = gr.Interface(
|
60 |
fn=perform_ocr,
|
61 |
inputs=gr.Image(type="numpy"),
|
62 |
outputs=gr.Textbox(),
|
63 |
live=True,
|
64 |
+
title="Qari Arabic OCR",
|
65 |
description="Upload an image to extract text in real-time."
|
66 |
)
|
67 |
|