oddadmix commited on
Commit
0071505
·
verified ·
1 Parent(s): 1546d8c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -10
app.py CHANGED
@@ -1,24 +1,67 @@
1
  import gradio as gr
2
- import easyocr
3
  import time
 
 
 
 
 
 
4
 
 
 
 
 
 
 
 
 
 
 
 
5
  def perform_ocr(image):
6
- reader = easyocr.Reader(['en', 'ar']) # Add more languages if needed
7
- result = reader.readtext(image, detail=0)
8
-
9
- def stream_results():
10
- for text in result:
11
- yield text
12
- time.sleep(0.5) # Simulate streaming delay
13
 
14
- return stream_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  iface = gr.Interface(
17
  fn=perform_ocr,
18
  inputs=gr.Image(type="numpy"),
19
  outputs=gr.Textbox(),
20
  live=True,
21
- title="OCR Streaming App",
22
  description="Upload an image to extract text in real-time."
23
  )
24
 
 
1
  import gradio as gr
 
2
  import time
3
+ import spaces
4
+ from PIL import Image
5
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
6
+ from qwen_vl_utils import process_vision_info
7
+ import torch
8
+ import uuid
9
 
10
+
11
+ model_name = "oddadmix/Qari-OCR-0.1-VL-2B-Instruct"
12
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
13
+ model_name,
14
+ torch_dtype="auto",
15
+ device_map="cuda"
16
+ )
17
+ processor = AutoProcessor.from_pretrained(model_name)
18
+ max_tokens = 2000
19
+
20
+ @spaces.GPU
21
  def perform_ocr(image):
22
+ src = uuid.uuid4() + ".png"
23
+
24
+ prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
25
+ image.save(src)
 
 
 
26
 
27
+ messages = [
28
+ {
29
+ "role": "user",
30
+ "content": [
31
+ {"type": "image", "image": f"file://{src}"},
32
+ {"type": "text", "text": prompt},
33
+ ],
34
+ }
35
+ ]
36
+ text = processor.apply_chat_template(
37
+ messages, tokenize=False, add_generation_prompt=True
38
+ )
39
+ image_inputs, video_inputs = process_vision_info(messages)
40
+ inputs = processor(
41
+ text=[text],
42
+ images=image_inputs,
43
+ videos=video_inputs,
44
+ padding=True,
45
+ return_tensors="pt",
46
+ )
47
+ inputs = inputs.to("cuda")
48
+ generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)
49
+ generated_ids_trimmed = [
50
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
51
+ ]
52
+ output_text = processor.batch_decode(
53
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
54
+ )[0]
55
+ os.remove(src)
56
+ print(output_text)
57
+ return output_text
58
 
59
  iface = gr.Interface(
60
  fn=perform_ocr,
61
  inputs=gr.Image(type="numpy"),
62
  outputs=gr.Textbox(),
63
  live=True,
64
+ title="Qari Arabic OCR",
65
  description="Upload an image to extract text in real-time."
66
  )
67