Spaces:

intuitive262
/

Doc_Reader

Sleeping

App Files Files Community

intuitive262 commited on Sep 29, 2024

Commit

4902aa0

1 Parent(s): 10c178b

Uploaded code files

Browse files

Files changed (2) hide show

app.py +88 -73
requirements.txt +7 -7

app.py CHANGED Viewed

@@ -1,88 +1,103 @@
-import gradio as gr
-import numpy as np
-from PIL import Image
 import torch
-from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 import re
-# Load the first OCR model (Microsoft's TrOCR)
-ms_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
-ms_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
-# Load the second OCR model (Surya-OCR)
-surya_processor = TrOCRProcessor.from_pretrained("suryavarmaaddala/suryaocr")
-surya_model = VisionEncoderDecoderModel.from_pretrained("suryavarmaaddala/suryaocr")
-def preprocess_image(image):
-    if isinstance(image, str):
-        image = Image.open(image).convert("RGB")
-    elif isinstance(image, np.ndarray):
-        image = Image.fromarray(image).convert("RGB")
-    return image
-def microsoft_ocr(image):
-    image = preprocess_image(image)
-    pixel_values = ms_processor(image, return_tensors="pt").pixel_values
-    generated_ids = ms_model.generate(pixel_values)
-    generated_text = ms_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return generated_text
-def surya_ocr(image):
-    image = preprocess_image(image)
-    pixel_values = surya_processor(image, return_tensors="pt").pixel_values
-    generated_ids = surya_model.generate(pixel_values)
-    generated_text = surya_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return generated_text
 def post_process_text(text):
-    # Simple post-processing to split into lines
-    return '\n'.join(text.split('. '))
-def search_text(text, query):
-    try:
-        pattern = re.compile(query, re.IGNORECASE)
-        lines = text.split('\n')
-        matching_lines = [line for line in lines if pattern.search(line)]
-        return '\n'.join(matching_lines) if matching_lines else "No matches found."
-    except re.error:
-        return "Invalid regex pattern. Please try again."
-def process_and_search(image, search_query):
-    try:
-        ms_text = microsoft_ocr(image)
-        surya_text = surya_ocr(image)
-        result = f"Microsoft OCR Result:\n{ms_text}\n\nSurya OCR Result:\n{surya_text}"
-        processed_text = post_process_text(result)
-        search = None
-        if search_query:
-            search = search_text(processed_text, search_query)
-        return image, processed_text, search
-    except Exception as e:
-        return None, f"An error occurred: {str(e)}", None
-with gr.Blocks() as demo:
-    with gr.Row():
-        with gr.Column(scale=1):
-            image_input = gr.Image(type="filepath", label="Upload your image")
-            search_query_input = gr.Textbox(label="Enter search query")
-            submit_button = gr.Button("Submit")
-        with gr.Column(scale=2):
-            displayed_image = gr.Image(label="Uploaded Image")
-            ocr_result = gr.Textbox(label="OCR Result", lines=10)
-            search_result = gr.Textbox(label="Search Result", lines=5)
-    submit_button.click(
-        fn=process_and_search,
-        inputs=[image_input, search_query_input],
-        outputs=[displayed_image, ocr_result, search_result]
-    )
-if __name__ == "__main__":
-    demo.launch()

+from byaldi import RAGMultiModalModel
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 import torch
+from qwen_vl_utils import process_vision_info
+from PIL import Image
+import os
 import re
+rag = RAGMultiModalModel.from_pretrained("vidore/colpali")
+vlm = Qwen2VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2-VL-2B-Instruct",
+    torch_dtype=torch.float16,
+    trust_remote_code=True,
+    device_map="auto",
+)
+rag.index(
+    input_path="./test1.png",
+    index_name="index",
+    store_collection_with_index=False,
+    overwrite=True,
+)
+text_query = "What is the text content displayed in the image?"
+res = rag.search(text_query, k=1)
+res
+image = Image.open("./test2.jpg")
+image_index = res[0]["page_num"] - 1
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True)
+def extract_text(image, query):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": query},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+    inputs = inputs.to("cpu")
+    with torch.no_grad():
+        generated_ids = vlm.generate(**inputs, max_new_tokens=200, temperature=0.7, top_p=0.9)
+        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+        return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 def post_process_text(text):
+    # Remove extra whitespace
+    text = re.sub(r'\s+', ' ', text).strip()
+    # Remove repeated phrases (which sometimes occur in multi-pass extraction)
+    phrases = text.split('. ')
+    unique_phrases = list(dict.fromkeys(phrases))
+    text = '. '.join(unique_phrases)
+    return text
+def ocr(image):
+    queries = [
+        "Extract and transcribe all the text visible in the image, including any small or partially visible text.",
+        "Look closely at the image and list any text you see, no matter how small or unclear.",
+        "What text can you identify in this image? Include everything, even if it's partially obscured or in the background."
+    ]
+    all_extracted_text = []
+    for query in queries:
+        extracted_text = extract_text(image, query)
+        all_extracted_text.append(extracted_text)
+    # Combine and deduplicate the results
+    final_text = "\n".join(set(all_extracted_text))
+    final_text = post_process_text(final_text)
+    return final_text
+def main_fun(image, keyword):
+    ext_text = ocr(image)
+    if keyword:
+        highlight_text = re.sub(f'({re.escape(keyword)})', r'<span style="background-color: yellow;">\1</span>', ext_text, flags=re.IGNORECASE)
+    return ext_text, highlight_text
+iface = gr.Interface(
+    fn=app,
+    inputs=[
+        gr.Image(type="pil", label="Upload an Image").
+        gr.Textbox(label="Enter search term")
+    ],
+    outputs=[
+        gr.Textbox(label="Extracted Text"),
+        gr.HTML(label="Search Results")
+    ],
+    title="Document Search using OCR (English/Hindi)"
+)
+iface.launch()

requirements.txt CHANGED Viewed

@@ -1,10 +1,10 @@
 gradio
-Pillow
-surya-ocr
 torch
-transformers
-tiktoken
 torchvision
-verovio
-accelerate
-rapidfuzz

 gradio
+byaldi
+qwen-vl-utils
+numpy==1.24.4
+Pillow==10.3.0
+Requests==2.31.0
 torch
 torchvision
+git+https://github.com/huggingface/transformers.git
+accelerate