Spaces:

gauri-sharan
/

test-two

Sleeping

App Files Files Community

gauri-sharan commited on Sep 29, 2024

Commit

40f7360

verified ·

1 Parent(s): d85fa29

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -71

app.py CHANGED Viewed

@@ -6,34 +6,43 @@ import torch
 from PIL import Image
 import os
 import traceback
 import re
-# Load models
-rag_model = RAGMultiModalModel.from_pretrained("vidore/colpali")
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16
-)
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True)
-extracted_text = ""  # Store the extracted text globally for keyword search
-def ocr_and_extract(image, text_query=None):
     global extracted_text
     try:
         # Save the uploaded image temporarily
         temp_image_path = "temp_image.jpg"
         image.save(temp_image_path)
-        # Index the image with Byaldi
         rag_model.index(
             input_path=temp_image_path,
-            index_name="image_index",
             store_collection_with_index=False,
-            overwrite=True
         )
         # Perform the search query on the indexed image
-        results = rag_model.search(text_query, k=1)
         # Prepare the input for Qwen2-VL
         image_data = Image.open(temp_image_path)
@@ -43,31 +52,33 @@ def ocr_and_extract(image, text_query=None):
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image_data},
-                    {"type": "text", "text": text_query},
                 ],
             }
         ]
-        # Process input for Qwen2-VL
         text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         image_inputs, _ = process_vision_info(messages)
         inputs = processor(
             text=[text_input],
             images=image_inputs,
             padding=True,
             return_tensors="pt",
-        )
-        qwen_model.to("cuda")
-        inputs = {k: v.to("cuda") for k, v in inputs.items()}
         # Generate the output with Qwen2-VL
         generated_ids = qwen_model.generate(**inputs, max_new_tokens=50)
-        output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        # Store the extracted text for keyword search
-        extracted_text = output_text[0]
         os.remove(temp_image_path)
         return extracted_text
@@ -77,57 +88,36 @@ def ocr_and_extract(image, text_query=None):
         traceback.print_exc()
         return f"Error: {error_message}"
-def search_keywords(keyword):
-    global extracted_text
     if not extracted_text:
         return "No text extracted yet. Please upload an image."
-    # Perform basic keyword search within the extracted text
-    if re.search(rf"\b{re.escape(keyword)}\b", extracted_text, re.IGNORECASE):
-        highlighted_text = re.sub(rf"({re.escape(keyword)})", r"<mark>\1</mark>", extracted_text, flags=re.IGNORECASE)
-        return f"Keyword found! {highlighted_text}"
-    else:
-        return "Keyword not found in the extracted text."
-# Gradio interface
-image_input = gr.Image(type="pil")
-text_output = gr.Textbox(label="Extracted Text", interactive=True)
-keyword_search = gr.Textbox(label="Enter keywords to search")
-search_button = gr.Button("Search Keywords")
-search_output = gr.HTML()
-extract_button = gr.Button("Extract Text")
-# Layout update
-iface = gr.Interface(
-    fn=ocr_and_extract,
-    inputs=[image_input],
-    outputs=[text_output],
-    title="Image OCR with Byaldi + Qwen2-VL",
-    description="Upload an image containing Hindi and English text for OCR. Then, search for specific keywords.",
-)
-# Keyword search layout
-iface_search = gr.Interface(
-    fn=search_keywords,
-    inputs=[keyword_search],
-    outputs=[search_output],
-)
-# Move extract button above the text output
-def combined_interface(image, keyword):
-    ocr_text = ocr_and_extract(image)
-    search_result = search_keywords(keyword)
-    return ocr_text, search_result
-combined_iface = gr.Interface(
-    fn=combined_interface,
-    inputs=[image_input, keyword_search],
-    outputs=[text_output, search_output],
-    live=True,
-    title="Image OCR & Keyword Search",
-    description="Extract text from the image and search for specific keywords."
-)
-# Launch the app
-combined_iface.launch()

 from PIL import Image
 import os
 import traceback
+import spaces
 import re
+# Check if CUDA is available
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# Load the Byaldi and Qwen2-VL models
+rag_model = RAGMultiModalModel.from_pretrained("vidore/colpali")  # Byaldi model
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16
+).to(device)  # Move Qwen2-VL to GPU
+# Processor for Qwen2-VL
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True)
+# Global variable to store extracted text
+extracted_text = ""
+@spaces.GPU(duration=120)  # Increased GPU duration to 120 seconds
+def ocr_and_extract(image):
     global extracted_text
     try:
         # Save the uploaded image temporarily
         temp_image_path = "temp_image.jpg"
         image.save(temp_image_path)
+        # Index the image with Byaldi, and force overwrite of the existing index
         rag_model.index(
             input_path=temp_image_path,
+            index_name="image_index",  # Reuse the same index
             store_collection_with_index=False,
+            overwrite=True  # Overwrite the index for every new image
         )
         # Perform the search query on the indexed image
+        results = rag_model.search("", k=1)
         # Prepare the input for Qwen2-VL
         image_data = Image.open(temp_image_path)
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image_data},
                 ],
             }
         ]
+        # Process the message and prepare for Qwen2-VL
         text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         image_inputs, _ = process_vision_info(messages)
+        # Move the image inputs and processor outputs to CUDA
         inputs = processor(
             text=[text_input],
             images=image_inputs,
             padding=True,
             return_tensors="pt",
+        ).to(device)
         # Generate the output with Qwen2-VL
         generated_ids = qwen_model.generate(**inputs, max_new_tokens=50)
+        output_text = processor.batch_decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        # Filter out "You are a helpful assistant" and "assistant" labels
+        filtered_output = [line for line in output_text[0].split("\n") if not any(kw in line.lower() for kw in ["you are a helpful assistant", "assistant", "user", "system"])]
+        extracted_text = "\n".join(filtered_output).strip()
+        # Clean up the temporary file
         os.remove(temp_image_path)
         return extracted_text
         traceback.print_exc()
         return f"Error: {error_message}"
+def search_keywords(keywords):
     if not extracted_text:
         return "No text extracted yet. Please upload an image."
+    # Highlight matching keywords in the extracted text
+    highlighted_text = extracted_text
+    for keyword in keywords.split():
+        highlighted_text = re.sub(f"({re.escape(keyword)})", r"<mark>\1</mark>", highlighted_text, flags=re.IGNORECASE)
+    # Return the highlighted text
+    return highlighted_text
+# Gradio interface for image input and keyword search
+with gr.Blocks() as iface:
+    # Image upload and text extraction section
+    with gr.Column():
+        img_input = gr.Image(type="pil", label="Upload an Image")
+        extracted_output = gr.Textbox(label="Extracted Text", interactive=False)
+        # Functionality to trigger the OCR and extraction
+        img_button = gr.Button("Extract Text")
+        img_button.click(fn=ocr_and_extract, inputs=img_input, outputs=extracted_output)
+    # Keyword search section
+    with gr.Column():
+        search_input = gr.Textbox(label="Enter keywords to search")
+        search_output = gr.HTML(label="Search Results")
+        # Functionality to search within the extracted text
+        search_button = gr.Button("Search")
+        search_button.click(fn=search_keywords, inputs=search_input, outputs=search_output)
+iface.launch()