Spaces:

ChaseHan
/

Latex2Layout_PDF_Layout_Parsing

Running

App Files Files Community

ChaseHan commited on 18 days ago

Commit

1ff383d

verified ·

1 Parent(s): 3ab79b5

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -105

app.py CHANGED Viewed

@@ -2,135 +2,206 @@ import gradio as gr
 import cv2
 import numpy as np
 import os
-import tempfile
-from ultralytics import YOLO
-# Define the model path for Latex2Layout
-model_path = "latex2layout_object_detection_yolov8.pt"
-# Check if the model file exists before loading
-if not os.path.exists(model_path):
-    raise FileNotFoundError(f"Model file not found at {model_path}")
-# Load the Latex2Layout model
-try:
-    model = YOLO(model_path)
-except Exception as e:
-    raise RuntimeError(f"Failed to load Latex2Layout model: {e}")
-def detect_and_visualize(image):
     """
-    Perform object detection on the uploaded image and visualize the results.
     Args:
-        image: The uploaded image as a numpy array.
     Returns:
-        annotated_image: Image with bounding boxes drawn.
-        yolo_annotations: Annotations in YOLO format as a string.
     """
-    # Validate input image
-    if image is None or not isinstance(image, np.ndarray):
-        raise ValueError("Invalid image input: Please upload a valid image.")
-    # Run object detection with error handling
     try:
-        results = model(image)
     except Exception as e:
-        raise RuntimeError(f"Error during Latex2Layout detection: {e}")
-    # Extract results from the first frame
-    result = results[0]
-    annotated_image = image.copy()
-    yolo_annotations = []
-    # Get image dimensions
-    img_height, img_width = image.shape[:2]
-    # Process each detected object
-    for box in result.boxes:
-        # Extract bounding box coordinates
-        x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
-        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
-        # Get confidence and class details
-        conf = float(box.conf[0])
-        cls_id = int(box.cls[0])
-        cls_name = result.names[cls_id]
-        # Assign a random color to the class
-        color = tuple(np.random.randint(0, 255, 3).tolist())
-        # Draw bounding box on the image
-        cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
-        # Create and draw label with confidence
-        label = f"{cls_name} {conf:.2f}"
-        (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
-        cv2.rectangle(annotated_image, (x1, y1 - label_height - 5), (x1 + label_width, y1), color, -1)
-        cv2.putText(annotated_image, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
-        # Convert bounding box to YOLO format (normalized coordinates)
-        x_center = (x1 + x2) / (2 * img_width)
-        y_center = (y1 + y2) / (2 * img_height)
-        width = (x2 - x1) / img_width
-        height = (y2 - y1) / img_height
-        yolo_annotations.append(f"{cls_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}")
-    # Combine annotations into a single string
-    yolo_annotations_str = "\n".join(yolo_annotations) if yolo_annotations else "No objects detected."
-    return annotated_image, yolo_annotations_str
-def save_yolo_annotations(yolo_annotations_str):
     """
-    Save YOLO annotations to a temporary file and return its path.
     Args:
-        yolo_annotations_str: Annotations string in YOLO format.
     Returns:
-        file_path: Path to the saved annotation file.
     """
-    # Handle empty annotations
-    if not yolo_annotations_str or yolo_annotations_str == "No objects detected.":
-        raise ValueError("No annotations available to save.")
-    # Save annotations to a temporary file with error handling
     try:
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
-        temp_file_path = temp_file.name
-        with open(temp_file_path, "w") as f:
-            f.write(yolo_annotations_str)
-        return temp_file_path
     except Exception as e:
-        raise RuntimeError(f"Failed to save annotations: {e}")
-# Build the Gradio interface
-with gr.Blocks(title="Latex2Layout Object Detection Visualization") as demo:
-    gr.Markdown("# Latex2Layout Object Detection Visualization")
-    gr.Markdown("Upload an image to detect objects using the Latex2Layout model. View the results with bounding boxes and download annotations in YOLO format.")
     with gr.Row():
-        with gr.Column():
             input_image = gr.Image(label="Upload Image", type="numpy")
-            detect_btn = gr.Button("Start Detection")
-        with gr.Column():
             output_image = gr.Image(label="Detection Results")
-            yolo_annotations = gr.Textbox(label="YOLO Annotations", lines=10)
-            download_btn = gr.Button("Download YOLO Annotations")
-            download_file = gr.File(label="Download Annotations")
-    # Define button click events
     detect_btn.click(
-        fn=detect_and_visualize,
         inputs=[input_image],
-        outputs=[output_image, yolo_annotations]
     )
-    download_btn.click(
-        fn=save_yolo_annotations,
-        inputs=[yolo_annotations],
-        outputs=[download_file]
     )
 # Launch the application

 import cv2
 import numpy as np
 import os
+import requests
+import json
+from PIL import Image
+import io
+import base64
+from openai import OpenAI
+# API endpoints
+YOLO_API_ENDPOINT = "https://api.example.com/yolo"  # Replace with actual YOLO API endpoint
+# Qwen API configuration
+QWEN_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+QWEN_MODEL_ID = "qwen2.5-vl-3b-instruct"
+def encode_image(image_array):
     """
+    Encode numpy array image to base64 string.
     Args:
+        image_array: numpy array of the image
     Returns:
+        base64 encoded string of the image
     """
+    # Convert numpy array to PIL Image
+    pil_image = Image.fromarray(image_array)
+    # Convert PIL Image to bytes
+    img_byte_arr = io.BytesIO()
+    pil_image.save(img_byte_arr, format='PNG')
+    img_byte_arr = img_byte_arr.getvalue()
+    # Encode to base64
+    return base64.b64encode(img_byte_arr).decode("utf-8")
+def detect_layout(image):
+    """
+    Perform layout detection on the uploaded image using YOLO API.
+    Args:
+        image: The uploaded image as a numpy array
+    Returns:
+        annotated_image: Image with detection boxes
+        layout_info: Layout detection results
+    """
+    if image is None:
+        return None, "Error: No image uploaded."
+    # Convert numpy array to PIL Image
+    pil_image = Image.fromarray(image)
+    # Convert PIL Image to bytes for API request
+    img_byte_arr = io.BytesIO()
+    pil_image.save(img_byte_arr, format='PNG')
+    img_byte_arr = img_byte_arr.getvalue()
+    # Prepare API request
+    files = {'image': ('image.png', img_byte_arr, 'image/png')}
     try:
+        # Call YOLO API
+        response = requests.post(YOLO_API_ENDPOINT, files=files)
+        response.raise_for_status()
+        detection_results = response.json()
+        # Create a copy of the image for visualization
+        annotated_image = image.copy()
+        # Draw detection results
+        for detection in detection_results:
+            x1, y1, x2, y2 = detection['bbox']
+            cls_name = detection['class']
+            conf = detection['confidence']
+            # Generate a color for each class
+            color = tuple(np.random.randint(0, 255, 3).tolist())
+            # Draw bounding box and label
+            cv2.rectangle(annotated_image, (int(x1), int(y1)), (int(x2), int(y2)), color, 2)
+            label = f'{cls_name} {conf:.2f}'
+            (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+            cv2.rectangle(annotated_image, (int(x1), int(y1)-label_height-5), (int(x1)+label_width, int(y1)), color, -1)
+            cv2.putText(annotated_image, label, (int(x1), int(y1)-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+        # Format layout information for Qwen
+        layout_info = json.dumps(detection_results, indent=2)
+        return annotated_image, layout_info
     except Exception as e:
+        return None, f"Error during layout detection: {str(e)}"
+def qa_about_layout(image, question, layout_info, api_key):
     """
+    Answer questions about the layout using Qwen2.5-VL API.
     Args:
+        image: The uploaded image
+        question: User's question about the layout
+        layout_info: Layout detection results from YOLO
+        api_key: User's Qwen API key
     Returns:
+        answer: Qwen's answer to the question
     """
+    if image is None or not question:
+        return "Please upload an image and ask a question."
+    if not layout_info:
+        return "No layout information available. Please detect layout first."
+    if not api_key:
+        return "Please enter your Qwen API key."
     try:
+        # Encode image to base64
+        base64_image = encode_image(image)
+        # Initialize OpenAI client for Qwen API
+        client = OpenAI(
+            api_key=api_key,
+            base_url=QWEN_BASE_URL,
+        )
+        # Prepare system prompt with layout information
+        system_prompt = f"""You are a helpful assistant specialized in analyzing document layouts.
+        The following layout information has been detected in the image:
+        {layout_info}
+        Please answer questions about the layout based on this information and the image."""
+        # Prepare messages for API call
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": system_prompt}]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                    },
+                    {"type": "text", "text": question},
+                ],
+            }
+        ]
+        # Call Qwen API
+        completion = client.chat.completions.create(
+            model=QWEN_MODEL_ID,
+            messages=messages,
+        )
+        return completion.choices[0].message.content
     except Exception as e:
+        return f"Error during QA: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="Latex2Layout QA System") as demo:
+    gr.Markdown("# Latex2Layout QA System")
+    gr.Markdown("Upload an image, detect layout elements, and ask questions about the layout.")
     with gr.Row():
+        with gr.Column(scale=1):
             input_image = gr.Image(label="Upload Image", type="numpy")
+            detect_btn = gr.Button("Detect Layout")
+            gr.Markdown("**Tip**: Upload a clear image for optimal detection results.")
+        with gr.Column(scale=1):
             output_image = gr.Image(label="Detection Results")
+            layout_info = gr.Textbox(label="Layout Information", lines=10)
+    with gr.Row():
+        with gr.Column(scale=1):
+            api_key_input = gr.Textbox(
+                label="Qwen API Key",
+                placeholder="Enter your Qwen API key here",
+                type="password"
+            )
+            question_input = gr.Textbox(label="Ask a question about the layout")
+            qa_btn = gr.Button("Ask Question")
+        with gr.Column(scale=1):
+            answer_output = gr.Textbox(label="Answer", lines=5)
+    # Event handlers
     detect_btn.click(
+        fn=detect_layout,
         inputs=[input_image],
+        outputs=[output_image, layout_info]
     )
+    qa_btn.click(
+        fn=qa_about_layout,
+        inputs=[input_image, question_input, layout_info, api_key_input],
+        outputs=[answer_output]
     )
 # Launch the application