Spaces:

MLBench
/

OCR_Term_Exctraction

Sleeping

File size: 7,004 Bytes

fdb32ca

# import os
# import cv2
# import re
# import numpy as np
# from PIL import Image, ImageDraw, ImageFont
# from paddleocr import PaddleOCR
# from pdf2image import convert_from_path
# import gradio as gr

# # Specify the path to the Poppler bin directory
# poppler_path = r"C:\\poppler\\poppler-24.08.0\\Library\\bin"

# # Function to check proximity of bounding boxes
# def are_boxes_close(box1, box2, y_threshold=50):
#     y1_center = (box1[0][1] + box1[2][1]) / 2
#     y2_center = (box2[0][1] + box2[2][1]) / 2
#     return abs(y1_center - y2_center) <= y_threshold

# # Function to extract terms with specific rules
# def extract_specific_terms(ocr_results):
#     extracted_terms = []

#     for line in ocr_results[0]:
#         detected_text = line[1][0]  # Extracted text
#         box = line[0]  # Bounding box of the detected text

#         if re.match(r"Bill of Lading:\s*\d+", detected_text):
#             extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})

#         elif re.match(r"Page:\s*\w+", detected_text):
#             extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})

#         elif detected_text in ["Shipper", "Receiver", "Carrier"]:
#             extracted_terms.append({'detected_text': detected_text + " Signature", 'bounding_box': box})

#         elif detected_text == "Signature":
#             extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})

#     return extracted_terms

# # Function to annotate image with detected terms
# def annotate_image_with_terms(image, terms):
#     pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
#     draw = ImageDraw.Draw(pil_image)

#     font_size = 40
#     try:
#         font = ImageFont.truetype("arial.ttf", font_size)
#     except IOError:
#         font = ImageFont.load_default()

#     for term in terms:
#         box = term['bounding_box']
#         detected_text = term['detected_text']

#         points = [(int(x[0]), int(x[1])) for x in box]
#         draw.polygon(points, outline="blue", width=2)
#         position = (points[0][0], points[0][1] - font_size - 5)
#         draw.text(position, detected_text, fill="red", font=font)

#     return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

# # Main processing function
# def process_file(file):
#     ocr = PaddleOCR(lang='en')
#     extracted_terms = []

#     if file.name.endswith(".pdf"):
#         images = convert_from_path(file.name, poppler_path=poppler_path)
#         processed_images = []
#         for image in images:
#             image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
#             ocr_results = ocr.ocr(image_np, cls=True)
#             extracted_terms = extract_specific_terms(ocr_results)
#             annotated_image = annotate_image_with_terms(image_np, extracted_terms)
#             processed_images.append(annotated_image)

#         return [Image.fromarray(img) for img in processed_images]

#     else:
#         image = cv2.imread(file.name)
#         ocr_results = ocr.ocr(image, cls=True)
#         extracted_terms = extract_specific_terms(ocr_results)
#         annotated_image = annotate_image_with_terms(image, extracted_terms)
#         return Image.fromarray(annotated_image)

# # Gradio Interface
# def gradio_interface(file):
#     result = process_file(file)
#     if isinstance(result, list):
#         return result[0]  # Display only the first page
#     return result

# iface = gr.Interface(
#     fn=gradio_interface,
#     inputs=gr.File(label="Upload an Image or PDF", file_types=[".pdf", ".png", ".jpg", ".jpeg"]),
#     outputs="image",
#     live=True,
#     title="OCR Term Extraction",
#     description="Upload an image or PDF containing text to detect and annotate terms such as 'Bill of Lading', 'Page', and signatures.",
#     allow_flagging="never"
# )
# iface.launch()



import os
import cv2
import re
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from paddleocr import PaddleOCR
import gradio as gr

# Function to check proximity of bounding boxes
def are_boxes_close(box1, box2, y_threshold=50):
    y1_center = (box1[0][1] + box1[2][1]) / 2
    y2_center = (box2[0][1] + box2[2][1]) / 2
    return abs(y1_center - y2_center) <= y_threshold

# Function to extract terms with specific rules
def extract_specific_terms(ocr_results):
    extracted_terms = []

    for line in ocr_results[0]:
        detected_text = line[1][0]  # Extracted text
        box = line[0]  # Bounding box of the detected text

        if re.match(r"Bill of Lading:\s*\d+", detected_text):
            extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})

        elif re.match(r"Page:\s*\w+", detected_text):
            extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})

        elif detected_text in ["Shipper", "Receiver", "Carrier"]:
            extracted_terms.append({'detected_text': detected_text + " Signature", 'bounding_box': box})

        elif detected_text == "Signature":
            extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})

    return extracted_terms

# Function to annotate image with detected terms
def annotate_image_with_terms(image, terms):
    pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_image)

    font_size = 20
    try:
        font = ImageFont.truetype("arial.ttf", font_size)
    except IOError:
        font = ImageFont.load_default()

    for term in terms:
        box = term['bounding_box']
        detected_text = term['detected_text']

        points = [(int(x[0]), int(x[1])) for x in box]
        draw.polygon(points, outline="blue", width=2)
        position = (points[0][0], points[0][1] - font_size - 5)
        draw.text(position, detected_text, fill="red", font=font)

    return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

# Main processing function
def process_file(file):
    ocr = PaddleOCR(lang='en')
    extracted_terms = []

    # Handle image files (PNG, JPG, JPEG)
    image = cv2.imread(file.name)
    ocr_results = ocr.ocr(image, cls=True)
    extracted_terms = extract_specific_terms(ocr_results)
    annotated_image = annotate_image_with_terms(image, extracted_terms)
    return Image.fromarray(annotated_image)

# Gradio Interface
def gradio_interface(file):
    result = process_file(file)
    return result

iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.File(label="Upload an Image", file_types=[".png", ".jpg", ".jpeg"]),
    outputs="image",
    live=True,
    title="OCR Term Extraction",
    description="Upload an image containing text to detect and annotate terms such as 'Bill of Lading', 'Page', and signatures.",
    allow_flagging="never"
)
iface.launch()