|
|
|
"""Untitled1.ipynb |
|
Automatically generated by Colaboratory. |
|
Original file is located at |
|
https://colab.research.google.com/drive/1J4fCr7TGzdFvkCeikMAQ5af5ml2Q83W0 |
|
""" |
|
|
|
import os |
|
os.system('pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu') |
|
import os, glob, fitz |
|
import cv2 |
|
import os |
|
import PIL |
|
import torch |
|
import pandas as pd |
|
import numpy as np |
|
import gradio as gr |
|
from tqdm import tqdm |
|
from scipy import ndimage |
|
from PIL import Image, ImageDraw, ImageFont |
|
|
|
|
|
|
|
def unnormalize_box(bbox, width, height): |
|
|
|
return [ |
|
width * (bbox[0] / 1000), |
|
height * (bbox[1] / 1000), |
|
width * (bbox[2] / 1000), |
|
height * (bbox[3] / 1000), |
|
] |
|
|
|
def imageconversion(pdffile): |
|
doc = fitz.open(pdffile) |
|
page = doc.load_page(0) |
|
zoom = 2 |
|
mat = fitz.Matrix(zoom, zoom) |
|
pix = page.get_pixmap(matrix = mat,dpi = 300) |
|
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
t=pix.save("page.jpg") |
|
|
|
|
|
|
|
return image |
|
|
|
|
|
|
|
def completepreprocess(pdffile): |
|
t=imageconversion(pdffile) |
|
image = t.convert("RGB") |
|
width,height=image.size |
|
if ocr_type == "PaddleOCR": |
|
words, boxes = process_image_PaddleOCR(image, width, height) |
|
elif ocr_type == "Pytesseract": |
|
words, boxes = process_image_pytesseract(image, width, height) |
|
myDataFrame = pd.DataFrame() |
|
a=[] |
|
doc = fitz.open(pdffile) |
|
for i in range(0,len(doc)): |
|
page = doc.load_page(i) |
|
zoom = 2 |
|
mat = fitz.Matrix(zoom, zoom) |
|
pix = page.get_pixmap(matrix = mat,dpi = 200) |
|
t=pix.save("page"+str(i)+".jpg") |
|
images = Image.open("page"+str(i)+".jpg") |
|
image = images.convert("RGB") |
|
bbox, preds, words, image = process_image(image) |
|
im, df = visualize_image(bbox, preds, words, image) |
|
im1 = im.save("page"+str(i)+".jpg") |
|
a.append("page"+str(i)+".jpg") |
|
pred_list = [] |
|
for number in preds: |
|
pred_list.append(iob_to_label(number)) |
|
_bbox, _preds, _words = process_form(pred_list, words, bbox) |
|
print('page: ' + str(i) + ' ' + str(len(_preds))+ ' ' + str(len(_words))) |
|
df = createDataframe(_preds, _words) |
|
myDataFrame=myDataFrame.append(df) |
|
|
|
im2=mergeImageVertical(a) |
|
return im2,myDataFrame |
|
|
|
|
|
title = "OCR outputs" |
|
description = "" |
|
|
|
css = """.output_image, .input_image {height: 600px !important}""" |
|
|
|
|
|
|
|
|
|
iface = gr.Interface(fn=completepreprocess, |
|
|
|
inputs=[ |
|
gr.inputs.File(label="PDF"), |
|
gr.inputs.Dropdown(label="Select the Open Source OCR", choices=["PaddleOCR", "Pytesseract"]), |
|
], |
|
|
|
outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] , |
|
title=title, |
|
description=description, |
|
|
|
css=css, |
|
analytics_enabled = True, enable_queue=True) |
|
|
|
iface.launch(inline=False , debug=True) |