File size: 3,658 Bytes
122792f a0df5e7 122792f a0df5e7 122792f a0df5e7 122792f a0df5e7 122792f a0df5e7 122792f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# -*- coding: utf-8 -*-
"""Untitled1.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1J4fCr7TGzdFvkCeikMAQ5af5ml2Q83W0
"""
import os
os.system('pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu')
import os, glob, fitz
import cv2
import os
import PIL
import torch
import pandas as pd
import numpy as np
import gradio as gr
from tqdm import tqdm
from scipy import ndimage
from PIL import Image, ImageDraw, ImageFont
def unnormalize_box(bbox, width, height):
#print('shape is: ', np.asarray(bbox).shape, ' and box has values: ', bbox)
return [
width * (bbox[0] / 1000),
height * (bbox[1] / 1000),
width * (bbox[2] / 1000),
height * (bbox[3] / 1000),
]
def imageconversion(pdffile):
doc = fitz.open(pdffile)
page = doc.load_page(0)
zoom = 2 # zoom factor
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix = mat,dpi = 300)
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
t=pix.save("page.jpg")
# img = removeBorders(image)
# noise_img = add_noise(np.array(image))
# image = Image.fromarray(noise_img)
return image
def completepreprocess(pdffile):
t=imageconversion(pdffile)
image = t.convert("RGB")
width,height=image.size
if ocr_type == "PaddleOCR":
words, boxes = process_image_PaddleOCR(image, width, height)
elif ocr_type == "Pytesseract":
words, boxes = process_image_pytesseract(image, width, height)
myDataFrame = pd.DataFrame()
a=[]
doc = fitz.open(pdffile)
for i in range(0,len(doc)):
page = doc.load_page(i)
zoom = 2 # zoom factor
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix = mat,dpi = 200)
t=pix.save("page"+str(i)+".jpg")
images = Image.open("page"+str(i)+".jpg")
image = images.convert("RGB")
bbox, preds, words, image = process_image(image)
im, df = visualize_image(bbox, preds, words, image)
im1 = im.save("page"+str(i)+".jpg")
a.append("page"+str(i)+".jpg")
pred_list = []
for number in preds:
pred_list.append(iob_to_label(number))
_bbox, _preds, _words = process_form(pred_list, words, bbox)
print('page: ' + str(i) + ' ' + str(len(_preds))+ ' ' + str(len(_words)))
df = createDataframe(_preds, _words)
myDataFrame=myDataFrame.append(df)
im2=mergeImageVertical(a)
return im2,myDataFrame
title = "OCR outputs"
description = ""
css = """.output_image, .input_image {height: 600px !important}"""
#examples = [["461BHH69.PDF"],["AP-481-RF.PDF"],["DP-095-ML.PDF"],["DQ-231-LL.PDF"],["FK-941-ET.PDF"], ["FL-078-NH.PDF"]
# ,["14ZZ69.PDF"],["74BCA69.PDF"],["254BEG69.PDF"],["761BJQ69.PDF"],["AB-486-EH.PDF"],["AZ-211-ZA.PDF"], ["CY-073-YV.PDF"]]
# ["744BJQ69.PDF"], ['tarros_2.jpg'],
iface = gr.Interface(fn=completepreprocess,
#inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"),
inputs=[
gr.inputs.File(label="PDF"),
gr.inputs.Dropdown(label="Select the Open Source OCR", choices=["PaddleOCR", "Pytesseract"]),
],
#inputs=gr.inputs.Image(type="pil")
outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] ,
title=title,
description=description,
#examples=examples,
css=css,
analytics_enabled = True, enable_queue=True)
iface.launch(inline=False , debug=True) |