|
|
|
"""Untitled1.ipynb |
|
Automatically generated by Colaboratory. |
|
Original file is located at |
|
https://colab.research.google.com/drive/1J4fCr7TGzdFvkCeikMAQ5af5ml2Q83W0 |
|
""" |
|
|
|
import os |
|
os.system('pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu') |
|
import os, glob, fitz |
|
import cv2 |
|
import os |
|
import PIL |
|
import torch |
|
import pandas as pd |
|
import numpy as np |
|
import pandas as pd |
|
import gradio as gr |
|
from tqdm import tqdm |
|
from PIL import Image as im |
|
from scipy import ndimage |
|
from difflib import SequenceMatcher |
|
from itertools import groupby |
|
from datasets import load_metric |
|
from datasets import load_dataset |
|
from datasets.features import ClassLabel |
|
from transformers import AutoProcessor |
|
from PIL import Image, ImageDraw, ImageFont |
|
from transformers import AutoModelForTokenClassification |
|
from transformers.data.data_collator import default_data_collator |
|
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D |
|
from transformers import LayoutLMv3ForTokenClassification,LayoutLMv3FeatureExtractor |
|
|
|
|
|
id2label={0: 'container id', 1: 'seal number', 2: 'container quantity', 3: 'container type', 4: 'tare', 5: 'package quantity', 6: 'weight', 7: 'others'} |
|
custom_config = r'--oem 3 --psm 6' |
|
|
|
lang='spa' |
|
|
|
label_ints = np.random.randint(0,len(PIL.ImageColor.colormap.items()),42) |
|
label_color_pil = [k for k,_ in PIL.ImageColor.colormap.items()] |
|
label_color = [label_color_pil[i] for i in label_ints] |
|
label2color = {} |
|
for k,v in id2label.items(): |
|
if v[:2] == '': |
|
label2color['o']=label_color[k] |
|
else: |
|
label2color[v[2:]]=label_color[k] |
|
|
|
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True,lang=lang) |
|
model = AutoModelForTokenClassification.from_pretrained("atatavana/layoutlm_manifesto_bigdataset") |
|
|
|
|
|
def unnormalize_box(bbox, width, height): |
|
|
|
return [ |
|
width * (bbox[0] / 1000), |
|
height * (bbox[1] / 1000), |
|
width * (bbox[2] / 1000), |
|
height * (bbox[3] / 1000), |
|
] |
|
|
|
def iob_to_label(label): |
|
if label == 0: |
|
return 'container id' |
|
if label == 1: |
|
return 'seal number' |
|
if label == 2: |
|
return 'container quantity' |
|
if label == 3: |
|
return 'container type' |
|
if label == 4: |
|
return 'tare' |
|
if label == 5: |
|
return 'package quantity' |
|
if label == 6: |
|
return 'weight' |
|
if label == 7: |
|
return 'others' |
|
|
|
|
|
def intersect(w, z): |
|
x1 = max(w[0], z[0]) |
|
y1 = max(w[1], z[1]) |
|
x2 = min(w[2], z[2]) |
|
y2 = min(w[3], z[3]) |
|
if (x1 > x2 or y1 > y2): |
|
return 0 |
|
else: |
|
|
|
|
|
area = (x2-x1) * (y2-y1) |
|
if (area > 0): |
|
return [int(x1), int(y1), int(x2), int(y2)] |
|
else: |
|
return 0 |
|
|
|
|
|
def process_image(image): |
|
custom_config = r'--oem 3 --psm 6' |
|
|
|
lang='spa' |
|
width, height = image.size |
|
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True,lang=lang) |
|
encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True) |
|
words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes |
|
|
|
custom_config = r'--oem 3 --psm 6' |
|
|
|
inference_image = [image.convert("RGB")] |
|
encoding = processor(inference_image , truncation=True, return_offsets_mapping=True, return_tensors="pt", |
|
padding="max_length", stride =128, max_length=512, return_overflowing_tokens=True) |
|
offset_mapping = encoding.pop('offset_mapping') |
|
overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping') |
|
|
|
|
|
x = [] |
|
for i in range(0, len(encoding['pixel_values'])): |
|
x.append(encoding['pixel_values'][i]) |
|
x = torch.stack(x) |
|
encoding['pixel_values'] = x |
|
|
|
|
|
outputs = model(**encoding) |
|
|
|
|
|
predictions = outputs.logits.argmax(-1).squeeze().tolist() |
|
token_boxes = encoding.bbox.squeeze().tolist() |
|
|
|
|
|
preds = [] |
|
l_words = [] |
|
bboxes = [] |
|
token_section_num = [] |
|
|
|
if (len(token_boxes) == 512): |
|
predictions = [predictions] |
|
token_boxes = [token_boxes] |
|
|
|
|
|
for i in range(0, len(token_boxes)): |
|
for j in range(0, len(token_boxes[i])): |
|
|
|
unnormal_box = unnormalize_box(token_boxes[i][j], width, height) |
|
|
|
if (np.asarray(token_boxes[i][j]).shape != (4,)): |
|
continue |
|
elif (token_boxes[i][j] == [0, 0, 0, 0] or token_boxes[i][j] == 0): |
|
|
|
continue |
|
|
|
elif (unnormal_box not in bboxes): |
|
preds.append(predictions[i][j]) |
|
l_words.append(processor.tokenizer.decode(encoding["input_ids"][i][j])) |
|
bboxes.append(unnormal_box) |
|
token_section_num.append(i) |
|
else: |
|
|
|
_index = bboxes.index(unnormal_box) |
|
if (token_section_num[_index] == i): |
|
|
|
|
|
|
|
l_words[_index] = l_words[_index] + processor.tokenizer.decode(encoding["input_ids"][i][j]) |
|
|
|
else: |
|
continue |
|
|
|
|
|
return bboxes, preds, l_words, image |
|
|
|
|
|
|
|
def visualize_image(final_bbox, final_preds, l_words, image): |
|
|
|
draw = ImageDraw.Draw(image) |
|
font = ImageFont.load_default() |
|
|
|
label2color = {'container id':'red', 'seal number':'blue', 'container quantity':'black', 'container type':'green', 'tare':'brown', 'package quantity':'purple', 'weight':'orange', 'others': 'white'} |
|
l2l = {'container id':'red', 'seal number':'blue', 'container quantity':'black', 'container type':'green', 'tare':'brown', 'package quantity':'purple', 'weight':'orange', 'others': 'white'} |
|
f_labels = {'container id':'red', 'seal number':'blue', 'container quantity':'black', 'container type':'green', 'tare':'brown', 'package quantity':'purple', 'weight':'orange', 'others': 'white'} |
|
|
|
json_df = [] |
|
|
|
for ix, (prediction, box) in enumerate(zip(final_preds, final_bbox)): |
|
predicted_label = iob_to_label(prediction).lower() |
|
draw.rectangle(box, outline=label2color[predicted_label]) |
|
draw.text((box[0]+10, box[1]-10), text=predicted_label, fill=label2color[predicted_label], font=font) |
|
|
|
json_dict = {} |
|
json_dict['TEXT'] = l_words[ix] |
|
json_dict['LABEL'] = f_labels[predicted_label] |
|
|
|
json_df.append(json_dict) |
|
return image, json_df |
|
|
|
|
|
def mergeCloseBoxes(pr, bb, wr, threshold): |
|
idx = 0 |
|
final_bbox =[] |
|
final_preds =[] |
|
final_words=[] |
|
|
|
for box, pred, word in zip(bb, pr, wr): |
|
if (pred=='others'): |
|
continue |
|
else: |
|
final_bbox.append(box) |
|
final_preds.append(pred) |
|
final_words.append(word) |
|
for b, p, w in zip(bb, pr, wr): |
|
if (p == 'others'): |
|
continue |
|
elif (box==b): |
|
continue |
|
else: |
|
XMIN, YMIN, XMAX, YMAX = box |
|
xmin, ymin, xmax, ymax = b |
|
intsc = intersect([XMIN, YMIN, XMAX+threshold, YMAX], [xmin-threshold, ymin, xmax, ymax]) |
|
if (intsc != 0 and pred==p): |
|
|
|
if(box in final_bbox): |
|
final_bbox[idx]= [XMIN, min(YMIN, ymin), xmax, max(YMAX, ymax)] |
|
final_words[idx] = word + ' ' + w |
|
continue |
|
|
|
print('box: {}, label: {} is close to b:{} with this p:{}--> {}'.format(box, pred, b, p, word + ' ' + w)) |
|
|
|
idx = idx +1 |
|
return final_bbox, final_preds, final_words |
|
|
|
def createDataframe(preds, words): |
|
df = pd.DataFrame(columns = ['container id' ,'seal number', 'container quantity', 'container type', 'package quantity', 'tare', 'weight']) |
|
flag_label = preds[0] |
|
|
|
|
|
|
|
|
|
row_number = -1 |
|
for i in range(len(preds)): |
|
|
|
if (preds[i] == flag_label): |
|
row_number = row_number + 1 |
|
df.at[row_number, preds[i]] = words[i] |
|
|
|
continue |
|
|
|
else: |
|
|
|
|
|
|
|
if(pd.isna(df[preds[i]].iloc[row_number])): |
|
df.at[row_number, preds[i]] = words[i] |
|
else: |
|
row_number = row_number + 1 |
|
df.at[row_number, preds[i]] = words[i] |
|
|
|
return df |
|
|
|
def isInside(w, z): |
|
|
|
if(w[0] >= z[0] and w[1] >= z[1] and w[2] <= z[2] and w[3] <= z[3]): |
|
return True |
|
return False |
|
|
|
def removeSimilarItems(final_bbox, final_preds, final_words): |
|
_bb =[] |
|
_pp=[] |
|
_ww=[] |
|
for i in range(len(final_bbox)): |
|
_bb.append(final_bbox[i]) |
|
_pp.append(final_preds[i]) |
|
_ww.append(final_words[i]) |
|
for j in range(len(final_bbox)): |
|
if (final_bbox[i] == final_bbox[j]): |
|
continue |
|
elif (isInside(final_bbox[i], final_bbox[j]) and final_preds[i]==final_preds[j] ): |
|
|
|
|
|
_bb = _bb[:-1] |
|
_pp = _pp[:-1] |
|
_ww = _ww[:-1] |
|
continue |
|
return _bb, _pp, _ww |
|
|
|
|
|
|
|
def process_form(preds, words, bboxes): |
|
|
|
final_bbox, final_preds, final_words = mergeCloseBoxes(preds, bboxes, words, 70) |
|
_bbox, _preds, _words = removeSimilarItems(final_bbox, final_preds, final_words) |
|
|
|
_bbox = [[int(x) for x in item ] for item in _bbox] |
|
|
|
data = [] |
|
for index in range(len(_bbox)): |
|
data.append((_bbox[index], _preds[index], _words[index])) |
|
|
|
sorted_list = sorted( |
|
data, |
|
key=lambda x: x[0][1] |
|
) |
|
_bbox = [item[0] for item in sorted_list] |
|
_preds = [item[1] for item in sorted_list] |
|
_words = [item[2] for item in sorted_list] |
|
return _bbox, _preds, _words |
|
|
|
def mergeImageVertical(a): |
|
list_im = a |
|
imgs = [ Image.open(i) for i in list_im ] |
|
|
|
min_shape = sorted( [(np.sum(i.size), i.size ) for i in imgs])[0][1] |
|
imgs_comb = np.hstack([i.resize(min_shape) for i in imgs]) |
|
|
|
|
|
imgs_comb = np.vstack([i.resize(min_shape) for i in imgs]) |
|
imgs_comb = Image.fromarray( imgs_comb) |
|
imgs_comb.save( 'Trifecta_vertical.jpg' ) |
|
return imgs_comb |
|
|
|
|
|
|
|
def completepreprocess(pdffile): |
|
myDataFrame = pd.DataFrame() |
|
a=[] |
|
doc = fitz.open(pdffile) |
|
for i in range(0,len(doc)): |
|
page = doc.load_page(i) |
|
zoom = 2 |
|
mat = fitz.Matrix(zoom, zoom) |
|
pix = page.get_pixmap(matrix = mat,dpi = 200) |
|
t=pix.save("page"+str(i)+".jpg") |
|
images = Image.open("page"+str(i)+".jpg") |
|
image = images.convert("RGB") |
|
bbox, preds, words, image = process_image(image) |
|
im, df = visualize_image(bbox, preds, words, image) |
|
im1 = im.save("page"+str(i)+".jpg") |
|
a.append("page"+str(i)+".jpg") |
|
pred_list = [] |
|
for number in preds: |
|
pred_list.append(iob_to_label(number)) |
|
_bbox, _preds, _words = process_form(pred_list, words, bbox) |
|
print('page: ' + str(i) + ' ' + str(len(_preds))+ ' ' + str(len(_words))) |
|
df = createDataframe(_preds, _words) |
|
myDataFrame=myDataFrame.append(df) |
|
|
|
im2=mergeImageVertical(a) |
|
return im2,myDataFrame |
|
|
|
|
|
title = "Interactive demo: Manifesto Information Extraction model" |
|
description = "Manifesto Information Extraction - We use Microsoft’s LayoutLMv3 trained on Manifesto Dataset through csv's to predict the labels. To use it, simply upload a PDF or use the example PDF below and click ‘Submit’. Results will show up in a few seconds. If you want to make the output bigger, right-click on it and select ‘Open image in new tab’.Train =63 ,Test =15" |
|
|
|
css = """.output_image, .input_image {height: 600px !important}""" |
|
|
|
|
|
|
|
examples = [['3pages_messina.pdf'], ['messina2.jpg'], ['arkas1.jpg'], ['brointermed1.jpg'], ['brointermed2.pdf'], ['tarros_1.jpg'], ['tarros_3.jpg'], ['tarros_4.jpg']] |
|
|
|
iface = gr.Interface(fn=completepreprocess, |
|
|
|
inputs=gr.File(label="PDF"), |
|
|
|
outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] , |
|
title=title, |
|
description=description, |
|
examples=examples, |
|
css=css, |
|
analytics_enabled = True, enable_queue=True) |
|
|
|
iface.launch(inline=False , debug=True) |