Spaces:
Runtime error
Runtime error
Delete app.py
#1
by
fabiomorei161
- opened
app.py
DELETED
@@ -1,313 +0,0 @@
|
|
1 |
-
import io
|
2 |
-
import json
|
3 |
-
import os
|
4 |
-
import shutil
|
5 |
-
import time
|
6 |
-
from collections import Counter
|
7 |
-
from pathlib import Path
|
8 |
-
|
9 |
-
import fitz
|
10 |
-
import numpy as np
|
11 |
-
import pandas as pd
|
12 |
-
import plotly.express as px
|
13 |
-
import streamlit as st
|
14 |
-
import torch
|
15 |
-
import torch.nn.functional as F
|
16 |
-
from easyocr import Reader
|
17 |
-
from PIL import Image
|
18 |
-
from tqdm import tqdm
|
19 |
-
from transformers import (LayoutLMv3FeatureExtractor,
|
20 |
-
LayoutLMv3ForSequenceClassification,
|
21 |
-
LayoutLMv3Processor, LayoutLMv3TokenizerFast)
|
22 |
-
|
23 |
-
# DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
|
24 |
-
DEVICE = "cpu"
|
25 |
-
MICROSOFT_HODEL_NAME = "microsoft/layoutlmv3-base"
|
26 |
-
MODEL_NAME = "arthur-lima/layoutlmv3-triagem-documentos"
|
27 |
-
|
28 |
-
|
29 |
-
def create_bounding_box(bbox_data, width_scale: float, height_scale: float):
|
30 |
-
xs = []
|
31 |
-
ys = []
|
32 |
-
for x, y in bbox_data:
|
33 |
-
xs.append(x)
|
34 |
-
ys.append(y)
|
35 |
-
left = int(min(xs) * width_scale)
|
36 |
-
top = int(min(ys) * height_scale)
|
37 |
-
right = int(max(xs) * width_scale)
|
38 |
-
bottom = int(max(ys) * height_scale)
|
39 |
-
return [left, top, right, bottom]
|
40 |
-
|
41 |
-
|
42 |
-
@st.experimental_singleton
|
43 |
-
def create_ocr_reader():
|
44 |
-
# return Reader(["pt", "en"], gpu=True)
|
45 |
-
return Reader(["pt", "en"], gpu=False)
|
46 |
-
|
47 |
-
|
48 |
-
@st.experimental_singleton
|
49 |
-
def create_processor():
|
50 |
-
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
|
51 |
-
tokenizer = LayoutLMv3TokenizerFast.from_pretrained(MICROSOFT_HODEL_NAME)
|
52 |
-
return LayoutLMv3Processor(feature_extractor, tokenizer)
|
53 |
-
|
54 |
-
|
55 |
-
@st.experimental_singleton
|
56 |
-
def create_model(revision="main"):
|
57 |
-
model = LayoutLMv3ForSequenceClassification.from_pretrained(MODEL_NAME, revision=revision)
|
58 |
-
return model.eval().to(DEVICE)
|
59 |
-
|
60 |
-
def pdf2jpg(src: Path, dest_path: Path=None, dpi=100, limit=None):
|
61 |
-
"""
|
62 |
-
Converte um arquivo PDF em JPG.
|
63 |
-
Se forem várias páginas, serão geradas várias imagens
|
64 |
-
"""
|
65 |
-
# Tratamento dos caminhos de destino
|
66 |
-
if (dest_path is None):
|
67 |
-
# Não passou caminho
|
68 |
-
dest = src.parent / src.stem
|
69 |
-
elif (dest_path.suffix == ""):
|
70 |
-
# Só passou uma pasta
|
71 |
-
dest = dest_path / src.stem
|
72 |
-
else:
|
73 |
-
# Passou um caminho com arquivo
|
74 |
-
dest = dest_path.parent / dest_path.stem
|
75 |
-
|
76 |
-
zoom = dpi / 72 # zoom factor, standard: 72 dpi
|
77 |
-
magnify = fitz.Matrix(zoom, zoom) # magnifies in x, resp. y direction
|
78 |
-
try:
|
79 |
-
doc = fitz.open(src) # open document
|
80 |
-
for page in doc:
|
81 |
-
pix = page.get_pixmap(matrix=magnify) # render page to an image
|
82 |
-
dest_final_filename = Path(str(dest) + f"-{page.number}.jpg")
|
83 |
-
pix.save(dest_final_filename)
|
84 |
-
return True
|
85 |
-
except Exception as e:
|
86 |
-
print(f"\nProblemas na conversão para JPG do arquivo PDF {src}: " + str(e))
|
87 |
-
return False
|
88 |
-
|
89 |
-
def classifyPDF(
|
90 |
-
pdfpath: Path, model, processor, reader: Reader = None, dpi=100
|
91 |
-
) -> str:
|
92 |
-
def create_bounding_box(bbox_data, width_scale: float = 1, height_scale: float = 1):
|
93 |
-
xs = []
|
94 |
-
ys = []
|
95 |
-
for x, y in bbox_data:
|
96 |
-
xs.append(x)
|
97 |
-
ys.append(y)
|
98 |
-
left = int(min(xs) * width_scale)
|
99 |
-
top = int(min(ys) * height_scale)
|
100 |
-
right = int(max(xs) * width_scale)
|
101 |
-
bottom = int(max(ys) * height_scale)
|
102 |
-
return [left, top, right, bottom]
|
103 |
-
|
104 |
-
# Cria pasta temporária para converter em JPG
|
105 |
-
tmp = Path("temp")
|
106 |
-
if os.path.exists(tmp):
|
107 |
-
tmp = Path("temp_classification")
|
108 |
-
shutil.rmtree(tmp, ignore_errors=True)
|
109 |
-
os.mkdir(tmp)
|
110 |
-
image_path = tmp / Path(pdfpath.name).with_suffix(".jpg")
|
111 |
-
pdf2jpg(pdfpath, image_path, dpi)
|
112 |
-
if reader is None:
|
113 |
-
reader = Reader(["pt", "en"])
|
114 |
-
time.sleep(0.5)
|
115 |
-
|
116 |
-
# Verificar se há várias páginas
|
117 |
-
if len(os.listdir(tmp)) > 1:
|
118 |
-
# Várias páginas, escolher a da maioria
|
119 |
-
results = []
|
120 |
-
all_probs = []
|
121 |
-
for img in tqdm(os.listdir(tmp)):
|
122 |
-
image_path = tmp / img
|
123 |
-
# Ler cada página (em bytes) via OCR
|
124 |
-
image = Image.open(image_path)
|
125 |
-
with open(image_path, "rb") as f:
|
126 |
-
image_bytes = f.read()
|
127 |
-
ocr_result = reader.readtext(image_bytes, batch_size=1)
|
128 |
-
ocr_page = []
|
129 |
-
for bbox, word, confidence in ocr_result:
|
130 |
-
ocr_page.append(
|
131 |
-
{"word": word, "bounding_box": create_bounding_box(bbox)}
|
132 |
-
)
|
133 |
-
with Path(image_path).with_suffix(".json").open("w") as f:
|
134 |
-
json.dump(ocr_page, f)
|
135 |
-
|
136 |
-
# Fazer a previsão
|
137 |
-
predicted_class, probabilities = predict(
|
138 |
-
image, image_bytes, reader, processor, model
|
139 |
-
)
|
140 |
-
# result = model.config.id2label[predicted_class]
|
141 |
-
results.append(predicted_class)
|
142 |
-
|
143 |
-
if (len(all_probs) == 0): all_probs = np.array(probabilities)
|
144 |
-
else: all_probs += np.array(probabilities)
|
145 |
-
# Resultado é o mais comum
|
146 |
-
result = Counter(results).most_common(1)
|
147 |
-
result = result[0][0]
|
148 |
-
all_probs = all_probs * (1 / len(os.listdir(tmp)))
|
149 |
-
predicted_class, probabilities = result, all_probs
|
150 |
-
|
151 |
-
|
152 |
-
else:
|
153 |
-
# Uma página
|
154 |
-
image_path = tmp / (os.listdir(tmp)[0])
|
155 |
-
|
156 |
-
# Ler a imagem via OCR
|
157 |
-
image = Image.open(image_path)
|
158 |
-
with open(image_path, "rb") as f:
|
159 |
-
image_bytes = f.read()
|
160 |
-
ocr_result = reader.readtext(image_bytes, batch_size=1)
|
161 |
-
ocr_page = []
|
162 |
-
for bbox, word, confidence in ocr_result:
|
163 |
-
ocr_page.append({"word": word, "bounding_box": create_bounding_box(bbox)})
|
164 |
-
with image_path.with_suffix(".json").open("w") as f:
|
165 |
-
json.dump(ocr_page, f)
|
166 |
-
|
167 |
-
# Fazer a previsão
|
168 |
-
predicted_class, probabilities = predict(
|
169 |
-
image, image_bytes, reader, processor, model
|
170 |
-
)
|
171 |
-
# result = model.config.id2label[predicted_class]
|
172 |
-
|
173 |
-
return predicted_class, probabilities
|
174 |
-
|
175 |
-
|
176 |
-
def predict(
|
177 |
-
image: Image.Image,
|
178 |
-
image_bytes: bytes,
|
179 |
-
reader: Reader,
|
180 |
-
processor: LayoutLMv3Processor,
|
181 |
-
model: LayoutLMv3ForSequenceClassification,
|
182 |
-
):
|
183 |
-
|
184 |
-
ocr_result = reader.readtext(image_bytes)
|
185 |
-
|
186 |
-
width, height = image.size
|
187 |
-
width_scale = 1000 / width
|
188 |
-
height_scale = 1000 / height
|
189 |
-
|
190 |
-
words = []
|
191 |
-
boxes = []
|
192 |
-
for bbox, word, _ in ocr_result:
|
193 |
-
boxes.append(create_bounding_box(bbox, width_scale, height_scale))
|
194 |
-
words.append(word)
|
195 |
-
|
196 |
-
encoding = processor(
|
197 |
-
image,
|
198 |
-
words,
|
199 |
-
boxes=boxes,
|
200 |
-
max_length=512,
|
201 |
-
padding="max_length",
|
202 |
-
truncation=True,
|
203 |
-
return_tensors="pt",
|
204 |
-
)
|
205 |
-
|
206 |
-
with torch.inference_mode():
|
207 |
-
output = model(
|
208 |
-
input_ids=encoding["input_ids"].to(DEVICE),
|
209 |
-
attention_mask=encoding["attention_mask"].to(DEVICE),
|
210 |
-
bbox=encoding["bbox"].to(DEVICE),
|
211 |
-
pixel_values=encoding["pixel_values"].to(DEVICE),
|
212 |
-
)
|
213 |
-
|
214 |
-
logits = output.logits
|
215 |
-
predicted_class = logits.argmax()
|
216 |
-
probabilities = (
|
217 |
-
F.softmax(logits, dim=-1).flatten().tolist()
|
218 |
-
) # Convertendo em probabilidades novamente
|
219 |
-
# return model.config.id2label[predicted_class.item()]
|
220 |
-
return predicted_class.detach().item(), probabilities
|
221 |
-
|
222 |
-
|
223 |
-
reader = create_ocr_reader()
|
224 |
-
processor = create_processor()
|
225 |
-
model = create_model(revision="e34c270")
|
226 |
-
|
227 |
-
# Logo
|
228 |
-
c1, c2, c3 = st.columns([2.7,5,1])
|
229 |
-
c2.image("resources/previsa_cinza.png", width=250)
|
230 |
-
|
231 |
-
# Caixas de Upload
|
232 |
-
col1, col2 = st.columns(2)
|
233 |
-
with col1:
|
234 |
-
uploaded_file = st.file_uploader("Upload: Notas Fiscais de Entrada", ["jpg", "pdf"])
|
235 |
-
uploaded_file = st.file_uploader("Upload: Notas Fiscais de Saída", ["jpg", "pdf"])
|
236 |
-
uploaded_file = st.file_uploader("Upload: Notas Fiscais de Retenção", ["jpg", "pdf"])
|
237 |
-
uploaded_file = st.file_uploader("Upload: Notas Fiscais de Serviços", ["jpg", "pdf"])
|
238 |
-
with col2:
|
239 |
-
uploaded_file = st.file_uploader("Upload: Documentos Aluguel", ["jpg", "pdf"])
|
240 |
-
uploaded_file = st.file_uploader("Upload: Documentos Contábeis", ["jpg", "pdf"])
|
241 |
-
uploaded_file = st.file_uploader("Upload: Documentos Tributos", ["jpg", "pdf"])
|
242 |
-
uploaded_file = st.file_uploader("Upload: Documentos MEI", ["jpg", "pdf"])
|
243 |
-
uploaded_file = st.file_uploader("Upload: Extrato Bancário", ["jpg", "pdf"])
|
244 |
-
|
245 |
-
def plot_confianca(probabilities, model):
|
246 |
-
# Desenhar o gráfico de confianças
|
247 |
-
with st.spinner("Criando gráficos de confiança..."):
|
248 |
-
df_predictions = pd.DataFrame(
|
249 |
-
{
|
250 |
-
"Tipo Documento": list(model.config.id2label.values()),
|
251 |
-
"Confiança": probabilities,
|
252 |
-
}
|
253 |
-
)
|
254 |
-
fig = px.bar(df_predictions, x="Tipo Documento", y="Confiança")
|
255 |
-
fig.update_layout({
|
256 |
-
'plot_bgcolor': '#FFFFFF'
|
257 |
-
})
|
258 |
-
fig.update_traces(marker_color='#fcaf17')
|
259 |
-
st.plotly_chart(fig, use_container_width=True)
|
260 |
-
|
261 |
-
# Processamento
|
262 |
-
if uploaded_file is not None:
|
263 |
-
print(dir(uploaded_file))
|
264 |
-
c1, c2, c3 = st.columns([2.4,5,1])
|
265 |
-
|
266 |
-
try:
|
267 |
-
# Tentar decodificar como PDF
|
268 |
-
if os.path.exists("temp"):
|
269 |
-
shutil.rmtree("temp", ignore_errors=True)
|
270 |
-
os.mkdir("temp")
|
271 |
-
doc = fitz.Document(stream=uploaded_file.getvalue())
|
272 |
-
pdfPath = Path("temp/temp.pdf")
|
273 |
-
doc.save(pdfPath)
|
274 |
-
|
275 |
-
# Imprimir a primeira página
|
276 |
-
for page in doc:
|
277 |
-
pix = page.get_pixmap()
|
278 |
-
pix.save("temp/icon-page-1.jpg")
|
279 |
-
c2.image("temp/icon-page-1.jpg", "Página do documento", width=300)
|
280 |
-
break
|
281 |
-
|
282 |
-
# Fazer a previsão
|
283 |
-
with st.spinner("Fazendo previsão..."):
|
284 |
-
predicted_class, probabilities = classifyPDF(pdfPath, model, processor, reader)
|
285 |
-
print(probabilities)
|
286 |
-
except fitz.fitz.FileDataError:
|
287 |
-
# Carregar a imagem passada
|
288 |
-
image_bytes = uploaded_file.getvalue()
|
289 |
-
bytes_data = io.BytesIO(image_bytes)
|
290 |
-
image = Image.open(bytes_data)
|
291 |
-
|
292 |
-
# Mostrar a imagem
|
293 |
-
c2.image(image, "Página do documento", width=300)
|
294 |
-
|
295 |
-
# Fazer a previsão
|
296 |
-
with st.spinner("Fazendo previsão..."):
|
297 |
-
predicted_class, probabilities = predict(
|
298 |
-
image, image_bytes, reader, processor, model
|
299 |
-
)
|
300 |
-
finally:
|
301 |
-
# Remover a pasta temporária se ainda existir
|
302 |
-
if os.path.exists("temp"):
|
303 |
-
shutil.rmtree("temp", ignore_errors=True)
|
304 |
-
if os.path.exists("temp_classification"):
|
305 |
-
shutil.rmtree("temp_classification", ignore_errors=True)
|
306 |
-
|
307 |
-
# Imprimir o resultado na tela
|
308 |
-
predicted_label = model.config.id2label[predicted_class]
|
309 |
-
st.markdown(f"Tipo do documento previsto: **{predicted_label}**")
|
310 |
-
|
311 |
-
plot_confianca(probabilities, model)
|
312 |
-
|
313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|