Files changed (1) hide show
  1. app.py +0 -313
app.py DELETED
@@ -1,313 +0,0 @@
1
- import io
2
- import json
3
- import os
4
- import shutil
5
- import time
6
- from collections import Counter
7
- from pathlib import Path
8
-
9
- import fitz
10
- import numpy as np
11
- import pandas as pd
12
- import plotly.express as px
13
- import streamlit as st
14
- import torch
15
- import torch.nn.functional as F
16
- from easyocr import Reader
17
- from PIL import Image
18
- from tqdm import tqdm
19
- from transformers import (LayoutLMv3FeatureExtractor,
20
- LayoutLMv3ForSequenceClassification,
21
- LayoutLMv3Processor, LayoutLMv3TokenizerFast)
22
-
23
- # DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
24
- DEVICE = "cpu"
25
- MICROSOFT_HODEL_NAME = "microsoft/layoutlmv3-base"
26
- MODEL_NAME = "arthur-lima/layoutlmv3-triagem-documentos"
27
-
28
-
29
- def create_bounding_box(bbox_data, width_scale: float, height_scale: float):
30
- xs = []
31
- ys = []
32
- for x, y in bbox_data:
33
- xs.append(x)
34
- ys.append(y)
35
- left = int(min(xs) * width_scale)
36
- top = int(min(ys) * height_scale)
37
- right = int(max(xs) * width_scale)
38
- bottom = int(max(ys) * height_scale)
39
- return [left, top, right, bottom]
40
-
41
-
42
- @st.experimental_singleton
43
- def create_ocr_reader():
44
- # return Reader(["pt", "en"], gpu=True)
45
- return Reader(["pt", "en"], gpu=False)
46
-
47
-
48
- @st.experimental_singleton
49
- def create_processor():
50
- feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
51
- tokenizer = LayoutLMv3TokenizerFast.from_pretrained(MICROSOFT_HODEL_NAME)
52
- return LayoutLMv3Processor(feature_extractor, tokenizer)
53
-
54
-
55
- @st.experimental_singleton
56
- def create_model(revision="main"):
57
- model = LayoutLMv3ForSequenceClassification.from_pretrained(MODEL_NAME, revision=revision)
58
- return model.eval().to(DEVICE)
59
-
60
- def pdf2jpg(src: Path, dest_path: Path=None, dpi=100, limit=None):
61
- """
62
- Converte um arquivo PDF em JPG.
63
- Se forem várias páginas, serão geradas várias imagens
64
- """
65
- # Tratamento dos caminhos de destino
66
- if (dest_path is None):
67
- # Não passou caminho
68
- dest = src.parent / src.stem
69
- elif (dest_path.suffix == ""):
70
- # Só passou uma pasta
71
- dest = dest_path / src.stem
72
- else:
73
- # Passou um caminho com arquivo
74
- dest = dest_path.parent / dest_path.stem
75
-
76
- zoom = dpi / 72 # zoom factor, standard: 72 dpi
77
- magnify = fitz.Matrix(zoom, zoom) # magnifies in x, resp. y direction
78
- try:
79
- doc = fitz.open(src) # open document
80
- for page in doc:
81
- pix = page.get_pixmap(matrix=magnify) # render page to an image
82
- dest_final_filename = Path(str(dest) + f"-{page.number}.jpg")
83
- pix.save(dest_final_filename)
84
- return True
85
- except Exception as e:
86
- print(f"\nProblemas na conversão para JPG do arquivo PDF {src}: " + str(e))
87
- return False
88
-
89
- def classifyPDF(
90
- pdfpath: Path, model, processor, reader: Reader = None, dpi=100
91
- ) -> str:
92
- def create_bounding_box(bbox_data, width_scale: float = 1, height_scale: float = 1):
93
- xs = []
94
- ys = []
95
- for x, y in bbox_data:
96
- xs.append(x)
97
- ys.append(y)
98
- left = int(min(xs) * width_scale)
99
- top = int(min(ys) * height_scale)
100
- right = int(max(xs) * width_scale)
101
- bottom = int(max(ys) * height_scale)
102
- return [left, top, right, bottom]
103
-
104
- # Cria pasta temporária para converter em JPG
105
- tmp = Path("temp")
106
- if os.path.exists(tmp):
107
- tmp = Path("temp_classification")
108
- shutil.rmtree(tmp, ignore_errors=True)
109
- os.mkdir(tmp)
110
- image_path = tmp / Path(pdfpath.name).with_suffix(".jpg")
111
- pdf2jpg(pdfpath, image_path, dpi)
112
- if reader is None:
113
- reader = Reader(["pt", "en"])
114
- time.sleep(0.5)
115
-
116
- # Verificar se há várias páginas
117
- if len(os.listdir(tmp)) > 1:
118
- # Várias páginas, escolher a da maioria
119
- results = []
120
- all_probs = []
121
- for img in tqdm(os.listdir(tmp)):
122
- image_path = tmp / img
123
- # Ler cada página (em bytes) via OCR
124
- image = Image.open(image_path)
125
- with open(image_path, "rb") as f:
126
- image_bytes = f.read()
127
- ocr_result = reader.readtext(image_bytes, batch_size=1)
128
- ocr_page = []
129
- for bbox, word, confidence in ocr_result:
130
- ocr_page.append(
131
- {"word": word, "bounding_box": create_bounding_box(bbox)}
132
- )
133
- with Path(image_path).with_suffix(".json").open("w") as f:
134
- json.dump(ocr_page, f)
135
-
136
- # Fazer a previsão
137
- predicted_class, probabilities = predict(
138
- image, image_bytes, reader, processor, model
139
- )
140
- # result = model.config.id2label[predicted_class]
141
- results.append(predicted_class)
142
-
143
- if (len(all_probs) == 0): all_probs = np.array(probabilities)
144
- else: all_probs += np.array(probabilities)
145
- # Resultado é o mais comum
146
- result = Counter(results).most_common(1)
147
- result = result[0][0]
148
- all_probs = all_probs * (1 / len(os.listdir(tmp)))
149
- predicted_class, probabilities = result, all_probs
150
-
151
-
152
- else:
153
- # Uma página
154
- image_path = tmp / (os.listdir(tmp)[0])
155
-
156
- # Ler a imagem via OCR
157
- image = Image.open(image_path)
158
- with open(image_path, "rb") as f:
159
- image_bytes = f.read()
160
- ocr_result = reader.readtext(image_bytes, batch_size=1)
161
- ocr_page = []
162
- for bbox, word, confidence in ocr_result:
163
- ocr_page.append({"word": word, "bounding_box": create_bounding_box(bbox)})
164
- with image_path.with_suffix(".json").open("w") as f:
165
- json.dump(ocr_page, f)
166
-
167
- # Fazer a previsão
168
- predicted_class, probabilities = predict(
169
- image, image_bytes, reader, processor, model
170
- )
171
- # result = model.config.id2label[predicted_class]
172
-
173
- return predicted_class, probabilities
174
-
175
-
176
- def predict(
177
- image: Image.Image,
178
- image_bytes: bytes,
179
- reader: Reader,
180
- processor: LayoutLMv3Processor,
181
- model: LayoutLMv3ForSequenceClassification,
182
- ):
183
-
184
- ocr_result = reader.readtext(image_bytes)
185
-
186
- width, height = image.size
187
- width_scale = 1000 / width
188
- height_scale = 1000 / height
189
-
190
- words = []
191
- boxes = []
192
- for bbox, word, _ in ocr_result:
193
- boxes.append(create_bounding_box(bbox, width_scale, height_scale))
194
- words.append(word)
195
-
196
- encoding = processor(
197
- image,
198
- words,
199
- boxes=boxes,
200
- max_length=512,
201
- padding="max_length",
202
- truncation=True,
203
- return_tensors="pt",
204
- )
205
-
206
- with torch.inference_mode():
207
- output = model(
208
- input_ids=encoding["input_ids"].to(DEVICE),
209
- attention_mask=encoding["attention_mask"].to(DEVICE),
210
- bbox=encoding["bbox"].to(DEVICE),
211
- pixel_values=encoding["pixel_values"].to(DEVICE),
212
- )
213
-
214
- logits = output.logits
215
- predicted_class = logits.argmax()
216
- probabilities = (
217
- F.softmax(logits, dim=-1).flatten().tolist()
218
- ) # Convertendo em probabilidades novamente
219
- # return model.config.id2label[predicted_class.item()]
220
- return predicted_class.detach().item(), probabilities
221
-
222
-
223
- reader = create_ocr_reader()
224
- processor = create_processor()
225
- model = create_model(revision="e34c270")
226
-
227
- # Logo
228
- c1, c2, c3 = st.columns([2.7,5,1])
229
- c2.image("resources/previsa_cinza.png", width=250)
230
-
231
- # Caixas de Upload
232
- col1, col2 = st.columns(2)
233
- with col1:
234
- uploaded_file = st.file_uploader("Upload: Notas Fiscais de Entrada", ["jpg", "pdf"])
235
- uploaded_file = st.file_uploader("Upload: Notas Fiscais de Saída", ["jpg", "pdf"])
236
- uploaded_file = st.file_uploader("Upload: Notas Fiscais de Retenção", ["jpg", "pdf"])
237
- uploaded_file = st.file_uploader("Upload: Notas Fiscais de Serviços", ["jpg", "pdf"])
238
- with col2:
239
- uploaded_file = st.file_uploader("Upload: Documentos Aluguel", ["jpg", "pdf"])
240
- uploaded_file = st.file_uploader("Upload: Documentos Contábeis", ["jpg", "pdf"])
241
- uploaded_file = st.file_uploader("Upload: Documentos Tributos", ["jpg", "pdf"])
242
- uploaded_file = st.file_uploader("Upload: Documentos MEI", ["jpg", "pdf"])
243
- uploaded_file = st.file_uploader("Upload: Extrato Bancário", ["jpg", "pdf"])
244
-
245
- def plot_confianca(probabilities, model):
246
- # Desenhar o gráfico de confianças
247
- with st.spinner("Criando gráficos de confiança..."):
248
- df_predictions = pd.DataFrame(
249
- {
250
- "Tipo Documento": list(model.config.id2label.values()),
251
- "Confiança": probabilities,
252
- }
253
- )
254
- fig = px.bar(df_predictions, x="Tipo Documento", y="Confiança")
255
- fig.update_layout({
256
- 'plot_bgcolor': '#FFFFFF'
257
- })
258
- fig.update_traces(marker_color='#fcaf17')
259
- st.plotly_chart(fig, use_container_width=True)
260
-
261
- # Processamento
262
- if uploaded_file is not None:
263
- print(dir(uploaded_file))
264
- c1, c2, c3 = st.columns([2.4,5,1])
265
-
266
- try:
267
- # Tentar decodificar como PDF
268
- if os.path.exists("temp"):
269
- shutil.rmtree("temp", ignore_errors=True)
270
- os.mkdir("temp")
271
- doc = fitz.Document(stream=uploaded_file.getvalue())
272
- pdfPath = Path("temp/temp.pdf")
273
- doc.save(pdfPath)
274
-
275
- # Imprimir a primeira página
276
- for page in doc:
277
- pix = page.get_pixmap()
278
- pix.save("temp/icon-page-1.jpg")
279
- c2.image("temp/icon-page-1.jpg", "Página do documento", width=300)
280
- break
281
-
282
- # Fazer a previsão
283
- with st.spinner("Fazendo previsão..."):
284
- predicted_class, probabilities = classifyPDF(pdfPath, model, processor, reader)
285
- print(probabilities)
286
- except fitz.fitz.FileDataError:
287
- # Carregar a imagem passada
288
- image_bytes = uploaded_file.getvalue()
289
- bytes_data = io.BytesIO(image_bytes)
290
- image = Image.open(bytes_data)
291
-
292
- # Mostrar a imagem
293
- c2.image(image, "Página do documento", width=300)
294
-
295
- # Fazer a previsão
296
- with st.spinner("Fazendo previsão..."):
297
- predicted_class, probabilities = predict(
298
- image, image_bytes, reader, processor, model
299
- )
300
- finally:
301
- # Remover a pasta temporária se ainda existir
302
- if os.path.exists("temp"):
303
- shutil.rmtree("temp", ignore_errors=True)
304
- if os.path.exists("temp_classification"):
305
- shutil.rmtree("temp_classification", ignore_errors=True)
306
-
307
- # Imprimir o resultado na tela
308
- predicted_label = model.config.id2label[predicted_class]
309
- st.markdown(f"Tipo do documento previsto: **{predicted_label}**")
310
-
311
- plot_confianca(probabilities, model)
312
-
313
-