Gilvan commited on
Commit
888d78f
·
verified ·
1 Parent(s): 53b872e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -0
app.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ######################## Todos os Imports necessários ########################
2
+ import os
3
+ import re
4
+ import pickle
5
+ import pandas as pd
6
+ import nltk
7
+ from nltk.corpus import stopwords
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.stem import RSLPStemmer
10
+ # Imports necessários para DistilBert NER
11
+ import numpy as np
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+ import torch
15
+ from sklearn.model_selection import train_test_split
16
+ from transformers import DistilBertTokenizerFast, DistilBertConfig, DistilBertForTokenClassification
17
+ from transformers import Trainer, TrainingArguments
18
+ from torch.utils.data import DataLoader, Dataset
19
+ from torch.utils.data import Dataset, DataLoader, random_split
20
+ from sklearn.metrics import precision_recall_fscore_support, accuracy_score
21
+ import evaluate
22
+ # Imports necessários para a interface Gradio
23
+ import gradio as gr
24
+
25
+ # Definir dispositivo (CPU ou GPU, se disponível)
26
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+
28
+ # Baixar recursos necessários do NLTK
29
+ nltk.download('stopwords')
30
+ nltk.download('punkt')
31
+ nltk.download('rslp')
32
+
33
+ # Carregar dados
34
+ file_path = "base_info_produtos.csv"
35
+ df = pd.read_csv(file_path, sep='\t')
36
+
37
+ # Configurar pré-processamento de texto
38
+ stop_words = set(stopwords.words('portuguese'))
39
+ stemmer = RSLPStemmer()
40
+
41
+ def preprocess_text(text):
42
+ """Preprocessa o texto removendo stopwords e aplicando stemming."""
43
+ words = word_tokenize(text.lower())
44
+ words = [stemmer.stem(word) for word in words if word.isalnum() and word not in stop_words]
45
+ return ' '.join(words)
46
+
47
+ # Concatenar colunas para enriquecer as informações
48
+ df.fillna('n/a', inplace=True)
49
+ df['concatenated'] = (df['nome'] + ' ' + df['tipo'] + ' ' + df['marca'] + ' ' + df['categoria'] + ' ' +
50
+ df['cor'] + ' ' + df['modelo'])
51
+
52
+ # Aplicar preprocessamento de texto
53
+ df['processed_text'] = df['concatenated'].apply(preprocess_text)
54
+
55
+ ######################## TF-IDF ########################
56
+
57
+ # Verificar se os arquivos do modelo TF-IDF já existem
58
+ tfidf_dir = "tfidf_model"
59
+ vectorizer_path = os.path.join(tfidf_dir, "tfidf_vectorizer.pkl")
60
+ matrix_path = os.path.join(tfidf_dir, "tfidf_matrix.pkl")
61
+
62
+ with open(vectorizer_path, 'rb') as f:
63
+ vectorizer = pickle.load(f)
64
+ with open(matrix_path, 'rb') as f:
65
+ tfidf_matrix = pickle.load(f)
66
+ print("Modelo TF-IDF carregado com sucesso.")
67
+
68
+ def calculate_similarity(product1, product2):
69
+ """Calcula a similaridade entre dois produtos."""
70
+ product1_processed = preprocess_text(product1)
71
+ product2_processed = preprocess_text(product2)
72
+ product1_tfidf = vectorizer.transform([product1_processed])
73
+ product2_tfidf = vectorizer.transform([product2_processed])
74
+ similarity = cosine_similarity(product1_tfidf, product2_tfidf)
75
+ return min(similarity[0][0], 1.0)
76
+
77
+ def search_products(query, top_n=5):
78
+ """Realiza busca de produtos com base na similaridade TF-IDF."""
79
+ query = preprocess_text(query)
80
+ query_tfidf = vectorizer.transform([query])
81
+ similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
82
+ top_indices = similarities.argsort()[::-1][:top_n]
83
+ results = df.iloc[top_indices].copy()
84
+ results['probabilidade'] = [calculate_similarity(query, results.iloc[i]['concatenated']) for i in range(len(results))]
85
+ return results[['nome', 'tipo', 'marca', 'categoria', 'cor', 'modelo', 'probabilidade']]
86
+
87
+ def extract_info_from_title(title):
88
+ """Extrai informações de um título usando TF-IDF."""
89
+ processed_title = preprocess_text(title)
90
+ query_tfidf = vectorizer.transform([processed_title])
91
+ similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
92
+ top_index = similarities.argsort()[::-1][0]
93
+ return df.iloc[top_index][['tipo', 'marca', 'categoria', 'cor', 'modelo']]
94
+
95
+ ######################## NER DISTILBERT ########################
96
+
97
+ model_path = "ner_model"
98
+ tokenizer = "ner_model"
99
+
100
+ from collections import defaultdict
101
+ from transformers import pipeline
102
+
103
+ def get_most_cited_label_for_strings(string, model_path, tokenizer, device):
104
+ strings = string.split(" ")
105
+ classifier = pipeline("ner", model=model_path, tokenizer=tokenizer, device=device)
106
+ results = {}
107
+
108
+ # Initialize a list to keep track of entities and their positions
109
+ entities = []
110
+
111
+ for idx, string in enumerate(strings):
112
+ classifier_output = classifier(string)
113
+ label_scores = defaultdict(float)
114
+
115
+ # Aggregate scores for each label
116
+ for item in classifier_output:
117
+ entity = item['entity']
118
+ score = item['score']
119
+ label_scores[entity] += score
120
+
121
+ # Find the label with the highest cumulative score
122
+ most_cited_label = max(label_scores, key=label_scores.get)
123
+
124
+ # Store the entity and its position
125
+ entities.append((idx, most_cited_label))
126
+
127
+ # Sort entities by their original position in the input string
128
+ entities.sort(key=lambda x: x[0])
129
+
130
+ # Build the results dictionary aligned with the original input
131
+ for position, label in entities:
132
+ results[strings[position]] = label
133
+
134
+ return results
135
+
136
+ ######################## GRADIO INTERFACE ########################
137
+
138
+ # Habilitar modo de debug com a variável de ambiente GRADIO_DEBUG=1
139
+ os.environ["GRADIO_DEBUG"] = "1"
140
+
141
+ def search_interface(query):
142
+ results = search_products(query)
143
+ return results
144
+
145
+ def ner_interface(input_text):
146
+ ner_predictions = get_most_cited_label_for_strings(input_text, model_path, tokenizer, device)
147
+ return ner_predictions
148
+
149
+ search_demo = gr.Interface(fn=search_interface, inputs="text", outputs="dataframe", title="Busca de produtos")
150
+ ner_demo = gr.Interface(fn=ner_interface, inputs="text", outputs="json", title="NER Extraction")
151
+
152
+ demo = gr.TabbedInterface([search_demo, ner_demo], ["Busca de produtos", "Extração de features NER"])
153
+ demo.launch()