Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
######################## Todos os Imports necessários ########################
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import pickle
|
5 |
+
import pandas as pd
|
6 |
+
import nltk
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
from nltk.tokenize import word_tokenize
|
9 |
+
from nltk.stem import RSLPStemmer
|
10 |
+
# Imports necessários para DistilBert NER
|
11 |
+
import numpy as np
|
12 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
13 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
14 |
+
import torch
|
15 |
+
from sklearn.model_selection import train_test_split
|
16 |
+
from transformers import DistilBertTokenizerFast, DistilBertConfig, DistilBertForTokenClassification
|
17 |
+
from transformers import Trainer, TrainingArguments
|
18 |
+
from torch.utils.data import DataLoader, Dataset
|
19 |
+
from torch.utils.data import Dataset, DataLoader, random_split
|
20 |
+
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
|
21 |
+
import evaluate
|
22 |
+
# Imports necessários para a interface Gradio
|
23 |
+
import gradio as gr
|
24 |
+
|
25 |
+
# Definir dispositivo (CPU ou GPU, se disponível)
|
26 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
27 |
+
|
28 |
+
# Baixar recursos necessários do NLTK
|
29 |
+
nltk.download('stopwords')
|
30 |
+
nltk.download('punkt')
|
31 |
+
nltk.download('rslp')
|
32 |
+
|
33 |
+
# Carregar dados
|
34 |
+
file_path = "base_info_produtos.csv"
|
35 |
+
df = pd.read_csv(file_path, sep='\t')
|
36 |
+
|
37 |
+
# Configurar pré-processamento de texto
|
38 |
+
stop_words = set(stopwords.words('portuguese'))
|
39 |
+
stemmer = RSLPStemmer()
|
40 |
+
|
41 |
+
def preprocess_text(text):
|
42 |
+
"""Preprocessa o texto removendo stopwords e aplicando stemming."""
|
43 |
+
words = word_tokenize(text.lower())
|
44 |
+
words = [stemmer.stem(word) for word in words if word.isalnum() and word not in stop_words]
|
45 |
+
return ' '.join(words)
|
46 |
+
|
47 |
+
# Concatenar colunas para enriquecer as informações
|
48 |
+
df.fillna('n/a', inplace=True)
|
49 |
+
df['concatenated'] = (df['nome'] + ' ' + df['tipo'] + ' ' + df['marca'] + ' ' + df['categoria'] + ' ' +
|
50 |
+
df['cor'] + ' ' + df['modelo'])
|
51 |
+
|
52 |
+
# Aplicar preprocessamento de texto
|
53 |
+
df['processed_text'] = df['concatenated'].apply(preprocess_text)
|
54 |
+
|
55 |
+
######################## TF-IDF ########################
|
56 |
+
|
57 |
+
# Verificar se os arquivos do modelo TF-IDF já existem
|
58 |
+
tfidf_dir = "tfidf_model"
|
59 |
+
vectorizer_path = os.path.join(tfidf_dir, "tfidf_vectorizer.pkl")
|
60 |
+
matrix_path = os.path.join(tfidf_dir, "tfidf_matrix.pkl")
|
61 |
+
|
62 |
+
with open(vectorizer_path, 'rb') as f:
|
63 |
+
vectorizer = pickle.load(f)
|
64 |
+
with open(matrix_path, 'rb') as f:
|
65 |
+
tfidf_matrix = pickle.load(f)
|
66 |
+
print("Modelo TF-IDF carregado com sucesso.")
|
67 |
+
|
68 |
+
def calculate_similarity(product1, product2):
|
69 |
+
"""Calcula a similaridade entre dois produtos."""
|
70 |
+
product1_processed = preprocess_text(product1)
|
71 |
+
product2_processed = preprocess_text(product2)
|
72 |
+
product1_tfidf = vectorizer.transform([product1_processed])
|
73 |
+
product2_tfidf = vectorizer.transform([product2_processed])
|
74 |
+
similarity = cosine_similarity(product1_tfidf, product2_tfidf)
|
75 |
+
return min(similarity[0][0], 1.0)
|
76 |
+
|
77 |
+
def search_products(query, top_n=5):
|
78 |
+
"""Realiza busca de produtos com base na similaridade TF-IDF."""
|
79 |
+
query = preprocess_text(query)
|
80 |
+
query_tfidf = vectorizer.transform([query])
|
81 |
+
similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
|
82 |
+
top_indices = similarities.argsort()[::-1][:top_n]
|
83 |
+
results = df.iloc[top_indices].copy()
|
84 |
+
results['probabilidade'] = [calculate_similarity(query, results.iloc[i]['concatenated']) for i in range(len(results))]
|
85 |
+
return results[['nome', 'tipo', 'marca', 'categoria', 'cor', 'modelo', 'probabilidade']]
|
86 |
+
|
87 |
+
def extract_info_from_title(title):
|
88 |
+
"""Extrai informações de um título usando TF-IDF."""
|
89 |
+
processed_title = preprocess_text(title)
|
90 |
+
query_tfidf = vectorizer.transform([processed_title])
|
91 |
+
similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
|
92 |
+
top_index = similarities.argsort()[::-1][0]
|
93 |
+
return df.iloc[top_index][['tipo', 'marca', 'categoria', 'cor', 'modelo']]
|
94 |
+
|
95 |
+
######################## NER DISTILBERT ########################
|
96 |
+
|
97 |
+
model_path = "ner_model"
|
98 |
+
tokenizer = "ner_model"
|
99 |
+
|
100 |
+
from collections import defaultdict
|
101 |
+
from transformers import pipeline
|
102 |
+
|
103 |
+
def get_most_cited_label_for_strings(string, model_path, tokenizer, device):
|
104 |
+
strings = string.split(" ")
|
105 |
+
classifier = pipeline("ner", model=model_path, tokenizer=tokenizer, device=device)
|
106 |
+
results = {}
|
107 |
+
|
108 |
+
# Initialize a list to keep track of entities and their positions
|
109 |
+
entities = []
|
110 |
+
|
111 |
+
for idx, string in enumerate(strings):
|
112 |
+
classifier_output = classifier(string)
|
113 |
+
label_scores = defaultdict(float)
|
114 |
+
|
115 |
+
# Aggregate scores for each label
|
116 |
+
for item in classifier_output:
|
117 |
+
entity = item['entity']
|
118 |
+
score = item['score']
|
119 |
+
label_scores[entity] += score
|
120 |
+
|
121 |
+
# Find the label with the highest cumulative score
|
122 |
+
most_cited_label = max(label_scores, key=label_scores.get)
|
123 |
+
|
124 |
+
# Store the entity and its position
|
125 |
+
entities.append((idx, most_cited_label))
|
126 |
+
|
127 |
+
# Sort entities by their original position in the input string
|
128 |
+
entities.sort(key=lambda x: x[0])
|
129 |
+
|
130 |
+
# Build the results dictionary aligned with the original input
|
131 |
+
for position, label in entities:
|
132 |
+
results[strings[position]] = label
|
133 |
+
|
134 |
+
return results
|
135 |
+
|
136 |
+
######################## GRADIO INTERFACE ########################
|
137 |
+
|
138 |
+
# Habilitar modo de debug com a variável de ambiente GRADIO_DEBUG=1
|
139 |
+
os.environ["GRADIO_DEBUG"] = "1"
|
140 |
+
|
141 |
+
def search_interface(query):
|
142 |
+
results = search_products(query)
|
143 |
+
return results
|
144 |
+
|
145 |
+
def ner_interface(input_text):
|
146 |
+
ner_predictions = get_most_cited_label_for_strings(input_text, model_path, tokenizer, device)
|
147 |
+
return ner_predictions
|
148 |
+
|
149 |
+
search_demo = gr.Interface(fn=search_interface, inputs="text", outputs="dataframe", title="Busca de produtos")
|
150 |
+
ner_demo = gr.Interface(fn=ner_interface, inputs="text", outputs="json", title="NER Extraction")
|
151 |
+
|
152 |
+
demo = gr.TabbedInterface([search_demo, ner_demo], ["Busca de produtos", "Extração de features NER"])
|
153 |
+
demo.launch()
|