Update app.py
Browse files
app.py
CHANGED
@@ -20,90 +20,74 @@ footer {visibility: hidden}
|
|
20 |
# Inicializar o cliente de inferência
|
21 |
client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
|
22 |
|
|
|
|
|
|
|
23 |
# Função de pré-processamento de texto
|
24 |
def preprocess_text(text):
|
25 |
"""Pré-processa o texto removendo ruídos e normalizando."""
|
26 |
-
# Remover números de página (ex.: "Página 1", "Page 1 of 10")
|
27 |
text = re.sub(r'(Página|Page)\s+\d+(?:\s+of\s+\d+)?', '', text, flags=re.IGNORECASE)
|
28 |
-
# Remover múltiplos espaços e quebras de linha
|
29 |
text = re.sub(r'\s+', ' ', text).strip()
|
30 |
-
# Normalizar texto (remover acentos e converter para minúsculas)
|
31 |
text = unidecode(text.lower())
|
32 |
return text
|
33 |
|
34 |
-
# Configurar o retriever
|
35 |
def initialize_retriever(file_objs, persist_directory="chroma_db"):
|
36 |
"""Carrega documentos PDFs, pré-processa e cria um retriever híbrido."""
|
|
|
37 |
if not file_objs:
|
38 |
-
return
|
39 |
|
40 |
-
# Carregar e pré-processar documentos
|
41 |
documents = []
|
42 |
for file_obj in file_objs:
|
43 |
loader = PyPDFLoader(file_obj.name)
|
44 |
raw_docs = loader.load()
|
45 |
for doc in raw_docs:
|
46 |
doc.page_content = preprocess_text(doc.page_content)
|
47 |
-
# Adicionar metadados (exemplo: página e origem)
|
48 |
doc.metadata.update({"source": os.path.basename(file_obj.name)})
|
49 |
documents.extend(raw_docs)
|
50 |
|
51 |
-
# Dividir em pedaços menores
|
52 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=128)
|
53 |
splits = text_splitter.split_documents(documents)
|
54 |
|
55 |
-
# Criar embeddings e banco de vetores (Chroma)
|
56 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
57 |
try:
|
58 |
-
# Tentar carregar um banco existente
|
59 |
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
|
60 |
-
vectorstore.add_documents(splits)
|
61 |
except:
|
62 |
-
# Criar um novo banco se não existir
|
63 |
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=persist_directory)
|
64 |
|
65 |
-
# Configurar retriever semântico
|
66 |
semantic_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
|
67 |
-
|
68 |
-
# Configurar retriever lexical (BM25)
|
69 |
bm25_retriever = BM25Retriever.from_documents(splits)
|
70 |
bm25_retriever.k = 2
|
71 |
|
72 |
-
|
73 |
-
ensemble_retriever = EnsembleRetriever(
|
74 |
retrievers=[semantic_retriever, bm25_retriever],
|
75 |
-
weights=[0.6, 0.4]
|
76 |
)
|
77 |
|
78 |
-
return
|
79 |
|
80 |
# Formatar o prompt para RAG
|
81 |
def format_prompt(message, history, retriever=None, system_prompt=None):
|
82 |
prompt = "<s>"
|
83 |
-
|
84 |
-
# Adicionar histórico
|
85 |
for user_prompt, bot_response in history:
|
86 |
prompt += f"[INST] {user_prompt} [/INST]"
|
87 |
prompt += f" {bot_response}</s> "
|
88 |
-
|
89 |
-
# Adicionar instrução do sistema, se fornecida
|
90 |
if system_prompt:
|
91 |
prompt += f"[SYS] {system_prompt} [/SYS]"
|
92 |
-
|
93 |
-
# Adicionar contexto recuperado, se houver retriever
|
94 |
if retriever:
|
95 |
docs = retriever.get_relevant_documents(message)
|
96 |
context = "\n".join([f"[{doc.metadata.get('source', 'Unknown')}, Page {doc.metadata.get('page', 'N/A')}] {doc.page_content}" for doc in docs])
|
97 |
prompt += f"[CONTEXT] {context} [/CONTEXT]"
|
98 |
-
|
99 |
-
# Adicionar a mensagem do usuário
|
100 |
prompt += f"[INST] {message} [/INST]"
|
101 |
return prompt
|
102 |
|
103 |
# Função de geração com RAG
|
104 |
def generate(
|
105 |
-
prompt, history,
|
106 |
):
|
|
|
107 |
temperature = float(temperature)
|
108 |
if temperature < 1e-2:
|
109 |
temperature = 1e-2
|
@@ -118,10 +102,7 @@ def generate(
|
|
118 |
seed=42,
|
119 |
)
|
120 |
|
121 |
-
|
122 |
-
formatted_prompt = format_prompt(prompt, history, retriever, system_prompt)
|
123 |
-
|
124 |
-
# Gerar resposta em streaming
|
125 |
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
|
126 |
output = ""
|
127 |
|
@@ -132,13 +113,10 @@ def generate(
|
|
132 |
# Interface Gradio com RAG
|
133 |
def create_demo():
|
134 |
with gr.Blocks(css=css) as demo:
|
135 |
-
retriever_state = gr.State(value=None)
|
136 |
status = gr.State(value="Nenhum documento carregado")
|
137 |
|
138 |
-
# Título
|
139 |
gr.Markdown("<h1>RAG Chatbot</h1>")
|
140 |
|
141 |
-
# Seção de upload de documentos
|
142 |
with gr.Row():
|
143 |
with gr.Column(scale=1):
|
144 |
gr.Markdown("### Carregar Documentos")
|
@@ -146,21 +124,18 @@ def create_demo():
|
|
146 |
process_btn = gr.Button("Processar Documentos")
|
147 |
status_output = gr.Textbox(label="Status", value="Nenhum documento carregado")
|
148 |
|
149 |
-
# Interface de chat
|
150 |
chat_interface = gr.ChatInterface(
|
151 |
fn=generate,
|
152 |
additional_inputs=[
|
153 |
-
gr.State(value=retriever_state),
|
154 |
gr.Textbox(label="System Prompt", placeholder="Digite um prompt de sistema (opcional)", value=None)
|
155 |
],
|
156 |
title="",
|
157 |
)
|
158 |
|
159 |
-
# Evento para processar documentos
|
160 |
process_btn.click(
|
161 |
fn=initialize_retriever,
|
162 |
inputs=[file_input],
|
163 |
-
outputs=[
|
164 |
)
|
165 |
|
166 |
return demo
|
|
|
20 |
# Inicializar o cliente de inferência
|
21 |
client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
|
22 |
|
23 |
+
# Variável global para armazenar o retriever
|
24 |
+
global_retriever = None
|
25 |
+
|
26 |
# Função de pré-processamento de texto
|
27 |
def preprocess_text(text):
|
28 |
"""Pré-processa o texto removendo ruídos e normalizando."""
|
|
|
29 |
text = re.sub(r'(Página|Page)\s+\d+(?:\s+of\s+\d+)?', '', text, flags=re.IGNORECASE)
|
|
|
30 |
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
31 |
text = unidecode(text.lower())
|
32 |
return text
|
33 |
|
34 |
+
# Configurar o retriever
|
35 |
def initialize_retriever(file_objs, persist_directory="chroma_db"):
|
36 |
"""Carrega documentos PDFs, pré-processa e cria um retriever híbrido."""
|
37 |
+
global global_retriever
|
38 |
if not file_objs:
|
39 |
+
return "Nenhum documento carregado."
|
40 |
|
|
|
41 |
documents = []
|
42 |
for file_obj in file_objs:
|
43 |
loader = PyPDFLoader(file_obj.name)
|
44 |
raw_docs = loader.load()
|
45 |
for doc in raw_docs:
|
46 |
doc.page_content = preprocess_text(doc.page_content)
|
|
|
47 |
doc.metadata.update({"source": os.path.basename(file_obj.name)})
|
48 |
documents.extend(raw_docs)
|
49 |
|
|
|
50 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=128)
|
51 |
splits = text_splitter.split_documents(documents)
|
52 |
|
|
|
53 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
54 |
try:
|
|
|
55 |
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
|
56 |
+
vectorstore.add_documents(splits)
|
57 |
except:
|
|
|
58 |
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=persist_directory)
|
59 |
|
|
|
60 |
semantic_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
|
|
|
|
|
61 |
bm25_retriever = BM25Retriever.from_documents(splits)
|
62 |
bm25_retriever.k = 2
|
63 |
|
64 |
+
global_retriever = EnsembleRetriever(
|
|
|
65 |
retrievers=[semantic_retriever, bm25_retriever],
|
66 |
+
weights=[0.6, 0.4]
|
67 |
)
|
68 |
|
69 |
+
return "Documentos processados com sucesso!"
|
70 |
|
71 |
# Formatar o prompt para RAG
|
72 |
def format_prompt(message, history, retriever=None, system_prompt=None):
|
73 |
prompt = "<s>"
|
|
|
|
|
74 |
for user_prompt, bot_response in history:
|
75 |
prompt += f"[INST] {user_prompt} [/INST]"
|
76 |
prompt += f" {bot_response}</s> "
|
|
|
|
|
77 |
if system_prompt:
|
78 |
prompt += f"[SYS] {system_prompt} [/SYS]"
|
|
|
|
|
79 |
if retriever:
|
80 |
docs = retriever.get_relevant_documents(message)
|
81 |
context = "\n".join([f"[{doc.metadata.get('source', 'Unknown')}, Page {doc.metadata.get('page', 'N/A')}] {doc.page_content}" for doc in docs])
|
82 |
prompt += f"[CONTEXT] {context} [/CONTEXT]"
|
|
|
|
|
83 |
prompt += f"[INST] {message} [/INST]"
|
84 |
return prompt
|
85 |
|
86 |
# Função de geração com RAG
|
87 |
def generate(
|
88 |
+
prompt, history, system_prompt=None, temperature=0.2, max_new_tokens=1024, top_p=0.95, repetition_penalty=1.0
|
89 |
):
|
90 |
+
global global_retriever
|
91 |
temperature = float(temperature)
|
92 |
if temperature < 1e-2:
|
93 |
temperature = 1e-2
|
|
|
102 |
seed=42,
|
103 |
)
|
104 |
|
105 |
+
formatted_prompt = format_prompt(prompt, history, global_retriever, system_prompt)
|
|
|
|
|
|
|
106 |
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
|
107 |
output = ""
|
108 |
|
|
|
113 |
# Interface Gradio com RAG
|
114 |
def create_demo():
|
115 |
with gr.Blocks(css=css) as demo:
|
|
|
116 |
status = gr.State(value="Nenhum documento carregado")
|
117 |
|
|
|
118 |
gr.Markdown("<h1>RAG Chatbot</h1>")
|
119 |
|
|
|
120 |
with gr.Row():
|
121 |
with gr.Column(scale=1):
|
122 |
gr.Markdown("### Carregar Documentos")
|
|
|
124 |
process_btn = gr.Button("Processar Documentos")
|
125 |
status_output = gr.Textbox(label="Status", value="Nenhum documento carregado")
|
126 |
|
|
|
127 |
chat_interface = gr.ChatInterface(
|
128 |
fn=generate,
|
129 |
additional_inputs=[
|
|
|
130 |
gr.Textbox(label="System Prompt", placeholder="Digite um prompt de sistema (opcional)", value=None)
|
131 |
],
|
132 |
title="",
|
133 |
)
|
134 |
|
|
|
135 |
process_btn.click(
|
136 |
fn=initialize_retriever,
|
137 |
inputs=[file_input],
|
138 |
+
outputs=[status_output]
|
139 |
)
|
140 |
|
141 |
return demo
|