DHEIVER commited on
Commit
ff20866
·
verified ·
1 Parent(s): 70792e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -39
app.py CHANGED
@@ -20,90 +20,74 @@ footer {visibility: hidden}
20
  # Inicializar o cliente de inferência
21
  client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
22
 
 
 
 
23
  # Função de pré-processamento de texto
24
  def preprocess_text(text):
25
  """Pré-processa o texto removendo ruídos e normalizando."""
26
- # Remover números de página (ex.: "Página 1", "Page 1 of 10")
27
  text = re.sub(r'(Página|Page)\s+\d+(?:\s+of\s+\d+)?', '', text, flags=re.IGNORECASE)
28
- # Remover múltiplos espaços e quebras de linha
29
  text = re.sub(r'\s+', ' ', text).strip()
30
- # Normalizar texto (remover acentos e converter para minúsculas)
31
  text = unidecode(text.lower())
32
  return text
33
 
34
- # Configurar o retriever com pré-processamento e indexação avançada
35
  def initialize_retriever(file_objs, persist_directory="chroma_db"):
36
  """Carrega documentos PDFs, pré-processa e cria um retriever híbrido."""
 
37
  if not file_objs:
38
- return None, "Nenhum documento carregado."
39
 
40
- # Carregar e pré-processar documentos
41
  documents = []
42
  for file_obj in file_objs:
43
  loader = PyPDFLoader(file_obj.name)
44
  raw_docs = loader.load()
45
  for doc in raw_docs:
46
  doc.page_content = preprocess_text(doc.page_content)
47
- # Adicionar metadados (exemplo: página e origem)
48
  doc.metadata.update({"source": os.path.basename(file_obj.name)})
49
  documents.extend(raw_docs)
50
 
51
- # Dividir em pedaços menores
52
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=128)
53
  splits = text_splitter.split_documents(documents)
54
 
55
- # Criar embeddings e banco de vetores (Chroma)
56
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
57
  try:
58
- # Tentar carregar um banco existente
59
  vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
60
- vectorstore.add_documents(splits) # Adicionar novos documentos
61
  except:
62
- # Criar um novo banco se não existir
63
  vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=persist_directory)
64
 
65
- # Configurar retriever semântico
66
  semantic_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
67
-
68
- # Configurar retriever lexical (BM25)
69
  bm25_retriever = BM25Retriever.from_documents(splits)
70
  bm25_retriever.k = 2
71
 
72
- # Combinar em um retriever híbrido
73
- ensemble_retriever = EnsembleRetriever(
74
  retrievers=[semantic_retriever, bm25_retriever],
75
- weights=[0.6, 0.4] # Mais peso para busca semântica
76
  )
77
 
78
- return ensemble_retriever, "Documentos processados com sucesso!"
79
 
80
  # Formatar o prompt para RAG
81
  def format_prompt(message, history, retriever=None, system_prompt=None):
82
  prompt = "<s>"
83
-
84
- # Adicionar histórico
85
  for user_prompt, bot_response in history:
86
  prompt += f"[INST] {user_prompt} [/INST]"
87
  prompt += f" {bot_response}</s> "
88
-
89
- # Adicionar instrução do sistema, se fornecida
90
  if system_prompt:
91
  prompt += f"[SYS] {system_prompt} [/SYS]"
92
-
93
- # Adicionar contexto recuperado, se houver retriever
94
  if retriever:
95
  docs = retriever.get_relevant_documents(message)
96
  context = "\n".join([f"[{doc.metadata.get('source', 'Unknown')}, Page {doc.metadata.get('page', 'N/A')}] {doc.page_content}" for doc in docs])
97
  prompt += f"[CONTEXT] {context} [/CONTEXT]"
98
-
99
- # Adicionar a mensagem do usuário
100
  prompt += f"[INST] {message} [/INST]"
101
  return prompt
102
 
103
  # Função de geração com RAG
104
  def generate(
105
- prompt, history, retriever=None, system_prompt=None, temperature=0.2, max_new_tokens=1024, top_p=0.95, repetition_penalty=1.0
106
  ):
 
107
  temperature = float(temperature)
108
  if temperature < 1e-2:
109
  temperature = 1e-2
@@ -118,10 +102,7 @@ def generate(
118
  seed=42,
119
  )
120
 
121
- # Formatar o prompt com contexto RAG
122
- formatted_prompt = format_prompt(prompt, history, retriever, system_prompt)
123
-
124
- # Gerar resposta em streaming
125
  stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
126
  output = ""
127
 
@@ -132,13 +113,10 @@ def generate(
132
  # Interface Gradio com RAG
133
  def create_demo():
134
  with gr.Blocks(css=css) as demo:
135
- retriever_state = gr.State(value=None)
136
  status = gr.State(value="Nenhum documento carregado")
137
 
138
- # Título
139
  gr.Markdown("<h1>RAG Chatbot</h1>")
140
 
141
- # Seção de upload de documentos
142
  with gr.Row():
143
  with gr.Column(scale=1):
144
  gr.Markdown("### Carregar Documentos")
@@ -146,21 +124,18 @@ def create_demo():
146
  process_btn = gr.Button("Processar Documentos")
147
  status_output = gr.Textbox(label="Status", value="Nenhum documento carregado")
148
 
149
- # Interface de chat
150
  chat_interface = gr.ChatInterface(
151
  fn=generate,
152
  additional_inputs=[
153
- gr.State(value=retriever_state),
154
  gr.Textbox(label="System Prompt", placeholder="Digite um prompt de sistema (opcional)", value=None)
155
  ],
156
  title="",
157
  )
158
 
159
- # Evento para processar documentos
160
  process_btn.click(
161
  fn=initialize_retriever,
162
  inputs=[file_input],
163
- outputs=[retriever_state, status_output]
164
  )
165
 
166
  return demo
 
20
  # Inicializar o cliente de inferência
21
  client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
22
 
23
+ # Variável global para armazenar o retriever
24
+ global_retriever = None
25
+
26
  # Função de pré-processamento de texto
27
  def preprocess_text(text):
28
  """Pré-processa o texto removendo ruídos e normalizando."""
 
29
  text = re.sub(r'(Página|Page)\s+\d+(?:\s+of\s+\d+)?', '', text, flags=re.IGNORECASE)
 
30
  text = re.sub(r'\s+', ' ', text).strip()
 
31
  text = unidecode(text.lower())
32
  return text
33
 
34
+ # Configurar o retriever
35
  def initialize_retriever(file_objs, persist_directory="chroma_db"):
36
  """Carrega documentos PDFs, pré-processa e cria um retriever híbrido."""
37
+ global global_retriever
38
  if not file_objs:
39
+ return "Nenhum documento carregado."
40
 
 
41
  documents = []
42
  for file_obj in file_objs:
43
  loader = PyPDFLoader(file_obj.name)
44
  raw_docs = loader.load()
45
  for doc in raw_docs:
46
  doc.page_content = preprocess_text(doc.page_content)
 
47
  doc.metadata.update({"source": os.path.basename(file_obj.name)})
48
  documents.extend(raw_docs)
49
 
 
50
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=128)
51
  splits = text_splitter.split_documents(documents)
52
 
 
53
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
54
  try:
 
55
  vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
56
+ vectorstore.add_documents(splits)
57
  except:
 
58
  vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=persist_directory)
59
 
 
60
  semantic_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
 
 
61
  bm25_retriever = BM25Retriever.from_documents(splits)
62
  bm25_retriever.k = 2
63
 
64
+ global_retriever = EnsembleRetriever(
 
65
  retrievers=[semantic_retriever, bm25_retriever],
66
+ weights=[0.6, 0.4]
67
  )
68
 
69
+ return "Documentos processados com sucesso!"
70
 
71
  # Formatar o prompt para RAG
72
  def format_prompt(message, history, retriever=None, system_prompt=None):
73
  prompt = "<s>"
 
 
74
  for user_prompt, bot_response in history:
75
  prompt += f"[INST] {user_prompt} [/INST]"
76
  prompt += f" {bot_response}</s> "
 
 
77
  if system_prompt:
78
  prompt += f"[SYS] {system_prompt} [/SYS]"
 
 
79
  if retriever:
80
  docs = retriever.get_relevant_documents(message)
81
  context = "\n".join([f"[{doc.metadata.get('source', 'Unknown')}, Page {doc.metadata.get('page', 'N/A')}] {doc.page_content}" for doc in docs])
82
  prompt += f"[CONTEXT] {context} [/CONTEXT]"
 
 
83
  prompt += f"[INST] {message} [/INST]"
84
  return prompt
85
 
86
  # Função de geração com RAG
87
  def generate(
88
+ prompt, history, system_prompt=None, temperature=0.2, max_new_tokens=1024, top_p=0.95, repetition_penalty=1.0
89
  ):
90
+ global global_retriever
91
  temperature = float(temperature)
92
  if temperature < 1e-2:
93
  temperature = 1e-2
 
102
  seed=42,
103
  )
104
 
105
+ formatted_prompt = format_prompt(prompt, history, global_retriever, system_prompt)
 
 
 
106
  stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
107
  output = ""
108
 
 
113
  # Interface Gradio com RAG
114
  def create_demo():
115
  with gr.Blocks(css=css) as demo:
 
116
  status = gr.State(value="Nenhum documento carregado")
117
 
 
118
  gr.Markdown("<h1>RAG Chatbot</h1>")
119
 
 
120
  with gr.Row():
121
  with gr.Column(scale=1):
122
  gr.Markdown("### Carregar Documentos")
 
124
  process_btn = gr.Button("Processar Documentos")
125
  status_output = gr.Textbox(label="Status", value="Nenhum documento carregado")
126
 
 
127
  chat_interface = gr.ChatInterface(
128
  fn=generate,
129
  additional_inputs=[
 
130
  gr.Textbox(label="System Prompt", placeholder="Digite um prompt de sistema (opcional)", value=None)
131
  ],
132
  title="",
133
  )
134
 
 
135
  process_btn.click(
136
  fn=initialize_retriever,
137
  inputs=[file_input],
138
+ outputs=[status_output]
139
  )
140
 
141
  return demo