Spaces:
Runtime error
Runtime error
Upload 20 files
Browse files- .gitattributes +4 -0
- app.py +35 -0
- cache/index.faiss +3 -0
- cache/index.pkl +3 -0
- data/Codigo-Aeronautico.pdf +0 -0
- data/Codigo-Aguas.pdf +0 -0
- data/Codigo-COMERCIO.pdf +0 -0
- data/Codigo-Civil.pdf +3 -0
- data/Codigo-Derecho-Internacional.pdf +0 -0
- data/Codigo-Justicia-Militar.pdf +0 -0
- data/Codigo-Mineria.pdf +0 -0
- data/Codigo-ORGANICO-DE TRIBUNALES.pdf +0 -0
- data/Codigo-PENAL_.pdf +3 -0
- data/Codigo-Procedimiento-Civil.pdf +0 -0
- data/Codigo-Procedimiento-Penal.pdf +0 -0
- data/Codigo-Procesal-Penal.pdf +0 -0
- data/Codigo-Sanitario.pdf +0 -0
- data/Codigo-Tributario.pdf +0 -0
- data/Codigo_Trabajo.pdf +3 -0
- modelo.py +170 -0
- requirements.txt +12 -0
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
cache/index.faiss filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/Codigo_Trabajo.pdf filter=lfs diff=lfs merge=lfs -text
|
38 |
+
data/Codigo-Civil.pdf filter=lfs diff=lfs merge=lfs -text
|
39 |
+
data/Codigo-PENAL_.pdf filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from modelo import get_chain
|
3 |
+
|
4 |
+
#Menu Visual
|
5 |
+
st.markdown("<h1 style='text-align: center; color: yellow;'>Chatbot Códigos de Leyes</h1>", unsafe_allow_html=True) #mandar un texto en html
|
6 |
+
st.header("🤖🦾ChatBot entrenado usando los codigos que componen la ley. Actualizado en 30/01/2024.")
|
7 |
+
|
8 |
+
with st.chat_message(name="ai"): #assistant or ai
|
9 |
+
st.write('🤖 Hola soy tu asistente del dia de hoy, en que te puedo ayudar')
|
10 |
+
|
11 |
+
|
12 |
+
if "mensajes" not in st.session_state:
|
13 |
+
st.session_state.mensajes = []
|
14 |
+
|
15 |
+
for message in st.session_state.mensajes:
|
16 |
+
with st.chat_message(message["role"]):
|
17 |
+
st.markdown(message["content"])
|
18 |
+
|
19 |
+
pregunta = st.chat_input("Ingresa tu pregunta")
|
20 |
+
#Manejador del prompt, es un input y button a la vez
|
21 |
+
with st.spinner("Cargando modelo, por favor espere, mientras puede ingresar su pregunta"):
|
22 |
+
chain = get_chain() #windows
|
23 |
+
|
24 |
+
if pregunta:
|
25 |
+
#Muestra el mensaje del usuario en el chat
|
26 |
+
with st.chat_message(name="human"): #assistant or ai
|
27 |
+
st.markdown(pregunta)
|
28 |
+
|
29 |
+
st.session_state.mensajes.append({"role" : "human", "content": pregunta})
|
30 |
+
with st.spinner("Procesando su respuesta, por favor espere"):
|
31 |
+
respuesta = chain.invoke(pregunta)
|
32 |
+
|
33 |
+
with st.chat_message(name="ai"): #assistant or ai
|
34 |
+
st.markdown(respuesta)
|
35 |
+
st.session_state.mensajes.append({"role" : "ai", "content": respuesta})
|
cache/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:43d239d642504df57094e1562159f5742780f8ad829c3f3da33fdbf1352d63c7
|
3 |
+
size 33478701
|
cache/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd2951c5b74aa70bed9f02f9bfa68a01d2bfa52af11a2210c02ee89a84bb9717
|
3 |
+
size 11538382
|
data/Codigo-Aeronautico.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Codigo-Aguas.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Codigo-COMERCIO.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Codigo-Civil.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:355ff58317dca16570b85df30185e73399d51584762f0bad71569f223756b845
|
3 |
+
size 1994787
|
data/Codigo-Derecho-Internacional.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Codigo-Justicia-Militar.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Codigo-Mineria.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Codigo-ORGANICO-DE TRIBUNALES.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Codigo-PENAL_.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:610d1c73d251b9a3128c1100a214dffba35198b176ecf4fabd2884663ecdc206
|
3 |
+
size 1485820
|
data/Codigo-Procedimiento-Civil.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Codigo-Procedimiento-Penal.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Codigo-Procesal-Penal.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Codigo-Sanitario.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Codigo-Tributario.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Codigo_Trabajo.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a343a5e90c2f31642b0c3787bd80fc69909beba5841ce9ba44c56ae1aa103b52
|
3 |
+
size 1008557
|
modelo.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
|
2 |
+
from langchain_community.document_loaders import PyPDFDirectoryLoader
|
3 |
+
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
6 |
+
from langchain_core.retrievers import BaseRetriever
|
7 |
+
from langchain_community.vectorstores import FAISS #Facebook AI Similarity Search
|
8 |
+
from sentence_transformers import CrossEncoder
|
9 |
+
from langchain_core.documents import Document
|
10 |
+
from langchain.prompts import PromptTemplate
|
11 |
+
from langchain.chains.llm import LLMChain
|
12 |
+
from langchain.chains import RetrievalQA
|
13 |
+
from langchain_openai import ChatOpenAI
|
14 |
+
from typing import List
|
15 |
+
import pandas as pd
|
16 |
+
|
17 |
+
embeddings = HuggingFaceEmbeddings(
|
18 |
+
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", # Ruta a modelo Pre entrenado
|
19 |
+
model_kwargs={'device':'cpu'}, # Opciones de configuracion del modelo
|
20 |
+
encode_kwargs={'normalize_embeddings': False}) # Opciones de Encoding
|
21 |
+
|
22 |
+
try:
|
23 |
+
vectorstore = FAISS.load_local("cache", embeddings)
|
24 |
+
except:
|
25 |
+
loader = PyPDFDirectoryLoader("data/")
|
26 |
+
data = loader.load()
|
27 |
+
|
28 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=100, length_function=len)
|
29 |
+
docs = text_splitter.split_documents(data)
|
30 |
+
|
31 |
+
#DB y retriever
|
32 |
+
vectorstore = FAISS.from_documents(docs, embeddings) # Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents.
|
33 |
+
vectorstore.save_local("cache")
|
34 |
+
|
35 |
+
#Renranker para mejorar respuestas
|
36 |
+
model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2', max_length=512) #Por lejos el mejor, los otros no sirven
|
37 |
+
class Reranking_retriever(BaseRetriever):
|
38 |
+
def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
|
39 |
+
busqueda = vectorstore.similarity_search_with_score(query, k=10, fetch_k=15) # k = 10 numero total de documento a traer previo al re ranking
|
40 |
+
|
41 |
+
df = pd.DataFrame({ # Funciones lambda toman la ultima variable como input y la previa como iteracionm la primera x es que se retornara
|
42 |
+
'scores': list(map(lambda x : x[-1], busqueda)),
|
43 |
+
'respuestas': list(map(lambda x : x[0].page_content, busqueda)),
|
44 |
+
'metadata': list(map(lambda x : x[0].metadata ,busqueda))})
|
45 |
+
|
46 |
+
print(df.scores)
|
47 |
+
respuestas = df.respuestas.to_list() #lista de respuestas
|
48 |
+
|
49 |
+
sentence_combinations = [[query, respuesta] for respuesta in respuestas] # So we create the respective sentence combinations
|
50 |
+
|
51 |
+
scores = model.predict(sentence_combinations) #Aplica cross encoding para ver que para de q y a tienen mayor relacion, en este caso se manda la pregunta en cada una de ellas y se compara una a una con las respuestas
|
52 |
+
scores = scores.argsort()[::-1] #Ordena puntajes de mas relevate a menos relevante siendo indice 0 el mas relevante
|
53 |
+
|
54 |
+
docs = []
|
55 |
+
for i in scores[:3]: #Los 3 resulados mas relevantes
|
56 |
+
docs.append(Document(page_content=df.respuestas[i], metadata=df.metadata[i]))
|
57 |
+
return docs
|
58 |
+
|
59 |
+
retriever = Reranking_retriever()
|
60 |
+
|
61 |
+
def get_chain():
|
62 |
+
# prompt_template =
|
63 |
+
# prompt_template =
|
64 |
+
QA_CHAIN_PROMPT = PromptTemplate.from_template("""
|
65 |
+
Usa el siguiente contexto para responder la pregunta.
|
66 |
+
|
67 |
+
Contexto
|
68 |
+
{contexto}
|
69 |
+
|
70 |
+
Pregunta: {question}
|
71 |
+
Respuesta Util:"""
|
72 |
+
) # prompt_template defined above
|
73 |
+
|
74 |
+
|
75 |
+
llm = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)
|
76 |
+
|
77 |
+
llm_chain = LLMChain(llm=llm, prompt=QA_CHAIN_PROMPT, callbacks=None, verbose=True)
|
78 |
+
|
79 |
+
document_prompt = PromptTemplate( input_variables=["page_content"], template="Contexto:\n{page_content}")
|
80 |
+
|
81 |
+
combine_documents_chain = StuffDocumentsChain( llm_chain=llm_chain, document_variable_name="contexto", document_prompt=document_prompt, callbacks=None)
|
82 |
+
|
83 |
+
chain = RetrievalQA(combine_documents_chain=combine_documents_chain, callbacks=None, verbose=True, retriever=retriever)
|
84 |
+
|
85 |
+
return(chain)
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
from langchain_core.runnables import RunnablePassthrough
|
91 |
+
from langchain_core.output_parsers import StrOutputParser
|
92 |
+
from langchain_community.document_loaders import PyPDFDirectoryLoader
|
93 |
+
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
94 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
95 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
96 |
+
from langchain_core.retrievers import BaseRetriever
|
97 |
+
from langchain_community.vectorstores import FAISS #Facebook AI Similarity Search
|
98 |
+
from sentence_transformers import CrossEncoder
|
99 |
+
from langchain_core.documents import Document
|
100 |
+
from langchain.prompts import ChatPromptTemplate
|
101 |
+
from langchain_openai import ChatOpenAI
|
102 |
+
from typing import List
|
103 |
+
import pandas as pd
|
104 |
+
|
105 |
+
embeddings = HuggingFaceEmbeddings(
|
106 |
+
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", # Ruta a modelo Pre entrenado
|
107 |
+
model_kwargs={'device':'cpu'}, # Opciones de configuracion del modelo
|
108 |
+
encode_kwargs={'normalize_embeddings': False}) # Opciones de Encoding
|
109 |
+
|
110 |
+
try:
|
111 |
+
vectorstore = FAISS.load_local("cache", embeddings)
|
112 |
+
except:
|
113 |
+
loader = PyPDFDirectoryLoader("data/")
|
114 |
+
data = loader.load()
|
115 |
+
|
116 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=100, length_function=len)
|
117 |
+
docs = text_splitter.split_documents(data)
|
118 |
+
|
119 |
+
#DB y retriever
|
120 |
+
vectorstore = FAISS.from_documents(docs, embeddings) # Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents.
|
121 |
+
vectorstore.save_local("cache")
|
122 |
+
|
123 |
+
#Renranker para mejorar respuestas
|
124 |
+
model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2', max_length=512) #Por lejos el mejor, los otros no sirven
|
125 |
+
class Reranking_retriever(BaseRetriever):
|
126 |
+
def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
|
127 |
+
busqueda = vectorstore.similarity_search_with_score(query, k=10, fetch_k=15) # k = 10 numero total de documento a traer previo al re ranking
|
128 |
+
|
129 |
+
df = pd.DataFrame({ # Funciones lambda toman la ultima variable como input y la previa como iteracionm la primera x es que se retornara
|
130 |
+
'scores': list(map(lambda x : x[-1], busqueda)),
|
131 |
+
'respuestas': list(map(lambda x : x[0].page_content, busqueda)),
|
132 |
+
'metadata': list(map(lambda x : x[0].metadata ,busqueda))})
|
133 |
+
|
134 |
+
print(df.scores)
|
135 |
+
respuestas = df.respuestas.to_list() #lista de respuestas
|
136 |
+
|
137 |
+
sentence_combinations = [[query, respuesta] for respuesta in respuestas] # So we create the respective sentence combinations
|
138 |
+
|
139 |
+
scores = model.predict(sentence_combinations) #Aplica cross encoding para ver que para de q y a tienen mayor relacion, en este caso se manda la pregunta en cada una de ellas y se compara una a una con las respuestas
|
140 |
+
scores = scores.argsort()[::-1] #Ordena puntajes de mas relevate a menos relevante siendo indice 0 el mas relevante
|
141 |
+
|
142 |
+
docs = []
|
143 |
+
for i in scores[:3]: #Los 3 resulados mas relevantes
|
144 |
+
docs.append(Document(page_content=df.respuestas[i], metadata=df.metadata[i]))
|
145 |
+
return docs
|
146 |
+
|
147 |
+
retriever = Reranking_retriever()
|
148 |
+
|
149 |
+
def get_chain():
|
150 |
+
template = """
|
151 |
+
Usa el siguiente contexto para responder la pregunta.
|
152 |
+
|
153 |
+
Contexto
|
154 |
+
{contexto}
|
155 |
+
|
156 |
+
Pregunta: {pregunta}
|
157 |
+
Respuesta Util:"""
|
158 |
+
|
159 |
+
prompt = ChatPromptTemplate.from_template(template)
|
160 |
+
|
161 |
+
model = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)
|
162 |
+
|
163 |
+
chain = (
|
164 |
+
{"contexto": retriever, "pregunta": RunnablePassthrough()}
|
165 |
+
| prompt
|
166 |
+
| model
|
167 |
+
| StrOutputParser()
|
168 |
+
)
|
169 |
+
|
170 |
+
return(chain)
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
sentence-transformers
|
4 |
+
datasets
|
5 |
+
faiss-cpu
|
6 |
+
numpy
|
7 |
+
pandas
|
8 |
+
langchain
|
9 |
+
langchain-community
|
10 |
+
langchain-openai
|
11 |
+
pypdf
|
12 |
+
streamlit
|