Waflon commited on
Commit
3a2df8b
1 Parent(s): 08e8255

Upload 20 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ cache/index.faiss filter=lfs diff=lfs merge=lfs -text
37
+ data/Codigo_Trabajo.pdf filter=lfs diff=lfs merge=lfs -text
38
+ data/Codigo-Civil.pdf filter=lfs diff=lfs merge=lfs -text
39
+ data/Codigo-PENAL_.pdf filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from modelo import get_chain
3
+
4
+ #Menu Visual
5
+ st.markdown("<h1 style='text-align: center; color: yellow;'>Chatbot Códigos de Leyes</h1>", unsafe_allow_html=True) #mandar un texto en html
6
+ st.header("🤖🦾ChatBot entrenado usando los codigos que componen la ley. Actualizado en 30/01/2024.")
7
+
8
+ with st.chat_message(name="ai"): #assistant or ai
9
+ st.write('🤖 Hola soy tu asistente del dia de hoy, en que te puedo ayudar')
10
+
11
+
12
+ if "mensajes" not in st.session_state:
13
+ st.session_state.mensajes = []
14
+
15
+ for message in st.session_state.mensajes:
16
+ with st.chat_message(message["role"]):
17
+ st.markdown(message["content"])
18
+
19
+ pregunta = st.chat_input("Ingresa tu pregunta")
20
+ #Manejador del prompt, es un input y button a la vez
21
+ with st.spinner("Cargando modelo, por favor espere, mientras puede ingresar su pregunta"):
22
+ chain = get_chain() #windows
23
+
24
+ if pregunta:
25
+ #Muestra el mensaje del usuario en el chat
26
+ with st.chat_message(name="human"): #assistant or ai
27
+ st.markdown(pregunta)
28
+
29
+ st.session_state.mensajes.append({"role" : "human", "content": pregunta})
30
+ with st.spinner("Procesando su respuesta, por favor espere"):
31
+ respuesta = chain.invoke(pregunta)
32
+
33
+ with st.chat_message(name="ai"): #assistant or ai
34
+ st.markdown(respuesta)
35
+ st.session_state.mensajes.append({"role" : "ai", "content": respuesta})
cache/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43d239d642504df57094e1562159f5742780f8ad829c3f3da33fdbf1352d63c7
3
+ size 33478701
cache/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd2951c5b74aa70bed9f02f9bfa68a01d2bfa52af11a2210c02ee89a84bb9717
3
+ size 11538382
data/Codigo-Aeronautico.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/Codigo-Aguas.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/Codigo-COMERCIO.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/Codigo-Civil.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:355ff58317dca16570b85df30185e73399d51584762f0bad71569f223756b845
3
+ size 1994787
data/Codigo-Derecho-Internacional.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/Codigo-Justicia-Militar.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/Codigo-Mineria.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/Codigo-ORGANICO-DE TRIBUNALES.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/Codigo-PENAL_.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:610d1c73d251b9a3128c1100a214dffba35198b176ecf4fabd2884663ecdc206
3
+ size 1485820
data/Codigo-Procedimiento-Civil.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/Codigo-Procedimiento-Penal.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/Codigo-Procesal-Penal.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/Codigo-Sanitario.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/Codigo-Tributario.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/Codigo_Trabajo.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a343a5e90c2f31642b0c3787bd80fc69909beba5841ce9ba44c56ae1aa103b52
3
+ size 1008557
modelo.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
2
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
3
+ from langchain_core.callbacks import CallbackManagerForRetrieverRun
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_core.retrievers import BaseRetriever
7
+ from langchain_community.vectorstores import FAISS #Facebook AI Similarity Search
8
+ from sentence_transformers import CrossEncoder
9
+ from langchain_core.documents import Document
10
+ from langchain.prompts import PromptTemplate
11
+ from langchain.chains.llm import LLMChain
12
+ from langchain.chains import RetrievalQA
13
+ from langchain_openai import ChatOpenAI
14
+ from typing import List
15
+ import pandas as pd
16
+
17
+ embeddings = HuggingFaceEmbeddings(
18
+ model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", # Ruta a modelo Pre entrenado
19
+ model_kwargs={'device':'cpu'}, # Opciones de configuracion del modelo
20
+ encode_kwargs={'normalize_embeddings': False}) # Opciones de Encoding
21
+
22
+ try:
23
+ vectorstore = FAISS.load_local("cache", embeddings)
24
+ except:
25
+ loader = PyPDFDirectoryLoader("data/")
26
+ data = loader.load()
27
+
28
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=100, length_function=len)
29
+ docs = text_splitter.split_documents(data)
30
+
31
+ #DB y retriever
32
+ vectorstore = FAISS.from_documents(docs, embeddings) # Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents.
33
+ vectorstore.save_local("cache")
34
+
35
+ #Renranker para mejorar respuestas
36
+ model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2', max_length=512) #Por lejos el mejor, los otros no sirven
37
+ class Reranking_retriever(BaseRetriever):
38
+ def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
39
+ busqueda = vectorstore.similarity_search_with_score(query, k=10, fetch_k=15) # k = 10 numero total de documento a traer previo al re ranking
40
+
41
+ df = pd.DataFrame({ # Funciones lambda toman la ultima variable como input y la previa como iteracionm la primera x es que se retornara
42
+ 'scores': list(map(lambda x : x[-1], busqueda)),
43
+ 'respuestas': list(map(lambda x : x[0].page_content, busqueda)),
44
+ 'metadata': list(map(lambda x : x[0].metadata ,busqueda))})
45
+
46
+ print(df.scores)
47
+ respuestas = df.respuestas.to_list() #lista de respuestas
48
+
49
+ sentence_combinations = [[query, respuesta] for respuesta in respuestas] # So we create the respective sentence combinations
50
+
51
+ scores = model.predict(sentence_combinations) #Aplica cross encoding para ver que para de q y a tienen mayor relacion, en este caso se manda la pregunta en cada una de ellas y se compara una a una con las respuestas
52
+ scores = scores.argsort()[::-1] #Ordena puntajes de mas relevate a menos relevante siendo indice 0 el mas relevante
53
+
54
+ docs = []
55
+ for i in scores[:3]: #Los 3 resulados mas relevantes
56
+ docs.append(Document(page_content=df.respuestas[i], metadata=df.metadata[i]))
57
+ return docs
58
+
59
+ retriever = Reranking_retriever()
60
+
61
+ def get_chain():
62
+ # prompt_template =
63
+ # prompt_template =
64
+ QA_CHAIN_PROMPT = PromptTemplate.from_template("""
65
+ Usa el siguiente contexto para responder la pregunta.
66
+
67
+ Contexto
68
+ {contexto}
69
+
70
+ Pregunta: {question}
71
+ Respuesta Util:"""
72
+ ) # prompt_template defined above
73
+
74
+
75
+ llm = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)
76
+
77
+ llm_chain = LLMChain(llm=llm, prompt=QA_CHAIN_PROMPT, callbacks=None, verbose=True)
78
+
79
+ document_prompt = PromptTemplate( input_variables=["page_content"], template="Contexto:\n{page_content}")
80
+
81
+ combine_documents_chain = StuffDocumentsChain( llm_chain=llm_chain, document_variable_name="contexto", document_prompt=document_prompt, callbacks=None)
82
+
83
+ chain = RetrievalQA(combine_documents_chain=combine_documents_chain, callbacks=None, verbose=True, retriever=retriever)
84
+
85
+ return(chain)
86
+
87
+
88
+
89
+
90
+ from langchain_core.runnables import RunnablePassthrough
91
+ from langchain_core.output_parsers import StrOutputParser
92
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
93
+ from langchain_core.callbacks import CallbackManagerForRetrieverRun
94
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
95
+ from langchain_community.embeddings import HuggingFaceEmbeddings
96
+ from langchain_core.retrievers import BaseRetriever
97
+ from langchain_community.vectorstores import FAISS #Facebook AI Similarity Search
98
+ from sentence_transformers import CrossEncoder
99
+ from langchain_core.documents import Document
100
+ from langchain.prompts import ChatPromptTemplate
101
+ from langchain_openai import ChatOpenAI
102
+ from typing import List
103
+ import pandas as pd
104
+
105
+ embeddings = HuggingFaceEmbeddings(
106
+ model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", # Ruta a modelo Pre entrenado
107
+ model_kwargs={'device':'cpu'}, # Opciones de configuracion del modelo
108
+ encode_kwargs={'normalize_embeddings': False}) # Opciones de Encoding
109
+
110
+ try:
111
+ vectorstore = FAISS.load_local("cache", embeddings)
112
+ except:
113
+ loader = PyPDFDirectoryLoader("data/")
114
+ data = loader.load()
115
+
116
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=100, length_function=len)
117
+ docs = text_splitter.split_documents(data)
118
+
119
+ #DB y retriever
120
+ vectorstore = FAISS.from_documents(docs, embeddings) # Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents.
121
+ vectorstore.save_local("cache")
122
+
123
+ #Renranker para mejorar respuestas
124
+ model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2', max_length=512) #Por lejos el mejor, los otros no sirven
125
+ class Reranking_retriever(BaseRetriever):
126
+ def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
127
+ busqueda = vectorstore.similarity_search_with_score(query, k=10, fetch_k=15) # k = 10 numero total de documento a traer previo al re ranking
128
+
129
+ df = pd.DataFrame({ # Funciones lambda toman la ultima variable como input y la previa como iteracionm la primera x es que se retornara
130
+ 'scores': list(map(lambda x : x[-1], busqueda)),
131
+ 'respuestas': list(map(lambda x : x[0].page_content, busqueda)),
132
+ 'metadata': list(map(lambda x : x[0].metadata ,busqueda))})
133
+
134
+ print(df.scores)
135
+ respuestas = df.respuestas.to_list() #lista de respuestas
136
+
137
+ sentence_combinations = [[query, respuesta] for respuesta in respuestas] # So we create the respective sentence combinations
138
+
139
+ scores = model.predict(sentence_combinations) #Aplica cross encoding para ver que para de q y a tienen mayor relacion, en este caso se manda la pregunta en cada una de ellas y se compara una a una con las respuestas
140
+ scores = scores.argsort()[::-1] #Ordena puntajes de mas relevate a menos relevante siendo indice 0 el mas relevante
141
+
142
+ docs = []
143
+ for i in scores[:3]: #Los 3 resulados mas relevantes
144
+ docs.append(Document(page_content=df.respuestas[i], metadata=df.metadata[i]))
145
+ return docs
146
+
147
+ retriever = Reranking_retriever()
148
+
149
+ def get_chain():
150
+ template = """
151
+ Usa el siguiente contexto para responder la pregunta.
152
+
153
+ Contexto
154
+ {contexto}
155
+
156
+ Pregunta: {pregunta}
157
+ Respuesta Util:"""
158
+
159
+ prompt = ChatPromptTemplate.from_template(template)
160
+
161
+ model = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)
162
+
163
+ chain = (
164
+ {"contexto": retriever, "pregunta": RunnablePassthrough()}
165
+ | prompt
166
+ | model
167
+ | StrOutputParser()
168
+ )
169
+
170
+ return(chain)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ sentence-transformers
4
+ datasets
5
+ faiss-cpu
6
+ numpy
7
+ pandas
8
+ langchain
9
+ langchain-community
10
+ langchain-openai
11
+ pypdf
12
+ streamlit