samim2024 commited on
Commit
244c875
·
verified ·
1 Parent(s): c222b6a

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +66 -0
  2. rag.py +64 -0
  3. requeriments.txt +6 -0
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import streamlit as st
4
+ from streamlit_chat import message
5
+ from rag import ChatPDF
6
+
7
+ st.set_page_config(page_title="ChatPDF")
8
+
9
+
10
+ def display_messages():
11
+ st.subheader("Chat")
12
+ for i, (msg, is_user) in enumerate(st.session_state["messages"]):
13
+ message(msg, is_user=is_user, key=str(i))
14
+ st.session_state["thinking_spinner"] = st.empty()
15
+
16
+
17
+ def process_input():
18
+ if st.session_state["user_input"] and len(st.session_state["user_input"].strip()) > 0:
19
+ user_text = st.session_state["user_input"].strip()
20
+ with st.session_state["thinking_spinner"], st.spinner(f"Thinking"):
21
+ agent_text = st.session_state["assistant"].ask(user_text)
22
+
23
+ st.session_state["messages"].append((user_text, True))
24
+ st.session_state["messages"].append((agent_text, False))
25
+
26
+
27
+ def read_and_save_file():
28
+ st.session_state["assistant"].clear()
29
+ st.session_state["messages"] = []
30
+ st.session_state["user_input"] = ""
31
+
32
+ for file in st.session_state["file_uploader"]:
33
+ with tempfile.NamedTemporaryFile(delete=False) as tf:
34
+ tf.write(file.getbuffer())
35
+ file_path = tf.name
36
+
37
+ with st.session_state["ingestion_spinner"], st.spinner(f"Ingesting {file.name}"):
38
+ st.session_state["assistant"].ingest(file_path)
39
+ os.remove(file_path)
40
+
41
+
42
+ def page():
43
+ if len(st.session_state) == 0:
44
+ st.session_state["messages"] = []
45
+ st.session_state["assistant"] = ChatPDF()
46
+
47
+ st.header("ChatPDF")
48
+
49
+ st.subheader("Upload a document")
50
+ st.file_uploader(
51
+ "Upload document",
52
+ type=["pdf"],
53
+ key="file_uploader",
54
+ on_change=read_and_save_file,
55
+ label_visibility="collapsed",
56
+ accept_multiple_files=True,
57
+ )
58
+
59
+ st.session_state["ingestion_spinner"] = st.empty()
60
+
61
+ display_messages()
62
+ st.text_input("Message", key="user_input", on_change=process_input)
63
+
64
+
65
+ if __name__ == "__main__":
66
+ page()
rag.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import Chroma
2
+ from langchain.chat_models import ChatOllama
3
+ from langchain.embeddings import FastEmbedEmbeddings
4
+ from langchain.schema.output_parser import StrOutputParser
5
+ from langchain.document_loaders import PyPDFLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.schema.runnable import RunnablePassthrough
8
+ from langchain.prompts import PromptTemplate
9
+ from langchain.vectorstores.utils import filter_complex_metadata
10
+ #add new import
11
+ from langchain_community.document_loaders.csv_loader import CSVLoader
12
+
13
+
14
+
15
+ class ChatPDF:
16
+ vector_store = None
17
+ retriever = None
18
+ chain = None
19
+
20
+ def __init__(self):
21
+ self.model = ChatOllama(model="mistral")
22
+ self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)
23
+ self.prompt = PromptTemplate.from_template(
24
+ """
25
+ <s> [INST] You are an assistant for question-answering tasks. Use only the following pieces of retrieved context
26
+ to build an answer for the user. If you don't know the answer, just say that you don't know. Use three sentences
27
+ maximum and keep the answer concise. [/INST] </s>
28
+ [INST] Question: {question}
29
+ Context: {context}
30
+ Answer: [/INST]
31
+ """
32
+ )
33
+
34
+ def ingest(self, pdf_file_path: str):
35
+ docs = PyPDFLoader(file_path=pdf_file_path).load()
36
+
37
+
38
+ chunks = self.text_splitter.split_documents(docs)
39
+ chunks = filter_complex_metadata(chunks)
40
+
41
+ vector_store = Chroma.from_documents(documents=chunks, embedding=FastEmbedEmbeddings())
42
+ self.retriever = vector_store.as_retriever(
43
+ search_type="similarity_score_threshold",
44
+ search_kwargs={
45
+ "k": 3,
46
+ "score_threshold": 0.5,
47
+ },
48
+ )
49
+
50
+ self.chain = ({"context": self.retriever, "question": RunnablePassthrough()}
51
+ | self.prompt
52
+ | self.model
53
+ | StrOutputParser())
54
+
55
+ def ask(self, query: str):
56
+ if not self.chain:
57
+ return "Please, add a PDF document first."
58
+
59
+ return self.chain.invoke(query)
60
+
61
+ def clear(self):
62
+ self.vector_store = None
63
+ self.retriever = None
64
+ self.chain = None
requeriments.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ langchain
2
+ streamlit
3
+ streamlit-chat
4
+ fastembed
5
+ chromadb
6
+ pypdf