Paul-Joshi commited on
Commit
978d20f
·
verified ·
1 Parent(s): 3d5a8da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -79
app.py CHANGED
@@ -1,102 +1,152 @@
1
  import streamlit as st
2
- from dotenv import load_dotenv
3
- from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
5
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
- from langchain.vectorstores import FAISS
7
- from langchain.chat_models import ChatOpenAI
8
- from langchain.memory import ConversationBufferMemory
9
- from langchain.chains import ConversationalRetrievalChain
10
- from css_template import css, bot_template, user_template
11
- from langchain.llms import HuggingFaceHub
12
- import os
13
- # os.environ['FAISS_NO_AVX2'] = '1'
14
-
15
- def method_get_pdf_text(pdf_docs):
16
- text = ""
17
- for pdf in pdf_docs:
18
- pdf_reader = PdfReader(pdf)
19
- for page in pdf_reader.pages:
20
- text += page.extract_text()
21
- return text
22
 
 
23
 
24
- def method_get_text_chunks(text):
25
- text_splitter = CharacterTextSplitter(
26
- separator="\n\n",
27
- chunk_size=1000,
28
- chunk_overlap=200,
29
- length_function=len,
30
- is_separator_regex=False,
31
- )
32
- chunks = text_splitter.split_text(text)
33
- return chunks
34
-
35
-
36
- def method_get_vectorstore(text_chunks):
37
- # embeddings = OpenAIEmbeddings()
38
- embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
39
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
40
- return vectorstore
41
 
42
 
43
- def method_get_conversation_chain(vectorstore):
44
- #llm = ChatOpenAI()
45
- llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
 
 
 
46
 
47
- memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
48
- conversation_chain = ConversationalRetrievalChain.from_llm(
49
- llm=llm,
50
- retriever=vectorstore.as_retriever(),
51
- memory=memory
52
- )
53
- return conversation_chain
54
 
 
 
 
 
 
55
 
56
- def method_handle_userinput(user_question):
57
- response = st.session_state.conversation({'question': user_question})
58
- st.session_state.chat_history = response['chat_history']
59
 
60
- for i, message in enumerate(st.session_state.chat_history):
61
- if i % 2 == 0:
62
- st.write(user_template.replace(
63
- "{{MSG}}", message.content), unsafe_allow_html=True)
64
- else:
65
- st.write(bot_template.replace(
66
- "{{MSG}}", message.content), unsafe_allow_html=True)
67
 
 
 
68
 
69
- def main():
70
- load_dotenv()
71
- st.set_page_config(page_title="Converse with multiple PDFs",page_icon=":books:")
72
- st.write(css, unsafe_allow_html=True)
73
 
74
- if "conversation" not in st.session_state:
75
- st.session_state.conversation = None
76
- if "chat_history" not in st.session_state:
77
- st.session_state.chat_history = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- st.header("Converse with multiple PDFs :books:")
80
- user_question = st.text_input("Ask a question about your documents:")
81
- if user_question:
82
- method_handle_userinput(user_question)
83
 
 
 
 
 
 
 
84
  with st.sidebar:
85
- st.subheader("Documents Upload")
86
- pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Submit'", accept_multiple_files=True)
87
- if st.button("Submit"):
88
- with st.spinner("Processing"):
 
 
 
 
 
 
 
 
 
89
  # get pdf text
90
- raw_text = method_get_pdf_text(pdf_docs)
91
  # get the text chunks
92
- text_chunks = method_get_text_chunks(raw_text)
93
  # create vector store
94
- vectorstore = method_get_vectorstore(text_chunks)
95
- st.write(text_chunks)
96
- # create conversation chain
97
- st.session_state.conversation = method_get_conversation_chain(vectorstore)
98
-
 
99
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  if __name__ == '__main__':
102
  main()
 
1
  import streamlit as st
2
+ from langchain_community.document_loaders import WebBaseLoader
 
3
  from langchain.text_splitter import CharacterTextSplitter
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain_nomic.embeddings import NomicEmbeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ from langchain_community.llms import HuggingFaceHub
8
 
9
+ # from langchain_core.runnables import RunnablePassthrough
10
+ # from langchain_core.output_parsers import StrOutputParser
11
+ # from langchain_core.prompts import ChatPromptTemplate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
 
14
+ def method_get_website_text(url):
15
+ # Convert string of URLs to list
16
+ urls_list = urls.split("\n")
17
+ docs = [WebBaseLoader(url).load() for url in urls_list]
18
+ docs_list = [item for sublist in docs for item in sublist]
19
+ return docs_list
20
 
 
 
 
 
 
 
 
21
 
22
+ def method_get_text_chunks(text):
23
+ #split the text into chunks
24
+ text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
25
+ doc_splits = text_splitter.split_documents(docs_list)
26
+ return doc_splits
27
 
 
 
 
28
 
29
+ def method_get_vectorstore(doc_splits):
30
+ #convert text chunks into embeddings and store in vector database
 
 
 
 
 
31
 
32
+ # create the open-source embedding function
33
+ embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
34
 
35
+ # create a vectorstore from the chunks
36
+ vector_store = Chroma.from_documents(document_chunks, embeddings)
37
+ return vectorstore
 
38
 
39
+ def get_context_retriever_chain(vector_store):
40
+ # Initialize the retriever
41
+ retriever = vector_store.as_retriever()
42
+
43
+ # Initialize the language model
44
+ llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature": 0.6, "max_length": 512})
45
+
46
+ # Define the response template
47
+ response_template = """Answer the question based only on the following context:
48
+ {context}
49
+ Question: {question}
50
+ """
51
+
52
+ return retriever, llm, response_template
53
+
54
+
55
+ # def get_context_retriever_chain(vector_store):
56
+ # #llm = ChatOpenAI()
57
+ # llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature":0.6, "max_length":512})
58
+
59
+ # retriever = vector_store.as_retriever()
60
+
61
+ # prompt = ChatPromptTemplate.from_messages([
62
+ # MessagesPlaceholder(variable_name="chat_history"),
63
+ # ("user", "{input}"),
64
+ # ("user", "Given the above conversation, generate a search query to look up in order to get information relevant to the conversation")
65
+ # ])
66
+
67
+ # retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
68
+
69
+ # return retriever_chain, llm
70
+
71
+ # def method_get_conversation_chain(retriever_chain, question):
72
+ # # Use the retriever chain to generate a response to the user query
73
+ # response = retriever_chain(question)
74
+ # return response
75
+
76
+ # def method_get_conversation_chain(retriever_chain,llm,question):
77
+ # retriever = vectorstore.as_retriever()
78
+
79
+ # #perform the RAG
80
+
81
+ # after_rag_template = """Answer the question based only on the following context:
82
+ # {context}
83
+ # Question: {question}
84
+ # """
85
+ # after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
86
+ # after_rag_chain = (
87
+ # {"context": retriever, "question": RunnablePassthrough()}
88
+ # | after_rag_prompt
89
+ # | model_local
90
+ # | StrOutputParser()
91
+ # )
92
+ # return after_rag_chain.invoke(question)
93
+
94
+
95
+ # #llm = ChatOpenAI()
96
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
97
+
98
+ # memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
99
+ # conversation_chain = ConversationalRetrievalChain.from_llm(
100
+ # llm=llm,
101
+ # retriever=vectorstore.as_retriever(),
102
+ # memory=memory
103
+ # )
104
+ # return conversation_chain
105
 
 
 
 
 
106
 
107
+ def main():
108
+ load_dotenv()
109
+ st.set_page_config(page_title="Chat with websites", page_icon="🤖")
110
+ st.title("Chat with websites")
111
+
112
+ # sidebar
113
  with st.sidebar:
114
+ st.header("Settings")
115
+ website_url = st.text_input("Website URL")
116
+
117
+ if website_url is None or website_url == "":
118
+ st.info("Please enter a website URL")
119
+
120
+ else:
121
+ # Input fields
122
+ question = st.text_input("Question")
123
+
124
+ # Button to process input
125
+ if st.button('Query Documents'):
126
+ with st.spinner('Processing...'):
127
  # get pdf text
128
+ raw_text = method_get_website_text(website_url)
129
  # get the text chunks
130
+ doc_splits = method_get_text_chunks(raw_text)
131
  # create vector store
132
+ vectorstore = method_get_vectorstore(doc_splits)
133
+ st.write(doc_splits)
134
+ # retriever_chain = get_context_retriever_chain(vector_store)
135
+ # # create conversation chain
136
+ # answer = method_get_conversation_chain(retriever_chain,question)
137
+ # st.text_area("Answer", value=answer, height=300, disabled=True)
138
 
139
+
140
+ # Get the retriever, LLM, and response template
141
+ retriever, llm, response_template = get_context_retriever_chain(vectorstore)
142
+ # Retrieve relevant context using the retriever
143
+ context = retriever(question)
144
+ # Generate response using the LLM
145
+ llm_response = llm(question)
146
+ # Apply the response template to format the final answer
147
+ answer = response_template.format(context=context, question=question) + llm_response
148
+ # Display the generated answer
149
+ st.text_area("Answer", value=answer, height=300, disabled=True)
150
 
151
  if __name__ == '__main__':
152
  main()