Paul-Joshi commited on
Commit
ffd548e
1 Parent(s): b24c6a0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -0
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_community.document_loaders import WebBaseLoader
3
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain_nomic.embeddings import NomicEmbeddings
6
+
7
+ from langchain_community.llms import HuggingFaceHub
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from bs4 import BeautifulSoup
10
+
11
+ from langchain_core.runnables import RunnablePassthrough
12
+ from langchain_core.output_parsers import StrOutputParser
13
+ from langchain_core.prompts import ChatPromptTemplate
14
+
15
+
16
+ # Convert string of URLs to list
17
+ def method_get_website_text(urls):
18
+ urls_list = urls.split("\n")
19
+ docs = [WebBaseLoader(url).load() for url in urls_list]
20
+ docs_list = [item for sublist in docs for item in sublist]
21
+ return docs_list
22
+
23
+ #split the text into chunks
24
+ def method_get_text_chunks(text):
25
+ #text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
26
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
27
+ doc_splits = text_splitter.split_documents(text)
28
+ return doc_splits
29
+
30
+ #convert text chunks into embeddings and store in vector database
31
+ def method_get_vectorstore(document_chunks):
32
+ embeddings = HuggingFaceEmbeddings()
33
+ #embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
34
+
35
+ # create a vectorstore from the chunks
36
+ vector_store = Chroma.from_documents(document_chunks, embeddings)
37
+ return vector_store
38
+
39
+
40
+ def get_context_retriever_chain(vector_store,question):
41
+ # Initialize the retriever
42
+ retriever = vector_store.as_retriever()
43
+
44
+ # Define the RAG template
45
+ after_rag_template = """Answer the question based only on the following context:
46
+ {context}
47
+ Question: {question}
48
+ """
49
+
50
+ # Create the RAG prompt template
51
+ after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
52
+
53
+ # Initialize the Hugging Face language model (LLM)
54
+ llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":1024})
55
+
56
+ # Construct the RAG pipeline
57
+ after_rag_chain = (
58
+ {"context": retriever, "question": RunnablePassthrough()}
59
+ | after_rag_prompt
60
+ | llm
61
+ | StrOutputParser()
62
+ )
63
+
64
+ return after_rag_chain.invoke(question)
65
+
66
+ def main():
67
+ st.set_page_config(page_title="Chat with websites", page_icon="🤖")
68
+ st.title("Chat with websites")
69
+
70
+ # sidebar
71
+ with st.sidebar:
72
+ st.header("Settings")
73
+ website_url = st.text_input("Website URL")
74
+
75
+ if website_url is None or website_url == "":
76
+ st.info("Please enter a website URL")
77
+
78
+ else:
79
+ # Input fields
80
+ st.subheader('Your are gonna interact with the below Website:')
81
+ st.button("Start", type="primary")
82
+ st.subheader('Click on the Start button', divider='rainbow')
83
+
84
+ # Button to pre-process input
85
+ if st.button("Start"):
86
+ with st.spinner('Tokenizing and Embedding the Website Data'):
87
+ # get pdf text
88
+ raw_text = method_get_website_text(website_url)
89
+ # get the text chunks
90
+ doc_splits = method_get_text_chunks(raw_text)
91
+ # create vector store
92
+ vector_store = method_get_vectorstore(doc_splits)
93
+
94
+ # Input fields
95
+ question = st.text_input("Question")
96
+
97
+ # Button to process input and get output
98
+ if st.button('Query Documents'):
99
+ with st.spinner('Processing...'):
100
+ # # get pdf text
101
+ # raw_text = method_get_website_text(website_url)
102
+ # # get the text chunks
103
+ # doc_splits = method_get_text_chunks(raw_text)
104
+ # # create vector store
105
+ # vector_store = method_get_vectorstore(doc_splits)
106
+ # Generate response using the RAG pipeline
107
+ answer = get_context_retriever_chain(vector_store,question)
108
+ # Display the generated answer
109
+ split_string = "Question: " + str(question)
110
+ result = answer.split(split_string)[-1]
111
+ st.text_area("Answer", value=result, height=300, disabled=True)
112
+
113
+ if __name__ == '__main__':
114
+ main()