Spaces:
Runtime error
Runtime error
Paul-Joshi
commited on
Commit
•
ffd548e
1
Parent(s):
b24c6a0
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from langchain_community.document_loaders import WebBaseLoader
|
3 |
+
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
4 |
+
from langchain_community.vectorstores import Chroma
|
5 |
+
from langchain_nomic.embeddings import NomicEmbeddings
|
6 |
+
|
7 |
+
from langchain_community.llms import HuggingFaceHub
|
8 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
|
11 |
+
from langchain_core.runnables import RunnablePassthrough
|
12 |
+
from langchain_core.output_parsers import StrOutputParser
|
13 |
+
from langchain_core.prompts import ChatPromptTemplate
|
14 |
+
|
15 |
+
|
16 |
+
# Convert string of URLs to list
|
17 |
+
def method_get_website_text(urls):
|
18 |
+
urls_list = urls.split("\n")
|
19 |
+
docs = [WebBaseLoader(url).load() for url in urls_list]
|
20 |
+
docs_list = [item for sublist in docs for item in sublist]
|
21 |
+
return docs_list
|
22 |
+
|
23 |
+
#split the text into chunks
|
24 |
+
def method_get_text_chunks(text):
|
25 |
+
#text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
|
26 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
|
27 |
+
doc_splits = text_splitter.split_documents(text)
|
28 |
+
return doc_splits
|
29 |
+
|
30 |
+
#convert text chunks into embeddings and store in vector database
|
31 |
+
def method_get_vectorstore(document_chunks):
|
32 |
+
embeddings = HuggingFaceEmbeddings()
|
33 |
+
#embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
|
34 |
+
|
35 |
+
# create a vectorstore from the chunks
|
36 |
+
vector_store = Chroma.from_documents(document_chunks, embeddings)
|
37 |
+
return vector_store
|
38 |
+
|
39 |
+
|
40 |
+
def get_context_retriever_chain(vector_store,question):
|
41 |
+
# Initialize the retriever
|
42 |
+
retriever = vector_store.as_retriever()
|
43 |
+
|
44 |
+
# Define the RAG template
|
45 |
+
after_rag_template = """Answer the question based only on the following context:
|
46 |
+
{context}
|
47 |
+
Question: {question}
|
48 |
+
"""
|
49 |
+
|
50 |
+
# Create the RAG prompt template
|
51 |
+
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
|
52 |
+
|
53 |
+
# Initialize the Hugging Face language model (LLM)
|
54 |
+
llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":1024})
|
55 |
+
|
56 |
+
# Construct the RAG pipeline
|
57 |
+
after_rag_chain = (
|
58 |
+
{"context": retriever, "question": RunnablePassthrough()}
|
59 |
+
| after_rag_prompt
|
60 |
+
| llm
|
61 |
+
| StrOutputParser()
|
62 |
+
)
|
63 |
+
|
64 |
+
return after_rag_chain.invoke(question)
|
65 |
+
|
66 |
+
def main():
|
67 |
+
st.set_page_config(page_title="Chat with websites", page_icon="🤖")
|
68 |
+
st.title("Chat with websites")
|
69 |
+
|
70 |
+
# sidebar
|
71 |
+
with st.sidebar:
|
72 |
+
st.header("Settings")
|
73 |
+
website_url = st.text_input("Website URL")
|
74 |
+
|
75 |
+
if website_url is None or website_url == "":
|
76 |
+
st.info("Please enter a website URL")
|
77 |
+
|
78 |
+
else:
|
79 |
+
# Input fields
|
80 |
+
st.subheader('Your are gonna interact with the below Website:')
|
81 |
+
st.button("Start", type="primary")
|
82 |
+
st.subheader('Click on the Start button', divider='rainbow')
|
83 |
+
|
84 |
+
# Button to pre-process input
|
85 |
+
if st.button("Start"):
|
86 |
+
with st.spinner('Tokenizing and Embedding the Website Data'):
|
87 |
+
# get pdf text
|
88 |
+
raw_text = method_get_website_text(website_url)
|
89 |
+
# get the text chunks
|
90 |
+
doc_splits = method_get_text_chunks(raw_text)
|
91 |
+
# create vector store
|
92 |
+
vector_store = method_get_vectorstore(doc_splits)
|
93 |
+
|
94 |
+
# Input fields
|
95 |
+
question = st.text_input("Question")
|
96 |
+
|
97 |
+
# Button to process input and get output
|
98 |
+
if st.button('Query Documents'):
|
99 |
+
with st.spinner('Processing...'):
|
100 |
+
# # get pdf text
|
101 |
+
# raw_text = method_get_website_text(website_url)
|
102 |
+
# # get the text chunks
|
103 |
+
# doc_splits = method_get_text_chunks(raw_text)
|
104 |
+
# # create vector store
|
105 |
+
# vector_store = method_get_vectorstore(doc_splits)
|
106 |
+
# Generate response using the RAG pipeline
|
107 |
+
answer = get_context_retriever_chain(vector_store,question)
|
108 |
+
# Display the generated answer
|
109 |
+
split_string = "Question: " + str(question)
|
110 |
+
result = answer.split(split_string)[-1]
|
111 |
+
st.text_area("Answer", value=result, height=300, disabled=True)
|
112 |
+
|
113 |
+
if __name__ == '__main__':
|
114 |
+
main()
|