Spaces:

Paul-Joshi
/

website-summarizers-RAG

Runtime error

App Files Files Community

Paul-Joshi commited on Apr 11, 2024

Commit

ffd548e

•

1 Parent(s): b24c6a0

Create app.py

Browse files

Files changed (1) hide show

app.py +114 -0

app.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import streamlit as st
+from langchain_community.document_loaders import WebBaseLoader
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain_nomic.embeddings import NomicEmbeddings
+from langchain_community.llms import HuggingFaceHub
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from bs4 import BeautifulSoup
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+# Convert string of URLs to list
+def method_get_website_text(urls):
+    urls_list = urls.split("\n")
+    docs = [WebBaseLoader(url).load() for url in urls_list]
+    docs_list = [item for sublist in docs for item in sublist]
+    return docs_list
+#split the text into chunks
+def method_get_text_chunks(text):
+    #text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
+    doc_splits = text_splitter.split_documents(text)
+    return doc_splits
+#convert text chunks into embeddings and store in vector database
+def method_get_vectorstore(document_chunks):
+    embeddings = HuggingFaceEmbeddings()
+    #embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
+    # create a vectorstore from the chunks
+    vector_store = Chroma.from_documents(document_chunks, embeddings)
+    return vector_store
+def get_context_retriever_chain(vector_store,question):
+    # Initialize the retriever
+    retriever = vector_store.as_retriever()
+    # Define the RAG template
+    after_rag_template = """Answer the question based only on the following context:
+    {context}
+    Question: {question}
+    """
+    # Create the RAG prompt template
+    after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
+    # Initialize the Hugging Face language model (LLM)
+    llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":1024})
+    # Construct the RAG pipeline
+    after_rag_chain = (
+        {"context": retriever, "question": RunnablePassthrough()}
+        | after_rag_prompt
+        | llm
+        | StrOutputParser()
+    )
+    return after_rag_chain.invoke(question)
+def main():
+    st.set_page_config(page_title="Chat with websites", page_icon="🤖")
+    st.title("Chat with websites")
+    # sidebar
+    with st.sidebar:
+        st.header("Settings")
+        website_url = st.text_input("Website URL")
+    if website_url is None or website_url == "":
+        st.info("Please enter a website URL")
+    else:
+        # Input fields
+        st.subheader('Your are gonna interact with the below Website:')
+        st.button("Start", type="primary")
+        st.subheader('Click on the Start button', divider='rainbow')
+        # Button to pre-process input
+        if st.button("Start"):
+            with st.spinner('Tokenizing and Embedding the Website Data'):
+                # get pdf text
+                raw_text = method_get_website_text(website_url)
+                # get the text chunks
+                doc_splits = method_get_text_chunks(raw_text)
+                # create vector store
+                vector_store = method_get_vectorstore(doc_splits)
+        # Input fields
+        question = st.text_input("Question")
+        # Button to process input and get output
+        if st.button('Query Documents'):
+            with st.spinner('Processing...'):
+                # # get pdf text
+                # raw_text = method_get_website_text(website_url)
+                # # get the text chunks
+                # doc_splits = method_get_text_chunks(raw_text)
+                # # create vector store
+                # vector_store = method_get_vectorstore(doc_splits)
+                # Generate response using the RAG pipeline
+                answer = get_context_retriever_chain(vector_store,question)
+                # Display the generated answer
+                split_string = "Question: " + str(question)
+                result = answer.split(split_string)[-1]
+                st.text_area("Answer", value=result, height=300, disabled=True)
+if __name__ == '__main__':
+    main()