File size: 4,283 Bytes
ffd548e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb780ca
 
ffd548e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import streamlit as st
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_nomic.embeddings import NomicEmbeddings

from langchain_community.llms import HuggingFaceHub
from langchain_community.embeddings import HuggingFaceEmbeddings
from bs4 import BeautifulSoup

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate


# Convert string of URLs to list
def method_get_website_text(urls):
    urls_list = urls.split("\n")
    docs = [WebBaseLoader(url).load() for url in urls_list]
    docs_list = [item for sublist in docs for item in sublist]
    return docs_list

#split the text into chunks
def method_get_text_chunks(text):
    #text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
    doc_splits = text_splitter.split_documents(text)
    return doc_splits

#convert text chunks into embeddings and store in vector database
def method_get_vectorstore(document_chunks):
    embeddings = HuggingFaceEmbeddings()
    #embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
    
    # create a vectorstore from the chunks
    vector_store = Chroma.from_documents(document_chunks, embeddings)
    return vector_store

    
def get_context_retriever_chain(vector_store,question):
    # Initialize the retriever
    retriever = vector_store.as_retriever()
    
    # Define the RAG template
    after_rag_template = """Answer the question based only on the following context:
    {context}
    Question: {question}
    """
    
    # Create the RAG prompt template
    after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
    
    # Initialize the Hugging Face language model (LLM)
    llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":1024})
    
    # Construct the RAG pipeline
    after_rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | after_rag_prompt
        | llm
        | StrOutputParser()
    )
    
    return after_rag_chain.invoke(question)

def main():
    st.set_page_config(page_title="Chat with websites", page_icon="🤖")
    st.title("Chat with websites")
    
    # sidebar
    with st.sidebar:
        st.header("Settings")
        website_url = st.text_input("Website URL")
    
    if website_url is None or website_url == "":
        st.info("Please enter a website URL")
    
    else:
        # Input fields
        st.markdown("*Your are gonna interact with the below Website:*")
        st.info("Click on the Start button")

        # Button to pre-process input
        if st.button("Start"):
            with st.spinner('Tokenizing and Embedding the Website Data'):
                # get pdf text
                raw_text = method_get_website_text(website_url)
                # get the text chunks
                doc_splits = method_get_text_chunks(raw_text)
                # create vector store
                vector_store = method_get_vectorstore(doc_splits)
        
        # Input fields
        question = st.text_input("Question")
        
        # Button to process input and get output
        if st.button('Query Documents'):
            with st.spinner('Processing...'):
                # # get pdf text
                # raw_text = method_get_website_text(website_url)
                # # get the text chunks
                # doc_splits = method_get_text_chunks(raw_text)
                # # create vector store
                # vector_store = method_get_vectorstore(doc_splits)
                # Generate response using the RAG pipeline
                answer = get_context_retriever_chain(vector_store,question)
                # Display the generated answer
                split_string = "Question: " + str(question)
                result = answer.split(split_string)[-1]
                st.text_area("Answer", value=result, height=300, disabled=True)

if __name__ == '__main__':
    main()