File size: 4,500 Bytes
a00136a
978d20f
febd687
978d20f
 
a00136a
978d20f
a00136a
8785aa0
1225077
 
 
a00136a
54a2185
664b6f4
07ecaec
70c6a0c
a00136a
cef4abb
d83001b
978d20f
 
 
 
a00136a
cef4abb
978d20f
febd687
cef4abb
290d0b9
978d20f
2ed030b
cef4abb
5ac0ce9
 
 
07ecaec
2ed030b
 
 
664b6f4
e92de57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cef4abb
e92de57
 
 
 
 
 
 
 
1225077
e92de57
978d20f
 
 
 
 
 
a00136a
978d20f
 
2d3e495
978d20f
 
 
 
 
5ac0ce9
 
 
 
 
5b6c919
5ac0ce9
a00136a
978d20f
a00136a
978d20f
a00136a
5ac0ce9
1225077
5ac0ce9
 
 
 
 
 
 
 
 
 
 
 
 
 
ac0a4b5
978d20f
4b928c7
 
 
a00136a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import streamlit as st
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_nomic.embeddings import NomicEmbeddings

from langchain_community.llms import HuggingFaceHub

from bs4 import BeautifulSoup
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain import hub
from sentence_transformers import SentenceTransformer
import os

# Convert string of URLs to list
def method_get_website_text(urls):
    urls_list = urls.split("\n")
    docs = [WebBaseLoader(url).load() for url in urls_list]
    docs_list = [item for sublist in docs for item in sublist]
    return docs_list

#split the text into chunks
def method_get_text_chunks(text):
    #text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
    doc_splits = text_splitter.split_documents(text)
    return doc_splits

#convert text chunks into embeddings and store in vector database
def method_get_vectorstore(document_chunks):
    embeddings = HuggingFaceEmbeddings()
    #embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
    
    # create a vectorstore from the chunks
    vector_store = Chroma.from_documents(document_chunks, embeddings)
    return vector_store

    
def get_context_retriever_chain(vector_store,question):
    # Initialize the retriever
    retriever = vector_store.as_retriever()
    
    # Define the RAG template
    after_rag_template = """Answer the question based only on the following context:
    {context}
    Question: {question}
    """
    
    # Create the RAG prompt template
    after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
    
    # Initialize the Hugging Face language model (LLM)
    llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":1024})
    
    # Construct the RAG pipeline
    after_rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | after_rag_prompt
        | llm
        | StrOutputParser()
    )
    
    return after_rag_chain.invoke(question)

def main():
    st.set_page_config(page_title="Chat with websites", page_icon="🤖")
    st.title("Chat with websites")
    
    # sidebar
    with st.sidebar:
        st.header("Settings")
        website_url = st.text_input("Website URL")
    
    if website_url is None or website_url == "":
        st.info("Please enter a website URL")
    
    else:
        # Input fields
        st.subheader('Your are gonna interact with the below Website:')
        st.button("Start", type="primary")
        st.subheader('Click on the Start button', divider='rainbow')

        # Button to pre-process input
        if st.button("Start"):
            with st.spinner('Tokenizing and Embedding the Website Data'):
                # get pdf text
                raw_text = method_get_website_text(website_url)
                # get the text chunks
                doc_splits = method_get_text_chunks(raw_text)
                # create vector store
                vector_store = method_get_vectorstore(doc_splits)
                # Generate response using the RAG pipeline
        
        # Input fields
        question = st.text_input("Question")
        
        # Button to process input and get output
        if st.button('Query Documents'):
            with st.spinner('Processing...'):
                # # get pdf text
                # raw_text = method_get_website_text(website_url)
                # # get the text chunks
                # doc_splits = method_get_text_chunks(raw_text)
                # # create vector store
                # vector_store = method_get_vectorstore(doc_splits)
                # # Generate response using the RAG pipeline
                answer = get_context_retriever_chain(vector_store,question)
                # Display the generated answer
                split_string = "Question: " + str(question)
                result = answer.split(split_string)[-1]
                st.text_area("Answer", value=result, height=300, disabled=True)

if __name__ == '__main__':
    main()