File size: 5,657 Bytes
a00136a
978d20f
a00136a
978d20f
 
a00136a
978d20f
a00136a
8785aa0
978d20f
 
 
a00136a
 
d83001b
978d20f
 
 
 
 
a00136a
 
978d20f
 
 
 
 
a00136a
 
978d20f
 
a00136a
978d20f
 
a00136a
978d20f
 
 
a00136a
978d20f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a00136a
 
978d20f
 
 
 
 
a00136a
978d20f
 
 
 
 
 
 
 
 
 
 
 
 
a00136a
978d20f
a00136a
978d20f
a00136a
978d20f
 
 
 
 
 
a00136a
978d20f
 
 
 
 
 
 
 
 
 
 
a00136a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import streamlit as st
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_nomic.embeddings import NomicEmbeddings

from langchain_community.llms import HuggingFaceHub

from bs4 import BeautifulSoup
# from langchain_core.runnables import RunnablePassthrough
# from langchain_core.output_parsers import StrOutputParser
# from langchain_core.prompts import ChatPromptTemplate


def method_get_website_text(urls):
    # Convert string of URLs to list
    urls_list = urls.split("\n")
    docs = [WebBaseLoader(url).load() for url in urls_list]
    docs_list = [item for sublist in docs for item in sublist]
    return docs_list


def method_get_text_chunks(text):
    #split the text into chunks
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
    doc_splits = text_splitter.split_documents(docs_list)
    return doc_splits


def method_get_vectorstore(doc_splits):
    #convert text chunks into embeddings and store in vector database

    # create the open-source embedding function
    embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")

    # create a vectorstore from the chunks
    vector_store = Chroma.from_documents(document_chunks, embeddings)
    return vectorstore

def get_context_retriever_chain(vector_store):
    # Initialize the retriever
    retriever = vector_store.as_retriever()
    
    # Initialize the language model
    llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature": 0.6, "max_length": 512})
    
    # Define the response template
    response_template = """Answer the question based only on the following context:
    {context}
    Question: {question}
    """
    
    return retriever, llm, response_template


# def get_context_retriever_chain(vector_store):
#     #llm = ChatOpenAI()
#     llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature":0.6, "max_length":512})

#     retriever = vector_store.as_retriever()
    
#     prompt = ChatPromptTemplate.from_messages([
#       MessagesPlaceholder(variable_name="chat_history"),
#       ("user", "{input}"),
#       ("user", "Given the above conversation, generate a search query to look up in order to get information relevant to the conversation")
#     ])
    
#     retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
    
#     return retriever_chain, llm

# def method_get_conversation_chain(retriever_chain, question):
#     # Use the retriever chain to generate a response to the user query
#     response = retriever_chain(question)
#     return response
    
# def method_get_conversation_chain(retriever_chain,llm,question):
#     retriever = vectorstore.as_retriever()

#     #perform the RAG 
    
#     after_rag_template = """Answer the question based only on the following context:
#     {context}
#     Question: {question}
#     """
#     after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
#     after_rag_chain = (
#         {"context": retriever, "question": RunnablePassthrough()}
#         | after_rag_prompt
#         | model_local
#         | StrOutputParser()
#     )
#     return after_rag_chain.invoke(question) 

    
#     #llm = ChatOpenAI()
#     llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

#     memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
#     conversation_chain = ConversationalRetrievalChain.from_llm(
#         llm=llm,
#         retriever=vectorstore.as_retriever(),
#         memory=memory
#     )
#     return conversation_chain


def main():
    st.set_page_config(page_title="Chat with websites", page_icon="🤖")
    st.title("Chat with websites")
    
    # sidebar
    with st.sidebar:
        st.header("Settings")
        website_url = st.text_input("Website URL")
    
    if website_url is None or website_url == "":
        st.info("Please enter a website URL")
    
    else:
        # Input fields
        question = st.text_input("Question")
        
        # Button to process input
        if st.button('Query Documents'):
            with st.spinner('Processing...'):
                # get pdf text
                raw_text = method_get_website_text(website_url)
                # get the text chunks
                doc_splits = method_get_text_chunks(raw_text)
                # create vector store
                vectorstore = method_get_vectorstore(doc_splits)
                st.write(doc_splits)
                # retriever_chain = get_context_retriever_chain(vector_store)
                # # create conversation chain
                # answer = method_get_conversation_chain(retriever_chain,question)
                # st.text_area("Answer", value=answer, height=300, disabled=True)

                
                # Get the retriever, LLM, and response template
                retriever, llm, response_template = get_context_retriever_chain(vectorstore)
                # Retrieve relevant context using the retriever
                context = retriever(question)
                # Generate response using the LLM
                llm_response = llm(question)
                # Apply the response template to format the final answer
                answer = response_template.format(context=context, question=question) + llm_response
                # Display the generated answer
                st.text_area("Answer", value=answer, height=300, disabled=True)

if __name__ == '__main__':
    main()