Spaces:
Runtime error
Runtime error
Paul-Joshi
commited on
Commit
•
b24c6a0
1
Parent(s):
51940eb
Delete app.py
Browse files
app.py
DELETED
@@ -1,117 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
from langchain_community.document_loaders import WebBaseLoader
|
3 |
-
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
4 |
-
from langchain_community.vectorstores import Chroma
|
5 |
-
from langchain_nomic.embeddings import NomicEmbeddings
|
6 |
-
|
7 |
-
from langchain_community.llms import HuggingFaceHub
|
8 |
-
|
9 |
-
from bs4 import BeautifulSoup
|
10 |
-
from langchain_core.runnables import RunnablePassthrough
|
11 |
-
from langchain_core.output_parsers import StrOutputParser
|
12 |
-
from langchain_core.prompts import ChatPromptTemplate
|
13 |
-
|
14 |
-
from langchain_community.embeddings import HuggingFaceEmbeddings
|
15 |
-
from langchain import hub
|
16 |
-
from sentence_transformers import SentenceTransformer
|
17 |
-
import os
|
18 |
-
|
19 |
-
# Convert string of URLs to list
|
20 |
-
def method_get_website_text(urls):
|
21 |
-
urls_list = urls.split("\n")
|
22 |
-
docs = [WebBaseLoader(url).load() for url in urls_list]
|
23 |
-
docs_list = [item for sublist in docs for item in sublist]
|
24 |
-
return docs_list
|
25 |
-
|
26 |
-
#split the text into chunks
|
27 |
-
def method_get_text_chunks(text):
|
28 |
-
#text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
|
29 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
|
30 |
-
doc_splits = text_splitter.split_documents(text)
|
31 |
-
return doc_splits
|
32 |
-
|
33 |
-
#convert text chunks into embeddings and store in vector database
|
34 |
-
def method_get_vectorstore(document_chunks):
|
35 |
-
embeddings = HuggingFaceEmbeddings()
|
36 |
-
#embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
|
37 |
-
|
38 |
-
# create a vectorstore from the chunks
|
39 |
-
vector_store = Chroma.from_documents(document_chunks, embeddings)
|
40 |
-
return vector_store
|
41 |
-
|
42 |
-
|
43 |
-
def get_context_retriever_chain(vector_store,question):
|
44 |
-
# Initialize the retriever
|
45 |
-
retriever = vector_store.as_retriever()
|
46 |
-
|
47 |
-
# Define the RAG template
|
48 |
-
after_rag_template = """Answer the question based only on the following context:
|
49 |
-
{context}
|
50 |
-
Question: {question}
|
51 |
-
"""
|
52 |
-
|
53 |
-
# Create the RAG prompt template
|
54 |
-
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
|
55 |
-
|
56 |
-
# Initialize the Hugging Face language model (LLM)
|
57 |
-
llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":1024})
|
58 |
-
|
59 |
-
# Construct the RAG pipeline
|
60 |
-
after_rag_chain = (
|
61 |
-
{"context": retriever, "question": RunnablePassthrough()}
|
62 |
-
| after_rag_prompt
|
63 |
-
| llm
|
64 |
-
| StrOutputParser()
|
65 |
-
)
|
66 |
-
|
67 |
-
return after_rag_chain.invoke(question)
|
68 |
-
|
69 |
-
def main():
|
70 |
-
st.set_page_config(page_title="Chat with websites", page_icon="🤖")
|
71 |
-
st.title("Chat with websites")
|
72 |
-
|
73 |
-
# sidebar
|
74 |
-
with st.sidebar:
|
75 |
-
st.header("Settings")
|
76 |
-
website_url = st.text_input("Website URL")
|
77 |
-
|
78 |
-
if website_url is None or website_url == "":
|
79 |
-
st.info("Please enter a website URL")
|
80 |
-
|
81 |
-
else:
|
82 |
-
# Input fields
|
83 |
-
st.subheader('Your are gonna interact with the below Website:')
|
84 |
-
st.button("Start", type="primary")
|
85 |
-
st.subheader('Click on the Start button', divider='rainbow')
|
86 |
-
|
87 |
-
# Button to pre-process input
|
88 |
-
if st.button("Start"):
|
89 |
-
with st.spinner('Tokenizing and Embedding the Website Data'):
|
90 |
-
# get pdf text
|
91 |
-
raw_text = method_get_website_text(website_url)
|
92 |
-
# get the text chunks
|
93 |
-
doc_splits = method_get_text_chunks(raw_text)
|
94 |
-
# create vector store
|
95 |
-
vector_store = method_get_vectorstore(doc_splits)
|
96 |
-
|
97 |
-
# Input fields
|
98 |
-
question = st.text_input("Question")
|
99 |
-
|
100 |
-
# Button to process input and get output
|
101 |
-
if st.button('Query Documents'):
|
102 |
-
with st.spinner('Processing...'):
|
103 |
-
# # get pdf text
|
104 |
-
# raw_text = method_get_website_text(website_url)
|
105 |
-
# # get the text chunks
|
106 |
-
# doc_splits = method_get_text_chunks(raw_text)
|
107 |
-
# # create vector store
|
108 |
-
# vector_store = method_get_vectorstore(doc_splits)
|
109 |
-
# Generate response using the RAG pipeline
|
110 |
-
answer = get_context_retriever_chain(vector_store,question)
|
111 |
-
# Display the generated answer
|
112 |
-
split_string = "Question: " + str(question)
|
113 |
-
result = answer.split(split_string)[-1]
|
114 |
-
st.text_area("Answer", value=result, height=300, disabled=True)
|
115 |
-
|
116 |
-
if __name__ == '__main__':
|
117 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|