Spaces:
Runtime error
Runtime error
File size: 4,500 Bytes
a00136a 978d20f febd687 978d20f a00136a 978d20f a00136a 8785aa0 1225077 a00136a 54a2185 664b6f4 07ecaec 70c6a0c a00136a cef4abb d83001b 978d20f a00136a cef4abb 978d20f febd687 cef4abb 290d0b9 978d20f 2ed030b cef4abb 5ac0ce9 07ecaec 2ed030b 664b6f4 e92de57 cef4abb e92de57 1225077 e92de57 978d20f a00136a 978d20f 2d3e495 978d20f 5ac0ce9 5b6c919 5ac0ce9 a00136a 978d20f a00136a 978d20f a00136a 5ac0ce9 1225077 5ac0ce9 ac0a4b5 978d20f 4b928c7 a00136a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import streamlit as st
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_nomic.embeddings import NomicEmbeddings
from langchain_community.llms import HuggingFaceHub
from bs4 import BeautifulSoup
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain import hub
from sentence_transformers import SentenceTransformer
import os
# Convert string of URLs to list
def method_get_website_text(urls):
urls_list = urls.split("\n")
docs = [WebBaseLoader(url).load() for url in urls_list]
docs_list = [item for sublist in docs for item in sublist]
return docs_list
#split the text into chunks
def method_get_text_chunks(text):
#text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
doc_splits = text_splitter.split_documents(text)
return doc_splits
#convert text chunks into embeddings and store in vector database
def method_get_vectorstore(document_chunks):
embeddings = HuggingFaceEmbeddings()
#embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
# create a vectorstore from the chunks
vector_store = Chroma.from_documents(document_chunks, embeddings)
return vector_store
def get_context_retriever_chain(vector_store,question):
# Initialize the retriever
retriever = vector_store.as_retriever()
# Define the RAG template
after_rag_template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
# Create the RAG prompt template
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
# Initialize the Hugging Face language model (LLM)
llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":1024})
# Construct the RAG pipeline
after_rag_chain = (
{"context": retriever, "question": RunnablePassthrough()}
| after_rag_prompt
| llm
| StrOutputParser()
)
return after_rag_chain.invoke(question)
def main():
st.set_page_config(page_title="Chat with websites", page_icon="🤖")
st.title("Chat with websites")
# sidebar
with st.sidebar:
st.header("Settings")
website_url = st.text_input("Website URL")
if website_url is None or website_url == "":
st.info("Please enter a website URL")
else:
# Input fields
st.subheader('Your are gonna interact with the below Website:')
st.button("Start", type="primary")
st.subheader('Click on the Start button', divider='rainbow')
# Button to pre-process input
if st.button("Start"):
with st.spinner('Tokenizing and Embedding the Website Data'):
# get pdf text
raw_text = method_get_website_text(website_url)
# get the text chunks
doc_splits = method_get_text_chunks(raw_text)
# create vector store
vector_store = method_get_vectorstore(doc_splits)
# Generate response using the RAG pipeline
# Input fields
question = st.text_input("Question")
# Button to process input and get output
if st.button('Query Documents'):
with st.spinner('Processing...'):
# # get pdf text
# raw_text = method_get_website_text(website_url)
# # get the text chunks
# doc_splits = method_get_text_chunks(raw_text)
# # create vector store
# vector_store = method_get_vectorstore(doc_splits)
# # Generate response using the RAG pipeline
answer = get_context_retriever_chain(vector_store,question)
# Display the generated answer
split_string = "Question: " + str(question)
result = answer.split(split_string)[-1]
st.text_area("Answer", value=result, height=300, disabled=True)
if __name__ == '__main__':
main() |