Paul-Joshi commited on
Commit
febd687
·
verified ·
1 Parent(s): 0e981b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -4
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import streamlit as st
2
  from langchain_community.document_loaders import WebBaseLoader
3
- from langchain.text_splitter import CharacterTextSplitter
4
  from langchain_community.vectorstores import Chroma
5
  from langchain_nomic.embeddings import NomicEmbeddings
6
 
@@ -11,9 +11,11 @@ from langchain_core.runnables import RunnablePassthrough
11
  from langchain_core.output_parsers import StrOutputParser
12
  from langchain_core.prompts import ChatPromptTemplate
13
 
 
14
 
15
  def method_get_website_text(urls):
16
  # Convert string of URLs to list
 
17
  urls_list = urls.split("\n")
18
  docs = [WebBaseLoader(url).load() for url in urls_list]
19
  docs_list = [item for sublist in docs for item in sublist]
@@ -22,7 +24,9 @@ def method_get_website_text(urls):
22
 
23
  def method_get_text_chunks(text):
24
  #split the text into chunks
25
- text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
 
 
26
  doc_splits = text_splitter.split_documents(text)
27
  return doc_splits
28
 
@@ -31,7 +35,8 @@ def method_get_vectorstore(document_chunks):
31
  #convert text chunks into embeddings and store in vector database
32
 
33
  # create the open-source embedding function
34
- embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
 
35
 
36
  # create a vectorstore from the chunks
37
  vector_store = Chroma.from_documents(document_chunks, embeddings)
@@ -51,7 +56,7 @@ def get_context_retriever_chain(vector_store, question):
51
  after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
52
 
53
  # Initialize the Hugging Face language model (LLM)
54
- llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2")
55
 
56
  # Construct the RAG pipeline
57
  after_rag_chain = (
 
1
  import streamlit as st
2
  from langchain_community.document_loaders import WebBaseLoader
3
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
4
  from langchain_community.vectorstores import Chroma
5
  from langchain_nomic.embeddings import NomicEmbeddings
6
 
 
11
  from langchain_core.output_parsers import StrOutputParser
12
  from langchain_core.prompts import ChatPromptTemplate
13
 
14
+ from langchain.embeddings import HuggingFaceEmbeddings
15
 
16
  def method_get_website_text(urls):
17
  # Convert string of URLs to list
18
+
19
  urls_list = urls.split("\n")
20
  docs = [WebBaseLoader(url).load() for url in urls_list]
21
  docs_list = [item for sublist in docs for item in sublist]
 
24
 
25
  def method_get_text_chunks(text):
26
  #split the text into chunks
27
+
28
+ #text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
29
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
30
  doc_splits = text_splitter.split_documents(text)
31
  return doc_splits
32
 
 
35
  #convert text chunks into embeddings and store in vector database
36
 
37
  # create the open-source embedding function
38
+ #embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
39
+ embeddings = HuggingFaceEmbeddings()
40
 
41
  # create a vectorstore from the chunks
42
  vector_store = Chroma.from_documents(document_chunks, embeddings)
 
56
  after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
57
 
58
  # Initialize the Hugging Face language model (LLM)
59
+ llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":512})
60
 
61
  # Construct the RAG pipeline
62
  after_rag_chain = (