Spaces:
Running
Running
File size: 1,109 Bytes
162cd18 1badade |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
from langchain_community.document_loaders.text import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from setup import *
# Use a relative path:
file = "Amazon_sagemaker_Faq.txt" # Assuming you have a data folder in your project
loader = TextLoader(file_path=file)
pages = []
for page in loader.load():
pages.append(page)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
add_start_index=True,
separators=["\n", "\n\n"]
)
all_splits = text_splitter.split_documents(docs)
print(f"Split blog post into {len(all_splits)} sub-documents.")
# Instead of Windows absolute path for persistence:
# persist_directory = "D:\\Education\\AI\\AI-Agents\\Agentic-RAG"
# Use a relative path:
persist_directory = "./chroma_db" # This will create a chroma_db folder in your app's directory
vector_store = Chroma.from_documents(
documents=all_splits,
collection_name='sagemaker-chroma',
persist_directory=persist_directory,
embedding=embeddings
) |