AkwabaGPT / utils.py
Monsia's picture
v 0.1.0
ed5def4
raw
history blame
1.25 kB
from pathlib import Path
from typing import List
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
import configs
embeddings_model = OpenAIEmbeddings()
def process_documents(doc_storage_path: str):
print("doc preprocessing...")
doc_directory = Path(doc_storage_path)
docs = [] # type: List[Document]
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=configs.CHUNK_SIZE, chunk_overlap=configs.CHUNK_OVERLAP
)
doc_search = Chroma(
persist_directory=configs.STORE_FILE, embedding_function=embeddings_model
)
for file_path in doc_directory.glob("*.txt"):
loader = TextLoader(str(file_path))
documents = loader.load()
docs = text_splitter.split_documents(documents)
doc_search = doc_search.from_documents(
docs, embeddings_model, persist_directory=configs.STORE_FILE
)
doc_search.persist()
print("doc preprocessing end.")
return doc_search
def format_docs(docs):
return "\n\n".join([d.page_content for d in docs])