|
from pathlib import Path |
|
from typing import List |
|
|
|
from langchain.schema import Document |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.vectorstores.chroma import Chroma |
|
from langchain_community.document_loaders import TextLoader |
|
from langchain_openai import OpenAIEmbeddings |
|
|
|
import configs |
|
|
|
embeddings_model = OpenAIEmbeddings() |
|
|
|
|
|
def process_documents(doc_storage_path: str): |
|
print("doc preprocessing...") |
|
doc_directory = Path(doc_storage_path) |
|
docs = [] |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=configs.CHUNK_SIZE, chunk_overlap=configs.CHUNK_OVERLAP |
|
) |
|
doc_search = Chroma( |
|
persist_directory=configs.STORE_FILE, embedding_function=embeddings_model |
|
) |
|
for file_path in doc_directory.glob("*.txt"): |
|
loader = TextLoader(str(file_path)) |
|
documents = loader.load() |
|
docs = text_splitter.split_documents(documents) |
|
doc_search = doc_search.from_documents( |
|
docs, embeddings_model, persist_directory=configs.STORE_FILE |
|
) |
|
doc_search.persist() |
|
print("doc preprocessing end.") |
|
return doc_search |
|
|
|
|
|
def format_docs(docs): |
|
return "\n\n".join([d.page_content for d in docs]) |
|
|