import config from langchain.docstore.document import Document as LangchainDocument from langchain.text_splitter import RecursiveCharacterTextSplitter from transformers import AutoTokenizer from tqdm.notebook import tqdm from typing import List def split_documents(chunk_size: int, knowledge_base: List[LangchainDocument]) -> List[LangchainDocument]: """ Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents. """ MARKDOWN_SEPARATORS = [ "\n#{1,6} ", "```\n", "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n", "\n\n", "\n", " ", "", ] text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( AutoTokenizer.from_pretrained(config.EMBEDDING_MODEL_NAME), chunk_size=chunk_size, chunk_overlap=int(chunk_size / 10), add_start_index=True, strip_whitespace=True, separators=MARKDOWN_SEPARATORS, ) docs_processed = [] for doc in tqdm(knowledge_base): docs_processed += text_splitter.split_documents([doc]) unique_texts = {} docs_processed_unique = [] for doc in docs_processed: if doc.page_content not in unique_texts: unique_texts[doc.page_content] = True docs_processed_unique.append(doc) return docs_processed_unique # , docs_processed