Spaces:
Runtime error
Runtime error
import glob | |
import os | |
from langchain_text_splitters import MarkdownHeaderTextSplitter | |
from langchain_community.document_loaders import UnstructuredMarkdownLoader | |
path_to_data = "./data/" | |
def process_markdown(): | |
headers_to_split_on = [ | |
("#", "Header 1"), | |
("##", "Header 2"), | |
("###", "Header 3"), | |
("####", "Header 4"), | |
("#####", "Header 5") | |
] | |
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) | |
files = glob.glob(path_to_data+"*.md") | |
print(files) | |
docs = [] | |
for file in files: | |
print(file) | |
try: | |
loader = UnstructuredMarkdownLoader(file) | |
data = loader.load() | |
docs.append(data) | |
except Exception as e: | |
print("Exception: ", e) | |
docs_processed = [markdown_splitter.split_text(doc) for doc in docs] | |
print(len(docs_processed)) | |
print(docs_processed[0]) |