NSE-stock-picker / document_preprocessor.py
kailashsp's picture
add: initial commit
5001332
raw
history blame
659 Bytes
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.docstore.document import Document
from unstructured.cleaners.core import remove_punctuation,clean,clean_extra_whitespace
def generate_document(url):
"Given an URL, return a langchain Document to futher processing"
loader = UnstructuredURLLoader(urls=[url],
mode="elements",
post_processors=[clean,remove_punctuation,clean_extra_whitespace])
elements = loader.load()
selected_elements = [e for e in elements]
full_clean = " ".join([e.page_content for e in selected_elements])
return Document(page_content=full_clean, metadata={"source":url})