Spaces:
Running
Running
File size: 659 Bytes
5001332 |
1 2 3 4 5 6 7 8 9 10 11 12 13 |
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.docstore.document import Document
from unstructured.cleaners.core import remove_punctuation,clean,clean_extra_whitespace
def generate_document(url):
"Given an URL, return a langchain Document to futher processing"
loader = UnstructuredURLLoader(urls=[url],
mode="elements",
post_processors=[clean,remove_punctuation,clean_extra_whitespace])
elements = loader.load()
selected_elements = [e for e in elements]
full_clean = " ".join([e.page_content for e in selected_elements])
return Document(page_content=full_clean, metadata={"source":url}) |