Spaces:
Running
Running
from langchain.document_loaders import PyPDFLoader, TextLoader | |
import pandas as pd | |
from langchain.document_loaders import WebBaseLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import requests | |
class DocumentsLoader: | |
def __init__(self) -> None: | |
pass | |
def load_urls_from_csv(self, url_path: str, column: str = "url"): | |
df = pd.read_csv(url_path) | |
doc_urls = df[column].to_list() | |
return doc_urls | |
# TODO: complete notion scraper | |
def is_notion_url(self, url): | |
# Regular expressions to match Notion URLs | |
return "notion" in url | |
def is_pdf_url(self, url): | |
# Define a list of common PDF file extensions | |
pdf_extensions = [".pdf"] | |
# Check if the URL ends with a PDF file extension | |
for extension in pdf_extensions: | |
if url.endswith(extension): | |
return True | |
return False | |
def is_txt_url(self, url): | |
# Define a list of common PDF file extensions | |
pdf_extensions = [".txt"] | |
# Check if the URL ends with a PDF file extension | |
for extension in pdf_extensions: | |
if url.endswith(extension): | |
return True | |
return False | |
def is_valid_url(self, url): | |
# TODO: handle status codes not 200 | |
try: | |
response = requests.head(url) | |
if response.status_code == 200: | |
return True # 200 status code indicates a valid URL | |
except requests.RequestException: | |
return False | |
def load_docs(self, doc_urls: list) -> list: | |
web_urls, pdf_urls, notion_urls, text_urls, docs = [], [], [], [], [] | |
if isinstance(doc_urls[0], list): | |
doc_urls = [doc[0] for doc in doc_urls] | |
# doc_urls = doc_urls[0] | |
# split urls on pdf,web, | |
print("docs urls: ", doc_urls) | |
for url in doc_urls: | |
print("URL : ", url) | |
print(self.is_pdf_url(url)) | |
if self.is_pdf_url(url): | |
pdf_urls.append(url) | |
if self.is_notion_url(url): | |
notion_urls.append(url) | |
if self.is_txt_url(url): | |
text_urls.append(url) | |
else: | |
web_urls.append(url) | |
# load web urls | |
if len(web_urls) > 0: | |
web_urls = [url for url in web_urls if self.is_valid_url(url)] | |
for web_url in web_urls: | |
try: | |
web_loader = WebBaseLoader(web_url) | |
web_docs = web_loader.load() | |
docs = docs + web_docs | |
except Exception as e: | |
print(f"Error web loader, {web_url}: {str(e)}") | |
# load pdf urls | |
if len(pdf_urls) > 0: | |
for pdf_url in pdf_urls: | |
try: | |
pdf_loader = PyPDFLoader(pdf_url) | |
pdf_docs = pdf_loader.load() | |
docs = docs + pdf_docs | |
except Exception as e: | |
print(f"Error pdf loader, {pdf_url}: {str(e)}") | |
if len(text_urls) > 0: | |
for txt_url in text_urls: | |
try: | |
txt_loader = TextLoader(txt_url) | |
txt_docs = txt_loader.load() | |
docs = docs + txt_docs | |
except Exception as e: | |
print(f"Error pdf loader, {txt_url}: {str(e)}") | |
return docs | |
def split_docs(self, docs, chunk_size=2000): | |
r_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=0, | |
separators=["\n\n", "\n", "\. ", " ", ""], | |
) | |
splits = r_splitter.split_documents(docs) | |
return splits | |