from pypdf import PdfReader import re import random import gradio as gr from datasets import Dataset, DatasetDict import os import pandas as pd to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""] to_be_replaced = { "½": "1/2", "–": "-", "‘": "'", "’": "'", "…": "...", "₋": "-", "−": "-", "⓫": "11.", "⓬": "12.", "⓭": "13.", "⓮": "14.", "◦": "°", "❶": "1.", "❷": "2.", "❸": "3.", "❹": "4.", "❺": "5.", "❻": "6.", "❼": "7.", "❽": "8.", "❾": "9.", "❿": "10.", "\n": " ", } def clean(text): # Remove all the unwanted characters for char in to_be_removed: text = text.replace(char, "") # Replace all the characters that need to be replaced for char, replacement in to_be_replaced.items(): text = text.replace(char, replacement) # For all \n, if the next line doesn't start with a capital letter, remove the \n # text = re.sub(r"\n([^A-ZÀ-ÖØ-Þ])", r" \1", text) # Make sure that every "." is followed by a space text = re.sub(r"\.([^ ])", r". \1", text) # Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents) text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text) # Make sure that there is no space before a comma and a period text = text.replace(" ,", ",") text = text.replace(" .", ".") text = text.replace(" -", "-") text = text.replace("- ", "-") while " " in text: text = text.replace(" ", " ") return text def pdf2dataset(file, _, progress=gr.Progress()): progress(0, desc="Starting...") reader = PdfReader(file) num_pages = len(reader.pages) dataset_name = f"{random.getrandbits(128):x}" page_texts = [] for page in progress.tqdm(reader.pages, total=num_pages, desc="Converting pages"): page_text = page.extract_text() page_text = clean(page_text) page_texts.append(page_text) progress(0, desc="Uploading to Hugging Face...") dataset = Dataset.from_dict({"text": page_texts}) dataset.push_to_hub(f"pdf2dataset/{dataset_name}", token=os.getenv("TOKEN")) progress(1, desc="Done!") instrctions = f""" Your dataset is now available on Hugging Face Datasets at [pdf2dataset/{dataset_name}](https://huggingface.co/datasets/pdf2dataset/{dataset_name}). You can load the dataset using the following code: ```python from datasets import load_dataset dataset = load_dataset("pdf2dataset/{dataset_name}") ``` """ preview = dataset["text"][:10] preview = pd.DataFrame(preview, columns=["text"]) return instrctions, preview demo = gr.Interface( title="PDF to 🤗 Dataset", fn=pdf2dataset, inputs=[ gr.File(file_types=["pdf"]), gr.Markdown( "⚠️ Caution: This process will upload your data to a public Hugging Face repository. Do not upload sensitive information." ), ], outputs=[gr.Markdown(), gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True)], submit_btn="Convert to dataset", allow_flagging="never", ) demo.launch()