pdf2dataset / app.py
Quentin Gallouédec
app
58e4b18
raw
history blame
3.26 kB
from pypdf import PdfReader
import re
import random
import gradio as gr
from datasets import Dataset, DatasetDict
import os
import pandas as pd
to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
to_be_replaced = {
"½": "1/2",
"–": "-",
"‘": "'",
"’": "'",
"…": "...",
"₋": "-",
"−": "-",
"⓫": "11.",
"⓬": "12.",
"⓭": "13.",
"⓮": "14.",
"◦": "°",
"❶": "1.",
"❷": "2.",
"❸": "3.",
"❹": "4.",
"❺": "5.",
"❻": "6.",
"❼": "7.",
"❽": "8.",
"❾": "9.",
"❿": "10.",
"\n": " ",
}
def clean(text):
# Remove all the unwanted characters
for char in to_be_removed:
text = text.replace(char, "")
# Replace all the characters that need to be replaced
for char, replacement in to_be_replaced.items():
text = text.replace(char, replacement)
# For all \n, if the next line doesn't start with a capital letter, remove the \n
# text = re.sub(r"\n([^A-ZÀ-ÖØ-Þ])", r" \1", text)
# Make sure that every "." is followed by a space
text = re.sub(r"\.([^ ])", r". \1", text)
# Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents)
text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text)
# Make sure that there is no space before a comma and a period
text = text.replace(" ,", ",")
text = text.replace(" .", ".")
text = text.replace(" -", "-")
text = text.replace("- ", "-")
while " " in text:
text = text.replace(" ", " ")
return text
def pdf2dataset(file, _, progress=gr.Progress()):
progress(0, desc="Starting...")
reader = PdfReader(file)
num_pages = len(reader.pages)
dataset_name = f"{random.getrandbits(128):x}"
page_texts = []
for page in progress.tqdm(reader.pages, total=num_pages, desc="Converting pages"):
page_text = page.extract_text()
page_text = clean(page_text)
page_texts.append(page_text)
progress(0, desc="Uploading to Hugging Face...")
dataset = Dataset.from_dict({"text": page_texts})
dataset.push_to_hub(f"pdf2dataset/{dataset_name}", token=os.getenv("TOKEN"))
progress(1, desc="Done!")
instrctions = f"""
Your dataset is now available on Hugging Face Datasets at [pdf2dataset/{dataset_name}](https://huggingface.co/datasets/pdf2dataset/{dataset_name}).
You can load the dataset using the following code:
```python
from datasets import load_dataset
dataset = load_dataset("pdf2dataset/{dataset_name}")
```
"""
preview = dataset["text"][:10]
preview = pd.DataFrame(preview, columns=["text"])
return instrctions, preview
demo = gr.Interface(
title="PDF to 🤗 Dataset",
fn=pdf2dataset,
inputs=[
gr.File(file_types=["pdf"]),
gr.Markdown(
"⚠️ Caution: This process will upload your data to a public Hugging Face repository. Do not upload sensitive information."
),
],
outputs=[gr.Markdown(), gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True)],
submit_btn="Convert to dataset",
allow_flagging="never",
)
demo.launch()