pdf2dataset

Sleeping

pdf2dataset / app.py

Quentin Gallouédec

app

58e4b18 7 months ago

3.26 kB

	from pypdf import PdfReader
	import re
	import random
	import gradio as gr
	from datasets import Dataset, DatasetDict
	import os
	import pandas as pd

	to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
	to_be_replaced = {
	"½": "1/2",
	"–": "-",
	"‘": "'",
	"’": "'",
	"…": "...",
	"₋": "-",
	"−": "-",
	"⓫": "11.",
	"⓬": "12.",
	"⓭": "13.",
	"⓮": "14.",
	"◦": "°",
	"❶": "1.",
	"❷": "2.",
	"❸": "3.",
	"❹": "4.",
	"❺": "5.",
	"❻": "6.",
	"❼": "7.",
	"❽": "8.",
	"❾": "9.",
	"❿": "10.",
	"\n": " ",
	}


	def clean(text):
	# Remove all the unwanted characters
	for char in to_be_removed:
	text = text.replace(char, "")

	# Replace all the characters that need to be replaced
	for char, replacement in to_be_replaced.items():
	text = text.replace(char, replacement)

	# For all \n, if the next line doesn't start with a capital letter, remove the \n
	# text = re.sub(r"\n([^A-ZÀ-ÖØ-Þ])", r" \1", text)

	# Make sure that every "." is followed by a space
	text = re.sub(r"\.([^ ])", r". \1", text)

	# Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents)
	text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text)

	# Make sure that there is no space before a comma and a period
	text = text.replace(" ,", ",")
	text = text.replace(" .", ".")
	text = text.replace(" -", "-")
	text = text.replace("- ", "-")

	while " " in text:
	text = text.replace(" ", " ")

	return text


	def pdf2dataset(file, _, progress=gr.Progress()):
	progress(0, desc="Starting...")
	reader = PdfReader(file)
	num_pages = len(reader.pages)
	dataset_name = f"{random.getrandbits(128):x}"
	page_texts = []

	for page in progress.tqdm(reader.pages, total=num_pages, desc="Converting pages"):
	page_text = page.extract_text()
	page_text = clean(page_text)
	page_texts.append(page_text)

	progress(0, desc="Uploading to Hugging Face...")
	dataset = Dataset.from_dict({"text": page_texts})
	dataset.push_to_hub(f"pdf2dataset/{dataset_name}", token=os.getenv("TOKEN"))
	progress(1, desc="Done!")

	instrctions = f"""
	Your dataset is now available on Hugging Face Datasets at [pdf2dataset/{dataset_name}](https://huggingface.co/datasets/pdf2dataset/{dataset_name}).

	You can load the dataset using the following code:

	```python
	from datasets import load_dataset

	dataset = load_dataset("pdf2dataset/{dataset_name}")
	```
	"""
	preview = dataset["text"][:10]
	preview = pd.DataFrame(preview, columns=["text"])
	return instrctions, preview


	demo = gr.Interface(
	title="PDF to 🤗 Dataset",
	fn=pdf2dataset,
	inputs=[
	gr.File(file_types=["pdf"]),
	gr.Markdown(
	"⚠️ Caution: This process will upload your data to a public Hugging Face repository. Do not upload sensitive information."
	),
	],
	outputs=[gr.Markdown(), gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True)],
	submit_btn="Convert to dataset",
	allow_flagging="never",
	)


	demo.launch()