from pypdf import PdfReader
import re
import random
import gradio as gr
from datasets import Dataset, DatasetDict
import os
import pandas as pd

to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
to_be_replaced = {
    "½": "1/2",
    "–": "-",
    "‘": "'",
    "’": "'",
    "…": "...",
    "₋": "-",
    "−": "-",
    "⓫": "11.",
    "⓬": "12.",
    "⓭": "13.",
    "⓮": "14.",
    "◦": "°",
    "❶": "1.",
    "❷": "2.",
    "❸": "3.",
    "❹": "4.",
    "❺": "5.",
    "❻": "6.",
    "❼": "7.",
    "❽": "8.",
    "❾": "9.",
    "❿": "10.",
    "\n": " ",
}


def clean(text):
    # Remove all the unwanted characters
    for char in to_be_removed:
        text = text.replace(char, "")

    # Replace all the characters that need to be replaced
    for char, replacement in to_be_replaced.items():
        text = text.replace(char, replacement)

    # For all \n, if the next line doesn't start with a capital letter, remove the \n
    # text = re.sub(r"\n([^A-ZÀ-ÖØ-Þ])", r" \1", text)

    # Make sure that every "." is followed by a space
    text = re.sub(r"\.([^ ])", r". \1", text)

    # Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents)
    text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text)

    # Make sure that there is no space before a comma and a period
    text = text.replace(" ,", ",")
    text = text.replace(" .", ".")
    text = text.replace(" -", "-")
    text = text.replace("- ", "-")

    while "  " in text:
        text = text.replace("  ", " ")

    return text


def pdf2dataset(file, _, progress=gr.Progress()):
    progress(0, desc="Starting...")
    reader = PdfReader(file)
    num_pages = len(reader.pages)
    dataset_name = f"{random.getrandbits(128):x}"
    page_texts = []

    for page in progress.tqdm(reader.pages, total=num_pages, desc="Converting pages"):
        page_text = page.extract_text()
        page_text = clean(page_text)
        page_texts.append(page_text)

    progress(0, desc="Uploading to Hugging Face...")
    dataset = Dataset.from_dict({"text": page_texts})
    dataset.push_to_hub(f"pdf2dataset/{dataset_name}", token=os.getenv("TOKEN"))
    progress(1, desc="Done!")

    instrctions = f"""
Your dataset is now available on Hugging Face Datasets at [pdf2dataset/{dataset_name}](https://huggingface.co/datasets/pdf2dataset/{dataset_name}).

You can load the dataset using the following code:

```python
from datasets import load_dataset

dataset = load_dataset("pdf2dataset/{dataset_name}")
```
    """
    preview = dataset["text"][:10]
    preview = pd.DataFrame(preview, columns=["text"])
    return instrctions, preview


demo = gr.Interface(
    title="PDF to 🤗 Dataset",
    fn=pdf2dataset,
    inputs=[
        gr.File(file_types=["pdf"]),
        gr.Markdown(
            "⚠️ Caution: This process will upload your data to a public Hugging Face repository. Do not upload sensitive information."
        ),
    ],
    outputs=[gr.Markdown(), gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True)],
    submit_btn="Convert to dataset",
    allow_flagging="never",
)


demo.launch()