Quentin Gallouédec commited on
Commit
fc133fb
·
1 Parent(s): 0f0fa36
Files changed (1) hide show
  1. app.py +61 -29
app.py CHANGED
@@ -1,11 +1,14 @@
1
- from pypdf import PdfReader
2
- import re
3
  import random
 
 
4
  import gradio as gr
5
- from datasets import Dataset, DatasetDict
6
- import os
7
  import pandas as pd
8
-
 
 
 
 
9
  to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
10
  to_be_replaced = {
11
  "½": "1/2",
@@ -34,6 +37,8 @@ to_be_replaced = {
34
  }
35
 
36
 
 
 
37
  def clean(text):
38
  # Remove all the unwanted characters
39
  for char in to_be_removed:
@@ -64,7 +69,7 @@ def clean(text):
64
  return text
65
 
66
 
67
- def pdf2dataset(file, _, progress=gr.Progress()):
68
  progress(0, desc="Starting...")
69
  reader = PdfReader(file)
70
  num_pages = len(reader.pages)
@@ -81,35 +86,62 @@ def pdf2dataset(file, _, progress=gr.Progress()):
81
  dataset.push_to_hub(f"pdf2dataset/{dataset_name}", token=os.getenv("TOKEN"))
82
  progress(1, desc="Done!")
83
 
84
- instrctions = f"""
85
- Your dataset is now available on Hugging Face Datasets at [pdf2dataset/{dataset_name}](https://huggingface.co/datasets/pdf2dataset/{dataset_name}).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  You can load the dataset using the following code:
88
 
89
  ```python
90
  from datasets import load_dataset
91
 
92
- dataset = load_dataset("pdf2dataset/{dataset_name}")
93
  ```
94
- """
95
- preview = dataset["text"][:10]
96
- preview = pd.DataFrame(preview, columns=["text"])
97
- return instrctions, preview
98
-
99
-
100
- demo = gr.Interface(
101
- title="PDF to 🤗 Dataset",
102
- fn=pdf2dataset,
103
- inputs=[
104
- gr.File(file_types=["pdf"]),
105
- gr.Markdown(
106
- "⚠️ Caution: This process will upload your data to a public Hugging Face repository. Do not upload sensitive information."
107
- ),
108
- ],
109
- outputs=[gr.Markdown(), gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True)],
110
- submit_btn="Convert to dataset",
111
- allow_flagging="never",
112
- )
113
-
114
 
115
  demo.launch()
 
1
+ import os
 
2
  import random
3
+ import re
4
+
5
  import gradio as gr
 
 
6
  import pandas as pd
7
+ from datasets import Dataset
8
+ from pypdf import PdfReader
9
+ from huggingface_hub import HfApi
10
+ # import template
11
+ from string import Template
12
  to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
13
  to_be_replaced = {
14
  "½": "1/2",
 
37
  }
38
 
39
 
40
+
41
+
42
  def clean(text):
43
  # Remove all the unwanted characters
44
  for char in to_be_removed:
 
69
  return text
70
 
71
 
72
+ def pdf2dataset(file, progress=gr.Progress()):
73
  progress(0, desc="Starting...")
74
  reader = PdfReader(file)
75
  num_pages = len(reader.pages)
 
86
  dataset.push_to_hub(f"pdf2dataset/{dataset_name}", token=os.getenv("TOKEN"))
87
  progress(1, desc="Done!")
88
 
89
+ instrctions = instructions_template.substitute(dataset_name=dataset_name)
90
+ preview = dataset["text"][:10]
91
+ preview = pd.DataFrame(preview, columns=["text"])
92
+ return instrctions, preview, dataset_name
93
+
94
+
95
+ def delete_dataset(dataset_name):
96
+ api = HfApi()
97
+ if "/" in dataset_name:
98
+ user_id, dataset_name = dataset_name.split("/")
99
+ else:
100
+ user_id = "pdf2dataset"
101
+ if not user_id == "pdf2dataset":
102
+ return f"❌ Invalid namespace deteced: {user_id}"
103
+ repo_id = f"{user_id}/{dataset_name}"
104
+ try:
105
+ api.delete_repo(repo_id, repo_type="dataset")
106
+ return "✅ Dataset deleted successfully."
107
+ except Exception as e:
108
+ return f"❌ Error deleting dataset: {e}"
109
+
110
+
111
+ caution_text = """⚠️ Caution:
112
+ - This process will upload your data to a public Hugging Face repository. Do not upload sensitive information.
113
+ - Anyone (including you) will be able to delete the dataset once it is uploaded.
114
+ """
115
+
116
+ instructions_template = Template("""
117
+ Your dataset is now available on Hugging Face Datasets at [pdf2dataset/$dataset_name](https://huggingface.co/datasets/pdf2dataset/$dataset_name).
118
 
119
  You can load the dataset using the following code:
120
 
121
  ```python
122
  from datasets import load_dataset
123
 
124
+ dataset = load_dataset("pdf2dataset/$dataset_name")
125
  ```
126
+ """)
127
+
128
+ with gr.Blocks() as demo:
129
+ # Convert a PDF to a dataset
130
+ gr.Markdown("## Convert a PDF to a dataset")
131
+ file = gr.File(file_types=["pdf"], height=50)
132
+ gr.Markdown(caution_text)
133
+ convert_button = gr.Button("🔄 Convert and upload")
134
+ instructions = gr.Markdown(instructions_template.substitute(dataset_name="generated_dataset_name"))
135
+ preview = gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True, height=200)
136
+
137
+ # Delete a dataset
138
+ gr.Markdown("### Delete a dataset")
139
+ dataset_name_to_delete = gr.Textbox("", placeholder="Enter dataset name to delete")
140
+ delete_button = gr.Button("🗑️ Delete dataset")
141
+
142
+ # Define the actions
143
+ convert_button.click(pdf2dataset, inputs=[file], outputs=[instructions, preview, dataset_name_to_delete])
144
+ delete_button.click(delete_dataset, inputs=[dataset_name_to_delete], outputs=[delete_button])
145
+ dataset_name_to_delete.input(lambda: "🗑️ Delete dataset", outputs=[delete_button])
146
 
147
  demo.launch()