Spaces:
Sleeping
Sleeping
Quentin Gallouédec
commited on
Commit
•
399084d
1
Parent(s):
ccba23d
log
Browse files
app.py
CHANGED
@@ -1,15 +1,14 @@
|
|
1 |
import os
|
2 |
import random
|
3 |
import re
|
|
|
4 |
|
5 |
import gradio as gr
|
6 |
import pandas as pd
|
7 |
from datasets import Dataset
|
8 |
-
from pypdf import PdfReader
|
9 |
from huggingface_hub import HfApi
|
|
|
10 |
|
11 |
-
# import template
|
12 |
-
from string import Template
|
13 |
|
14 |
to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
|
15 |
to_be_replaced = {
|
@@ -57,7 +56,7 @@ def clean(text):
|
|
57 |
# Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents)
|
58 |
text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text)
|
59 |
|
60 |
-
# Make sure that there is no space before a comma
|
61 |
text = text.replace(" ,", ",")
|
62 |
text = text.replace(" .", ".")
|
63 |
text = text.replace(" -", "-")
|
@@ -89,6 +88,7 @@ def pdf2dataset(file, progress=gr.Progress()):
|
|
89 |
instrctions = instructions_template.substitute(dataset_name=dataset_name)
|
90 |
preview = dataset["text"][:10]
|
91 |
preview = pd.DataFrame(preview, columns=["text"])
|
|
|
92 |
return instrctions, preview, dataset_name
|
93 |
|
94 |
|
@@ -99,12 +99,15 @@ def delete_dataset(dataset_name):
|
|
99 |
else:
|
100 |
user_id = "pdf2dataset"
|
101 |
if not user_id == "pdf2dataset":
|
|
|
102 |
return f"❌ Invalid namespace deteced: {user_id}"
|
103 |
repo_id = f"{user_id}/{dataset_name}"
|
104 |
try:
|
105 |
api.delete_repo(repo_id, repo_type="dataset")
|
|
|
106 |
return "✅ Dataset deleted successfully."
|
107 |
except Exception as e:
|
|
|
108 |
return f"❌ Error deleting dataset: {e}"
|
109 |
|
110 |
|
@@ -132,7 +135,9 @@ with gr.Blocks() as demo:
|
|
132 |
gr.Markdown(caution_text)
|
133 |
gr.Markdown("## 2️⃣ Convert the PDF and upload")
|
134 |
convert_button = gr.Button("🔄 Convert and upload")
|
135 |
-
preview = gr.Dataframe(
|
|
|
|
|
136 |
gr.Markdown("## 3️⃣ Use the dataset in your code")
|
137 |
instructions = gr.Markdown(instructions_template.substitute(dataset_name="generated_dataset_name"))
|
138 |
gr.Markdown("## 4️⃣ Delete the (optional)")
|
|
|
1 |
import os
|
2 |
import random
|
3 |
import re
|
4 |
+
from string import Template
|
5 |
|
6 |
import gradio as gr
|
7 |
import pandas as pd
|
8 |
from datasets import Dataset
|
|
|
9 |
from huggingface_hub import HfApi
|
10 |
+
from pypdf import PdfReader
|
11 |
|
|
|
|
|
12 |
|
13 |
to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
|
14 |
to_be_replaced = {
|
|
|
56 |
# Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents)
|
57 |
text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text)
|
58 |
|
59 |
+
# Make sure that there is no space before a comma, a period, or a hyphen
|
60 |
text = text.replace(" ,", ",")
|
61 |
text = text.replace(" .", ".")
|
62 |
text = text.replace(" -", "-")
|
|
|
88 |
instrctions = instructions_template.substitute(dataset_name=dataset_name)
|
89 |
preview = dataset["text"][:10]
|
90 |
preview = pd.DataFrame(preview, columns=["text"])
|
91 |
+
print(f"Dataset {dataset_name} uploaded successfully.")
|
92 |
return instrctions, preview, dataset_name
|
93 |
|
94 |
|
|
|
99 |
else:
|
100 |
user_id = "pdf2dataset"
|
101 |
if not user_id == "pdf2dataset":
|
102 |
+
print(f"Invalid namespace deteced in {dataset_name}.")
|
103 |
return f"❌ Invalid namespace deteced: {user_id}"
|
104 |
repo_id = f"{user_id}/{dataset_name}"
|
105 |
try:
|
106 |
api.delete_repo(repo_id, repo_type="dataset")
|
107 |
+
print(f"Dataset {dataset_name} deleted successfully.")
|
108 |
return "✅ Dataset deleted successfully."
|
109 |
except Exception as e:
|
110 |
+
print(f"Error deleting dataset{dataset_name}: {e}")
|
111 |
return f"❌ Error deleting dataset: {e}"
|
112 |
|
113 |
|
|
|
135 |
gr.Markdown(caution_text)
|
136 |
gr.Markdown("## 2️⃣ Convert the PDF and upload")
|
137 |
convert_button = gr.Button("🔄 Convert and upload")
|
138 |
+
preview = gr.Dataframe(
|
139 |
+
label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True, height=200
|
140 |
+
)
|
141 |
gr.Markdown("## 3️⃣ Use the dataset in your code")
|
142 |
instructions = gr.Markdown(instructions_template.substitute(dataset_name="generated_dataset_name"))
|
143 |
gr.Markdown("## 4️⃣ Delete the (optional)")
|