Quentin Gallouédec commited on
Commit
399084d
1 Parent(s): ccba23d
Files changed (1) hide show
  1. app.py +10 -5
app.py CHANGED
@@ -1,15 +1,14 @@
1
  import os
2
  import random
3
  import re
 
4
 
5
  import gradio as gr
6
  import pandas as pd
7
  from datasets import Dataset
8
- from pypdf import PdfReader
9
  from huggingface_hub import HfApi
 
10
 
11
- # import template
12
- from string import Template
13
 
14
  to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
15
  to_be_replaced = {
@@ -57,7 +56,7 @@ def clean(text):
57
  # Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents)
58
  text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text)
59
 
60
- # Make sure that there is no space before a comma and a period
61
  text = text.replace(" ,", ",")
62
  text = text.replace(" .", ".")
63
  text = text.replace(" -", "-")
@@ -89,6 +88,7 @@ def pdf2dataset(file, progress=gr.Progress()):
89
  instrctions = instructions_template.substitute(dataset_name=dataset_name)
90
  preview = dataset["text"][:10]
91
  preview = pd.DataFrame(preview, columns=["text"])
 
92
  return instrctions, preview, dataset_name
93
 
94
 
@@ -99,12 +99,15 @@ def delete_dataset(dataset_name):
99
  else:
100
  user_id = "pdf2dataset"
101
  if not user_id == "pdf2dataset":
 
102
  return f"❌ Invalid namespace deteced: {user_id}"
103
  repo_id = f"{user_id}/{dataset_name}"
104
  try:
105
  api.delete_repo(repo_id, repo_type="dataset")
 
106
  return "✅ Dataset deleted successfully."
107
  except Exception as e:
 
108
  return f"❌ Error deleting dataset: {e}"
109
 
110
 
@@ -132,7 +135,9 @@ with gr.Blocks() as demo:
132
  gr.Markdown(caution_text)
133
  gr.Markdown("## 2️⃣ Convert the PDF and upload")
134
  convert_button = gr.Button("🔄 Convert and upload")
135
- preview = gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True, height=200)
 
 
136
  gr.Markdown("## 3️⃣ Use the dataset in your code")
137
  instructions = gr.Markdown(instructions_template.substitute(dataset_name="generated_dataset_name"))
138
  gr.Markdown("## 4️⃣ Delete the (optional)")
 
1
  import os
2
  import random
3
  import re
4
+ from string import Template
5
 
6
  import gradio as gr
7
  import pandas as pd
8
  from datasets import Dataset
 
9
  from huggingface_hub import HfApi
10
+ from pypdf import PdfReader
11
 
 
 
12
 
13
  to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
14
  to_be_replaced = {
 
56
  # Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents)
57
  text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text)
58
 
59
+ # Make sure that there is no space before a comma, a period, or a hyphen
60
  text = text.replace(" ,", ",")
61
  text = text.replace(" .", ".")
62
  text = text.replace(" -", "-")
 
88
  instrctions = instructions_template.substitute(dataset_name=dataset_name)
89
  preview = dataset["text"][:10]
90
  preview = pd.DataFrame(preview, columns=["text"])
91
+ print(f"Dataset {dataset_name} uploaded successfully.")
92
  return instrctions, preview, dataset_name
93
 
94
 
 
99
  else:
100
  user_id = "pdf2dataset"
101
  if not user_id == "pdf2dataset":
102
+ print(f"Invalid namespace deteced in {dataset_name}.")
103
  return f"❌ Invalid namespace deteced: {user_id}"
104
  repo_id = f"{user_id}/{dataset_name}"
105
  try:
106
  api.delete_repo(repo_id, repo_type="dataset")
107
+ print(f"Dataset {dataset_name} deleted successfully.")
108
  return "✅ Dataset deleted successfully."
109
  except Exception as e:
110
+ print(f"Error deleting dataset{dataset_name}: {e}")
111
  return f"❌ Error deleting dataset: {e}"
112
 
113
 
 
135
  gr.Markdown(caution_text)
136
  gr.Markdown("## 2️⃣ Convert the PDF and upload")
137
  convert_button = gr.Button("🔄 Convert and upload")
138
+ preview = gr.Dataframe(
139
+ label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True, height=200
140
+ )
141
  gr.Markdown("## 3️⃣ Use the dataset in your code")
142
  instructions = gr.Markdown(instructions_template.substitute(dataset_name="generated_dataset_name"))
143
  gr.Markdown("## 4️⃣ Delete the (optional)")