patrickvonplaten commited on
Commit
7434749
1 Parent(s): 14aa4ce
Files changed (2) hide show
  1. app.py +19 -23
  2. requirements.txt +1 -1
app.py CHANGED
@@ -5,19 +5,15 @@ import shutil
5
  import os
6
  import tqdm
7
  from huggingface_hub import snapshot_download
8
- from huggingface_hub import HfApi, login
9
  import tempfile
10
  import re
11
- import pdfminer
12
  import time
13
 
14
- print("pdfminer", pdfminer.__version__)
15
- print("pandoc", pypandoc.__version__)
16
-
17
  HF_TOKEN = os.environ.get("HF_TOKEN")
18
 
19
  api = HfApi()
20
- login(HF_TOKEN)
21
 
22
 
23
  #from docx import Document
@@ -50,7 +46,7 @@ def retrieve_lines(filename):
50
  extension = filename.split(".")[-1]
51
 
52
  if extension == "pdf":
53
- text = pdfminer.high_level.extract_text(filename)
54
  lines = text.split("\n")
55
  elif extension in ["docx", "doc"]:
56
  with tempfile.TemporaryDirectory() as tmpdirname:
@@ -98,7 +94,7 @@ def main(filename, codewords_mapping):
98
  out += 25 * "="
99
  out += "\n\n"
100
 
101
- out += f"## Source: {filename}\n"
102
  out += 25 * "-"
103
  out += "\n"
104
  out += "\n".join([f'-{v}' for k,v in match.items()])
@@ -123,31 +119,31 @@ def convert(*keywords):
123
  os.makedirs(RESULTS_FOLDER)
124
 
125
  result_files = []
126
- for folder in tqdm.tqdm(glob.glob(os.path.join(DOC_FOLDER, "*"))):
127
- all_files = tqdm.tqdm(glob.glob(f"./{folder}/*"))
 
 
 
128
  num_files += len(all_files)
129
 
130
  for filename in all_files:
131
  try:
132
- result_files += main(filename)
133
  except Exception as e:
134
  print(f"{filename} not working because \n {e}")
135
 
136
- break
137
-
138
- print(f"Len {result_files}")
139
-
140
  result_files = list(set(result_files))
141
 
142
- for file in result_files:
143
- api.upload_file(
144
- path_or_fileobj=file,
145
- path_in_repo=file,
146
- repo_id="patrickvonplaten/atlas",
147
- repo_type="dataset",
148
- )
 
 
149
 
150
- return f"Done: {len(result_files)}"
151
 
152
  inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholder=CAT_TO_CODEWORDS[k], value=",".join(CAT_TO_CODEWORDS[k])) for k in CATEGORIES]
153
 
 
5
  import os
6
  import tqdm
7
  from huggingface_hub import snapshot_download
8
+ from huggingface_hub import HfApi
9
  import tempfile
10
  import re
11
+ from pdfminer.high_level import extract_text
12
  import time
13
 
 
 
 
14
  HF_TOKEN = os.environ.get("HF_TOKEN")
15
 
16
  api = HfApi()
 
17
 
18
 
19
  #from docx import Document
 
46
  extension = filename.split(".")[-1]
47
 
48
  if extension == "pdf":
49
+ text = extract_text(filename)
50
  lines = text.split("\n")
51
  elif extension in ["docx", "doc"]:
52
  with tempfile.TemporaryDirectory() as tmpdirname:
 
94
  out += 25 * "="
95
  out += "\n\n"
96
 
97
+ out += f"## Source: {'/'.join(filename.split('/')[-2:])}\n"
98
  out += 25 * "-"
99
  out += "\n"
100
  out += "\n".join([f'-{v}' for k,v in match.items()])
 
119
  os.makedirs(RESULTS_FOLDER)
120
 
121
  result_files = []
122
+ folders = glob.glob(os.path.join(DOC_FOLDER, "*"))
123
+
124
+
125
+ for folder in tqdm.tqdm(folders):
126
+ all_files = tqdm.tqdm(glob.glob(f"{folder}/*"))
127
  num_files += len(all_files)
128
 
129
  for filename in all_files:
130
  try:
131
+ result_files += main(filename, codewords_mapping)
132
  except Exception as e:
133
  print(f"{filename} not working because \n {e}")
134
 
 
 
 
 
135
  result_files = list(set(result_files))
136
 
137
+ api.upload_folder(
138
+ repo_id="patrickvonplaten/atlas",
139
+ folder_path=RESULTS_FOLDER,
140
+ path_in_repo=f"results_{time.time()}",
141
+ repo_type="dataset",
142
+ token=HF_TOKEN,
143
+ )
144
+
145
+ return f"Done. Processed {len(result_files)} files."
146
 
 
147
 
148
  inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholder=CAT_TO_CODEWORDS[k], value=",".join(CAT_TO_CODEWORDS[k])) for k in CATEGORIES]
149
 
requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
  pypandoc
2
- pdfminer
 
1
  pypandoc
2
+ pdfminer.six