pszemraj commited on
Commit
73feb19
·
1 Parent(s): 77d5469

✨ easily customize app

Browse files

Signed-off-by: peter szemraj <[email protected]>

Files changed (1) hide show
  1. app.py +34 -9
app.py CHANGED
@@ -3,6 +3,13 @@ app.py - the main module for the gradio app
3
 
4
  Usage:
5
  python app.py
 
 
 
 
 
 
 
6
  """
7
  import contextlib
8
  import gc
@@ -14,9 +21,7 @@ import time
14
  from pathlib import Path
15
 
16
  os.environ["USE_TORCH"] = "1"
17
- os.environ[
18
- "TOKENIZERS_PARALLELISM"
19
- ] = "false" # parallelism on tokenizers is buggy with gradio
20
 
21
  logging.basicConfig(
22
  level=logging.INFO,
@@ -48,6 +53,10 @@ MODEL_OPTIONS = [
48
  "pszemraj/pegasus-x-large-book-summary",
49
  ] # models users can choose from
50
 
 
 
 
 
51
 
52
  def predict(
53
  input_text: str,
@@ -105,7 +114,11 @@ def proc_submission(
105
  length_penalty (float): the length penalty to use
106
  repetition_penalty (float): the repetition penalty to use
107
  no_repeat_ngram_size (int): the no repeat ngram size to use
108
- max_input_length (int, optional): the maximum input length to use. Defaults to 2048.
 
 
 
 
109
 
110
  Returns:
111
  str in HTML format, string of the summary, str of score
@@ -122,6 +135,9 @@ def proc_submission(
122
  "early_stopping": True,
123
  "do_sample": False,
124
  }
 
 
 
125
  st = time.perf_counter()
126
  history = {}
127
  clean_text = clean(input_text, lower=False)
@@ -186,7 +202,7 @@ def proc_submission(
186
 
187
  # save to file
188
  settings["model_name"] = model_name
189
- saved_file = saves_summary(_summaries, **settings)
190
 
191
  return html, sum_text_out, scores_out, saved_file
192
 
@@ -211,6 +227,8 @@ def load_single_example_text(
211
  text = clean(raw_text, lower=False)
212
  elif full_ex_path.suffix == ".pdf":
213
  logging.info(f"Loading PDF file {full_ex_path}")
 
 
214
  conversion_stats = convert_PDF_to_Text(
215
  full_ex_path,
216
  ocr_model=ocr_model,
@@ -241,12 +259,14 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
241
  file_path = Path(file_obj.name)
242
  try:
243
  logger.info(f"Loading file:\t{file_path}")
244
- if file_path.suffix == ".txt":
245
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
246
  raw_text = f.read()
247
  text = clean(raw_text, lower=lower)
248
  elif file_path.suffix == ".pdf":
249
  logger.info(f"loading as PDF file {file_path}")
 
 
250
  conversion_stats = convert_PDF_to_Text(
251
  file_path,
252
  ocr_model=ocr_model,
@@ -254,8 +274,8 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
254
  )
255
  text = conversion_stats["converted_text"]
256
  else:
257
- logger.error(f"Unknown file type {file_path.suffix}")
258
- text = "ERROR - check file - unknown file type"
259
 
260
  return text
261
  except Exception as e:
@@ -276,7 +296,8 @@ if __name__ == "__main__":
276
  )
277
  name_to_path = load_example_filenames(_here / "examples")
278
  logger.info(f"Loaded {len(name_to_path)} examples")
279
- demo = gr.Blocks()
 
280
  _examples = list(name_to_path.keys())
281
  with demo:
282
  gr.Markdown("# Document Summarization with Long-Document Transformers")
@@ -318,6 +339,7 @@ if __name__ == "__main__":
318
  with gr.Row():
319
  input_text = gr.Textbox(
320
  lines=4,
 
321
  label="Input Text (for summarization)",
322
  placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
323
  )
@@ -389,6 +411,9 @@ if __name__ == "__main__":
389
  gr.Markdown(
390
  "- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
391
  )
 
 
 
392
  gr.Markdown("---")
393
 
394
  load_examples_button.click(
 
3
 
4
  Usage:
5
  python app.py
6
+
7
+ Environment Variables:
8
+ USE_TORCH (str): whether to use torch (1) or not (0)
9
+ TOKENIZERS_PARALLELISM (str): whether to use parallelism (true) or not (false)
10
+ Optional Environment Variables:
11
+ APP_MAX_WORDS (int): the maximum number of words to use for summarization
12
+ APP_OCR_MAX_PAGES (int): the maximum number of pages to use for OCR
13
  """
14
  import contextlib
15
  import gc
 
21
  from pathlib import Path
22
 
23
  os.environ["USE_TORCH"] = "1"
24
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
25
 
26
  logging.basicConfig(
27
  level=logging.INFO,
 
53
  "pszemraj/pegasus-x-large-book-summary",
54
  ] # models users can choose from
55
 
56
+ # if duplicating space,, uncomment this line to adjust the max words
57
+ # os.environ["APP_MAX_WORDS"] = str(2048) # set the max words to 2048
58
+ # os.environ["APP_OCR_MAX_PAGES"] = str(40) # set the max pages to 40
59
+
60
 
61
  def predict(
62
  input_text: str,
 
114
  length_penalty (float): the length penalty to use
115
  repetition_penalty (float): the repetition penalty to use
116
  no_repeat_ngram_size (int): the no repeat ngram size to use
117
+ max_input_length (int, optional): the maximum input length to use. Defaults to 4096.
118
+
119
+ Note:
120
+ the max_input_length is set to 4096 by default, but can be changed by setting the
121
+ environment variable APP_MAX_WORDS to a different value.
122
 
123
  Returns:
124
  str in HTML format, string of the summary, str of score
 
135
  "early_stopping": True,
136
  "do_sample": False,
137
  }
138
+ max_input_length = int(os.environ.get("APP_MAX_WORDS", max_input_length))
139
+ logging.info(f"max_input_length set to: {max_input_length}")
140
+
141
  st = time.perf_counter()
142
  history = {}
143
  clean_text = clean(input_text, lower=False)
 
202
 
203
  # save to file
204
  settings["model_name"] = model_name
205
+ saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
206
 
207
  return html, sum_text_out, scores_out, saved_file
208
 
 
227
  text = clean(raw_text, lower=False)
228
  elif full_ex_path.suffix == ".pdf":
229
  logging.info(f"Loading PDF file {full_ex_path}")
230
+ max_pages = int(os.environ.get("APP_MAX_PAGES", max_pages))
231
+ logging.info(f"max_pages set to: {max_pages}")
232
  conversion_stats = convert_PDF_to_Text(
233
  full_ex_path,
234
  ocr_model=ocr_model,
 
259
  file_path = Path(file_obj.name)
260
  try:
261
  logger.info(f"Loading file:\t{file_path}")
262
+ if file_path.suffix in [".txt", ".md"]:
263
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
264
  raw_text = f.read()
265
  text = clean(raw_text, lower=lower)
266
  elif file_path.suffix == ".pdf":
267
  logger.info(f"loading as PDF file {file_path}")
268
+ max_pages = int(os.environ.get("APP_MAX_PAGES", max_pages))
269
+ logger.info(f"max_pages set to: {max_pages}")
270
  conversion_stats = convert_PDF_to_Text(
271
  file_path,
272
  ocr_model=ocr_model,
 
274
  )
275
  text = conversion_stats["converted_text"]
276
  else:
277
+ logger.error(f"Unknown file type:\t{file_path.suffix}")
278
+ text = "ERROR - check file - unknown file type. PDF, TXT, and MD are supported."
279
 
280
  return text
281
  except Exception as e:
 
296
  )
297
  name_to_path = load_example_filenames(_here / "examples")
298
  logger.info(f"Loaded {len(name_to_path)} examples")
299
+
300
+ demo = gr.Blocks(title="Document Summarization with Long-Document Transformers")
301
  _examples = list(name_to_path.keys())
302
  with demo:
303
  gr.Markdown("# Document Summarization with Long-Document Transformers")
 
339
  with gr.Row():
340
  input_text = gr.Textbox(
341
  lines=4,
342
+ max_lines=12,
343
  label="Input Text (for summarization)",
344
  placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
345
  )
 
411
  gr.Markdown(
412
  "- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
413
  )
414
+ gr.Markdown(
415
+ "Adjust the max input words & max PDF pages for OCR by duplicating this space and [setting the environment variables](https://huggingface.co/docs/hub/spaces-overview#managing-secrets) `APP_MAX_WORDS` and `APP_OCR_MAX_PAGES` to the desired integer values."
416
+ )
417
  gr.Markdown("---")
418
 
419
  load_examples_button.click(