pszemraj commited on
Commit
b8e1b99
·
1 Parent(s): 0fccfa2

✨ enable new checkpoints

Browse files

Signed-off-by: peter szemraj <[email protected]>

Files changed (1) hide show
  1. app.py +55 -36
app.py CHANGED
@@ -1,33 +1,70 @@
1
- import os
2
  import contextlib
3
  import logging
 
4
  import random
5
  import re
6
  import time
7
  from pathlib import Path
8
 
 
 
 
 
 
 
 
 
 
9
  import gradio as gr
10
  import nltk
 
11
  from cleantext import clean
12
  from doctr.io import DocumentFile
13
  from doctr.models import ocr_predictor
14
- from pdf2text import convert_PDF_to_Text
15
 
 
16
  from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
17
- from utils import load_example_filenames, truncate_word_count, saves_summary
18
 
19
  _here = Path(__file__).parent
20
 
21
  nltk.download("stopwords") # TODO=find where this requirement originates from
22
 
23
- logging.basicConfig(
24
- level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
25
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  def proc_submission(
29
  input_text: str,
30
- model_size: str,
31
  num_beams,
32
  token_batch_length,
33
  length_penalty,
@@ -40,7 +77,7 @@ def proc_submission(
40
 
41
  Args:
42
  input_text (str): the input text to summarize
43
- model_size (str): the size of the model to use
44
  num_beams (int): the number of beams to use
45
  token_batch_length (int): the length of the token batches to use
46
  length_penalty (float): the length penalty to use
@@ -66,7 +103,7 @@ def proc_submission(
66
  st = time.perf_counter()
67
  history = {}
68
  clean_text = clean(input_text, lower=False)
69
- max_input_length = 2048 if "base" in model_size.lower() else max_input_length
70
  processed = truncate_word_count(clean_text, max_input_length)
71
 
72
  if processed["was_truncated"]:
@@ -100,14 +137,13 @@ def proc_submission(
100
 
101
  return msg, "", []
102
 
103
- _summaries = summarize_via_tokenbatches(
104
- tr_in,
105
- model_sm if "base" in model_size.lower() else model,
106
- tokenizer_sm if "base" in model_size.lower() else tokenizer,
107
- batch_length=token_batch_length,
108
  **settings,
109
  )
110
- sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
111
  sum_scores = [
112
  f" - Section {i}: {round(s['summary_score'],4)}"
113
  for i, s in enumerate(_summaries)
@@ -204,18 +240,6 @@ def load_uploaded_file(file_obj, max_pages=20):
204
 
205
  if __name__ == "__main__":
206
  logging.info("Starting app instance")
207
- os.environ[
208
- "TOKENIZERS_PARALLELISM"
209
- ] = "false" # parallelism on tokenizers is buggy with gradio
210
- logging.info("Loading summ models")
211
- with contextlib.redirect_stdout(None):
212
- model, tokenizer = load_model_and_tokenizer(
213
- "pszemraj/pegasus-x-large-book-summary"
214
- )
215
- model_sm, tokenizer_sm = load_model_and_tokenizer(
216
- "pszemraj/long-t5-tglobal-base-16384-book-summary"
217
- )
218
-
219
  logging.info("Loading OCR model")
220
  with contextlib.redirect_stdout(None):
221
  ocr_model = ocr_predictor(
@@ -229,24 +253,19 @@ if __name__ == "__main__":
229
  demo = gr.Blocks()
230
  _examples = list(name_to_path.keys())
231
  with demo:
232
-
233
  gr.Markdown("# Document Summarization with Long-Document Transformers")
234
  gr.Markdown(
235
  "This is an example use case for fine-tuned long document transformers. The model is trained on book summaries (via the BookSum dataset). The models in this demo are [LongT5-base](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://huggingface.co/pszemraj/pegasus-x-large-book-summary)."
236
  )
237
  with gr.Column():
238
-
239
  gr.Markdown("## Load Inputs & Select Parameters")
240
  gr.Markdown(
241
  "Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). Optionally load an example below or upload a file. (`.txt` or `.pdf` - _[link to guide](https://i.imgur.com/c6Cs9ly.png)_)"
242
  )
243
  with gr.Row(variant="compact"):
244
  with gr.Column(scale=0.5, variant="compact"):
245
-
246
- model_size = gr.Radio(
247
- choices=["LongT5-base", "Pegasus-X-large"],
248
- label="Model Variant",
249
- value="LongT5-base",
250
  )
251
  num_beams = gr.Radio(
252
  choices=[2, 3, 4],
@@ -336,7 +355,7 @@ if __name__ == "__main__":
336
  value=3,
337
  )
338
  with gr.Column():
339
- gr.Markdown("### About the Model")
340
  gr.Markdown(
341
  "These models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
342
  )
@@ -354,7 +373,7 @@ if __name__ == "__main__":
354
  fn=proc_submission,
355
  inputs=[
356
  input_text,
357
- model_size,
358
  num_beams,
359
  token_batch_length,
360
  length_penalty,
 
 
1
  import contextlib
2
  import logging
3
+ import os
4
  import random
5
  import re
6
  import time
7
  from pathlib import Path
8
 
9
+ os.environ["USE_TORCH"] = "1"
10
+ os.environ[
11
+ "TOKENIZERS_PARALLELISM"
12
+ ] = "false" # parallelism on tokenizers is buggy with gradio
13
+
14
+ logging.basicConfig(
15
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
16
+ )
17
+
18
  import gradio as gr
19
  import nltk
20
+ import torch
21
  from cleantext import clean
22
  from doctr.io import DocumentFile
23
  from doctr.models import ocr_predictor
 
24
 
25
+ from pdf2text import convert_PDF_to_Text
26
  from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
27
+ from utils import load_example_filenames, saves_summary, truncate_word_count
28
 
29
  _here = Path(__file__).parent
30
 
31
  nltk.download("stopwords") # TODO=find where this requirement originates from
32
 
33
+
34
+ MODEL_OPTIONS = [
35
+ "pszemraj/long-t5-tglobal-base-16384-book-summary",
36
+ "pszemraj/long-t5-tglobal-base-sci-simplify",
37
+ "pszemraj/long-t5-tglobal-base-sci-simplify-elife",
38
+ "pszemraj/long-t5-tglobal-base-16384-booksci-summary-v1",
39
+ "pszemraj/pegasus-x-large-book-summary",
40
+ ]
41
+
42
+
43
+ def predict(
44
+ input_text: str,
45
+ model_name: str,
46
+ token_batch_length: int = 1024,
47
+ empty_cache: bool = True,
48
+ **settings,
49
+ ):
50
+ """helper fn to support multiple models at once"""
51
+ if torch.cuda.is_available() and empty_cache:
52
+ torch.cuda.empty_cache()
53
+
54
+ model, tokenizer = load_model_and_tokenizer(model_name)
55
+ summaries = summarize_via_tokenbatches(
56
+ input_text,
57
+ model,
58
+ tokenizer,
59
+ batch_length=token_batch_length,
60
+ **settings,
61
+ )
62
+ return summaries
63
 
64
 
65
  def proc_submission(
66
  input_text: str,
67
+ model_name: str,
68
  num_beams,
69
  token_batch_length,
70
  length_penalty,
 
77
 
78
  Args:
79
  input_text (str): the input text to summarize
80
+ model_name (str): the hf model tag of the model to use
81
  num_beams (int): the number of beams to use
82
  token_batch_length (int): the length of the token batches to use
83
  length_penalty (float): the length penalty to use
 
103
  st = time.perf_counter()
104
  history = {}
105
  clean_text = clean(input_text, lower=False)
106
+ max_input_length = 2048 if "base" in model_name.lower() else max_input_length
107
  processed = truncate_word_count(clean_text, max_input_length)
108
 
109
  if processed["was_truncated"]:
 
137
 
138
  return msg, "", []
139
 
140
+ _summaries = predict(
141
+ input_text=tr_in,
142
+ model_name=model_name,
143
+ token_batch_length=token_batch_length,
 
144
  **settings,
145
  )
146
+ sum_text = [f"Section {i}:\n\t" + s["summary"][0] for i, s in enumerate(_summaries)]
147
  sum_scores = [
148
  f" - Section {i}: {round(s['summary_score'],4)}"
149
  for i, s in enumerate(_summaries)
 
240
 
241
  if __name__ == "__main__":
242
  logging.info("Starting app instance")
 
 
 
 
 
 
 
 
 
 
 
 
243
  logging.info("Loading OCR model")
244
  with contextlib.redirect_stdout(None):
245
  ocr_model = ocr_predictor(
 
253
  demo = gr.Blocks()
254
  _examples = list(name_to_path.keys())
255
  with demo:
 
256
  gr.Markdown("# Document Summarization with Long-Document Transformers")
257
  gr.Markdown(
258
  "This is an example use case for fine-tuned long document transformers. The model is trained on book summaries (via the BookSum dataset). The models in this demo are [LongT5-base](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://huggingface.co/pszemraj/pegasus-x-large-book-summary)."
259
  )
260
  with gr.Column():
 
261
  gr.Markdown("## Load Inputs & Select Parameters")
262
  gr.Markdown(
263
  "Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). Optionally load an example below or upload a file. (`.txt` or `.pdf` - _[link to guide](https://i.imgur.com/c6Cs9ly.png)_)"
264
  )
265
  with gr.Row(variant="compact"):
266
  with gr.Column(scale=0.5, variant="compact"):
267
+ model_name = gr.Dropdown(
268
+ choices=MODEL_OPTIONS, value=MODEL_OPTIONS[0], label="Model"
 
 
 
269
  )
270
  num_beams = gr.Radio(
271
  choices=[2, 3, 4],
 
355
  value=3,
356
  )
357
  with gr.Column():
358
+ gr.Markdown("### About")
359
  gr.Markdown(
360
  "These models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
361
  )
 
373
  fn=proc_submission,
374
  inputs=[
375
  input_text,
376
+ model_name,
377
  num_beams,
378
  token_batch_length,
379
  length_penalty,