pszemraj commited on
Commit
77d5469
β€’
1 Parent(s): 716199b

♻️ πŸ”Š

Browse files

Signed-off-by: peter szemraj <[email protected]>

Files changed (2) hide show
  1. summarize.py +12 -6
  2. utils.py +24 -21
summarize.py CHANGED
@@ -1,4 +1,8 @@
 
 
 
1
  import logging
 
2
 
3
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
4
 
@@ -98,6 +102,7 @@ def summarize_via_tokenbatches(
98
  tokenizer,
99
  batch_length=2048,
100
  batch_stride=16,
 
101
  **kwargs,
102
  ) -> list:
103
  """
@@ -116,14 +121,15 @@ def summarize_via_tokenbatches(
116
 
117
  logger = logging.getLogger(__name__)
118
  # log all input parameters
119
- if batch_length < 512:
120
- batch_length = 512
121
  logger.warning(
122
- f"batch_length must be at least 512. Setting batch_length to {batch_length}"
123
  )
124
- logger.info(
125
- f"input parameters: {kwargs}, batch_length={batch_length}, batch_stride={batch_stride}"
126
- )
 
 
127
  encoded_input = tokenizer(
128
  input_text,
129
  padding="max_length",
 
1
+ """
2
+ summarize - a module for summarizing text using a model from the Hugging Face model hub
3
+ """
4
  import logging
5
+ import pprint as pp
6
 
7
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
8
 
 
102
  tokenizer,
103
  batch_length=2048,
104
  batch_stride=16,
105
+ min_batch_length=512,
106
  **kwargs,
107
  ) -> list:
108
  """
 
121
 
122
  logger = logging.getLogger(__name__)
123
  # log all input parameters
124
+ if batch_length < min_batch_length:
 
125
  logger.warning(
126
+ f"batch_length must be at least {min_batch_length}. Setting batch_length to {min_batch_length}"
127
  )
128
+ batch_length = min_batch_length
129
+
130
+ logger.info(f"input parameters:\n{pp.pformat(kwargs)}")
131
+ logger.info(f"batch_length: {batch_length}, batch_stride: {batch_stride}")
132
+
133
  encoded_input = tokenizer(
134
  input_text,
135
  padding="max_length",
utils.py CHANGED
@@ -2,26 +2,27 @@
2
  utils.py - Utility functions for the project.
3
  """
4
 
 
5
  import re
6
  import subprocess
7
- from collections import defaultdict
8
  from datetime import datetime
9
- from itertools import combinations
10
  from pathlib import Path
11
  from typing import List
12
 
13
- import nltk
 
 
 
14
  import torch
15
  from natsort import natsorted
16
- from nltk.corpus import stopwords
17
- from nltk.tokenize import sent_tokenize, word_tokenize
18
  from rapidfuzz import fuzz
19
 
20
- import re
21
- from typing import List
22
- from itertools import islice
23
- from collections import defaultdict, deque
24
- from rapidfuzz import fuzz
25
 
26
 
27
  def validate_pytorch2(torch_version: str = None):
@@ -32,13 +33,17 @@ def validate_pytorch2(torch_version: str = None):
32
  return True if re.match(pattern, torch_version) else False
33
 
34
 
35
- def get_timestamp() -> str:
36
  """
37
  get_timestamp - get a timestamp for the current time
38
  Returns:
39
  str, the timestamp
40
  """
41
- return datetime.now().strftime("%Y%m%d_%H%M%S")
 
 
 
 
42
 
43
 
44
  def truncate_word_count(text, max_words=512):
@@ -115,16 +120,12 @@ def extract_keywords(
115
  Returns:
116
  A list of strings, where each string is a keyword extracted from the input text.
117
  """
118
- # Define stopwords
119
- stop_words = set(
120
- "a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
121
- )
122
-
123
  # Remove stopwords and tokenize the text into words
124
  words = [
125
  word
126
  for word in re.findall(r"\b\w{3,}\b", text.lower())
127
- if word not in stop_words
128
  ]
129
 
130
  # Create a graph of word co-occurrences within a moving window of words
@@ -149,13 +150,13 @@ def extract_keywords(
149
 
150
  # Sort the words by score and return the top num_keywords keywords
151
  keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]
152
-
153
  # Use fuzzy matching to remove similar keywords
154
  final_keywords = []
155
  for keyword in keywords:
156
  if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords):
157
  final_keywords.append(keyword)
158
-
159
  return final_keywords
160
 
161
 
@@ -170,18 +171,20 @@ def saves_summary(
170
  add_signature: whether to add a signature to the output file
171
  kwargs: additional keyword arguments to include in the output file
172
  """
 
173
  sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
174
  sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
175
  scores_text = "\n".join(sum_scores)
176
  full_summary = "\n".join(sum_text)
177
 
178
  keywords = "_".join(extract_keywords(full_summary))
 
179
  outpath = (
180
  Path.cwd() / f"document_summary_{get_timestamp()}_{keywords}.txt"
181
  if outpath is None
182
  else Path(outpath)
183
  )
184
-
185
  with open(
186
  outpath,
187
  "w",
 
2
  utils.py - Utility functions for the project.
3
  """
4
 
5
+ import logging
6
  import re
7
  import subprocess
8
+ from collections import defaultdict, deque
9
  from datetime import datetime
10
+ from itertools import combinations, islice
11
  from pathlib import Path
12
  from typing import List
13
 
14
+ logging.basicConfig(
15
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
16
+ level=logging.INFO,
17
+ )
18
  import torch
19
  from natsort import natsorted
 
 
20
  from rapidfuzz import fuzz
21
 
22
+ # Define stopwords
23
+ STOPWORDS = set(
24
+ "a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
25
+ )
 
26
 
27
 
28
  def validate_pytorch2(torch_version: str = None):
 
33
  return True if re.match(pattern, torch_version) else False
34
 
35
 
36
+ def get_timestamp(detailed=False) -> str:
37
  """
38
  get_timestamp - get a timestamp for the current time
39
  Returns:
40
  str, the timestamp
41
  """
42
+ return (
43
+ datetime.now().strftime("%b%d%Y_%H%M%S%f")
44
+ if detailed
45
+ else datetime.now().strftime("%b%d%Y_%H")
46
+ )
47
 
48
 
49
  def truncate_word_count(text, max_words=512):
 
120
  Returns:
121
  A list of strings, where each string is a keyword extracted from the input text.
122
  """
123
+ logger = logging.getLogger(__name__)
 
 
 
 
124
  # Remove stopwords and tokenize the text into words
125
  words = [
126
  word
127
  for word in re.findall(r"\b\w{3,}\b", text.lower())
128
+ if word not in STOPWORDS
129
  ]
130
 
131
  # Create a graph of word co-occurrences within a moving window of words
 
150
 
151
  # Sort the words by score and return the top num_keywords keywords
152
  keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]
153
+ logger.debug(f"All keywords: {keywords}")
154
  # Use fuzzy matching to remove similar keywords
155
  final_keywords = []
156
  for keyword in keywords:
157
  if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords):
158
  final_keywords.append(keyword)
159
+ logger.info(f"Keywords (final):\t{final_keywords}")
160
  return final_keywords
161
 
162
 
 
171
  add_signature: whether to add a signature to the output file
172
  kwargs: additional keyword arguments to include in the output file
173
  """
174
+ logger = logging.getLogger(__name__)
175
  sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
176
  sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
177
  scores_text = "\n".join(sum_scores)
178
  full_summary = "\n".join(sum_text)
179
 
180
  keywords = "_".join(extract_keywords(full_summary))
181
+ logger.info(f"kw:\t{keywords}")
182
  outpath = (
183
  Path.cwd() / f"document_summary_{get_timestamp()}_{keywords}.txt"
184
  if outpath is None
185
  else Path(outpath)
186
  )
187
+ logger.info(f"Saving summary to:\t{outpath.name}")
188
  with open(
189
  outpath,
190
  "w",