Spaces:

pszemraj
/

document-summarization

Running on CPU Upgrade

App Files Files Community

pszemraj commited on Apr 30, 2023

Commit

77d5469

1 Parent(s): 716199b

♻️ 🔊

Browse files

Signed-off-by: peter szemraj <[email protected]>

Files changed (2) hide show

summarize.py +12 -6
utils.py +24 -21

summarize.py CHANGED Viewed

@@ -1,4 +1,8 @@
 import logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
@@ -98,6 +102,7 @@ def summarize_via_tokenbatches(
     tokenizer,
     batch_length=2048,
     batch_stride=16,
     **kwargs,
 ) -> list:
     """
@@ -116,14 +121,15 @@ def summarize_via_tokenbatches(
     logger = logging.getLogger(__name__)
     # log all input parameters
-    if batch_length < 512:
-        batch_length = 512
         logger.warning(
-            f"batch_length must be at least 512. Setting batch_length to {batch_length}"
         )
-    logger.info(
-        f"input parameters: {kwargs}, batch_length={batch_length}, batch_stride={batch_stride}"
-    )
     encoded_input = tokenizer(
         input_text,
         padding="max_length",

+"""
+summarize - a module for summarizing text using a model from the Hugging Face model hub
+"""
 import logging
+import pprint as pp
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
     tokenizer,
     batch_length=2048,
     batch_stride=16,
+    min_batch_length=512,
     **kwargs,
 ) -> list:
     """
     logger = logging.getLogger(__name__)
     # log all input parameters
+    if batch_length < min_batch_length:
         logger.warning(
+            f"batch_length must be at least {min_batch_length}. Setting batch_length to {min_batch_length}"
         )
+        batch_length = min_batch_length
+    logger.info(f"input parameters:\n{pp.pformat(kwargs)}")
+    logger.info(f"batch_length: {batch_length}, batch_stride: {batch_stride}")
     encoded_input = tokenizer(
         input_text,
         padding="max_length",

utils.py CHANGED Viewed

@@ -2,26 +2,27 @@
     utils.py - Utility functions for the project.
 """
 import re
 import subprocess
-from collections import defaultdict
 from datetime import datetime
-from itertools import combinations
 from pathlib import Path
 from typing import List
-import nltk
 import torch
 from natsort import natsorted
-from nltk.corpus import stopwords
-from nltk.tokenize import sent_tokenize, word_tokenize
 from rapidfuzz import fuzz
-import re
-from typing import List
-from itertools import islice
-from collections import defaultdict, deque
-from rapidfuzz import fuzz
 def validate_pytorch2(torch_version: str = None):
@@ -32,13 +33,17 @@ def validate_pytorch2(torch_version: str = None):
     return True if re.match(pattern, torch_version) else False
-def get_timestamp() -> str:
     """
     get_timestamp - get a timestamp for the current time
     Returns:
         str, the timestamp
     """
-    return datetime.now().strftime("%Y%m%d_%H%M%S")
 def truncate_word_count(text, max_words=512):
@@ -115,16 +120,12 @@ def extract_keywords(
     Returns:
         A list of strings, where each string is a keyword extracted from the input text.
     """
-    # Define stopwords
-    stop_words = set(
-        "a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
-    )
     # Remove stopwords and tokenize the text into words
     words = [
         word
         for word in re.findall(r"\b\w{3,}\b", text.lower())
-        if word not in stop_words
     ]
     # Create a graph of word co-occurrences within a moving window of words
@@ -149,13 +150,13 @@ def extract_keywords(
     # Sort the words by score and return the top num_keywords keywords
     keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]
     # Use fuzzy matching to remove similar keywords
     final_keywords = []
     for keyword in keywords:
         if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords):
             final_keywords.append(keyword)
     return final_keywords
@@ -170,18 +171,20 @@ def saves_summary(
     add_signature: whether to add a signature to the output file
     kwargs: additional keyword arguments to include in the output file
     """
     sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
     sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
     scores_text = "\n".join(sum_scores)
     full_summary = "\n".join(sum_text)
     keywords = "_".join(extract_keywords(full_summary))
     outpath = (
         Path.cwd() / f"document_summary_{get_timestamp()}_{keywords}.txt"
         if outpath is None
         else Path(outpath)
     )
     with open(
         outpath,
         "w",

     utils.py - Utility functions for the project.
 """
+import logging
 import re
 import subprocess
+from collections import defaultdict, deque
 from datetime import datetime
+from itertools import combinations, islice
 from pathlib import Path
 from typing import List
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    level=logging.INFO,
+)
 import torch
 from natsort import natsorted
 from rapidfuzz import fuzz
+# Define stopwords
+STOPWORDS = set(
+    "a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
+)
 def validate_pytorch2(torch_version: str = None):
     return True if re.match(pattern, torch_version) else False
+def get_timestamp(detailed=False) -> str:
     """
     get_timestamp - get a timestamp for the current time
     Returns:
         str, the timestamp
     """
+    return (
+        datetime.now().strftime("%b%d%Y_%H%M%S%f")
+        if detailed
+        else datetime.now().strftime("%b%d%Y_%H")
+    )
 def truncate_word_count(text, max_words=512):
     Returns:
         A list of strings, where each string is a keyword extracted from the input text.
     """
+    logger = logging.getLogger(__name__)
     # Remove stopwords and tokenize the text into words
     words = [
         word
         for word in re.findall(r"\b\w{3,}\b", text.lower())
+        if word not in STOPWORDS
     ]
     # Create a graph of word co-occurrences within a moving window of words
     # Sort the words by score and return the top num_keywords keywords
     keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]
+    logger.debug(f"All keywords: {keywords}")
     # Use fuzzy matching to remove similar keywords
     final_keywords = []
     for keyword in keywords:
         if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords):
             final_keywords.append(keyword)
+    logger.info(f"Keywords (final):\t{final_keywords}")
     return final_keywords
     add_signature: whether to add a signature to the output file
     kwargs: additional keyword arguments to include in the output file
     """
+    logger = logging.getLogger(__name__)
     sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
     sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
     scores_text = "\n".join(sum_scores)
     full_summary = "\n".join(sum_text)
     keywords = "_".join(extract_keywords(full_summary))
+    logger.info(f"kw:\t{keywords}")
     outpath = (
         Path.cwd() / f"document_summary_{get_timestamp()}_{keywords}.txt"
         if outpath is None
         else Path(outpath)
     )
+    logger.info(f"Saving summary to:\t{outpath.name}")
     with open(
         outpath,
         "w",