pszemraj commited on
Commit
62a2921
β€’
1 Parent(s): af3f9ae

🚧 add stopword removal fn

Browse files

Signed-off-by: peter szemraj <[email protected]>

Files changed (1) hide show
  1. utils.py +24 -0
utils.py CHANGED
@@ -15,8 +15,10 @@ logging.basicConfig(
15
  format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
16
  level=logging.INFO,
17
  )
 
18
  import torch
19
  from natsort import natsorted
 
20
  from rapidfuzz import fuzz
21
 
22
  # Define stopwords
@@ -25,6 +27,28 @@ STOPWORDS = set(
25
  )
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def remove_stagnant_files(
29
  freq: str = "hourly",
30
  search_path: str = ".",
 
15
  format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
16
  level=logging.INFO,
17
  )
18
+
19
  import torch
20
  from natsort import natsorted
21
+ from nltk.tokenize import word_tokenize
22
  from rapidfuzz import fuzz
23
 
24
  # Define stopwords
 
27
  )
28
 
29
 
30
+ def remove_stopwords(text: str, stopwords: list = STOPWORDS) -> str:
31
+ """
32
+ remove_stopwords - Remove stopwords from a string.
33
+
34
+ :param str text: text to remove stopwords from
35
+ :param list stopwords: list of stopwords to remove, defaults to STOPWORDS
36
+ :return, str: text with stopwords removed
37
+ """
38
+ words = word_tokenize(text)
39
+ filtered_words = []
40
+
41
+ for word in words:
42
+ word = word.strip(string.punctuation) # remove punctuation
43
+
44
+ if word.lower() not in stopwords:
45
+ filtered_words.append(word)
46
+
47
+ filtered_text = " ".join(filtered_words)
48
+
49
+ return filtered_text
50
+
51
+
52
  def remove_stagnant_files(
53
  freq: str = "hourly",
54
  search_path: str = ".",