nickmuchi commited on
Commit
9a368e2
1 Parent(s): 3cf0631

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +7 -1
functions.py CHANGED
@@ -106,10 +106,16 @@ def preprocess_plain_text(text,window_size=3):
106
  return passages
107
 
108
  @st.experimental_memo(suppress_st_warning=True)
109
- def chunk_clean_text(text):
110
 
111
  """Chunk text longer than 500 tokens"""
112
 
 
 
 
 
 
 
113
  article = nlp(text)
114
  sentences = [i.text for i in list(article.sents)]
115
 
 
106
  return passages
107
 
108
  @st.experimental_memo(suppress_st_warning=True)
109
+ def chunk_and_process_text(text):
110
 
111
  """Chunk text longer than 500 tokens"""
112
 
113
+ text = text.encode("ascii", "ignore").decode() # unicode
114
+ text = re.sub(r"https*\S+", " ", text) # url
115
+ text = re.sub(r"@\S+", " ", text) # mentions
116
+ text = re.sub(r"#\S+", " ", text) # hastags
117
+ text = re.sub(r"\s{2,}", " ", text) # over spaces
118
+
119
  article = nlp(text)
120
  sentences = [i.text for i in list(article.sents)]
121