thak123 commited on
Commit
7e51e2d
1 Parent(s): 6e501e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -23
app.py CHANGED
@@ -20,40 +20,43 @@ model.to(device)
20
 
21
 
22
 
23
- text_processor = TextPreProcessor(
24
- # terms that will be normalized
25
- normalize=['url', 'email', 'percent', 'money', 'phone', 'user'],
26
- # terms that will be annotated
27
- annotate={},
28
- fix_html=True, # fix HTML tokens
29
 
30
- # corpus from which the word statistics are going to be used
31
- # for word segmentation
32
- segmenter="twitter",
33
 
34
- # corpus from which the word statistics are going to be used
35
- # for spell correction
36
- corrector="twitter",
37
 
38
- unpack_hashtags=False, # perform word segmentation on hashtags
39
- unpack_contractions=False, # Unpack contractions (can't -> can not)
40
- spell_correct_elong=False, # spell correction for elongated words
41
 
42
- # select a tokenizer. You can use SocialTokenizer, or pass your own
43
- # the tokenizer, should take as input a string and return a list of tokens
44
- tokenizer=SocialTokenizer(lowercase=True).tokenize,
45
 
46
- # list of dictionaries, for replacing tokens extracted from the text,
47
- # with other expressions. You can pass more than one dictionaries.
48
- dicts=[]
49
- )
50
 
51
  # T = tokenizer.TweetTokenizer(
52
  # preserve_handles=True, preserve_hashes=True, preserve_case=False, preserve_url=False)
 
53
 
54
  def preprocess(text):
55
  # tokens = T.tokenize(text)
56
- tokens = text_processor.pre_process_docs(text)
 
 
57
  print(tokens, file=sys.stderr)
58
  ptokens = []
59
  for index, token in enumerate(tokens):
 
20
 
21
 
22
 
23
+ # text_processor = TextPreProcessor(
24
+ # # terms that will be normalized
25
+ # normalize=['url', 'email', 'percent', 'money', 'phone', 'user'],
26
+ # # terms that will be annotated
27
+ # annotate={},
28
+ # fix_html=True, # fix HTML tokens
29
 
30
+ # # corpus from which the word statistics are going to be used
31
+ # # for word segmentation
32
+ # segmenter="twitter",
33
 
34
+ # # corpus from which the word statistics are going to be used
35
+ # # for spell correction
36
+ # corrector="twitter",
37
 
38
+ # unpack_hashtags=False, # perform word segmentation on hashtags
39
+ # unpack_contractions=False, # Unpack contractions (can't -> can not)
40
+ # spell_correct_elong=False, # spell correction for elongated words
41
 
42
+ # # select a tokenizer. You can use SocialTokenizer, or pass your own
43
+ # # the tokenizer, should take as input a string and return a list of tokens
44
+ # tokenizer=SocialTokenizer(lowercase=True).tokenize,
45
 
46
+ # # list of dictionaries, for replacing tokens extracted from the text,
47
+ # # with other expressions. You can pass more than one dictionaries.
48
+ # dicts=[]
49
+ # )
50
 
51
  # T = tokenizer.TweetTokenizer(
52
  # preserve_handles=True, preserve_hashes=True, preserve_case=False, preserve_url=False)
53
+ social_tokenizer=SocialTokenizer(lowercase=True).tokenize,
54
 
55
  def preprocess(text):
56
  # tokens = T.tokenize(text)
57
+ # tokens = text_processor.pre_process_docs(text)
58
+
59
+ tokens = social_tokenizer(s)
60
  print(tokens, file=sys.stderr)
61
  ptokens = []
62
  for index, token in enumerate(tokens):