Spaces:

FFZG-cleopatra
/

latvian-twitter-sentiment-classifier

Build error

App Files Files Community

thak123 commited on May 11, 2023

Commit

7e51e2d

•

1 Parent(s): 6e501e3

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -23

app.py CHANGED Viewed

@@ -20,40 +20,43 @@ model.to(device)
-text_processor = TextPreProcessor(
-    # terms that will be normalized
-    normalize=['url', 'email', 'percent', 'money', 'phone', 'user'],
-    # terms that will be annotated
-    annotate={},
-    fix_html=True,  # fix HTML tokens
-    # corpus from which the word statistics are going to be used
-    # for word segmentation
-    segmenter="twitter",
-    # corpus from which the word statistics are going to be used
-    # for spell correction
-    corrector="twitter",
-    unpack_hashtags=False,  # perform word segmentation on hashtags
-    unpack_contractions=False,  # Unpack contractions (can't -> can not)
-    spell_correct_elong=False,  # spell correction for elongated words
-    # select a tokenizer. You can use SocialTokenizer, or pass your own
-    # the tokenizer, should take as input a string and return a list of tokens
-    tokenizer=SocialTokenizer(lowercase=True).tokenize,
-    # list of dictionaries, for replacing tokens extracted from the text,
-    # with other expressions. You can pass more than one dictionaries.
-    dicts=[]
-)
 # T = tokenizer.TweetTokenizer(
 #     preserve_handles=True, preserve_hashes=True, preserve_case=False, preserve_url=False)
 def preprocess(text):
     # tokens = T.tokenize(text)
-    tokens = text_processor.pre_process_docs(text)
     print(tokens, file=sys.stderr)
     ptokens = []
     for index, token in enumerate(tokens):

+# text_processor = TextPreProcessor(
+#     # terms that will be normalized
+#     normalize=['url', 'email', 'percent', 'money', 'phone', 'user'],
+#     # terms that will be annotated
+#     annotate={},
+#     fix_html=True,  # fix HTML tokens
+#     # corpus from which the word statistics are going to be used
+#     # for word segmentation
+#     segmenter="twitter",
+#     # corpus from which the word statistics are going to be used
+#     # for spell correction
+#     corrector="twitter",
+#     unpack_hashtags=False,  # perform word segmentation on hashtags
+#     unpack_contractions=False,  # Unpack contractions (can't -> can not)
+#     spell_correct_elong=False,  # spell correction for elongated words
+#     # select a tokenizer. You can use SocialTokenizer, or pass your own
+#     # the tokenizer, should take as input a string and return a list of tokens
+#     tokenizer=SocialTokenizer(lowercase=True).tokenize,
+#     # list of dictionaries, for replacing tokens extracted from the text,
+#     # with other expressions. You can pass more than one dictionaries.
+#     dicts=[]
+# )
 # T = tokenizer.TweetTokenizer(
 #     preserve_handles=True, preserve_hashes=True, preserve_case=False, preserve_url=False)
+social_tokenizer=SocialTokenizer(lowercase=True).tokenize,
 def preprocess(text):
     # tokens = T.tokenize(text)
+    # tokens = text_processor.pre_process_docs(text)
+    tokens = social_tokenizer(s)
     print(tokens, file=sys.stderr)
     ptokens = []
     for index, token in enumerate(tokens):