Spaces:

Snizhanna
/

sarcasm_detection

Sleeping

App Files Files Community

Snizhanna commited on May 12, 2024

Commit

4031604

verified ·

1 Parent(s): 3c9ab88

Upload 6 files

Browse files

Files changed (6) hide show

lr_classifier_default.pkl +3 -0
requirements.txt +7 -0
rf_classifier_param.pkl +3 -0
text_preprocessing.py +55 -0
tfidf_vectorizer.pkl +3 -0
utils_models.py +27 -0

lr_classifier_default.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9afdd0d293944860b9b83ce7667c220312e48269d2c123882f734545bb0a565d
+size 1607150

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit
+pandas
+scikit-learn==1.2.2
+stanza
+nltk
+transformers
+torch

rf_classifier_param.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10f6b3cfbd97c6867ac246e12c7ced59074b4a73bef553303839f3ed6019ee39
+size 9135666

text_preprocessing.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from nltk.tokenize import TweetTokenizer
+import stanza
+import re
+tk = TweetTokenizer()
+uk_nlp = stanza.Pipeline(lang='uk', verbose=False)
+def substitute_user_mentions_and_links(text):
+    # Regular expression to match user mentions (e.g., @username)
+    user_mention_pattern = r'@\w+'
+    # Regular expression to match links (e.g., http://example.com)
+    link_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
+    # Substitute user mentions
+    text = re.sub(user_mention_pattern, '', text)
+    # Substitute links
+    text = re.sub(link_pattern, '', text)
+    # Substitute latin chars
+    text = re.sub(r'[a-zA-Z]+', '', text)
+    return text.lower()
+def remove_some_punc_numbers(text):
+    chars_to_remove = r'[\#\$\%\&\*\+\,\-\/\:\;\<\=\>\@\[\\\]\^\_\{\|\}\~\d\.\–]'
+    result = re.sub(chars_to_remove, '', ' '.join(text))
+    return result.lower()
+pattern = r'\b(\w+)\s*\'\s*(\w+)\b'
+# Define a function to join words separated by single quotes
+def join_words(match):
+    return match.group(1) + "'" + match.group(2)
+def lemmatize(text):
+    lemmas_st = []
+    for sent in uk_nlp(text).sentences:
+        for word in sent.words:
+            lemmas_st.append(word.lemma)
+    return lemmas_st
+def preprocess_text(input_text):
+    text_mod = substitute_user_mentions_and_links(input_text)
+    tokenized = tk.tokenize(text_mod)
+    spec_char_remv = remove_some_punc_numbers(tokenized)
+    apostrophe_fixed = re.sub(pattern, join_words, spec_char_remv)
+    spaces_fixed = re.sub(r'\s+', ' ', apostrophe_fixed)
+    lemmatized = lemmatize(spaces_fixed)
+    return text_mod, lemmatized

tfidf_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:311d11b3cebc097ade6884e27c9e9841e68edfad6294610c52816677ed4173df
+size 9719891

utils_models.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+def map_num_to_label(num):
+    return "сарказм" if num==1 else "не сарказм"
+def load_roberta():
+    model_ckpt = "ukr-roberta-base-finetuned-sarc"
+    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+    id2label = {1: "sarcastic",0: "not_sarcastic"}
+    label2id = {"sarcastic": 1, "not_sarcastic": 0}
+    hf_model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2, label2id=label2id, id2label=id2label)
+    return hf_model, tokenizer
+def predict_roberta(model, tokenizer, text):
+    tokenized_input = tokenizer(text, return_tensors="pt")
+    predictions = model(**tokenized_input)
+    prediction = predictions.logits.argmax().item()
+    return map_num_to_label(prediction)
+def identity_tokenizer(text):
+    return text
+def predict_lr_rf(model, vectorizer, text):
+    prediction = model.predict(vectorizer.transform([text]))
+    return map_num_to_label(prediction)