cardiffnlp commited on
Commit
2508de5
โ€ข
1 Parent(s): 1565dd4

Adding twitter-xlm sentiment classifiers

Browse files
Files changed (1) hide show
  1. README.md +95 -0
README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # twitter-XLM-roBERTa-base for Sentiment Analysis
2
+
3
+
4
+
5
+ TODO: create model card
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+ This is a roBERTa-base model trained on ~58M tweets and finetuned for sentiment analysis with the TweetEval benchmark.
14
+
15
+ - Paper: [_TweetEval_ benchmark (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf).
16
+ - Git Repo: [Tweeteval official repository](https://github.com/cardiffnlp/tweeteval).
17
+
18
+ ## Example of classification
19
+
20
+ ```python
21
+ from transformers import AutoModelForSequenceClassification
22
+ from transformers import TFAutoModelForSequenceClassification
23
+ from transformers import AutoTokenizer
24
+ import numpy as np
25
+ from scipy.special import softmax
26
+ import csv
27
+ import urllib.request
28
+
29
+ # Preprocess text (username and link placeholders)
30
+ def preprocess(text):
31
+ new_text = []
32
+
33
+
34
+ for t in text.split(" "):
35
+ t = '@user' if t.startswith('@') and len(t) > 1 else t
36
+ t = 'http' if t.startswith('http') else t
37
+ new_text.append(t)
38
+ return " ".join(new_text)
39
+
40
+ # Tasks:
41
+ # emoji, emotion, hate, irony, offensive, sentiment
42
+ # stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary
43
+
44
+ task='sentiment'
45
+ MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
46
+
47
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
48
+
49
+ # download label mapping
50
+ labels=[]
51
+ mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
52
+ with urllib.request.urlopen(mapping_link) as f:
53
+ html = f.read().decode('utf-8').split("\
54
+ ")
55
+ csvreader = csv.reader(html, delimiter='\\t')
56
+ labels = [row[1] for row in csvreader if len(row) > 1]
57
+
58
+ # PT
59
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
60
+ model.save_pretrained(MODEL)
61
+
62
+ text = "Good night ๐Ÿ˜Š"
63
+ text = preprocess(text)
64
+ encoded_input = tokenizer(text, return_tensors='pt')
65
+ output = model(**encoded_input)
66
+ scores = output[0][0].detach().numpy()
67
+ scores = softmax(scores)
68
+
69
+ # # TF
70
+ # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
71
+ # model.save_pretrained(MODEL)
72
+
73
+ # text = "Good night ๐Ÿ˜Š"
74
+ # encoded_input = tokenizer(text, return_tensors='tf')
75
+ # output = model(encoded_input)
76
+ # scores = output[0][0].numpy()
77
+ # scores = softmax(scores)
78
+
79
+ ranking = np.argsort(scores)
80
+ ranking = ranking[::-1]
81
+ for i in range(scores.shape[0]):
82
+ l = labels[ranking[i]]
83
+ s = scores[ranking[i]]
84
+ print(f"{i+1}) {l} {np.round(float(s), 4)}")
85
+
86
+ ```
87
+
88
+ Output:
89
+
90
+ ```
91
+ 1) positive 0.8466
92
+ 2) neutral 0.1458
93
+ 3) negative 0.0076
94
+ ```
95
+