luisespinosa
commited on
Commit
โข
7a0cada
1
Parent(s):
1f8684f
Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# twitter-XLM-roBERTa-base for Sentiment Analysis
|
2 |
|
3 |
This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for sentiment analysis in
|
@@ -5,7 +17,16 @@ This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for senti
|
|
5 |
- Paper: [XLM-T: A Multilingual Language Model Toolkit for Twitter](https://...).
|
6 |
- Git Repo: [Tweeteval official repository](https://github.com/cardiffnlp/xlm-t).
|
7 |
|
8 |
-
## Example
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
```python
|
11 |
from transformers import AutoModelForSequenceClassification
|
@@ -13,32 +34,20 @@ from transformers import TFAutoModelForSequenceClassification
|
|
13 |
from transformers import AutoTokenizer
|
14 |
import numpy as np
|
15 |
from scipy.special import softmax
|
16 |
-
import csv
|
17 |
-
import urllib.request
|
18 |
|
19 |
# Preprocess text (username and link placeholders)
|
20 |
def preprocess(text):
|
21 |
new_text = []
|
22 |
-
|
23 |
-
|
24 |
for t in text.split(" "):
|
25 |
t = '@user' if t.startswith('@') and len(t) > 1 else t
|
26 |
t = 'http' if t.startswith('http') else t
|
27 |
new_text.append(t)
|
28 |
return " ".join(new_text)
|
29 |
|
|
|
30 |
|
31 |
-
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"
|
32 |
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
33 |
-
|
34 |
-
# download label mapping
|
35 |
-
labels=[]
|
36 |
-
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
|
37 |
-
with urllib.request.urlopen(mapping_link) as f:
|
38 |
-
html = f.read().decode('utf-8').split("\\
|
39 |
-
")
|
40 |
-
csvreader = csv.reader(html, delimiter='\\\\t')
|
41 |
-
labels = [row[1] for row in csvreader if len(row) > 1]
|
42 |
|
43 |
# PT
|
44 |
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
|
@@ -61,10 +70,11 @@ scores = softmax(scores)
|
|
61 |
# scores = output[0][0].numpy()
|
62 |
# scores = softmax(scores)
|
63 |
|
|
|
64 |
ranking = np.argsort(scores)
|
65 |
ranking = ranking[::-1]
|
66 |
for i in range(scores.shape[0]):
|
67 |
-
l =
|
68 |
s = scores[ranking[i]]
|
69 |
print(f"{i+1}) {l} {np.round(float(s), 4)}")
|
70 |
|
@@ -73,8 +83,8 @@ for i in range(scores.shape[0]):
|
|
73 |
Output:
|
74 |
|
75 |
```
|
76 |
-
1)
|
77 |
-
2)
|
78 |
-
3)
|
79 |
```
|
80 |
|
|
|
1 |
+
---
|
2 |
+
language: multilingual
|
3 |
+
widget:
|
4 |
+
- text: "T'estimo!"
|
5 |
+
- text: "I love you!"
|
6 |
+
- text: "I hate you"
|
7 |
+
- text: "Mahal kita!"
|
8 |
+
- text: "์ฌ๋ํด!"
|
9 |
+
- text: "๋ ๋๊ฐ ์ซ์ด"
|
10 |
+
---
|
11 |
+
|
12 |
+
|
13 |
# twitter-XLM-roBERTa-base for Sentiment Analysis
|
14 |
|
15 |
This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for sentiment analysis in
|
|
|
17 |
- Paper: [XLM-T: A Multilingual Language Model Toolkit for Twitter](https://...).
|
18 |
- Git Repo: [Tweeteval official repository](https://github.com/cardiffnlp/xlm-t).
|
19 |
|
20 |
+
## Example Pipeline
|
21 |
+
```python
|
22 |
+
from transformers import pipeline
|
23 |
+
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
|
24 |
+
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
|
25 |
+
|
26 |
+
sentiment_task("T'estimo!")
|
27 |
+
```
|
28 |
+
|
29 |
+
## Full classification example
|
30 |
|
31 |
```python
|
32 |
from transformers import AutoModelForSequenceClassification
|
|
|
34 |
from transformers import AutoTokenizer
|
35 |
import numpy as np
|
36 |
from scipy.special import softmax
|
|
|
|
|
37 |
|
38 |
# Preprocess text (username and link placeholders)
|
39 |
def preprocess(text):
|
40 |
new_text = []
|
|
|
|
|
41 |
for t in text.split(" "):
|
42 |
t = '@user' if t.startswith('@') and len(t) > 1 else t
|
43 |
t = 'http' if t.startswith('http') else t
|
44 |
new_text.append(t)
|
45 |
return " ".join(new_text)
|
46 |
|
47 |
+
MODEL = f"/home/jupyter/misc/tweeteval/TweetEval_models/xlm-twitter/twitter-xlm-roberta-base-sentiment"
|
48 |
|
|
|
49 |
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
50 |
+
config = AutoConfig.from_pretrained(MODEL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
# PT
|
53 |
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
|
|
|
70 |
# scores = output[0][0].numpy()
|
71 |
# scores = softmax(scores)
|
72 |
|
73 |
+
# Print labels and scores
|
74 |
ranking = np.argsort(scores)
|
75 |
ranking = ranking[::-1]
|
76 |
for i in range(scores.shape[0]):
|
77 |
+
l = config.id2label[ranking[i]]
|
78 |
s = scores[ranking[i]]
|
79 |
print(f"{i+1}) {l} {np.round(float(s), 4)}")
|
80 |
|
|
|
83 |
Output:
|
84 |
|
85 |
```
|
86 |
+
1) Positive 0.7673
|
87 |
+
2) Neutral 0.2015
|
88 |
+
3) Negative 0.0313
|
89 |
```
|
90 |
|