added tokenizer

Browse files

Files changed (5) hide show

README.md.bkp +149 -0
merges.txt +0 -0
special_tokens_map.json +1 -0
tokenizer_config.json +1 -0
vocab.json +0 -0

README.md.bkp ADDED Viewed

	@@ -0,0 +1,149 @@

+# Twitter-roBERTa-base
+This is a Twitter-roBERTa-base model trained on ~7000 tweets annotated for 11 emotion categories in [SemEval-2018 Task 1: Affect in Tweets: SubTask 5: Emotion Classification.](https://competitions.codalab.org/competitions/17751).
+~58M tweets, described and evaluated in the [_TweetEval_ benchmark (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf). To evaluate this and other LMs on Twitter-specific data, please refer to the [Tweeteval official repository](https://github.com/cardiffnlp/tweeteval).
+## Preprocess Text
+Replace usernames and links for placeholders: "@user" and "http".
+```python
+def preprocess(text):
+    new_text = []
+    for t in text.split(" "):
+        t = '@user' if t.startswith('@') and len(t) > 1 else t
+        t = 'http' if t.startswith('http') else t
+        new_text.append(t)
+    return " ".join(new_text)
+```
+## Example Masked Language Model
+```python
+from transformers import pipeline, AutoTokenizer
+import numpy as np
+MODEL = "cardiffnlp/twitter-roberta-base"
+fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL)
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+def print_candidates():
+    for i in range(5):
+        token = tokenizer.decode(candidates[i]['token'])
+        score = np.round(candidates[i]['score'], 4)
+        print(f"{i+1}) {token} {score}")
+texts = [
+ "I am so <mask> 😊",
+ "I am so <mask> 😢"
+]
+for text in texts:
+    t = preprocess(text)
+    print(f"{'-'*30}\n{t}")
+    candidates = fill_mask(t)
+    print_candidates()
+```
+Output:
+```
+------------------------------
+I am so <mask> 😊
+1)  happy 0.402
+2)  excited 0.1441
+3)  proud 0.143
+4)  grateful 0.0669
+5)  blessed 0.0334
+------------------------------
+I am so <mask> 😢
+1)  sad 0.2641
+2)  sorry 0.1605
+3)  tired 0.138
+4)  sick 0.0278
+5)  hungry 0.0232
+```
+## Example Tweet Embeddings
+```python
+from transformers import AutoTokenizer, AutoModel, TFAutoModel
+import numpy as np
+from scipy.spatial.distance import cosine
+from collections import defaultdict
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+model = AutoModel.from_pretrained(MODEL)
+def get_embedding(text):
+  text = preprocess(text)
+  encoded_input = tokenizer(text, return_tensors='pt')
+  features = model(**encoded_input)
+  features = features[0].detach().cpu().numpy()
+  features_mean = np.mean(features[0], axis=0)
+  return features_mean
+MODEL = "cardiffnlp/twitter-roberta-base"
+query = "The book was awesome"
+tweets = ["I just ordered fried chicken 🐣",
+          "The movie was great",
+          "What time is the next game?",
+          "Just finished reading 'Embeddings in NLP'"]
+d = defaultdict(int)
+for tweet in tweets:
+  sim = 1-cosine(get_embedding(query),get_embedding(tweet))
+  d[tweet] = sim
+print('Most similar to: ',query)
+print('----------------------------------------')
+for idx,x in enumerate(sorted(d.items(), key=lambda x:x[1], reverse=True)):
+  print(idx+1,x[0])
+```
+Output:
+```
+Most similar to:  The book was awesome
+----------------------------------------
+1 The movie was great
+2 Just finished reading 'Embeddings in NLP'
+3 I just ordered fried chicken 🐣
+4 What time is the next game?
+```
+## Example Feature Extraction
+```python
+from transformers import AutoTokenizer, AutoModel, TFAutoModel
+import numpy as np
+MODEL = "cardiffnlp/twitter-roberta-base"
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+text = "Good night 😊"
+text = preprocess(text)
+# Pytorch
+model = AutoModel.from_pretrained(MODEL)
+encoded_input = tokenizer(text, return_tensors='pt')
+features = model(**encoded_input)
+features = features[0].detach().cpu().numpy()
+features_mean = np.mean(features[0], axis=0)
+#features_max = np.max(features[0], axis=0)
+# # Tensorflow
+# model = TFAutoModel.from_pretrained(MODEL)
+# encoded_input = tokenizer(text, return_tensors='tf')
+# features = model(encoded_input)
+# features = features[0].numpy()
+# features_mean = np.mean(features[0], axis=0)
+# #features_max = np.max(features[0], axis=0)
+```

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "cardiffnlp/twitter-roberta-base"}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff