File size: 1,667 Bytes
cdb159e
 
 
 
 
 
 
 
 
 
 
c0bbdb0
cdb159e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import transformers
import os
import torch

MAX_LEN = 150 #256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5

# Folder to contain all the datasets
DATASET_LOCATION = "" # 
MODEL_PATH = "https://huggingface.co/FFZG-cleopatra/lv-cros-sentimentor/blob/main/model.bin"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# MBERT Raw Version
# BERT_PATH = "bert-base-multilingual-cased"


# 7 EPOCH Version
BERT_PATH = "FFZG-cleopatra/bert-emoji-latvian-twitter"

# 7 EPOCH Version + emoticons
# BERT_PATH = "bert-twitter-language-pretraining/models/LatvianTwittermBERT-v2/checkpoint-106000"

# TODO check if lower casing is required
# BertTokenizer
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH,
    do_lower_case=True
)

#####################################################################################################################################
# Electra
# Step 1: Model path
# BERT_PATH = "lmtuners/experiments/disc_lm_small/electra-small/discriminator/final"
# #"lmtuners/experiments/disc_lm_small/albert-small/final"

# # Step 2: Vocab and Lowercase setting
# TOKENIZER = transformers.BertTokenizer.from_pretrained(
# 	"lmtuners/experiments/disc_lm_small/lvtwitterbwpt-vocab-lower_accent.txt",
#     # "lmtuners/experiments/disc_lm_small/bert-base-multilingual-cased-vocab.txt",
#     do_lower_case=True
# )

# ALBERT_CONFIG = transformers.AlbertConfig(
#         vocab_size=len(TOKENIZER), #.get_vocab_size(),
#         hidden_size=256,
#         embedding_size=128,
#         num_hidden_layers=12,
#         num_attention_heads=4,
#         intermediate_size=1024,
#         max_position_embeddings=128)