Spaces:
Sleeping
Sleeping
import transformers | |
import os | |
import torch | |
MAX_LEN = 150 #256 | |
TRAIN_BATCH_SIZE = 8 | |
VALID_BATCH_SIZE = 4 | |
EPOCHS = 5 | |
# Folder to contain all the datasets | |
DATASET_LOCATION = "" # | |
MODEL_PATH = "model.bin" | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
# MBERT Raw Version | |
# BERT_PATH = "bert-base-multilingual-cased" | |
# 7 EPOCH Version | |
BERT_PATH = "FFZG-cleopatra/bert-emoji-latvian-twitter" | |
# 7 EPOCH Version + emoticons | |
# BERT_PATH = "bert-twitter-language-pretraining/models/LatvianTwittermBERT-v2/checkpoint-106000" | |
# TODO check if lower casing is required | |
# BertTokenizer | |
TOKENIZER = transformers.BertTokenizer.from_pretrained( | |
BERT_PATH, | |
do_lower_case=True | |
) | |
##################################################################################################################################### | |
# Electra | |
# Step 1: Model path | |
# BERT_PATH = "lmtuners/experiments/disc_lm_small/electra-small/discriminator/final" | |
# #"lmtuners/experiments/disc_lm_small/albert-small/final" | |
# # Step 2: Vocab and Lowercase setting | |
# TOKENIZER = transformers.BertTokenizer.from_pretrained( | |
# "lmtuners/experiments/disc_lm_small/lvtwitterbwpt-vocab-lower_accent.txt", | |
# # "lmtuners/experiments/disc_lm_small/bert-base-multilingual-cased-vocab.txt", | |
# do_lower_case=True | |
# ) | |
# ALBERT_CONFIG = transformers.AlbertConfig( | |
# vocab_size=len(TOKENIZER), #.get_vocab_size(), | |
# hidden_size=256, | |
# embedding_size=128, | |
# num_hidden_layers=12, | |
# num_attention_heads=4, | |
# intermediate_size=1024, | |
# max_position_embeddings=128) | |