lv-sentiment-demo / config.py
thak123's picture
Update config.py
058e542
raw
history blame
1.6 kB
import transformers
import os
import torch
MAX_LEN = 150 #256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
# Folder to contain all the datasets
DATASET_LOCATION = "" #
MODEL_PATH = "model.bin"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# MBERT Raw Version
# BERT_PATH = "bert-base-multilingual-cased"
# 7 EPOCH Version
BERT_PATH = "FFZG-cleopatra/bert-emoji-latvian-twitter"
# 7 EPOCH Version + emoticons
# BERT_PATH = "bert-twitter-language-pretraining/models/LatvianTwittermBERT-v2/checkpoint-106000"
# TODO check if lower casing is required
# BertTokenizer
TOKENIZER = transformers.BertTokenizer.from_pretrained(
BERT_PATH,
do_lower_case=True
)
#####################################################################################################################################
# Electra
# Step 1: Model path
# BERT_PATH = "lmtuners/experiments/disc_lm_small/electra-small/discriminator/final"
# #"lmtuners/experiments/disc_lm_small/albert-small/final"
# # Step 2: Vocab and Lowercase setting
# TOKENIZER = transformers.BertTokenizer.from_pretrained(
# "lmtuners/experiments/disc_lm_small/lvtwitterbwpt-vocab-lower_accent.txt",
# # "lmtuners/experiments/disc_lm_small/bert-base-multilingual-cased-vocab.txt",
# do_lower_case=True
# )
# ALBERT_CONFIG = transformers.AlbertConfig(
# vocab_size=len(TOKENIZER), #.get_vocab_size(),
# hidden_size=256,
# embedding_size=128,
# num_hidden_layers=12,
# num_attention_heads=4,
# intermediate_size=1024,
# max_position_embeddings=128)