Spaces:
Sleeping
Sleeping
File size: 1,667 Bytes
cdb159e c0bbdb0 cdb159e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import transformers
import os
import torch
MAX_LEN = 150 #256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
# Folder to contain all the datasets
DATASET_LOCATION = "" #
MODEL_PATH = "https://huggingface.co/FFZG-cleopatra/lv-cros-sentimentor/blob/main/model.bin"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# MBERT Raw Version
# BERT_PATH = "bert-base-multilingual-cased"
# 7 EPOCH Version
BERT_PATH = "FFZG-cleopatra/bert-emoji-latvian-twitter"
# 7 EPOCH Version + emoticons
# BERT_PATH = "bert-twitter-language-pretraining/models/LatvianTwittermBERT-v2/checkpoint-106000"
# TODO check if lower casing is required
# BertTokenizer
TOKENIZER = transformers.BertTokenizer.from_pretrained(
BERT_PATH,
do_lower_case=True
)
#####################################################################################################################################
# Electra
# Step 1: Model path
# BERT_PATH = "lmtuners/experiments/disc_lm_small/electra-small/discriminator/final"
# #"lmtuners/experiments/disc_lm_small/albert-small/final"
# # Step 2: Vocab and Lowercase setting
# TOKENIZER = transformers.BertTokenizer.from_pretrained(
# "lmtuners/experiments/disc_lm_small/lvtwitterbwpt-vocab-lower_accent.txt",
# # "lmtuners/experiments/disc_lm_small/bert-base-multilingual-cased-vocab.txt",
# do_lower_case=True
# )
# ALBERT_CONFIG = transformers.AlbertConfig(
# vocab_size=len(TOKENIZER), #.get_vocab_size(),
# hidden_size=256,
# embedding_size=128,
# num_hidden_layers=12,
# num_attention_heads=4,
# intermediate_size=1024,
# max_position_embeddings=128)
|