File size: 997 Bytes
12f548d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import wandb
import shutil
from transformers import BartTokenizer
from idiomify.fetchers import fetch_config
from idiomify.paths import ROOT_DIR
def main():
config = fetch_config()['tokenizer']
tokenizer = BartTokenizer.from_pretrained(config['bart'])
tokenizer.add_special_tokens({
"additional_special_tokens": ["<idiom>", "</idiom>"], # beginning and end of an idiom
})
with wandb.init(entity="eubinecto", project="idiomify") as run:
# the paths to write datasets in
tok_dir = ROOT_DIR / "tokenizer"
tokenizer.save_pretrained(tok_dir)
artifact = wandb.Artifact(name="tokenizer", type="other", description=config['description'],
metadata=config)
artifact.add_dir(tok_dir)
# then, we just log them here.
run.log_artifact(artifact, aliases=["latest", config['ver']])
# don't forget to remove them
shutil.rmtree(tok_dir)
if __name__ == '__main__':
main()
|