import wandb | |
import shutil | |
from transformers import BartTokenizer | |
from idiomify.fetchers import fetch_config | |
from idiomify.paths import ROOT_DIR | |
def main(): | |
config = fetch_config()['tokenizer'] | |
tokenizer = BartTokenizer.from_pretrained(config['bart']) | |
tokenizer.add_special_tokens({ | |
"additional_special_tokens": ["<idiom>", "</idiom>"], # beginning and end of an idiom | |
}) | |
with wandb.init(entity="eubinecto", project="idiomify") as run: | |
# the paths to write datasets in | |
tok_dir = ROOT_DIR / "tokenizer" | |
tokenizer.save_pretrained(tok_dir) | |
artifact = wandb.Artifact(name="tokenizer", type="other", description=config['description'], | |
metadata=config) | |
artifact.add_dir(tok_dir) | |
# then, we just log them here. | |
run.log_artifact(artifact, aliases=["latest", config['ver']]) | |
# don't forget to remove them | |
shutil.rmtree(tok_dir) | |
if __name__ == '__main__': | |
main() | |