File size: 1,769 Bytes
e9d1a5a e3c7b5a e9d1a5a e3c7b5a fca50f9 e9d1a5a e3c7b5a 642d911 e3c7b5a fca50f9 e3c7b5a 370afc1 e3c7b5a e9d1a5a e3c7b5a e9d1a5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
"""
literal2idiomatic ver: d-1-2
"""
import os
from idiomify.paths import ROOT_DIR
from idiomify.fetchers import fetch_pie, fetch_config
from idiomify.preprocess import upsample, cleanse, stratified_split, annotate
import wandb
def main():
# here, we use all of them, while splitting them into train & test
pie_df = fetch_pie()
config = fetch_config()['literal2idiomatic']
train_df, test_df = pie_df.pipe(cleanse)\
.pipe(upsample, seed=config['seed'])\
.pipe(annotate, boi_token=config['boi_token'], eoi_token=config['eoi_token'])\
.pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])
# why don't you just "select" the columns? yeah, stop using csv library. just select them.
train_df = train_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]]
test_df = test_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]]
dfs = (train_df, test_df)
with wandb.init(entity="eubinecto", project="idiomify") as run:
# the paths to write datasets in
train_path = ROOT_DIR / "train.tsv"
test_path = ROOT_DIR / "test.tsv"
paths = (train_path, test_path)
artifact = wandb.Artifact(name="literal2idiomatic", type="dataset", description=config['description'],
metadata=config)
for tsv_path, df in zip(paths, dfs):
df.to_csv(tsv_path, sep="\t")
artifact.add_file(tsv_path)
# then, we just log them here.
run.log_artifact(artifact, aliases=["latest", config['ver']])
# don't forget to remove them
for tsv_path in paths:
os.remove(tsv_path)
if __name__ == '__main__':
main()
|