File size: 1,581 Bytes
486585a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
from pathlib import Path
import pandas as pd
train = (pd.concat([pd.read_csv(p, sep="\t", names=["_text", "lang"]) for p in Path("/nfsmounts/datastore/langid2").rglob("*_train.tsv")])
.assign(text=lambda x: x["_text"].str[6:].str.strip())
.drop("_text", axis=1)
.query("'xxx' not in text")
.sample(frac=1)
.reset_index()
.drop('index', axis=1)
)
validation = (pd.concat([pd.read_csv(p, sep="\t", names=["_text", "lang"]) for p in Path("/nfsmounts/datastore/langid2").rglob("*_dev.tsv")])
.assign(text=lambda x: x["_text"].str[6:].str.strip())
.drop("_text", axis=1)
.query("'xxx' not in text")
.sample(frac=1)
.reset_index()
.drop('index', axis=1)
)
test = (pd.concat([pd.read_csv(p, sep="\t", names=["_text", "lang"]) for p in Path("/nfsmounts/datastore/langid2").rglob("*_test.tsv")])
.assign(text=lambda x: x["_text"].str[6:].str.strip())
.drop("_text", axis=1)
.query("'xxx' not in text")
.sample(frac=1)
.reset_index()
.drop('index', axis=1)
)
train.to_csv("train.csv", index=False)
validation.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)
Path("train.txt").write_text("\n".join(train.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
Path("validation.txt").write_text("\n".join(validation.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
Path("test.txt").write_text("\n".join(test.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
|