File size: 2,734 Bytes
43a63e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"import pandas as pd\n",
"\n",
"from datasets_common import write_dataset, train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"dataset_dir = Path('ru')\n",
"parts_dir = dataset_dir / 'dataset_parts' "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"dfs = []\n",
"for dataset_path in parts_dir.glob(\"*.csv\"):\n",
" dfs.append(pd.read_csv(dataset_path))\n",
"df = pd.concat(dfs)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"label_name = 'category'\n",
"df = df.rename(columns={'categories': label_name})\n",
"for column in df.columns:\n",
" def transform_cell(value):\n",
" prefixes = [\"{'translation_text': '\", \"{\\'translation_text\\': \\'\", \"\\'translation_text\\':\", \"{'translation_text': \\\"\"]\n",
" suffix = \"\\'}\"\n",
" for prefix in prefixes:\n",
" if value.startswith(prefix):\n",
" value = value[len(prefix):]\n",
" if value.endswith(suffix):\n",
" value = value[:-len(suffix)]\n",
" return value\n",
" df[column] = df[column].apply(transform_cell)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"y = df[label_name]\n",
"X = df.drop(columns=label_name)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"train_filename = \"arxiv_train.csv\"\n",
"test_filename = \"arxiv_test.csv\"\n",
"write_dataset(dest_dir=dataset_dir, X=X_train, y=y_train, filename=train_filename, to_json=False)\n",
"write_dataset(dest_dir=dataset_dir, X=X_test, y=y_test, filename=test_filename, to_json=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|