{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "\n", "import pandas as pd\n", "\n", "from datasets_common import write_dataset, train_test_split" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "dataset_dir = Path('ru')\n", "parts_dir = dataset_dir / 'dataset_parts' " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "dfs = []\n", "for dataset_path in parts_dir.glob(\"*.csv\"):\n", " dfs.append(pd.read_csv(dataset_path))\n", "df = pd.concat(dfs)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "label_name = 'category'\n", "df = df.rename(columns={'categories': label_name})\n", "for column in df.columns:\n", " def transform_cell(value):\n", " prefixes = [\"{'translation_text': '\", \"{\\'translation_text\\': \\'\", \"\\'translation_text\\':\", \"{'translation_text': \\\"\"]\n", " suffix = \"\\'}\"\n", " for prefix in prefixes:\n", " if value.startswith(prefix):\n", " value = value[len(prefix):]\n", " if value.endswith(suffix):\n", " value = value[:-len(suffix)]\n", " return value\n", " df[column] = df[column].apply(transform_cell)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "y = df[label_name]\n", "X = df.drop(columns=label_name)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "train_filename = \"arxiv_train.csv\"\n", "test_filename = \"arxiv_test.csv\"\n", "write_dataset(dest_dir=dataset_dir, X=X_train, y=y_train, filename=train_filename, to_json=False)\n", "write_dataset(dest_dir=dataset_dir, X=X_test, y=y_test, filename=test_filename, to_json=False)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }