{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "file2obj = pd.read_csv(\"../data/processed/OM_file_to_obj.csv\")\n", "obj2info = pd.read_csv(\"../data/processed/OM_obj_to_info.csv\")\n", "\n", "\n", "# Could eventually do something with these columns, but need cleaning first\n", "obj2info.drop(\n", " columns=[\"number_of_parts\", \"production.date.start\", \"production.date.end\", \"obj_num_old\"],\n", " inplace=True,\n", ")\n", "\n", "file2obj[\"image\"] = file2obj.apply(lambda x: os.path.join(x[\"root\"], x[\"file\"]), axis=1)\n", "# file2obj.rename(columns={\"obj_num\": \"label\"}, inplace=True)\n", "\n", "join_df = file2obj[[\"obj_num\", \"file\", \"image\", \"root\"]].merge(\n", " obj2info, left_on=\"obj_num\", right_on=\"obj_num\", how=\"left\"\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 data/raw/images/fulling_mill/1985\n", "1 data/raw/images/fulling_mill/1985\n", "2 data/raw/images/fulling_mill/1985\n", "3 data/raw/images/fulling_mill/1985\n", "4 data/raw/images/fulling_mill/1985\n", " ... \n", "37300 data/raw/images/egyptian/2014\n", "37301 data/raw/images/egyptian/2014\n", "37302 data/raw/images/egyptian/2014\n", "37303 data/raw/images/egyptian/1963\n", "37304 data/raw/images/egyptian/1963\n", "Name: root, Length: 37305, dtype: object" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "file2obj[\"root\"]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
obj_numdescriptionobject_nameother_namematerialproduction.periodproduction.place
0eg3squat shouldered jar, no rimbowlsbowllimestone1st DynastyEgypt
1eg64axe-headaxes: woodworking toolsaxe-headgraniteNaNEgypt
2eg71the working end of a fish tail knife with pres...knivesknifeFlint/ChertNaqada IIEgypt
3eg75seated figure of priest holding unrolled papyr...Human Figurineimhotep figurinebronzeLate PeriodEgypt
4durom.1971.78seated woman, inset eyes (lost), headdress had...Human FigurineHathor figurinebronzeLate PeriodEgypt
........................
12349durma.2020.3.2562A silver Roman coin which is a part of the Pie...coinsNaNmetalRomanRome
12350durma.2020.3.2060A silver Roman coin which is a part of the Pie...coinsNaNmetalRomanNaN
12351durma.2020.3.1446A silver Roman coin which is a part of the Pie...coinsNaNmetalRomanRome
12352durma.2020.3.2042A silver Roman coin which is a part of the Pie...coinsNaNmetalRomanRome
12353durma.2020.3.2072A silver Roman coin which is a part of the Pie...coinsNaNmetalRomanRome
\n", "

11673 rows × 7 columns

\n", "
" ], "text/plain": [ " obj_num description \\\n", "0 eg3 squat shouldered jar, no rim \n", "1 eg64 axe-head \n", "2 eg71 the working end of a fish tail knife with pres... \n", "3 eg75 seated figure of priest holding unrolled papyr... \n", "4 durom.1971.78 seated woman, inset eyes (lost), headdress had... \n", "... ... ... \n", "12349 durma.2020.3.2562 A silver Roman coin which is a part of the Pie... \n", "12350 durma.2020.3.2060 A silver Roman coin which is a part of the Pie... \n", "12351 durma.2020.3.1446 A silver Roman coin which is a part of the Pie... \n", "12352 durma.2020.3.2042 A silver Roman coin which is a part of the Pie... \n", "12353 durma.2020.3.2072 A silver Roman coin which is a part of the Pie... \n", "\n", " object_name other_name material \\\n", "0 bowls bowl limestone \n", "1 axes: woodworking tools axe-head granite \n", "2 knives knife Flint/Chert \n", "3 Human Figurine imhotep figurine bronze \n", "4 Human Figurine Hathor figurine bronze \n", "... ... ... ... \n", "12349 coins NaN metal \n", "12350 coins NaN metal \n", "12351 coins NaN metal \n", "12352 coins NaN metal \n", "12353 coins NaN metal \n", "\n", " production.period production.place \n", "0 1st Dynasty Egypt \n", "1 NaN Egypt \n", "2 Naqada II Egypt \n", "3 Late Period Egypt \n", "4 Late Period Egypt \n", "... ... ... \n", "12349 Roman Rome \n", "12350 Roman NaN \n", "12351 Roman Rome \n", "12352 Roman Rome \n", "12353 Roman Rome \n", "\n", "[11673 rows x 7 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj2info.dropna(subset=[\"material\", \"description\"], inplace=False)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "label_col = \"material\"\n", "\n", "o2i_lim = obj2info.dropna(subset=[label_col, \"description\"], inplace=False)\n", "\n", "num_counts = o2i_lim[label_col].value_counts()\n", "for lower_lim in [3]:\n", " o2i_lim = o2i_lim[o2i_lim[label_col].isin(num_counts[num_counts > lower_lim].index)]\n", "train, val_test = train_test_split(\n", " o2i_lim, stratify=o2i_lim[label_col], test_size=0.4, random_state=42\n", ")\n", "val, test = train_test_split(\n", " val_test, stratify=val_test[label_col], test_size=0.8, random_state=42\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from datasets import Dataset, DatasetDict" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "ds = Dataset.from_pandas(join_df).to_pandas()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
obj_numfileimagerootdescriptionobject_nameother_namematerialproduction.periodproduction.place
0durma.1985.15.681985.15.68.jpgdata/raw/images/fulling_mill/1985/1985.15.68.jpgdata/raw/images/fulling_mill/19852 fragments of a bowl with open fret work at t...NoneRim SherdspotteryPost-MedievalNone
1durma.1985.52.371985.52.37.ff2.jpgdata/raw/images/fulling_mill/1985/1985.52.37.f...data/raw/images/fulling_mill/1985Reconstructed small vessel (many pieces with s...potteryPotterypotteryRomanNone
2durma.1985.81.44961985.81.4496 d2.jpgdata/raw/images/fulling_mill/1985/1985.81.4496...data/raw/images/fulling_mill/1985Fragment of a Samian beaker. Panell decoration...vesselspotterypotteryRomanNone
3durma.1985.9.11985.9.1.1-d4.jpgdata/raw/images/fulling_mill/1985/1985.9.1.1-d...data/raw/images/fulling_mill/19852 Fragmentary Saxon Cinerary Urns + 1 relative...NoneCinerary UrnspotterySaxonNone
4durma.1985.52.371985.52.37.sf2.jpgdata/raw/images/fulling_mill/1985/1985.52.37.s...data/raw/images/fulling_mill/1985Reconstructed small vessel (many pieces with s...potteryPotterypotteryRomanNone
.................................
37300durom.2014.1.22014.1.2 bb.jpgdata/raw/images/egyptian/2014/2014.1.2 bb.jpgdata/raw/images/egyptian/2014One of a collection of 162 flint tools. Brown,...bladesNoneFlint/ChertNeolithic PeriodEgypt
37301durom.2014.1.712014.1.71 ll.jpgdata/raw/images/egyptian/2014/2014.1.71 ll.jpgdata/raw/images/egyptian/2014One of a collection of 162 flint tools. Large,...axes: woodworking toolsNoneFlint/ChertNeolithic PeriodEgypt
37302durom.2014.1.22014.1.2 rr.jpgdata/raw/images/egyptian/2014/2014.1.2 rr.jpgdata/raw/images/egyptian/2014One of a collection of 162 flint tools. Brown,...bladesNoneFlint/ChertNeolithic PeriodEgypt
37303durom.1963.41963.4.jpgdata/raw/images/egyptian/1963/1963.4.jpgdata/raw/images/egyptian/1963The woman is dressed in Qing dynasty style and...figures牙雕母婴像ivorylate Qing dynastyChina
37304durom.1963.41963.4.2.jpgdata/raw/images/egyptian/1963/1963.4.2.jpgdata/raw/images/egyptian/1963The woman is dressed in Qing dynasty style and...figures牙雕母婴像ivorylate Qing dynastyChina
\n", "

37305 rows × 10 columns

\n", "
" ], "text/plain": [ " obj_num file \\\n", "0 durma.1985.15.68 1985.15.68.jpg \n", "1 durma.1985.52.37 1985.52.37.ff2.jpg \n", "2 durma.1985.81.4496 1985.81.4496 d2.jpg \n", "3 durma.1985.9.1 1985.9.1.1-d4.jpg \n", "4 durma.1985.52.37 1985.52.37.sf2.jpg \n", "... ... ... \n", "37300 durom.2014.1.2 2014.1.2 bb.jpg \n", "37301 durom.2014.1.71 2014.1.71 ll.jpg \n", "37302 durom.2014.1.2 2014.1.2 rr.jpg \n", "37303 durom.1963.4 1963.4.jpg \n", "37304 durom.1963.4 1963.4.2.jpg \n", "\n", " image \\\n", "0 data/raw/images/fulling_mill/1985/1985.15.68.jpg \n", "1 data/raw/images/fulling_mill/1985/1985.52.37.f... \n", "2 data/raw/images/fulling_mill/1985/1985.81.4496... \n", "3 data/raw/images/fulling_mill/1985/1985.9.1.1-d... \n", "4 data/raw/images/fulling_mill/1985/1985.52.37.s... \n", "... ... \n", "37300 data/raw/images/egyptian/2014/2014.1.2 bb.jpg \n", "37301 data/raw/images/egyptian/2014/2014.1.71 ll.jpg \n", "37302 data/raw/images/egyptian/2014/2014.1.2 rr.jpg \n", "37303 data/raw/images/egyptian/1963/1963.4.jpg \n", "37304 data/raw/images/egyptian/1963/1963.4.2.jpg \n", "\n", " root \\\n", "0 data/raw/images/fulling_mill/1985 \n", "1 data/raw/images/fulling_mill/1985 \n", "2 data/raw/images/fulling_mill/1985 \n", "3 data/raw/images/fulling_mill/1985 \n", "4 data/raw/images/fulling_mill/1985 \n", "... ... \n", "37300 data/raw/images/egyptian/2014 \n", "37301 data/raw/images/egyptian/2014 \n", "37302 data/raw/images/egyptian/2014 \n", "37303 data/raw/images/egyptian/1963 \n", "37304 data/raw/images/egyptian/1963 \n", "\n", " description \\\n", "0 2 fragments of a bowl with open fret work at t... \n", "1 Reconstructed small vessel (many pieces with s... \n", "2 Fragment of a Samian beaker. Panell decoration... \n", "3 2 Fragmentary Saxon Cinerary Urns + 1 relative... \n", "4 Reconstructed small vessel (many pieces with s... \n", "... ... \n", "37300 One of a collection of 162 flint tools. Brown,... \n", "37301 One of a collection of 162 flint tools. Large,... \n", "37302 One of a collection of 162 flint tools. Brown,... \n", "37303 The woman is dressed in Qing dynasty style and... \n", "37304 The woman is dressed in Qing dynasty style and... \n", "\n", " object_name other_name material production.period \\\n", "0 None Rim Sherds pottery Post-Medieval \n", "1 pottery Pottery pottery Roman \n", "2 vessels pottery pottery Roman \n", "3 None Cinerary Urns pottery Saxon \n", "4 pottery Pottery pottery Roman \n", "... ... ... ... ... \n", "37300 blades None Flint/Chert Neolithic Period \n", "37301 axes: woodworking tools None Flint/Chert Neolithic Period \n", "37302 blades None Flint/Chert Neolithic Period \n", "37303 figures 牙雕母婴像 ivory late Qing dynasty \n", "37304 figures 牙雕母婴像 ivory late Qing dynasty \n", "\n", " production.place \n", "0 None \n", "1 None \n", "2 None \n", "3 None \n", "4 None \n", "... ... \n", "37300 Egypt \n", "37301 Egypt \n", "37302 Egypt \n", "37303 China \n", "37304 China \n", "\n", "[37305 rows x 10 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ds" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(6819, 7) (2370, 7) (2370, 7) (11559, 7)\n", "(19246, 10) (6743, 10) (7078, 10) (37305, 10)\n" ] } ], "source": [ "index_col = \"obj_num\"\n", "text_col = \"obj_num\"\n", "label_col = \"material\"\n", "lower_lim = 3\n", "problem_type = \"image\"\n", "\n", "\n", "o2i_lim = (\n", " ds.drop_duplicates(subset=[index_col, label_col], inplace=False)\n", " .dropna(subset=[text_col, label_col], inplace=False)\n", " .drop(columns=[\"root\", \"file\", \"image\"], inplace=False)\n", ")\n", "\n", "\n", "num_counts = o2i_lim[label_col].value_counts()\n", "o2i_lim = o2i_lim[o2i_lim[label_col].isin(num_counts[num_counts > lower_lim].index)]\n", "\n", "train, val_test = train_test_split(\n", " o2i_lim, stratify=o2i_lim[label_col], test_size=0.41, random_state=42\n", ")\n", "val, test = train_test_split(\n", " val_test, stratify=val_test[label_col], test_size=0.5, random_state=42\n", ")\n", "print(train.shape, val.shape, test.shape, o2i_lim.shape)\n", "\n", "if problem_type == \"image\":\n", " train = train.merge(\n", " ds[[\"obj_num\", \"root\", \"file\", \"image\"]], left_on=\"obj_num\", right_on=\"obj_num\", how=\"left\"\n", " )\n", " val = val.merge(\n", " ds[[\"obj_num\", \"root\", \"file\", \"image\"]], left_on=\"obj_num\", right_on=\"obj_num\", how=\"left\"\n", " )\n", " test = test.merge(\n", " ds[[\"obj_num\", \"root\", \"file\", \"image\"]], left_on=\"obj_num\", right_on=\"obj_num\", how=\"left\"\n", " )\n", " print(train.shape, val.shape, test.shape, ds.shape)\n", "\n", "# ds_dict = DatasetDict({\"train\": Dataset.from_pandas(train), \"val\": Dataset.from_pandas(val), \"test\": Dataset.from_pandas(test)})\n", "# ds_dict\n", "\n", "# if problem_type == \"image\":\n", "\n", "# o2i_lim_ds = o2i_lim_ds.train_test_split(test_size=0.3, stratify_by_column=label_col, seed=42)\n", "# o2i_lim_ds_valtest = o2i_lim_ds[\"test\"].train_test_split(test_size=0.5, stratify_by_column=label_col, seed=42)\n", "# o2i_lim_ds = DatasetDict({\"train\": o2i_lim_ds[\"train\"], \"val\": o2i_lim_ds_valtest[\"train\"], \"test\": o2i_lim_ds_valtest[\"test\"]})\n", "\n", "# if problem_type == \"image\":\n", "# file2obj = ds[[\"obj_num\", \"file\", \"image\", \"root\"]].drop_duplicates(subset=[\"obj_num\"], inplace=False)\n", "# train = o2i_lim_ds[\"train\"].merge(file2obj, left_on=\"obj_num\", right_on=\"obj_num\", how=\"left\")\n", "# val = o2i_lim_ds[\"val\"].merge(file2obj, left_on=\"obj_num\", right_on=\"obj_num\", how=\"left\")\n", "# test = o2i_lim_ds[\"test\"].merge(file2obj, left_on=\"obj_num\", right_on=\"obj_num\", how=\"left\")\n", "# o2i_lim_ds = DatasetDict({\"train\": train, \"val\": val, \"test\": test})\n", "# o2i_lim_ds" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
obj_numdescriptionobject_nameother_namematerialproduction.periodproduction.place
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [obj_num, description, object_name, other_name, material, production.period, production.place]\n", "Index: []" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "o2i_lim" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols_to_drop = [\"col1\", \"col2\", \"col3\"]\n", "ds = ds.drop(cols_to_drop, axis=1, errors=\"ignore\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ds_lim = ds_all.dropna(subset=[\"image\", args.label_col], inplace=False)\n", "if \"3D\" in args.dataset:\n", " ds_lim = ds_all[ds_all[\"original\"]]\n", "\n", "num_counts = ds_lim[args.label_col].value_counts()\n", "ds_lim = ds_lim[ds_lim[args.label_col].isin(num_counts[num_counts > args.lower_lim].index)]\n", "\n", "train, val_test = train_test_split(\n", " ds_lim,\n", " stratify=ds_lim[args.label_col],\n", " test_size=2 * args.testset_size,\n", " random_state=42,\n", ")\n", "val, test = train_test_split(\n", " val_test, stratify=val_test[args.label_col], test_size=0.5, random_state=42\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "ArtifactClassification", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }