{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import logging\n", "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "import click\n", "from dotenv import find_dotenv, load_dotenv\n", "import pandas as pd\n", "import time\n", "import requests\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "input_filepath = \"../data/raw/BM_csv_files/3000BC-AD500/\"\n", "\n", "\n", "csv_files = os.listdir(input_filepath)\n", "regions = [\"north_america\", \"asia\", \"europe\", \"africa\", \"south_america\"]\n", "all_dfs = {}\n", "for region in regions:\n", " region_csv_files = [file for file in csv_files if region in file.lower()]\n", " region_df = pd.concat(\n", " [pd.read_csv(os.path.join(input_filepath, file)) for file in region_csv_files]\n", " )\n", " region_df.drop_duplicates(inplace=True, ignore_index=True)\n", " all_dfs[region] = region_df\n", "\n", "# create a master df with a new column for the region\n", "world_df = pd.DataFrame()\n", "for region, df in all_dfs.items():\n", " df[\"region\"] = region\n", " world_df = pd.concat([world_df, df])\n", "\n", "world_df.drop_duplicates(subset=world_df.columns.difference([\"region\"]), inplace=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ImageObject typeMuseum numberTitleDenominationEscapementDescriptionProducer nameSchool/styleState...Acq notes (acq)Acq notes (exc)DeptBM/Big numberReg numberAdd idsCat noBanknote serial numberJoined objectsregion
0https://media.britishmuseum.org/media/Reposito...adzeNo: Am1994,09.1NaNNaNNaNAdze? of pecked and ground stone, grooved for ...NaNNaNNaN...NaNNaNAfrica, Oceania and the AmericasNaNAm1994,09.1NaNNaNNaNNaNnorth_america
1https://media.britishmuseum.org/media/Reposito...altarNo: Am,S.818NaNNaNNaNThree fragments of burnt clay that formed part...NaNNaNNaN...NaNNaNAfrica, Oceania and the AmericasNaNAm,S.818CDMS number: Am1931E1.818 (old CDMS no.); Prev...NaNNaNNaNnorth_america
2https://media.britishmuseum.org/media/Reposito...altarNo: Am,S.817NaNNaNNaNFragments of an altar or crematory basin made ...NaNNaNNaN...NaNNaNAfrica, Oceania and the AmericasNaNAm,S.817CDMS number: Am1931E1.817 (old CDMS no.); Prev...NaNNaNNaNnorth_america
3https://media.britishmuseum.org/media/Reposito...amulet; pendantNo: Am.9685NaNNaNNaNChalchihuitl, amulet, pendant made of amazonst...NaNNaNNaN...NaNNaNAfrica, Oceania and the AmericasNaNAm.9685CDMS number: Am1876C1.9685 (old CDMS no.); Mis...NaNNaNNaNnorth_america
4https://media.britishmuseum.org/media/Reposito...arrow; pointNo: Am,S.758.a-cNaNNaNNaNThree expanding stem arrow or spear points, ma...NaNNaNNaN...NaNNaNAfrica, Oceania and the AmericasNaNAm,S.758.a-cCDMS number: Am1931E1.758a-c (old CDMS no.); M...NaNNaNNaNnorth_america
..................................................................
719https://media.britishmuseum.org/media/Reposito...whistle; figurineNo: Am1954,05.196NaNNaNNaNAnthropomorphic whistle / whistle in the shape...NaNNaNNaN...NaNNaNAfrica, Oceania and the AmericasNaNAm1954,05.196NaNNaNNaNNaNsouth_america
720https://media.britishmuseum.org/media/Reposito...whistleNo: Am1954,05.669NaNNaNNaNWhistle made of pottery, possibly modelled and...NaNNaNNaN...NaNNaNAfrica, Oceania and the AmericasNaNAm1954,05.669NaNNaNNaNNaNsouth_america
721https://media.britishmuseum.org/media/Reposito...whistleNo: Am1954,05.194NaNNaNNaNAnthropomorphic whistle in the shape of the An...NaNNaNNaN...NaNNaNAfrica, Oceania and the AmericasNaNAm1954,05.194Previous owner/ex-collection number: 167687 (W...NaNNaNNaNsouth_america
722https://media.britishmuseum.org/media/Reposito...whistleNo: Am.6877NaNNaNNaNOvoid shaped whistle made of tumbaga by lost-w...NaNNaNNaN...NaNNaNAfrica, Oceania and the AmericasNaNAm.6877CDMS number: Am1870C1.6877 (old CDMS no.); Mis...NaNNaNNaNsouth_america
723https://media.britishmuseum.org/media/Reposito...whistling vessel; jarNo: Am1982,Q.944NaNNaNNaNRectangular spouted jar with bridge and (broke...NaNNaNNaN...Acquisition details unknown.NaNAfrica, Oceania and the AmericasNaNAm1982,Q.944NaNNaNNaNNaNsouth_america
\n", "

205721 rows × 48 columns

\n", "
" ], "text/plain": [ " Image Object type \\\n", "0 https://media.britishmuseum.org/media/Reposito... adze \n", "1 https://media.britishmuseum.org/media/Reposito... altar \n", "2 https://media.britishmuseum.org/media/Reposito... altar \n", "3 https://media.britishmuseum.org/media/Reposito... amulet; pendant \n", "4 https://media.britishmuseum.org/media/Reposito... arrow; point \n", ".. ... ... \n", "719 https://media.britishmuseum.org/media/Reposito... whistle; figurine \n", "720 https://media.britishmuseum.org/media/Reposito... whistle \n", "721 https://media.britishmuseum.org/media/Reposito... whistle \n", "722 https://media.britishmuseum.org/media/Reposito... whistle \n", "723 https://media.britishmuseum.org/media/Reposito... whistling vessel; jar \n", "\n", " Museum number Title Denomination Escapement \\\n", "0 No: Am1994,09.1 NaN NaN NaN \n", "1 No: Am,S.818 NaN NaN NaN \n", "2 No: Am,S.817 NaN NaN NaN \n", "3 No: Am.9685 NaN NaN NaN \n", "4 No: Am,S.758.a-c NaN NaN NaN \n", ".. ... ... ... ... \n", "719 No: Am1954,05.196 NaN NaN NaN \n", "720 No: Am1954,05.669 NaN NaN NaN \n", "721 No: Am1954,05.194 NaN NaN NaN \n", "722 No: Am.6877 NaN NaN NaN \n", "723 No: Am1982,Q.944 NaN NaN NaN \n", "\n", " Description Producer name \\\n", "0 Adze? of pecked and ground stone, grooved for ... NaN \n", "1 Three fragments of burnt clay that formed part... NaN \n", "2 Fragments of an altar or crematory basin made ... NaN \n", "3 Chalchihuitl, amulet, pendant made of amazonst... NaN \n", "4 Three expanding stem arrow or spear points, ma... NaN \n", ".. ... ... \n", "719 Anthropomorphic whistle / whistle in the shape... NaN \n", "720 Whistle made of pottery, possibly modelled and... NaN \n", "721 Anthropomorphic whistle in the shape of the An... NaN \n", "722 Ovoid shaped whistle made of tumbaga by lost-w... NaN \n", "723 Rectangular spouted jar with bridge and (broke... NaN \n", "\n", " School/style State ... Acq notes (acq) Acq notes (exc) \\\n", "0 NaN NaN ... NaN NaN \n", "1 NaN NaN ... NaN NaN \n", "2 NaN NaN ... NaN NaN \n", "3 NaN NaN ... NaN NaN \n", "4 NaN NaN ... NaN NaN \n", ".. ... ... ... ... ... \n", "719 NaN NaN ... NaN NaN \n", "720 NaN NaN ... NaN NaN \n", "721 NaN NaN ... NaN NaN \n", "722 NaN NaN ... NaN NaN \n", "723 NaN NaN ... Acquisition details unknown. NaN \n", "\n", " Dept BM/Big number Reg number \\\n", "0 Africa, Oceania and the Americas NaN Am1994,09.1 \n", "1 Africa, Oceania and the Americas NaN Am,S.818 \n", "2 Africa, Oceania and the Americas NaN Am,S.817 \n", "3 Africa, Oceania and the Americas NaN Am.9685 \n", "4 Africa, Oceania and the Americas NaN Am,S.758.a-c \n", ".. ... ... ... \n", "719 Africa, Oceania and the Americas NaN Am1954,05.196 \n", "720 Africa, Oceania and the Americas NaN Am1954,05.669 \n", "721 Africa, Oceania and the Americas NaN Am1954,05.194 \n", "722 Africa, Oceania and the Americas NaN Am.6877 \n", "723 Africa, Oceania and the Americas NaN Am1982,Q.944 \n", "\n", " Add ids Cat no \\\n", "0 NaN NaN \n", "1 CDMS number: Am1931E1.818 (old CDMS no.); Prev... NaN \n", "2 CDMS number: Am1931E1.817 (old CDMS no.); Prev... NaN \n", "3 CDMS number: Am1876C1.9685 (old CDMS no.); Mis... NaN \n", "4 CDMS number: Am1931E1.758a-c (old CDMS no.); M... NaN \n", ".. ... ... \n", "719 NaN NaN \n", "720 NaN NaN \n", "721 Previous owner/ex-collection number: 167687 (W... NaN \n", "722 CDMS number: Am1870C1.6877 (old CDMS no.); Mis... NaN \n", "723 NaN NaN \n", "\n", " Banknote serial number Joined objects region \n", "0 NaN NaN north_america \n", "1 NaN NaN north_america \n", "2 NaN NaN north_america \n", "3 NaN NaN north_america \n", "4 NaN NaN north_america \n", ".. ... ... ... \n", "719 NaN NaN south_america \n", "720 NaN NaN south_america \n", "721 NaN NaN south_america \n", "722 NaN NaN south_america \n", "723 NaN NaN south_america \n", "\n", "[205721 rows x 48 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "world_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Null values" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ColumnNull PercentageNon-Null Countunique_values
0Image0.00205721202525
6Description0.00205721107102
40Dept0.002057219
25Location0.002057211403
14Production date0.0020572112766
47region0.002057215
2Museum number0.00205721196970
1Object type0.002057184199
17Materials0.052056121856
42Reg number3.86197787196674
13Culture4.181971251738
21Dimensions4.52196413115526
15Production place17.441698533494
37Acq date20.69163156566
24Bib references21.6716113413399
33Acq name (acq)29.951441085215
28Subjects44.441143086779
16Find spot47.621077547860
10Authority49.921030301541
22Inscription53.739517847979
29Assoc name54.57934569520
4Denomination57.3987657636
9State62.187780250
20Technique70.98596954003
23Curators Comments74.305286533459
44Cat no78.424438944280
43Add ids84.813124526824
38Acq notes (acq)86.98267825853
36Acq name (previous)89.06224971501
39Acq notes (exc)90.70191321450
27Condition93.39135905438
19Type series94.52112713001
18Ware94.6011116351
41BM/Big number95.8685268474
26Exhibition history95.8785043688
34Acq name (finding)96.756689108
7Producer name98.2535961447
30Assoc place98.722638492
3Title99.002063313
31Assoc events99.88244113
32Assoc titles99.968938
11Ethnic name (made by)99.983218
12Ethnic name (assoc)100.0084
45Banknote serial number100.0011
35Acq name (excavator)100.0000
8School/style100.0000
46Joined objects100.0000
5Escapement100.0000
\n", "
" ], "text/plain": [ " Column Null Percentage Non-Null Count unique_values\n", "0 Image 0.00 205721 202525\n", "6 Description 0.00 205721 107102\n", "40 Dept 0.00 205721 9\n", "25 Location 0.00 205721 1403\n", "14 Production date 0.00 205721 12766\n", "47 region 0.00 205721 5\n", "2 Museum number 0.00 205721 196970\n", "1 Object type 0.00 205718 4199\n", "17 Materials 0.05 205612 1856\n", "42 Reg number 3.86 197787 196674\n", "13 Culture 4.18 197125 1738\n", "21 Dimensions 4.52 196413 115526\n", "15 Production place 17.44 169853 3494\n", "37 Acq date 20.69 163156 566\n", "24 Bib references 21.67 161134 13399\n", "33 Acq name (acq) 29.95 144108 5215\n", "28 Subjects 44.44 114308 6779\n", "16 Find spot 47.62 107754 7860\n", "10 Authority 49.92 103030 1541\n", "22 Inscription 53.73 95178 47979\n", "29 Assoc name 54.57 93456 9520\n", "4 Denomination 57.39 87657 636\n", "9 State 62.18 77802 50\n", "20 Technique 70.98 59695 4003\n", "23 Curators Comments 74.30 52865 33459\n", "44 Cat no 78.42 44389 44280\n", "43 Add ids 84.81 31245 26824\n", "38 Acq notes (acq) 86.98 26782 5853\n", "36 Acq name (previous) 89.06 22497 1501\n", "39 Acq notes (exc) 90.70 19132 1450\n", "27 Condition 93.39 13590 5438\n", "19 Type series 94.52 11271 3001\n", "18 Ware 94.60 11116 351\n", "41 BM/Big number 95.86 8526 8474\n", "26 Exhibition history 95.87 8504 3688\n", "34 Acq name (finding) 96.75 6689 108\n", "7 Producer name 98.25 3596 1447\n", "30 Assoc place 98.72 2638 492\n", "3 Title 99.00 2063 313\n", "31 Assoc events 99.88 244 113\n", "32 Assoc titles 99.96 89 38\n", "11 Ethnic name (made by) 99.98 32 18\n", "12 Ethnic name (assoc) 100.00 8 4\n", "45 Banknote serial number 100.00 1 1\n", "35 Acq name (excavator) 100.00 0 0\n", "8 School/style 100.00 0 0\n", "46 Joined objects 100.00 0 0\n", "5 Escapement 100.00 0 0" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "null_percentage = (world_df.isnull().sum() / len(world_df)) * 100\n", "desc_df = pd.DataFrame(\n", " {\"Column\": null_percentage.index, \"Null Percentage\": null_percentage.values}\n", ")\n", "desc_df[\"Null Percentage\"] = desc_df[\"Null Percentage\"].round(2)\n", "desc_df[\"Non-Null Count\"] = world_df.count().values\n", "desc_df[\"unique_values\"] = world_df.nunique().values\n", "desc_df[\"Non-Null Count\"] = world_df.count().values\n", "desc_df = desc_df.sort_values(by=\"Non-Null Count\", ascending=False)\n", "desc_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There could be some interesting information in the more sparsely populated columns, for example the `Inscription` column could result in a cool OCR task, but we have to draw the line somewhere, hence I will drop all columns with more than 50% null values." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "threshold = len(world_df) * 0.5\n", "world_df.dropna(thresh=threshold, axis=1, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Analysing each column" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```python\n", "for column in world_df.columns:\n", " try:\n", " non_null_examples = world_df[column].dropna().sample(10)\n", " print(\n", " f\"Column: {column}, Non-null count: {desc_df[desc_df['Column'] == column]['Non-Null Count'].values[0]}, Unique values: {desc_df[desc_df['Column'] == column]['unique_values'].values[0]}\"\n", " )\n", " print(non_null_examples.values)\n", " except:\n", " print(f\"Column: {column}\")\n", " print(\"No non-null examples found\")\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "### Column: Image, Non-null count: 205721, Unique values: 202525\n", "```\n", "['https://media.britishmuseum.org/media/Repository/Documents/2014_10/11_10/6acbde4c_52c8_482b_a420_a3c100ae572e/preview_00630152_001.jpg'\n", "...\n", " 'https://media.britishmuseum.org/media/Repository/Documents/2022_1/31_15/b9873170_4632_4d7c_92b4_ae2e00f9efa1/preview_ERC_Cab_118_OFFICIAL_01020.jpg']\n", " ```\n", "\n", "Could be the best option for an index column: **keep**\n", "### Column: Production date, Non-null count: 205721, Unique values: 12766\n", "```\n", "['3rdC-7thC' '253-260' '1700BC-1000BC' '400BC-300BC (?)' '550BC-510BC'\n", " '3rdC-7thC' '450-457' '190-230 (circa)' '238-244' '5thC BC-4thC BC']\n", "```\n", "I like this as a regression task, but I will have to do some cleaning first: **keep**\n", "### Column: Dept, Non-null count: 205721, Unique values: 9\n", "```\n", "['Britain, Europe and Prehistory' 'Britain, Europe and Prehistory'\n", " 'Middle East' 'Money and Medals' 'Greek and Roman' 'Greek and Roman'\n", " 'Greek and Roman' 'Greek and Roman' 'Greek and Roman' 'Money and Medals']\n", "```\n", "Too broad: **drop**\n", "### Column: Location, Non-null count: 205721, Unique values: 1403\n", "```\n", "['Not on display' 'Not on display' 'Not on display' 'Not on display'\n", " 'Not on display' 'Not on display' 'Not on display' 'Not on display'\n", " 'Not on display' 'Not on display']\n", "```\n", "\n", "Useless: **drop**\n", "### Column: region, Non-null count: 205721, Unique values: 5\n", "```\n", "['europe' 'asia' 'europe' 'europe' 'asia' 'asia' 'europe' 'europe' 'asia'\n", " 'asia']\n", "```\n", "The column I added, could be useful for splitting the data into smaller datasets if the world is too big: **keep**\n", "### Column: Description, Non-null count: 205721, Unique values: 107102\n", "```\n", "['Pottery: red-figured squat lekythos.\\r\\nAphrodite and a young hunter (Adonis?). In the centre, Aphrodite (flesh coloured white) is seated on raised ground to left, with a mantle over her legs, leaning on her left hand; her face and right arm are missing; she wears a radiated fillet, necklace of beads, and bracelets. Confronting her on left is seated a youth in a chlamys and a Phrygian cap, resting his left hand on the end of his club, the head of which rests on the ground beside him; his left elbow is placed in the hollow between the branch and trunk of a tree which springs from the ground on his left. His face, which is broken away, seems to be turned to the front. Behind Aphrodite, Eros flies towards her, and appears to be clasping her round the neck; his face is damaged; he wears a fillet. On the right is a group of two women: one is seated on raised ground, and holds out in her right hand to left a wreath of laurel (part missing), of which her left hand probably held the other end. Behind her stands a woman who rests one hand on either shoulder of the seated figure; her drapery flies back as if she were in quick movement, though her feet are together. On the left a third woman stands looking to right, leaning forward with left knee raised, and resting her left elbow in another hollow of the tree. All the women wear a long chiton tied, earrings, necklace, and bracelets, and their hair is knotted high with a fillet. In the ground, branches of laurel spring up.\\r\\nBelow, egg pattern; above, a kind of alternate palmette and lotus; round neck, tongue pattern. Below the handle, a pattern composed of four palmettes with tendrils.'\n", "\n", " 'Silver coin.; Head of Septimius Severus, laureate, right.; Victory standing right, left foot on globe, with right hand inscribing shield set on column and holding palm-branch in left hand over left shoulder.'\n", "\n", " 'Fragment of an iron arrow-head.' 'Alloy coin.'\n", "\n", " 'Silver coin.; Silphium.; Ammon Head.' 'Copper alloy coin.'\n", "\n", " 'Gold coin.; Blank.; Stylised horse, right.' 'Silver coin.'\n", "\n", " 'Bronze openwork bell-shaped horse-harness ornaments.'\n", " \n", " 'Fragment of Rhodian pottery transport amphora; neck and rising handle to sharpish turn; clay fine pink-buff with cream slip; rectangular stamp.']\n", "```\n", "This is the most interesting text column, with a lot of unique values. Just from the examples above, we can see that some of the descriptions are quite detailed, whereas others are shorter.\n", "\n", "Only half of the items have a unique description however, so I will need to explore further.\n", "\n", "**keep**\n", "\n", "\n", "### Column: Museum number, Non-null count: 205721, Unique values: 196970\n", "```\n", "['No: 1994,0915.680' 'No: 1933,0214.689' 'No: 1910,0610.21'\n", " 'No: 1909,0503.10' 'No: 1998,0118.2752' 'No: 1814,0704.1886'\n", " 'No: 1913,1119.15' 'No: 1864,1007.1907.+' 'No: IOLC.3415'\n", " 'No: 1981,1219.116']\n", "```\n", "Looks to be the same as the `Reg number` column, but just with a `No:` prefix.: **drop**\n", "\n", "### Column: Object type, Non-null count: 205718, Unique values: 4199\n", "```\n", "['coin' 'transport amphora' 'finger-ring' 'box' 'buckle' 'coin; forgery'\n", " 'panel' 'coin' 'coin' 'coin']\n", "```\n", "\n", "A good option for a high level classification task: **keep**\n", "### Column: Materials, Non-null count: 205612, Unique values: 1856\n", "```\n", "['copper alloy' 'copper alloy' 'copper alloy' 'silver' 'sandstone' 'alloy'\n", " 'alloy' 'copper alloy' 'silver' 'alloy' 'human tissue; linen; wood']\n", "```\n", "Good for a high level classification task: **keep**\n", "\n", "I have found at least one instance of a value with multiple materials, so I will have to think about whether this should be a multi-class classification task or leave it as is.\n", "\n", "Will need to do some analysis on the distribution of labels.\n", "### Column: Reg number, Non-null count: 197787, Unique values: 196674\n", "```\n", "['1887,0108.4' 'R.10443' '2002,0101.686' '1908,0110.1816' 'R.3790'\n", " '1969,0401.207' 'B.427' 'R.15402' '1992,0302.106' '1884,1213.21']\n", "```\n", "\n", "Not a good index column, but can be used to look up items on the collection website: **keep**\n", "### Column: Culture, Non-null count: 197125, Unique values: 1738\n", "```\n", "['Middle Helladic' 'Roman Imperial' 'Greco-Bactrian' 'Greek'\n", " 'Mesopotamian' 'Roman Republican' 'Greek' 'Roman Imperial'\n", " 'Roman Imperial' 'Greek']\n", "```\n", "Good option for a high level classification task: **keep**\n", "### Column: Dimensions, Non-null count: 196413, Unique values: 115526\n", "```\n", "[\"Die-axis: 6 o'clock; Weight: 8.019 grammes\" 'Height: 0.50 centimetres'\n", " \"Die-axis: 12 o'clock; Diameter: 20 millimetres; Weight: 4.29 grammes\"\n", " 'Length: 228.60 centimetres; Width: 157.48 centimetres; Depth: 15.24 centimetres'\n", " \"Die-axis: 12 o'clock; Diameter: 13 millimetres maximum ; Weight: 2.030 grammes\"\n", " 'Weight: 5.680 grammes'\n", " 'Diameter: 21.90 centimetres spout to handle ; Diameter: 10.10 centimetres; Height: 9.20 centimetres; Weight: 166 grammes'\n", " 'Weight: 7.500 grammes' \"Die-axis: 7 o'clock; Weight: 6.460 grammes\"\n", " 'Length: 25 inches']\n", "```\n", "Looks too irregular for a tabular prediction task (for example, different diameter values for spout to handle and just diameter), and i don't think it would be useful for a text prediction task either. I will **drop** this column.\n", "### Column: Production place, Non-null count: 169853, Unique values: 3494\n", "```\n", "['Minted in: Prusa ad Olympum; Associated with: Roman Empire'\n", " 'Minted in: Syracuse (historic - Sicily)' 'Made in: Rhodes'\n", " 'Minted in: Rome (city); Associated with: Roman Empire'\n", " 'Minted in: Gaul'\n", " 'Minted in: Rome (city); Associated with: Roman Empire'\n", " 'Minted in: Nicopolis (Epirus); Associated with: Roman Empire'\n", " 'Minted in: Gaul; Associated with: Roman Empire' 'Minted in: Andros'\n", " 'Minted in: Rome (city); Associated with: Roman Empire']\n", "```\n", "Decent option for a high level classification task, although more null values than others: **keep**\n", "\n", "### Column: Acq date, Non-null count: 163156, Unique values: 566\n", "```\n", "['1927' '1920' '1931' '1799' '1983' '2004' '1937' '1927' '1847'\n", " '21/3/2022']\n", "```\n", "\n", "Useless: **drop**\n", "### Column: Bib references, Non-null count: 161134, Unique values: 13399\n", "```\n", "['RE4 / Coins of the Roman Empire in the British Museum, vol.IV: Antoninus Pius to Commodus. Introduction, indexes and plates.; RIC3 / The Roman imperial coinage, vol. 3: Antoninus Pius to Commodus (type)'\n", " 'RIC8 / The Roman Imperial Coinage, vol. 8: the family of Constantine I, AD 337-364'\n", " 'Vase / Catalogue of Vases in the British Museum'\n", " 'Sculpture / Catalogue of Greek Sculpture in the British Museum; Jenkins 1994 / The Parthenon Frieze'\n", " 'Villing et al 2013-2015 / Naukratis: Greeks in Egypt (Phase 3); Petrie 1886 / Naukratis. Part I., 1884-85'\n", " 'RE3 / Coins of the Roman Empire in the British Museum, vol.III: Nerva to Hadrian; Strack (Hadrian) / Die Reichspraegung zur Zeit des Hadrian; RIC2 / The Roman imperial coinage, vol. 2: Vespasian to Hadrian (type); RIC2.3 / The Roman Imperial Coinage, vol. II - part 3 from AD 117-138 Hadrian'\n", " 'RRC / Roman Republican Coinage; RR1 / Coins of the Roman Republic in the British Museum: vol. 1 aes rude, aes signatum, aes grave, and coinage of Rome from B.C. 268.; Ghey, Leins & Crawford 2010 / A catalogue of the Roman Republican Coins in the British Museum, with descriptions and chronology based on M.H. Crawford, Roman Republican Coinage (1974)'\n", " 'RIC4 / The Roman imperial coinage, vol. 4: Pertinax - Uranius Antonius (type)'\n", " 'Price 1991 / The Coinage in the name of Alexander the Great and Philip Arridhaeus. A British Museum Catalogue.'\n", " 'Sellwood 1980 / An Introduction to the Coinage of Parthia']\n", "```\n", "\n", "Useless: **drop**\n", "### Column: Acq name (acq), Non-null count: 144108, Unique values: 5215\n", "```\n", "['Donated by: Ephesus Excavation Committee' 'Purchased from: R Teather'\n", " \"Purchased from: Sir Edward Herbert Bunbury, 9th Baronet (estate of); Purchased through: Sotheby's (December 1896); Purchased through: Rollin & Feuardent\"\n", " 'Treasure Trove: HM Treasury'\n", " \"Purchased from: Dr Nott; Purchased through: Sotheby's (30/5/1842)\"\n", " 'Transferred from: India Museum' 'Purchased from: James Millingen'\n", " 'Purchased from: Rollin & Feuardent'\n", " 'Purchased from: Peregrine Edward Towneley'\n", " 'Purchased from: Auguste Salzmann; Purchased from: Sir Alfred Biliotti']\n", "```\n", "\n", "Useless: **drop**\n", "### Column: Subjects, Non-null count: 114308, Unique values: 6779\n", "```\n", "['horse/ass; charioteer/chariot-racing; funeral/funerary rite'\n", " 'mammal; symbol' \n", " 'classical deity; mammal; triton'\n", " 'classical deity; cherub/cupid; mammal'\n", " 'emperor/empress; classical deity' \n", " 'emperor/empress; boat/ship'\n", " 'mammal; mythical figure/creature' \n", " 'mammal' \n", " 'classical deity; mammal'\n", " 'mammal']\n", "```\n", "Image to multi-class classification task might be interesting: **keep**\n", "\n", "Seems like entities are just separated by a semi colon\n", "### Column: Find spot, Non-null count: 107754, Unique values: 7860\n", "```\n", "['Excavated/Findspot: Naukratis'\n", " 'Excavated/Findspot: Stonea (Field Baulk hoard)'\n", " 'Found/Acquired: Netherhampton (Salisbury hoard)'\n", " 'Excavated/Findspot: Ur (historic - city)'\n", " 'Excavated/Findspot: Tell Taya' 'Excavated/Findspot: Enkomi'\n", " 'Excavated/Findspot: Susa' 'Excavated/Findspot: Enkomi'\n", " 'Excavated/Findspot: Corbridge'\n", " 'Excavated/Findspot: Sanctuary of Artemis Orthia (Lacedaemon)']\n", "```\n", "Too specific: **drop**\n", "### Column: Authority, Non-null count: 103030, Unique values: 1541\n", "```\n", "['Huvishka' 'Anonymous' 'N Fabius Pictor' 'Wima Takto' 'Titus' 'Huvishka'\n", " 'Hadrian' 'Tiberius' 'Maxentius'\n", " 'Augustus (Octavian); P Petronius Turpilianus']\n", "```\n", "\n", "I see a couple of Roman emperors in there, so it is an option for a high level classification task, but not as good as others. \n", "Also a lot of null values and could have a lot of 'Anonymous' values: **drop**\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "world_df.drop(\n", " [\n", " \"Museum number\",\n", " \"Dept\",\n", " \"Location\",\n", " \"Dimensions\",\n", " \"Acq date\",\n", " \"Bib references\",\n", " \"Acq name (acq)\",\n", " \"Find spot\",\n", " \"Authority\",\n", " ],\n", " axis=1,\n", " inplace=True,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Investigating images that appear multiple times" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Image\n", "https://media.britishmuseum.org/media/Repository/Documents/2014_10/2_20/f3c887a6_eb2a_4ef6_abd4_a3b8014ab8b8/preview_00329027_001.jpg 40\n", "https://media.britishmuseum.org/media/Repository/Documents/2014_10/2_16/8eacd3a4_b417_407d_a8ed_a3b80116ba24/preview_00198824_001.jpg 38\n", "https://media.britishmuseum.org/media/Repository/Documents/2014_10/16_14/3b899983_22f5_43c2_8a40_a3c600e97162/preview_01016240_001.jpg 37\n", "https://media.britishmuseum.org/media/Repository/Documents/2014_10/1_6/7b94705c_2aed_4c92_a14c_a3b700711952/preview_00033787_001.jpg 26\n", "https://media.britishmuseum.org/media/Repository/Documents/2014_10/1_12/765e9aca_c6cd_4969_9009_a3b700cbda32/preview_00060253_001.jpg 21\n", " ..\n", "https://media.britishmuseum.org/media/Repository/Documents/2014_11/4_22/9d0ed056_2b66_4b6c_a005_a3d9016f3fc6/preview_01114601_001.jpg 2\n", "https://media.britishmuseum.org/media/Repository/Documents/2014_10/4_21/4cd34596_f7c0_4838_8c56_a3ba015ed5e4/preview_00263455_001.jpg 2\n", "https://media.britishmuseum.org/media/Repository/Documents/2014_10/1_7/d2947a6a_4ba4_4faf_9867_a3b700744c3f/preview_00034574_001.jpg 2\n", "https://media.britishmuseum.org/media/Repository/Documents/2014_10/6_8/a6dbc056_74e5_4473_ae8c_a3bc0093592e/preview_00396916_001.jpg 2\n", "https://media.britishmuseum.org/media/Repository/Documents/2014_10/2_21/716e0100_c932_4011_aceb_a3b801693f1d/preview_00337334_001.jpg 2\n", "Name: count, Length: 1406, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "non_unique_images = world_df[world_df.duplicated(subset=\"Image\", keep=False)]\n", "non_unique_images[\"Image\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are some images that contain multiple items, such as the image of the 5 flints below. In this case, the same image appears multiple times in the dataset, but with different metadata for each item.\n", "\n", "\n", "![](https://media.britishmuseum.org/media/Repository/Documents/2014_10/16_12/4b546ebc_0383_44c9_aa0b_a3c600d4c7b9/preview_01009877_001.jpg)\n", "\n", "More examples:\n", "\n", "![](https://media.britishmuseum.org/media/Repository/Documents/2014_10/1_6/7b94705c_2aed_4c92_a14c_a3b700711952/preview_00033787_001.jpg )\n", "![](https://media.britishmuseum.org/media/Repository/Documents/2014_10/16_14/3b899983_22f5_43c2_8a40_a3c600e97162/preview_01016240_001.jpg )\n", "![](https://media.britishmuseum.org/media/Repository/Documents/2014_10/2_21/716e0100_c932_4011_aceb_a3b801693f1d/preview_00337334_001.jpg )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I think it is best to drop these images. Not being unique means that I can't use them as an index column and also I wouldn't want the same image appearing in the training and test set." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "world_df = world_df[~world_df.duplicated(subset=\"Image\", keep=False)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Potential columns for Multi-label classification" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Materials" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Looking at the examples below, as well as the overall small number of multi-label instances, it seems to be a case of a primary and secondary material, with the primary material being the first in the list. \n", "\n", "Taking the first material only seems like it would be a fine simplification." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of non-null instances: 201017\n", "Number of instances with multiple classes: 6065\n", "Number of unique class groups: 1541\n", "Number of instances with single class: 194952\n", "Number of unique classes from singles: 239\n" ] } ], "source": [ "print(\"Number of non-null instances:\", world_df[\"Materials\"].notnull().sum())\n", "\n", "materials_semicolon = world_df[world_df[\"Materials\"].str.contains(\";\", regex=False, na=False)]\n", "# materials_no_semicolon = world_df[~world_df['Materials'].str.contains(';')]\n", "num_instances = len(materials_semicolon)\n", "num_classes = len(materials_semicolon[\"Materials\"].unique())\n", "\n", "print(\"Number of instances with multiple classes:\", num_instances)\n", "print(\"Number of unique class groups:\", num_classes)\n", "\n", "materials_no_semicolon = world_df[\n", " ~world_df[\"Materials\"].str.contains(\";\", na=False) & world_df[\"Materials\"].notnull()\n", "]\n", "print(\"Number of instances with single class:\", len(materials_no_semicolon))\n", "print(\"Number of unique classes from singles:\", len(materials_no_semicolon[\"Materials\"].unique()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```python\n", "materials_semicolon['Materials'].sample(20).values.tolist()\n", "```\n", "```\n", "['copper; silver',\n", " 'steatite; glazed composition',\n", " 'iron; wood',\n", " 'fired clay; 陶器',\n", " 'hemp; 麻',\n", " 'iron; wood',\n", " 'cornelian; silver',\n", " 'silver; electrum',\n", " 'silver; gold',\n", " 'stucco; 灰泥',\n", " 'copper alloy; gold; garnet',\n", " 'lapis lazuli; gold',\n", " 'goat horn; sheep horn',\n", " 'pottery; stone; bitumen',\n", " 'silver; glass',\n", " 'stone; rhyolite',\n", " 'jasper; 碧玉',\n", " 'reed; fibre',\n", " 'iron; organic',\n", " 'stone; pottery']\n", "```\n", "\n", "We have some non-english characters in there.\n", "\n", "One option is to take the first material as the primary material, the other is to drop the rows with multiple materials." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of classes per instance: Materials\n", "2 4770\n", "3 954\n", "4 245\n", "5 59\n", "6 26\n", "8 5\n", "7 5\n", "9 1\n", "Name: count, dtype: int64\n" ] } ], "source": [ "classes_per_instance = materials_semicolon[\"Materials\"].str.split(\";\").apply(len)\n", "print(f\"Number of classes per instance: {classes_per_instance.value_counts()}\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Materials: silver alloy; gold\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Materials: pottery; ceramic; earthenware\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Materials: bronze; turquoise\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Materials: shell; lapis lazuli\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Materials: stucco; 灰泥\n", "------------------------\n" ] } ], "source": [ "from IPython.display import display, Image\n", "\n", "# Take 10 samples of 'Object type' and 'Image' columns\n", "samples = materials_semicolon[[\"Materials\", \"Image\"]].sample(5)\n", "\n", "# Display the images and their associated 'Object type'\n", "for index, row in samples.iterrows():\n", " display(Image(url=row[\"Image\"]))\n", " print(\"Materials:\", row[\"Materials\"])\n", " print(\"------------------------\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Subjects" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Subjects seem to be looking at the people/animals/objects that appear in the artifact. \n", "\n", "A lot of coins are appearing in the examples of multi-subject images below, with the multi-subjects being the different animals or people that appear on the coin.\n", "\n", "A genuine multi-label classification task.\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of non-null instances: 112829\n", "Number of instances with multiple classes: 76222\n", "Number of unique class groups: 6120\n", "Number of instances with single class: 36607\n", "Number of unique classes from singles: 523\n", "Number of instances with single class, including first of multi-class: 112829\n", "Number of unique classes , including first of multi-class: 663\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_425911/2410717160.py:17: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " subjects_semicolon[\"Subjects\"] = subjects_semicolon[\"Subjects\"].str.split(\";\").str[0]\n" ] } ], "source": [ "print(\"Number of non-null instances:\", world_df[\"Subjects\"].notnull().sum())\n", "\n", "subjects_semicolon = world_df[world_df[\"Subjects\"].str.contains(\";\", regex=False, na=False)]\n", "num_instances = len(subjects_semicolon)\n", "num_classes = len(subjects_semicolon[\"Subjects\"].unique())\n", "\n", "print(\"Number of instances with multiple classes:\", num_instances)\n", "print(\"Number of unique class groups:\", num_classes)\n", "\n", "subjects_no_semicolon = world_df[\n", " ~world_df[\"Subjects\"].str.contains(\";\", na=False) & world_df[\"Subjects\"].notnull()\n", "]\n", "print(\"Number of instances with single class:\", len(subjects_no_semicolon))\n", "print(\"Number of unique classes from singles:\", len(subjects_no_semicolon[\"Subjects\"].unique()))\n", "\n", "\n", "subjects_semicolon[\"Subjects\"] = subjects_semicolon[\"Subjects\"].str.split(\";\").str[0]\n", "subjects_no_semicolon = world_df[\n", " ~world_df[\"Subjects\"].str.contains(\";\", na=False) & world_df[\"Subjects\"].notnull()\n", "]\n", "subjects_no_semicolon = pd.concat([subjects_no_semicolon, subjects_semicolon])\n", "print(\n", " \"Number of instances with single class, including first of multi-class:\",\n", " len(subjects_no_semicolon),\n", ")\n", "print(\n", " \"Number of unique classes , including first of multi-class:\",\n", " len(subjects_no_semicolon[\"Subjects\"].unique()),\n", ")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of classes per instance: Subjects\n", "2 41673\n", "3 28095\n", "4 4920\n", "5 1077\n", "6 287\n", "7 94\n", "8 32\n", "9 20\n", "10 10\n", "13 5\n", "11 4\n", "16 2\n", "15 1\n", "14 1\n", "12 1\n", "Name: count, dtype: int64\n" ] } ], "source": [ "subjects_semicolon = world_df[world_df[\"Subjects\"].str.contains(\";\", regex=False, na=False)]\n", "classes_per_instance = subjects_semicolon[\"Subjects\"].str.split(\";\").apply(len)\n", "print(\"Number of classes per instance:\", classes_per_instance.value_counts())" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Subjects: tree/bush; amphibian (frog)\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Subjects: royal/imperial; cart/wagon; equestrian\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Subjects: classical deity; allegory/personification; mythical figure/creature\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Subjects: deity; devotee/worshipper\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Subjects: king/queen; peacock\n", "------------------------\n" ] } ], "source": [ "from IPython.display import display, Image\n", "\n", "# Take 10 samples of 'Object type' and 'Image' columns\n", "samples = subjects_semicolon[[\"Subjects\", \"Image\"]].sample(5)\n", "\n", "# Display the images and their associated 'Object type'\n", "for index, row in samples.iterrows():\n", " display(Image(url=row[\"Image\"]))\n", " print(\"Subjects:\", row[\"Subjects\"])\n", " print(\"------------------------\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Object type" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Only 11k/201k have multiple labels\n", "\n", "With single labels only, we have 1218 labels. Including the first label of the multi-labels, we have 1337 labels.\n", "\n", "Seems to be a primary and secondary description, so I think it is fair to drop only use the first label all items." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of non-null instances: 201116\n", "Number of instances with multiple classes: 12088\n", "Number of unique class groups: 2846\n", "Number of instances with single class: 189028\n", "Number of unique classes from singles: 1218\n", "Number of instances with single class, including first of multi-class: 201116\n", "Number of unique classes , including first of multi-class: 1337\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_425911/1440965513.py:16: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " object_semicolon[\"Object type\"] = object_semicolon[\"Object type\"].str.split(\";\").str[0]\n" ] } ], "source": [ "print(\"Number of non-null instances:\", world_df[\"Object type\"].notnull().sum())\n", "\n", "object_semicolon = world_df[world_df[\"Object type\"].str.contains(\";\", regex=False, na=False)]\n", "num_instances = len(object_semicolon)\n", "num_classes = len(object_semicolon[\"Object type\"].unique())\n", "\n", "print(\"Number of instances with multiple classes:\", num_instances)\n", "print(\"Number of unique class groups:\", num_classes)\n", "\n", "object_no_semicolon = world_df[\n", " ~world_df[\"Object type\"].str.contains(\";\", na=False) & world_df[\"Object type\"].notnull()\n", "]\n", "print(\"Number of instances with single class:\", len(object_no_semicolon))\n", "print(\"Number of unique classes from singles:\", len(object_no_semicolon[\"Object type\"].unique()))\n", "\n", "object_semicolon[\"Object type\"] = object_semicolon[\"Object type\"].str.split(\";\").str[0]\n", "object_no_semicolon = world_df[\n", " ~world_df[\"Object type\"].str.contains(\";\", na=False) & world_df[\"Object type\"].notnull()\n", "]\n", "object_no_semicolon = pd.concat([object_no_semicolon, object_semicolon])\n", "print(\n", " \"Number of instances with single class, including first of multi-class:\",\n", " len(object_no_semicolon),\n", ")\n", "print(\n", " \"Number of unique classes , including first of multi-class:\",\n", " len(object_no_semicolon[\"Object type\"].unique()),\n", ")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of classes per instance: Object type\n", "1 12088\n", "Name: count, dtype: int64\n" ] } ], "source": [ "classes_per_instance = object_semicolon[\"Object type\"].str.split(\";\").apply(len)\n", "print(\"Number of classes per instance:\", classes_per_instance.value_counts())" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Object type: cup\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Object type: stemmed dish\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Object type: figure\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Object type: figure\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Object type: textile\n", "------------------------\n" ] } ], "source": [ "from IPython.display import display, Image\n", "\n", "# Take 10 samples of 'Object type' and 'Image' columns\n", "samples = object_semicolon[[\"Object type\", \"Image\"]].sample(5)\n", "\n", "# Display the images and their associated 'Object type'\n", "for index, row in samples.iterrows():\n", " display(Image(url=row[\"Image\"]))\n", " print(\"Object type:\", row[\"Object type\"])\n", " print(\"------------------------\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Culture" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I think this is a genuine multi-label classification task: looking at the examples below I have seen 'Helenistic' appear before 'Classical Greek' or before 'Ptolemaic'\n", "\n", "Also there are examples of a spread of eras for the same item, perhaps for those where the date is unclear.\n", "\n", "However, there is a difficulty in that there are only 20k/193k which have multiple labels" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of non-null instances: 192729\n", "Number of instances with multiple classes: 20242\n", "Number of unique class groups: 1240\n", "Number of instances with single class: 172487\n", "Number of unique classes from singles: 464\n", "Number of instances with single class, including first of multi-class: 192729\n", "Number of unique classes , including first of multi-class: 508\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_425911/1536071001.py:16: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " culture_semicolon[\"Culture\"] = culture_semicolon[\"Culture\"].str.split(\";\").str[0]\n" ] } ], "source": [ "print(\"Number of non-null instances:\", world_df[\"Culture\"].notnull().sum())\n", "\n", "culture_semicolon = world_df[world_df[\"Culture\"].str.contains(\";\", regex=False, na=False)]\n", "num_instances = len(culture_semicolon)\n", "num_classes = len(culture_semicolon[\"Culture\"].unique())\n", "\n", "print(\"Number of instances with multiple classes:\", num_instances)\n", "print(\"Number of unique class groups:\", num_classes)\n", "\n", "culture_no_semicolon = world_df[\n", " ~world_df[\"Culture\"].str.contains(\";\", na=False) & world_df[\"Culture\"].notnull()\n", "]\n", "print(\"Number of instances with single class:\", len(culture_no_semicolon))\n", "print(\"Number of unique classes from singles:\", len(culture_no_semicolon[\"Culture\"].unique()))\n", "\n", "culture_semicolon[\"Culture\"] = culture_semicolon[\"Culture\"].str.split(\";\").str[0]\n", "culture_no_semicolon = world_df[\n", " ~world_df[\"Culture\"].str.contains(\";\", na=False) & world_df[\"Culture\"].notnull()\n", "]\n", "culture_no_semicolon = pd.concat([culture_no_semicolon, culture_semicolon])\n", "print(\n", " \"Number of instances with single class, including first of multi-class:\",\n", " len(culture_no_semicolon),\n", ")\n", "print(\n", " \"Number of unique classes , including first of multi-class:\",\n", " len(culture_no_semicolon[\"Culture\"].unique()),\n", ")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of classes per instance: Culture\n", "2 18538\n", "3 1560\n", "4 101\n", "6 39\n", "5 3\n", "7 1\n", "Name: count, dtype: int64\n" ] } ], "source": [ "culture_semicolon = world_df[world_df[\"Culture\"].str.contains(\";\", regex=False, na=False)]\n", "classes_per_instance = culture_semicolon[\"Culture\"].str.split(\";\").apply(len)\n", "print(\"Number of classes per instance:\", classes_per_instance.value_counts())" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Culture: East Greek; Archaic Greek\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Culture: Ptolemaic; Late Period\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Culture: Iron Age; Greek\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Culture: East Greek; Archaic Greek\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Culture: Iron Age; Hallstatt D; La Tène I\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Culture: Western Greek; Etruscan\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Culture: Attic; Classical Greek\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Culture: Classical Greek; Hellenistic\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Culture: Roman; Hellenistic\n", "------------------------\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Culture: Old Babylonian; Isin-Larsa\n", "------------------------\n" ] } ], "source": [ "from IPython.display import display, Image\n", "\n", "# Take 10 samples of 'Culture' and 'Image' columns\n", "samples = culture_semicolon[[\"Culture\", \"Image\"]].sample(10)\n", "\n", "# Display the images and their associated 'Culture'\n", "for index, row in samples.iterrows():\n", " display(Image(url=row[\"Image\"]))\n", " print(\"Culture:\", row[\"Culture\"])\n", " print(\"------------------------\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concluding thoughts on multi-label columns\n", "\n", "**Object type**: use the first label only, only 11k/201k have multiple labels\n", "\n", "**Culture**: could be a multi-label classification task, but only 20k/193k have multiple labels, so first label only\n", "\n", "**Materials**: use the first label only, seems to be primary and secondary materials, only 6k/201k have multiple labels\n", "\n", "**Subjects**: definitely a multi-label classification task, 76k/113k have multiple labels" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_425911/3353630846.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " world_df[world_df[\"Object type\"].str.contains(\";\", regex=False, na=False)] = world_df[\n", "/tmp/ipykernel_425911/3353630846.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " world_df[world_df[\"Culture\"].str.contains(\";\", regex=False, na=False)] = world_df[\n", "/tmp/ipykernel_425911/3353630846.py:7: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " world_df[world_df[\"Materials\"].str.contains(\";\", regex=False, na=False)] = world_df[\n" ] } ], "source": [ "world_df[world_df[\"Object type\"].str.contains(\";\", regex=False, na=False)] = world_df[\n", " world_df[\"Object type\"].str.contains(\";\", regex=False, na=False)\n", "].apply(lambda x: x.str.split(\";\").str[0])\n", "world_df[world_df[\"Culture\"].str.contains(\";\", regex=False, na=False)] = world_df[\n", " world_df[\"Culture\"].str.contains(\";\", regex=False, na=False)\n", "].apply(lambda x: x.str.split(\";\").str[0])\n", "world_df[world_df[\"Materials\"].str.contains(\";\", regex=False, na=False)] = world_df[\n", " world_df[\"Materials\"].str.contains(\";\", regex=False, na=False)\n", "].apply(lambda x: x.str.split(\";\").str[0])" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['1-1600 (?)',\n", " '200BC - 400AD',\n", " '200BC - 400AD',\n", " '400 - 800',\n", " '1000BC - 400AD',\n", " '200BC - 400AD',\n", " '1500BC - 500BC',\n", " '200BC - 1000AD',\n", " '200BC - 400AD',\n", " '200BC - 400AD',\n", " '500 - 1000AD',\n", " '3000BC - 500BC',\n", " '11,000BC - 8000BC',\n", " '300 - 1200',\n", " '200BC-400',\n", " '6000BC - 1000BC',\n", " '6000BC - 1000BC',\n", " '6000BC - 1000BC',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '250AD - 900AD',\n", " '1-1800',\n", " '1300BC - 700BC',\n", " '200BC - 400AD',\n", " '200BC- 400 AD',\n", " '200BC - 400AD',\n", " '200BC-500',\n", " '200BC - 400AD',\n", " '200BC-400',\n", " '200BC - 400AD',\n", " '200BC- 400AD',\n", " '200BC - 400AD',\n", " '200BC-400',\n", " '200BC - 400AD',\n", " '200BC - 400AD',\n", " '200BC-400AD',\n", " '1700 BC - 150 BC',\n", " '400 BC-AD 400 (This is carved in Marpole Culture style, and it may be of this period and culture); 1900-1949 (Further work is required to date this bowl; it may have made by or for R A Brooks, from whom Inverarity acquired it in 1949.)',\n", " '200BC-500',\n", " '150 BC - 750',\n", " '1000BC - 200AD',\n", " '200BC - 400AD',\n", " '1200BC - 150AD',\n", " '200BC - 400AD',\n", " '200BC - 400AD (?)',\n", " '200BC-1000',\n", " '250AD - 900AD',\n", " '1000BC - 200AD (?)',\n", " '1000BC - 200BC',\n", " '200BC - 400AD',\n", " '200BC - 400AD',\n", " '1000BC - 200BC (?)',\n", " '1000BC - 200BC',\n", " '1000BC - 200BC',\n", " '1000BC - 400AD',\n", " '200BC - 400AD (?)',\n", " '200BC-1000 (?)',\n", " '1200 BC - 400 BC',\n", " '300-1200',\n", " '1-500',\n", " '300-1200',\n", " '200BC - 1520AD',\n", " '200BC - 1520AD',\n", " '200BC-400',\n", " '200BC-400',\n", " '200BC-400',\n", " '200BC-400',\n", " '200BC-400',\n", " '200BC-400',\n", " '200BC-400',\n", " '200BC-400',\n", " '200BC-400',\n", " '200BC-400',\n", " '1000BC - 200AD',\n", " '300-900',\n", " '1200 BC - 400 BC',\n", " '300 BC - AD 300',\n", " '400BC-AD400 (Marpole Culture)',\n", " '150 BC - AD 750',\n", " '300-1200',\n", " '100 BC - 750',\n", " '150 BC - AD 750',\n", " '1500 BC - 900 BC',\n", " '300BC-300AD',\n", " '300-1200',\n", " '200BC-500',\n", " '300 BC - AD 300',\n", " '200BC-500',\n", " '100 BC - AD 750',\n", " '100 BC-AD 750',\n", " '200BC-500',\n", " '150 BC - 750',\n", " '150 BC - AD 750',\n", " '300 BC - AD 300',\n", " '200BC-500',\n", " '200BC-500',\n", " '300 BC - AD 300',\n", " '200BC-500',\n", " '150BC - AD750',\n", " '150BC - AD750',\n", " '200BC-500',\n", " '1500 BC - 900 BC',\n", " '1500BC - 900BC',\n", " '300 BC - AD 300',\n", " '150BC - AD750',\n", " '150BC - AD750',\n", " '1700 BC - 150 BC',\n", " '250-900 (?)',\n", " '250-900',\n", " '700BC-1560s',\n", " '100 BC - AD 750',\n", " '1200BC-400BC (circa)',\n", " '250-900',\n", " '150BC - AD750',\n", " '100 BC-AD 750',\n", " '250-900',\n", " '100 BC - AD 750',\n", " '1200BC - 400BC',\n", " '150BC - 400AD',\n", " '100 BC - AD 750',\n", " '1500 BC - 900 BC',\n", " '200BC-500',\n", " '1500 BC - 900 BC',\n", " '1500 BC - 900 BC',\n", " '150BC - AD750',\n", " '150BC - AD750',\n", " '150 BC - AD 750',\n", " '1500BC - 900BC',\n", " '400BC-100BC',\n", " '1500BC - 900BC',\n", " '200BC - 1520AD',\n", " '200 BC - AD 800',\n", " '700BC-1560s',\n", " '700BC-1560s',\n", " '700BC-1560s',\n", " '700BC-1560s',\n", " '1000BC - 200AD']" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "world_df[\"Production date\"].values[:150].tolist()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 adze\n", "1 altar\n", "2 altar\n", "3 amulet\n", "4 arrow\n", " ... \n", "719 whistle\n", "720 whistle\n", "721 whistle\n", "722 whistle\n", "723 whistling vessel\n", "Name: Object type, Length: 201119, dtype: object" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "world_df[\"Object type\"]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('600BC-575BC (circa (Möller 2000, 142, n. 414).)', -588),\n", " ('15 - 18 Aug, 1868', 'error'),\n", " ('550BC - 500BC (possibly 530 BC (S. Weber))', -525),\n", " ('480BC (about)', -480),\n", " ('1stC (circa)', 50),\n", " ('102BC', -102),\n", " ('350BC (circa)', -350),\n", " ('4thC BC', -350),\n", " ('1stC-3rdC', 150),\n", " ('182 BC (see N. Badoud, Le temps de Rhodes, 2015, 142.)', -182),\n", " ('87BC', -87),\n", " ('460BC (circa)', -460),\n", " ('138', 138),\n", " ('124 BC (see N. Badoud, Le temps de Rhodes, 2015, 143.)', -124),\n", " ('2ndC', 150),\n", " ('2ndC BC(mid)', -150),\n", " ('2ndC BC(late)', -150),\n", " ('1stC-2ndC', 100),\n", " ('2ndC', 150),\n", " ('6thC BC', -550),\n", " ('late 7thC BC - late 1stC BC', -350),\n", " ('271', 271),\n", " ('1stC-3rdC', 150),\n", " ('70', 70),\n", " ('192', 192),\n", " ('103BC', -103),\n", " ('85BC', -85),\n", " ('6thC', 550)]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "\n", "\n", "def convert_century_to_year(century):\n", " match = re.search(r\"(\\d+)(st|nd|rd|th|stC|ndC|rdC|thC)\", century)\n", " if match:\n", " century_number = int(match.group(1))\n", " year = (century_number - 1) * 100 + 50\n", " return str(year)\n", " else:\n", " return century\n", "\n", "\n", "def parse_date(raw_date):\n", " try:\n", " raw_date = raw_date.lower().strip()\n", " raw_date = re.sub(r\"\\(.*?\\)\", \"\", raw_date) # Remove anything in parentheses\n", " raw_date = re.sub(r\"[?,.!()]\", \"\", raw_date) # Remove punctuation\n", " if \";\" in raw_date:\n", " raw_date = raw_date.split(\";\")[0]\n", "\n", " # Check for ranges\n", " splitter = \"–\" if \"–\" in raw_date else \"-\"\n", " if splitter in raw_date:\n", " parts = raw_date.split(splitter)\n", " num_parts = raw_date.replace(\"bc\", \"\").replace(\"ad\", \"\").split(splitter)\n", " num_parts = [convert_century_to_year(part) for part in num_parts]\n", "\n", " if \"bc\" in raw_date and \"ad\" in raw_date: # Mixed ranges including both bc and ad\n", " start, end = int(num_parts[0].strip()), int(num_parts[1].strip())\n", " if \"bc\" in parts[0]:\n", " start *= -1\n", " # For mixed bc/ad, only the end will be ad, so no action needed\n", " elif \"bc\" in raw_date:\n", " start, end = (\n", " int(num_parts[0].strip()) * -1,\n", " int(num_parts[1].strip()) * -1,\n", " ) # Convert to negative for bc\n", " else:\n", " start, end = int(num_parts[0].strip()), int(num_parts[1].strip())\n", " # Calculate middle date and correct for bc/ad crossover\n", " middle_date = start + (end - start) // 2\n", " if middle_date > 2021:\n", " return \"error\"\n", " else:\n", " return middle_date\n", "\n", " else:\n", " num_date = raw_date.replace(\"bc\", \"\").replace(\"ad\", \"\")\n", " num_date = convert_century_to_year(num_date)\n", " if \"bc\" in raw_date:\n", " num_date = int(num_date) * -1\n", " if int(num_date) > 2021:\n", " return \"error\"\n", " else:\n", " return int(num_date)\n", "\n", " except Exception as e:\n", " # print(f\"Error parsing date {raw_date}: {e}\")\n", " return \"error\"\n", "\n", "\n", "# # for date_range in world_df['Production date'].values[:150].tolist(): # Iterate through your ranges\n", "# for date_range in (\n", "# world_df[\"Production date\"].sample(150).values.tolist()\n", "# ): # Iterate through your ranges\n", "# middle_date = parse_date(date_range)\n", "# print(f\"{date_range}: Middle Date = {middle_date}\")\n", "\n", "test_cases = [\n", " \"600BC-575BC (circa (Möller 2000, 142, n. 414).)\",\n", " \"15 - 18 Aug, 1868\",\n", " \"550BC - 500BC (possibly 530 BC (S. Weber))\",\n", " \"480BC (about)\",\n", " \"1stC (circa)\",\n", " \"102BC\",\n", " \"350BC (circa)\",\n", " \"4thC BC\",\n", " \"1stC-3rdC\",\n", " \"182 BC (see N. Badoud, Le temps de Rhodes, 2015, 142.)\",\n", " \"87BC\",\n", " \"460BC (circa)\",\n", " \"138\",\n", " \"124 BC (see N. Badoud, Le temps de Rhodes, 2015, 143.)\",\n", " \"2ndC\",\n", " \"2ndC BC(mid)\",\n", " \"2ndC BC(late)\",\n", " \"1stC-2ndC\",\n", " \"2ndC\",\n", " \"6thC BC\",\n", " \"late 7thC BC - late 1stC BC\",\n", " \"271\",\n", " \"1stC-3rdC\",\n", " \"70\",\n", " \"192\",\n", " \"103BC\",\n", " \"85BC\",\n", " \"6thC\",\n", "]\n", "\n", "[(date, parse_date(date)) for date in test_cases]\n", "# parse_date(\"600BC-575BC (circa (Möller 2000, 142, n. 414).)\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_425911/1669513916.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " dates[\"correct\"] = dates.apply(lambda x: parse_date(x[\"Production date\"]), axis=1)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Production dateReg numbercorrect
129700BC-1560sAm1849,0629.23error
152700BC-1560sAm1940,02.45error
153700BC-1560sAm1946,19.6error
154700BC-1560sAm1943,04.3error
155700BC-1560sAm1946,19.7error
............
20519Third century BC (?)1925,0119.611error
2140019135Af1935,0205.2error
214056 April 29 AD1898,0315.342error
115C14-C15Am1842,1112.3error
225C212013,2014.1.a-berror
\n", "

385 rows × 3 columns

\n", "
" ], "text/plain": [ " Production date Reg number correct\n", "129 700BC-1560s Am1849,0629.23 error\n", "152 700BC-1560s Am1940,02.45 error\n", "153 700BC-1560s Am1946,19.6 error\n", "154 700BC-1560s Am1943,04.3 error\n", "155 700BC-1560s Am1946,19.7 error\n", "... ... ... ...\n", "20519 Third century BC (?) 1925,0119.611 error\n", "21400 19135 Af1935,0205.2 error\n", "21405 6 April 29 AD 1898,0315.342 error\n", "115 C14-C15 Am1842,1112.3 error\n", "225 C21 2013,2014.1.a-b error\n", "\n", "[385 rows x 3 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dates = world_df[[\"Production date\", \"Reg number\"]]\n", "dates[\"correct\"] = dates.apply(lambda x: parse_date(x[\"Production date\"]), axis=1)\n", "dates[dates[\"correct\"] == \"error\"]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_425911/1922912582.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " d[\"correct\"] = d[\"correct\"].astype(int)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Production dateReg numbercorrect
01-1600 (?)Am1994,09.1800
1200BC - 400ADAm,S.818100
2200BC - 400ADAm,S.817100
3400 - 800Am.9685600
41000BC - 400ADAm,S.758.a-c-300
............
719100BC-600Am1954,05.196-350
720100BC-600Am1954,05.669-350
721100BC-650Am1954,05.194-375
722150BC-1600 (?)Am.6877-875
723100BC-600Am1982,Q.944-350
\n", "

200734 rows × 3 columns

\n", "
" ], "text/plain": [ " Production date Reg number correct\n", "0 1-1600 (?) Am1994,09.1 800\n", "1 200BC - 400AD Am,S.818 100\n", "2 200BC - 400AD Am,S.817 100\n", "3 400 - 800 Am.9685 600\n", "4 1000BC - 400AD Am,S.758.a-c -300\n", ".. ... ... ...\n", "719 100BC-600 Am1954,05.196 -350\n", "720 100BC-600 Am1954,05.669 -350\n", "721 100BC-650 Am1954,05.194 -375\n", "722 150BC-1600 (?) Am.6877 -875\n", "723 100BC-600 Am1982,Q.944 -350\n", "\n", "[200734 rows x 3 columns]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d = dates[dates[\"correct\"] != \"error\"]\n", "d[\"correct\"] = d[\"correct\"].astype(int)\n", "d" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "if dates[\"correct\"].notnull().any():\n", " filtered_dates = dates[dates[\"correct\"] != \"error\"]\n", " plt.boxplot(filtered_dates[\"correct\"], showfliers=False)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_425911/3093622834.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", " equal_buckets = filtered_dates.groupby(\"bucket\").size().reset_index(name=\"count\")\n" ] } ], "source": [ "# Assuming you have a DataFrame called 'filtered_dates' with a column 'correct'\n", "filtered_dates = filtered_dates[filtered_dates[\"correct\"] >= -3000]\n", "filtered_dates[\"bucket\"] = pd.qcut(filtered_dates[\"correct\"], q=100, duplicates=\"drop\")\n", "equal_buckets = filtered_dates.groupby(\"bucket\").size().reset_index(name=\"count\")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Production dateReg numbercorrectbucket
01-1600 (?)Am1994,09.1800(600.0, 1995.0]
1200BC - 400ADAm,S.818100(81.0, 100.0]
2200BC - 400ADAm,S.817100(81.0, 100.0]
3400 - 800Am.9685600(550.0, 600.0]
41000BC - 400ADAm,S.758.a-c-300(-303.0, -300.0]
...............
719100BC-600Am1954,05.196-350(-375.0, -350.0]
720100BC-600Am1954,05.669-350(-375.0, -350.0]
721100BC-650Am1954,05.194-375(-400.0, -375.0]
722150BC-1600 (?)Am.6877-875(-970.0, -800.0]
723100BC-600Am1982,Q.944-350(-375.0, -350.0]
\n", "

197667 rows × 4 columns

\n", "
" ], "text/plain": [ " Production date Reg number correct bucket\n", "0 1-1600 (?) Am1994,09.1 800 (600.0, 1995.0]\n", "1 200BC - 400AD Am,S.818 100 (81.0, 100.0]\n", "2 200BC - 400AD Am,S.817 100 (81.0, 100.0]\n", "3 400 - 800 Am.9685 600 (550.0, 600.0]\n", "4 1000BC - 400AD Am,S.758.a-c -300 (-303.0, -300.0]\n", ".. ... ... ... ...\n", "719 100BC-600 Am1954,05.196 -350 (-375.0, -350.0]\n", "720 100BC-600 Am1954,05.669 -350 (-375.0, -350.0]\n", "721 100BC-650 Am1954,05.194 -375 (-400.0, -375.0]\n", "722 150BC-1600 (?) Am.6877 -875 (-970.0, -800.0]\n", "723 100BC-600 Am1982,Q.944 -350 (-375.0, -350.0]\n", "\n", "[197667 rows x 4 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered_dates" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAACVUAAAH5CAYAAACLEUz+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABFBElEQVR4nO3df3TV9Z0n/lcCIQExUJIh0SWxzLAD2No6pVuN/XG0paZd6NYtM9vpiqWtraOLPavwVcc9/u507Nqx1las7bYj3WNdq51p3YKjUqyiFfxBYWsxcGZWOpdWA3tRiEgSIvl8/5jJ7edCAgkk+dwkj8c5OeF+3u/7ua/P54b7Tj73ed/vsiRJkgAAAAAAAAAAACAiIsqzLgAAAAAAAAAAAKCUCFUBAAAAAAAAAACkCFUBAAAAAAAAAACkCFUBAAAAAAAAAACkCFUBAAAAAAAAAACkCFUBAAAAAAAAAACkCFUBAAAAAAAAAACkjM+6gKHS3d0dL7/8cpx44olRVlaWdTkAAAAAAAAAAEDGkiSJ119/PU4++eQoL+97PqpRG6p6+eWXo6GhIesyAAAAAAAAAACAErNjx46YMWNGn+2jNlR14oknRsS/nIDq6uqMqwEAAAAAAAAAALLW1tYWDQ0NhWxRX0ZtqKpnyb/q6mqhKgAAAAAAAAAAoKAnW9SXvhcGBAAAAAAAAAAAGIOEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFLGZ10AAAAAADD0crlc5PP5w7bX1tZGY2NjBhUBAAAAlC6hKgAAAAAY5XK5XMyeMzc62vcf1lY1cVJs29oiWAUAAACQIlQFAAAAAKNcPp+Pjvb9UbNweVTUNBS2d+3eEbtX3Rr5fF6oCgAAACBFqAoAAAAAxoiKmoaorJ+VdRkAAAAAJa886wIAAAAAAAAAAABKiVAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAyvisCwAAAAAAAIZeLpeLfD5fuF1bWxuNjY0ZVgQAAFC6hKoAAAAAAGCUy+VyMXvO3Oho31/YVjVxUmzb2iJYBQAA0AuhKgAAAAAAGOXy+Xx0tO+PmoXLo6KmIbp274jdq26NfD4vVAUAANALoSoAAAAAABgjKmoaorJ+VtZlAAAAlLzyrAsAAAAAAAAAAAAoJUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKeOzLgAAAAAAABh8uVwu8vl81NbWZl0KAADAiCNUBQAAAAAAo0wul4vZc+ZGR/v+qJo4KX70wP1ZlwQAADCiWP4PAAAAAABGmXw+Hx3t+2NK0yejo31/7NmzJ+uSAAAARhShKgAAAAAAGKXGTZmedQkAAAAjkuX/AAAAAACgn3K5XOTz+aJttbW10djYmFFFAAAADAWhKgAAAAAA6IdcLhez58yNjvb9RdurJk6KbVtbBKsAAABGkeNa/u8rX/lKlJWVxWWXXVbY1tHREUuXLo2ampqYPHlyLFq0KHbu3Fl0v1wuFwsWLIhJkybF9OnT44orrog333yzqM/jjz8e73rXu6KysjJmzZoVK1euPJ5SAQAAAADguOTz+eho3x81C5dH/ZKvR/2Sr0fNwuXR0b7/sNmrAAAAGNmOeaaq5557Lr797W/HO97xjqLtl19+eaxevToeeOCBmDJlSlx66aXxiU98In7xi19ERMTBgwdjwYIFUV9fH08//XS88sor8elPfzoqKirir//6ryMiYvv27bFgwYK4+OKL4wc/+EGsXbs2Pv/5z8dJJ50Uzc3Nx3G4AAAAAACDwzJwY1dFTUNU1s/KugwAAACG0DGFqvbt2xfnn39+/I//8T/ir/7qrwrb9+7dG9/73vfi3nvvjQ9+8IMREXH33XfH3LlzY8OGDXHmmWfGo48+Gi+++GL87Gc/i7q6ujj99NPjS1/6Ulx11VVxww03xIQJE+Kuu+6KmTNnxq233hoREXPnzo2nnnoqbrvtNqEqAAAAACBzloEDAACA0e2Ylv9bunRpLFiwIObPn1+0fePGjdHV1VW0fc6cOdHY2Bjr16+PiIj169fHaaedFnV1dYU+zc3N0dbWFlu2bCn0OXTfzc3NhX30prOzM9ra2oq+AAAAAACGgmXgAAAAYHQb8ExV9913X/zyl7+M55577rC21tbWmDBhQkydOrVoe11dXbS2thb6pANVPe09bUfq09bWFu3t7TFx4sTDHvvmm2+OG2+8caCHAwAAAEAJ6m1Ztb5Ybo0sWQYOAAAARqcBhap27NgR//W//tdYs2ZNVFVVDVVNx+Tqq6+OZcuWFW63tbVFQ0NDhhUBAAAAcCz6WlatL5ZbAwAAAGCwDShUtXHjxti1a1e8613vKmw7ePBgrFu3Lu6444545JFH4sCBA7Fnz56i2ap27twZ9fX1ERFRX18fzz77bNF+d+7cWWjr+d6zLd2nurq611mqIiIqKyujsrJyIIcDAAAAQAlKL6tWUXPkD8117d4Ru1fdGvl8XqgKAAAAgEEzoFDVhz70oXjhhReKtn32s5+NOXPmxFVXXRUNDQ1RUVERa9eujUWLFkVExLZt2yKXy0VTU1NERDQ1NcWXv/zl2LVrV0yfPj0iItasWRPV1dVx6qmnFvo89NBDRY+zZs2awj4AAAAAGP0sqwYAAABAVgYUqjrxxBPj7W9/e9G2E044IWpqagrbL7zwwli2bFlMmzYtqqur44tf/GI0NTXFmWeeGRER5557bpx66qlxwQUXxC233BKtra1xzTXXxNKlSwszTV188cVxxx13xJVXXhmf+9zn4rHHHov7778/Vq9ePRjHDAAAAAAAAAAA0KcBhar647bbbovy8vJYtGhRdHZ2RnNzc9x5552F9nHjxsWqVavikksuiaampjjhhBNiyZIlcdNNNxX6zJw5M1avXh2XX3553H777TFjxoz47ne/G83NzYNdLgAAAAAAAAAAQJHjDlU9/vjjRberqqpixYoVsWLFij7vc8oppxy2vN+hzj777Ni0adPxlgcAAAAAAAAAADAg5VkXAAAAAAAAAAAAUEqEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFKEqgAAAAAAAAAAAFLGZ10AAAAAADAy5XK5yOfz/epbW1sbjY2NQ1wRAAAAwOAQqgIAAAAABiyXy8XsOXOjo31/v/pXTZwU27a2CFYBAAAAI4JQFQAAAAAwYPl8Pjra90fNwuVRUdNwxL5du3fE7lW3Rj6fF6oCAAAARgShKgAAAADgmFXUNERl/aysywAAAAAYVOVZFwAAAAAAAAAAAFBKhKoAAAAAAAAAAABShKoAAAAAAAAAAABShKoAAAAAAAAAAABShKoAAAAAAAAAAABShKoAAAAAAAAAAABShKoAAAAAAAAAAABShKoAAAAAAAAAAABShKoAAAAAAAAAAABShKoAAAAAAAAAAABShKoAAAAAAAAAAABShKoAAAAAAAAAAABSxmddAAAAAACMZLlcLvL5/FH71dbWRmNj4zBUBAAAAMDxEqoCAAAAgGOUy+Vi9py50dG+/6h9qyZOim1bWwSrAAAAAEYAoSoAAAAAOEb5fD462vdHzcLlUVHT0Ge/rt07YveqWyOfzwtVAQAAAIwAQlUAAAAAcJwqahqisn5W1mUAAAAAMEjKsy4AAAAAAAAAAACglAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAhVAQAAAAAAAAAApAwoVPWtb30r3vGOd0R1dXVUV1dHU1NT/MM//EOhvaOjI5YuXRo1NTUxefLkWLRoUezcubNoH7lcLhYsWBCTJk2K6dOnxxVXXBFvvvlmUZ/HH3883vWud0VlZWXMmjUrVq5ceexHCAAAAAAAAAAAMAADClXNmDEjvvKVr8TGjRvj+eefjw9+8IPx8Y9/PLZs2RIREZdffnn89Kc/jQceeCCeeOKJePnll+MTn/hE4f4HDx6MBQsWxIEDB+Lpp5+O73//+7Fy5cq47rrrCn22b98eCxYsiHPOOSc2b94cl112WXz+85+PRx55ZJAOGQAAAAAAAAAAoG/jB9L5Yx/7WNHtL3/5y/Gtb30rNmzYEDNmzIjvfe97ce+998YHP/jBiIi4++67Y+7cubFhw4Y488wz49FHH40XX3wxfvazn0VdXV2cfvrp8aUvfSmuuuqquOGGG2LChAlx1113xcyZM+PWW2+NiIi5c+fGU089Fbfddls0NzcP0mEDAAAAAAAAAAD0bkAzVaUdPHgw7rvvvnjjjTeiqakpNm7cGF1dXTF//vxCnzlz5kRjY2OsX78+IiLWr18fp512WtTV1RX6NDc3R1tbW2G2q/Xr1xfto6dPzz760tnZGW1tbUVfAAAAAACMDLlcLn75y18e9pXL5bIuDQAAgDFoQDNVRUS88MIL0dTUFB0dHTF58uT48Y9/HKeeemps3rw5JkyYEFOnTi3qX1dXF62trRER0draWhSo6mnvaTtSn7a2tmhvb4+JEyf2WtfNN98cN95440APBwAAAACAjOVyuZg9Z250tO8/rK1q4qTYtrUlGhsbM6gMAACAsWrAoarZs2fH5s2bY+/evfGjH/0olixZEk888cRQ1DYgV199dSxbtqxwu62tLRoaGjKsCAAAAACA/sjn89HRvj9qFi6PiprfX9ft2r0jdq+6NfL5vFAVAAAAw2rAoaoJEybErFmzIiJi3rx58dxzz8Xtt98en/zkJ+PAgQOxZ8+eotmqdu7cGfX19RERUV9fH88++2zR/nbu3Flo6/nesy3dp7q6us9ZqiIiKisro7KycqCHAwAAAABAiaioaYjK+llZlwEAAABRfrw76O7ujs7Ozpg3b15UVFTE2rVrC23btm2LXC4XTU1NERHR1NQUL7zwQuzatavQZ82aNVFdXR2nnnpqoU96Hz19evYBAAAAAAAAAAAwlAY0U9XVV18dH/3oR6OxsTFef/31uPfee+Pxxx+PRx55JKZMmRIXXnhhLFu2LKZNmxbV1dXxxS9+MZqamuLMM8+MiIhzzz03Tj311LjgggvilltuidbW1rjmmmti6dKlhVmmLr744rjjjjviyiuvjM997nPx2GOPxf333x+rV68e/KMHAAAAACghuVwu8vl84XZtba1l7wAAACADAwpV7dq1Kz796U/HK6+8ElOmTIl3vOMd8cgjj8SHP/zhiIi47bbbory8PBYtWhSdnZ3R3Nwcd955Z+H+48aNi1WrVsUll1wSTU1NccIJJ8SSJUvipptuKvSZOXNmrF69Oi6//PK4/fbbY8aMGfHd7343mpubB+mQAQAAAABKTy6Xi9lz5kZH+/7CtqqJk2Lb1hbBKgAAABhmAwpVfe973ztie1VVVaxYsSJWrFjRZ59TTjklHnrooSPu5+yzz45NmzYNpDQAAAAAgBEtn89HR/v+qFm4PCpqGqJr947YverWyOfzQlUAAAAwzAYUqgIAAAAAYGhV1DREZf2srMsAAACAMa086wIAAAAAAAAAAABKiVAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAilAVAAAAAAAAAABAyvisCwAAAAAA6JHL5SKfzx+xT21tbTQ2Ng5TRQAAAMBYJFQFAAAAAJSEXC4Xs+fMjY72/UfsVzVxUmzb2iJYBQAAAAwZoSoAAAAAoCTk8/noaN8fNQuXR0VNQ699unbviN2rbo18Pi9UBQAAAAwZoSoAAAAAoKRU1DREZf2srMsAAAAAxrDyrAsAAAAAAAAAAAAoJUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKeOzLgAAAAAAABiZcrlc5PP5Xttqa2ujsbFxmCtiqBzpuQYA4Pf8Hjx6CFUBAAAAADDq9RUI8YbHscvlcjF7ztzoaN/fa3vVxEmxbWuL8zsKHO25BgDg9/wePHoIVQEAAAAAMKodKRDiDY/DA2f9DZrl8/noaN8fNQuXR0VNQ1Fb1+4dsXvVrZHP58f0uR0tjvRcAwDwe34PHl2EqgAAAAAAGNX6CoR4wyPilVdeife+7/1FgbOBBs0qahqisn7WUJVICfFcAwAwlghVAQAAAAAwJgiEHG7Pnj1FgTNBMwAAgH8hVAUAAAAAAGOcwBkAAEAxoSoAAAAAYNTI5XKRz+eLttXW1o7pGXdyuVxExJg+BzDW9PZaeDxaWloGbV8AADBSCFUBAAAAAKNCLpeL2XPmRkf7/qLtVRMnxbatLWMyVNRzTiJizJ4DGGv6ei0EAAAGRqgKAAAAgEFR6jMEHW3WjlKqlWOTz+ejo31/1CxcHhU1DRER0bV7R+xedWvk8/kx+fz2nJOef4/FcwBjTW+vhcer/aXnY++T9wzKvgAAYKQQqgIAAADguJX6DEH9mbWjVGrl+FXUNERl/aysywDI1GC+Fnbt3jEo+wEAgJFEqAoAAACA41bqMwQdbdaOUqoVAAAAgOwJVQEAAAAwaEp9hqBSrw8AAACA0lCedQEAAAAAAAAAAAClRKgKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgZXzWBQAAAJCdXC4X+Xw+6zIYArW1tdHY2Jh1GQAAAAAAI5JQFQAAwBiVy+Vi9py50dG+P+tSGAJVEyfFtq0tglUA0A8tLS2HbRNQBgAAGNuEqgAAAMaofD4fHe37o2bh8qioaci6HAZR1+4dsXvVrZHP570ZDABHcHDfaxFlZbF48eLD2gSUAQAAxjahKgBgWFheamTwSWwYmypqGqKyflbWZQAADLvuzn0RSXJYyFxAGQAAAKEqAGDIWV5q5PBJbAAAYCwSMqc/huoDYz7gBAAApUmoCgAYcpaXGhl8EhsAAAB6N5QfGPMBJwAAKE1CVQDAsPHJXwAAYKQ42ow0LS0tw1gNkLWh+sCYDzgBAEDpEqoCAAAAAEixhDnQFx8YAwCAsUOoCgAAAAAgpT8z0rS/9HzsffKeYa4MAAAAGC5CVQAAAAAAvTjSjDRdu3cMczUAAADAcCrPugAAAAAAAAAAAIBSIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQMj7rAgAAAAAAYKzL5XKRz+cjIqK2tjYaGxszrggAAGBsE6oCAAAAAIAM5XK5mD1nbnS074+IiKqJk2Lb1hbBKgAAgAwJVQEAAABjSnomkNHCjCYAI1s+n4+O9v1Rs3B5RETsXnVr5PN5r+0AAAAZEqoCAAAAxoxDZwIZLcxoAvRHS0tL4d/CmKWpoqYh6xIAAAD4V0JVAAAAwJiRnglktLxx3bV7hxlNgKMrK4vFixcXbgpjAgAAwJEJVQEAAABjTkVNQ1TWz8q6DIDhkySFQKkwJgAAABydUBUAAAAA0KtcLhf5fL7XtldeeWWYq+F4CZSWrv/3//5f1iUAAABwCKEqAAAAAOAwuVwuZs+ZGx3t+3ttn1BZNcwVMZa1tLRERERtbe2onF2rr/AiAAAA2RGqAgAAAAAOk8/no6N9f2HJuLSe5eMYHqM9UHQkB/e9FlFWFosXL46IiKqJk2Lb1paSPw89s7yNxecMAABgtBCqAgAAAAD6ZMm47IzUQNFg6u7cF5EkUbNweURE7F51a+Tz+ZI+B+lZ3obqOettaU4BLgAAgMElVAUAAAAAUIJGYqBoqBw6W1op65nlbUrTJ2Pv+h8O+nPW19KcYzF0BwAAMJSEqgAAAAAASthIChTxe+OmTB+S/fa2NGfPkpxjNXQHAAAwFISqAAAAAIBRr6WlJSIskcboYWlOAACAoSVUBQAAAACMWgf3vRZRVhaLFy+OCEukZSmXy0U+ny/cHisBt55AX2/GyjkAAAAYiYSqAAAAAIBRq7tzX0SSRM3C5RERlkjLSC6Xi9lz5kZH+/7CtpEScEuHwY4UkDrUwfbXiwJ9vek5BwAAAJQeoSoAAAAAYNSrqGnIuoQxLZ/PR0f7/qhZuDwqahqia/eOERFwe+WVV+K973t/URisv5ID7YVAX28/f+lzAAAAQOkRqgIAAAAgM4cuBxYxsJlgRqvezktvLB3GSFNR0xCV9bOyLqPf9uzZUxQGa3/p+dj75D0D2sdIO+YePa/Fpf4609/Xy76U+vEBAADZEaoCAKCINzHHJm8kAJCF3pYDY2DnZaQsnwYjXU8wqmv3jqxLGXIH971WtGxhKb/ODMY4UsrHBwAAZEuoCgCAiDj8wjljizcSAMjCocuB9TiWmWBGk77Oy6FGyvJpwMjS3bmvsGxhRJT060x/Xy/74nUUAAA4EqEqAAAiovjC+bFcjGbk8kYCAFk7dGmssTATTH+M1CXDgNGhr78L08vtlcpMx14vAQCAoTCgUNXNN98cf//3fx9bt26NiRMnxllnnRX//b//95g9e3ahT0dHRyxfvjzuu+++6OzsjObm5rjzzjujrq6u0CeXy8Ull1wSP//5z2Py5MmxZMmSuPnmm2P8+N+X8/jjj8eyZctiy5Yt0dDQENdcc0185jOfOf4jBgDgiFyMBgAASlk6yGMZ62I956a2tnZI9m/ZVgAAYCwpH0jnJ554IpYuXRobNmyINWvWRFdXV5x77rnxxhtvFPpcfvnl8dOf/jQeeOCBeOKJJ+Lll1+OT3ziE4X2gwcPxoIFC+LAgQPx9NNPx/e///1YuXJlXHfddYU+27dvjwULFsQ555wTmzdvjssuuyw+//nPxyOPPDIIhwwAAAAAwEiTXrJ83rx5MW/evJg9Z27kcrmsS8vcoedm9py58corrwz646SX26tf8vWY8n7LxwMAAKPXgGaqevjhh4tur1y5MqZPnx4bN26MD3zgA7F379743ve+F/fee2988IMfjIiIu+++O+bOnRsbNmyIM888Mx599NF48cUX42c/+1nU1dXF6aefHl/60pfiqquuihtuuCEmTJgQd911V8ycOTNuvfXWiIiYO3duPPXUU3HbbbdFc3PzIB06AAAAAMDgO9KSaMc7s1J6JqKhmqEpvbxbKc0EdeiS5Zax/r30uYmI2L3q1tizZ8+QPV7PDMeWawUAAEazAYWqDrV3796IiJg2bVpERGzcuDG6urpi/vz5hT5z5syJxsbGWL9+fZx55pmxfv36OO2004qWA2xubo5LLrkktmzZEn/yJ38S69evL9pHT5/LLrusz1o6Ozujs7OzcLutre14Dg0AAMacI735x+ArpTcoASCtJ1Djd4OBS88W1JeqiZNi29aWAf8ecOi+j3U/R3Po8m5D9TjHw5Llfauoaci6BAAAgFHjmENV3d3dcdlll8V73/veePvb3x4REa2trTFhwoSYOnVqUd+6urpobW0t9EkHqnrae9qO1KetrS3a29tj4sSJh9Vz8803x4033nishwMAAGNWf978Y/CV4huUAHBooIaBOXQmpUMdz8xKvc1ENBQzNKWXdxvKx+Ho0sFGgXwAAIDhd8yhqqVLl8avf/3reOqppwaznmN29dVXx7Jlywq329raoqHBp3IAAOBojvbmH4PPUjUAlKp0oObNvTtj75P3ZF3SiDSUMykN1+9rfi/MTm8fehDIBwAAGH7HFKq69NJLY9WqVbFu3bqYMWNGYXt9fX0cOHAg9uzZUzRb1c6dO6O+vr7Q59lnny3a386dOwttPd97tqX7VFdX9zpLVUREZWVlVFZWHsvhAAAAYRkVAOD3BGogO4d+6KEUAvktLS1RW1t7zPftD7NxAQAApWZAoaokSeKLX/xi/PjHP47HH388Zs6cWdQ+b968qKioiLVr18aiRYsiImLbtm2Ry+WiqakpIiKampriy1/+cuzatSumT58eERFr1qyJ6urqOPXUUwt9HnrooaJ9r1mzprAPAAAAAACGR39DMb0RlDl2pfChh/SsWVUTJ8WPHrj/mO7bH2bjAgAASs2AQlVLly6Ne++9Nx588ME48cQTo7W1NSIipkyZEhMnTowpU6bEhRdeGMuWLYtp06ZFdXV1fPGLX4ympqY488wzIyLi3HPPjVNPPTUuuOCCuOWWW6K1tTWuueaaWLp0aWGmqYsvvjjuuOOOuPLKK+Nzn/tcPPbYY3H//ffH6tWrB/nwAQAAAADozUBDMb3pCcowMvXMmjWl6ZOxd/0PY8+ePQO+b3+WGS+F2bgAAAAONaBQ1be+9a2IiDj77LOLtt99993xmc98JiIibrvttigvL49FixZFZ2dnNDc3x5133lnoO27cuFi1alVccskl0dTUFCeccEIsWbIkbrrppkKfmTNnxurVq+Pyyy+P22+/PWbMmBHf/e53o7m5+RgPEwAAAACAgRhIKKY36aAMI9u4KdOP+b6lMOMWAADAsRjw8n9HU1VVFStWrIgVK1b02eeUU045bHm/Q5199tmxadOmgZQHAAAAAMAgO95QzPEsH3i8+7f84NEJvQEAAPRuQKEqAAAAAADoj8FYPvB492/5wb71nL//74orsy4FAACgJAlVAQAAAIwCQz0TTA+zvgD9lV4+8M29O2Pvk/cM2f57W57Q8oNH1nP+3uw6kHUpAAAAJUmoCgAAAGAEG+qZYA7VM+uLYBXQX70FngZ7/8ezPCEAAAD0RqgKAAAAYAQ72kwtgyk964tQFYx8vc1wN1yz3gEAAECpE6oCAAAAGAXM1AL013DPcAf9NdBQnxAgAAAwlISqAAAAAADGkCPNcNf+0vOx98l7MqqMsUrQDwAAKEVCVQAAAAAwgvVnphazuYxOhz6vtbW1A1qas7cZ7rp27xiU2mAgjnUpWyFAAABgKAlVAQAAAMAIZGaXsauv575q4qTYtrVlQMEqKCUDXcpWCBAAABhKQlUAAAAAMAINZGYXs7mMLr099127d8TuVbdGPp8XqgIAAIBBIFQFAAAAACNYf2Z2MZvL6DTQWX0AAACA/ivPugAAAAAAAAAAAIBSIlQFAAAAAAAAAACQYvk/AOhDLpeLfD6fdRmjQktLS9YlAAAAAAAAAPSbUBUA9CKXy8XsOXOjo31/1qUAAAAAAAAAMMyEqgCgF/l8Pjra90fNwuVRUdOQdTkjXvtLz8feJ+/JugwAAAAAAACAfhGqAoAjqKhpiMr6WVmXMeJ17d6RdQkAAAAworW0tBR9BwAAYGgJVQEAAAAADKPeQjG1tbUZVMJIcHDfaxFlZbF48eIB3W/79u1DVBEAAMDYIFQFAAAAwJDqa1aVsTjbSn+OeSyel7HiSOGYqomT4kcP3J9BVZS67s59EUkSNQuXR0VNQ7S/9HzsffKePvv3/Jxde+21w1glAADA6CNUBQAAAMCQONbZVUYj54KIw8MxPbp274jdq26NPXv2ZFccJa+ipiEq62dF1+4dR+zX83M2+bQPx74X1gxTdQAAAKOPUBUAAAAAQ6KvAEmPo822Mpoc7VykjaXzMlb1hGMofSN5Cb3yydOyLgEAAGBEE6oCAAAAYEj1FSA52mwro1F/wjRj8bxAqTmWJfR6lu60hCcAAMDoIFQFAAAAAAApA1lCz/KeAAAAo5NQFQAAAAAA9KI/S+gdurynJTwBAABGB6EqAAAAGKUsP3Q454SjOdrPSG1tbTQ2Ng5TNcBI0rO8pyU8AQAARgehKgAAABhlLEMEA9ff/zdVEyfFtq0tglUwgrzyyitx0kknZV0GAAAAI4xQFQAAAIwyhy5DxO9Zkom+9Of/TdfuHbF71a2Rz+eFqmAE6AlLfmLRn8bf/92Psi4HAACAEUaoCgAAAEapnmWI+D1LMnE0/t/A6NETljzQ2RF79uzJuhwAAABGGKEqAAAAABjjWlpair4DAAAAjHVCVQAAAAAwRvUsj7Z48eKsSwEAAAAoKUJVAAAAADBG9SyPVrNweVTUNET7S8/H3ifvybosAAAAgMyVZ10AAAAAAJCtipqGqKyfFeOn1GVdCgAAAEBJMFMVmcvlcpHP57MuA6BIS0tL1iUAMAZkPd5k/fgAAAAAAFCqhKrIVC6Xi9lz5kZH+/6sSwEAgGFzcN9rEWVlsXjx4qxLATgmvYUyBTUBAAAAGE2EqshUPp+Pjvb9UbNweVTUNGRdDkBB+0vPx94n78m6DABGqe7OfRFJkvnvwcY7YKCEQgEAAAAYK4SqKAkVNQ1RWT8r6zIACrp278i6BADGgKx/DzbeAQN1pFCooCYAAAAAo4lQFQAAAAAD0lsoNOug5tGWH7Q8IQAAAAADIVQFAAAAwIhlSUIAAAAAhoJQFQAAAAAj1pGWJEyzPCEAAAAAAyFUBQAAAMCI19uShGlZL08IkLXt27dnXQIAAMCIIlQFAAAAAIwpLS0tx9UOI8nB9tcjysri2muvzboUAACAEUWoCgAAAAAYEw7uey2irCwWL16cdSkwbJID7RFJEpNP+3Dse2FN1uUAAACMGEJVAAAAAMCY0N25LyJJombh8qioaeizX/tLz8feJ+8Zxspg6JVPnpZ1CQAAACOKUBUAAAAAMKZU1DREZf2sPtu7du8YxmoAAACAUiRUBQAAAAAMi5aWluNqBwAAABguQlUAAAAAwJA6uO+1iLKyWLx4cdalAAAAAPSLUBUAAAAAMKS6O/dFJEnULFweFTUNffZrf+n52PvkPcNYGWSvpaUltm/fnnUZAAAAHEKoCgAAAAAYFhU1DVFZP6vP9q7dO4axGsiWGdwAAABKW3nWBQAAAAAAwFiTnsFt8p98NOtyAAAAOIRQFQAAAAAAZKSipiHGTa7JugwAAAAOIVQFAAAAAAAAAACQIlQFAAAAAAAAAACQMj7rAgAAAAAAGBwtLS1HvA0AAAD0j1AVAAAAAMAId3DfaxFlZbF48eKsS2EU2759e9YlAAAADBuhKgAAAACAEa67c19EkkTNwuVRUdNQ2N7+0vOx98l7MqyM0eBg++sRZWVx7bXXZl0KAADAsCnPugAAAAAAAAZHRU1DVNbPKnyNn1KXdUmMAsmB9ogkicmnfTjrUgAAAIaNUBUAAAAAAHBU5ZOnZV0CAADAsBGqAgAAAAAAAAAASBmfdQEAAAAAAAPV0tLSr20AAAAAx0KoCgAAAAAYMQ7uey2irCwWL16cdSkAAADAKCZUBQAAAACMGN2d+yKSJGoWLo+KmoaitvaXno+9T96TUWWUsu3bt2ddAgAAACOMUBUAAAAAMOJU1DREZf2som1du3dkVA0lrawsrr322qyrAAAAYIQpz7oAAAAAAAAYMkkSk0/7cNZVAAAAMMIIVQEAAAAAMKqVT56WdQkAAACMMEJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKUJVAAAAAAAAAAAAKeOzLgAAAAAAACArLS0tg9JnqGvIYl8AADCWCVUBAAAAAABjzsF9r0WUlcXixYvHdA0AAEDvhKoAAAAAAIAxp7tzX0SSRM3C5VFR03DEvu0vPR97n7wn0xr6a6hqBQCAsUaoCgAAAAAAGLMqahqisn7WEft07d6ReQ39NdS1AgDAWFGedQEAAAAAAAAAAAClRKgKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgZcChqnXr1sXHPvaxOPnkk6OsrCx+8pOfFLUnSRLXXXddnHTSSTFx4sSYP39+/OM//mNRn1dffTXOP//8qK6ujqlTp8aFF14Y+/btK+rzq1/9Kt7//vdHVVVVNDQ0xC233DLwowMAAAAAAAAAABigAYeq3njjjXjnO98ZK1as6LX9lltuiW984xtx1113xTPPPBMnnHBCNDc3R0dHR6HP+eefH1u2bIk1a9bEqlWrYt26dXHRRRcV2tva2uLcc8+NU045JTZu3Bhf/epX44YbbojvfOc7x3CIAAAAAAAAAAAA/Td+oHf46Ec/Gh/96Ed7bUuSJL7+9a/HNddcEx//+McjIuJ//s//GXV1dfGTn/wk/vzP/zxaWlri4Ycfjueeey7e/e53R0TEN7/5zfj3//7fx9/8zd/EySefHD/4wQ/iwIED8bd/+7cxYcKEeNvb3habN2+Or33ta0XhKwAAAAAAAAAAgME24JmqjmT79u3R2toa8+fPL2ybMmVKnHHGGbF+/fqIiFi/fn1MnTq1EKiKiJg/f36Ul5fHM888U+jzgQ98ICZMmFDo09zcHNu2bYvXXnut18fu7OyMtra2oi8AAAAAAAAAAICBGtRQVWtra0RE1NXVFW2vq6srtLW2tsb06dOL2sePHx/Tpk0r6tPbPtKPcaibb745pkyZUvhqaGg4/gMCAAAAAAAAAADGnEENVWXp6quvjr179xa+duzYkXVJAAAAAAAAAADACDSooar6+vqIiNi5c2fR9p07dxba6uvrY9euXUXtb775Zrz66qtFfXrbR/oxDlVZWRnV1dVFXwAAAAAAAAAAAAM1qKGqmTNnRn19faxdu7awra2tLZ555ploamqKiIimpqbYs2dPbNy4sdDnsccei+7u7jjjjDMKfdatWxddXV2FPmvWrInZs2fHW97ylsEsGQAAAAAAAAAAoMiAQ1X79u2LzZs3x+bNmyMiYvv27bF58+bI5XJRVlYWl112WfzVX/1V/O///b/jhRdeiE9/+tNx8sknx3nnnRcREXPnzo2PfOQj8YUvfCGeffbZ+MUvfhGXXnpp/Pmf/3mcfPLJERHxn//zf44JEybEhRdeGFu2bIkf/vCHcfvtt8eyZcsG7cABAAAAAAAAAAB6M36gd3j++efjnHPOKdzuCTotWbIkVq5cGVdeeWW88cYbcdFFF8WePXvife97Xzz88MNRVVVVuM8PfvCDuPTSS+NDH/pQlJeXx6JFi+Ib3/hGoX3KlCnx6KOPxtKlS2PevHlRW1sb1113XVx00UXHc6wAAAAAAAAAAABHNeBQ1dlnnx1JkvTZXlZWFjfddFPcdNNNffaZNm1a3HvvvUd8nHe84x3x5JNPDrQ8AAAAAAAAAACA4zLg5f8AAAAAAAAAAABGM6EqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAIASsH379qxLAAAAAP6VUBUAAAAAQIYO7nstoqwsrr322qxLAQAAAP6VUBUAAAAAQIa6O/dFJElMPu3DWZcCAAAA/CuhKgAAAACAElA+eVrWJQAAAAD/SqgKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgRagKAAAAAAAAAAAgZXzWBQAAAAAAMDa1tLQUfQcAAIBSIVQFAAAAAMCwOrjvtYiysli8eHHWpQAAAECvhKoAAAAAABhW3Z37IpIkahYuj4qahmh/6fnY++Q9WZcFAAAABeVZFwAAAAAAwNhUUdMQlfWzYvyUuqxLAQAAgCJCVQAAAAAAAAAAAClCVQAAAAAAAAAAAClCVQAAAAAAAAAAAClCVQAAAAAAAAAAACnjsy4AAAAAAEaSlpaWXv8NAAAAwOghVAUAAAAA/XBw32sRZWWxePHirEsBAAAAYIgJVQEAAABAP3R37otIkqhZuDwqahoiIqL9pedj75P3ZFwZAAAAAINNqAoAAAAABqCipiEq62dFRETX7h0ZVwMAAADAUCjPugAAAAAAAAAAAIBSIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAAAAACQIlQFAAAAAAzIwX2vZV0CAAAAwJASqgIAAAAABqS7c1/WJQAAAAAMKaEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAFKEqAAAAAAAAAACAlJIOVa1YsSLe+ta3RlVVVZxxxhnx7LPPZl0SAAAAAAAAAAAwypVsqOqHP/xhLFu2LK6//vr45S9/Ge985zujubk5du3alXVpAAAAAAAAAADAKDY+6wL68rWvfS2+8IUvxGc/+9mIiLjrrrti9erV8bd/+7fxl3/5l4f17+zsjM7OzsLtvXv3RkREW1vb8BTMMdm3b19ERHS2/lN0H+jIuBqA3+vavSMivD4NFudzZPA8jV2e++FXKue8VOpg8Hlu+zYaz81wHtORHqu3tqPV1lf7QI6pv32PtZaj9Rmsxx+KY+pP30O3p28fbNvV5/57+mVxTMfzc3ikYxqMOo73Z/7g3uL6+qq9t+M42r772u+RzmF/z1dfjrSvI/3s9fUzerR99PX4EYef2972P9DnpK9z39d9j3R+3tzbelznqrfz1PP4h34/dD89jvRzd7R99Pf/Y1+OdSw73jFwOGoczBpG2z4BAEajrld/GxH/koWQVyldPc9NkiRH7FeWHK1HBg4cOBCTJk2KH/3oR3HeeecVti9ZsiT27NkTDz744GH3ueGGG+LGG28cxioBAAAAAAAAAICRaMeOHTFjxow+20typqp8Ph8HDx6Murq6ou11dXWxdevWXu9z9dVXx7Jlywq3u7u749VXX42ampooKysb0no5dm1tbdHQ0BA7duyI6urqrMsBAGMTACXH2ARAqTE2AVBqjE0AlBpjU2lLkiRef/31OPnkk4/YryRDVceisrIyKisri7ZNnTo1m2IYsOrqai8kAJQUYxMApcbYBECpMTYBUGqMTQCUGmNT6ZoyZcpR+5QPQx0DVltbG+PGjYudO3cWbd+5c2fU19dnVBUAAAAAAAAAADAWlGSoasKECTFv3rxYu3ZtYVt3d3esXbs2mpqaMqwMAAAAAAAAAAAY7Up2+b9ly5bFkiVL4t3vfne85z3via9//evxxhtvxGc/+9msS2MQVVZWxvXXX3/Y0o0AkBVjEwClxtgEQKkxNgFQaoxNAJQaY9PoUJYkSZJ1EX2544474qtf/Wq0trbG6aefHt/4xjfijDPOyLosAAAAAAAAAABgFCvpUBUAAAAAAAAAAMBwK8+6AAAAAAAAAAAAgFIiVAUAAAAAAAAAAJAiVAUAAAAAAAAAAJAiVAUAAAAAAAAAAJAiVMWQ+A//4T9EY2NjVFVVxUknnRQXXHBBvPzyy0V9fvWrX8X73//+qKqqioaGhrjlllsO288DDzwQc+bMiaqqqjjttNPioYceKmpPkiSuu+66OOmkk2LixIkxf/78+Md//MchPTYARp7f/OY3ceGFF8bMmTNj4sSJ8Ud/9Edx/fXXx4EDB4r6GZsAGE5f/vKX46yzzopJkybF1KlTe+2Ty+ViwYIFMWnSpJg+fXpcccUV8eabbxb1efzxx+Nd73pXVFZWxqxZs2LlypWH7WfFihXx1re+NaqqquKMM86IZ599dgiOCICxwJgCwFBZt25dfOxjH4uTTz45ysrK4ic/+UlRe3+uu7366qtx/vnnR3V1dUydOjUuvPDC2LdvX1Gf/lwDBICIiJtvvjn+3b/7d3HiiSfG9OnT47zzzott27YV9eno6IilS5dGTU1NTJ48ORYtWhQ7d+4s6jNY1/gYfkJVDIlzzjkn7r///ti2bVv83d/9Xfzf//t/40//9E8L7W1tbXHuuefGKaecEhs3boyvfvWrccMNN8R3vvOdQp+nn346PvWpT8WFF14YmzZtivPOOy/OO++8+PWvf13oc8stt8Q3vvGNuOuuu+KZZ56JE044IZqbm6Ojo2NYjxeA0rZ169bo7u6Ob3/727Fly5a47bbb4q677or/9t/+W6GPsQmA4XbgwIH4sz/7s7jkkkt6bT948GAsWLAgDhw4EE8//XR8//vfj5UrV8Z1111X6LN9+/ZYsGBBnHPOObF58+a47LLL4vOf/3w88sgjhT4//OEPY9myZXH99dfHL3/5y3jnO98Zzc3NsWvXriE/RgBGF2MKAEPpjTfeiHe+852xYsWKXtv7c93t/PPPjy1btsSaNWti1apVsW7durjooosK7f25BggAPZ544olYunRpbNiwIdasWRNdXV1x7rnnxhtvvFHoc/nll8dPf/rTeOCBB+KJJ56Il19+OT7xiU8U2gfrGh8ZSWAYPPjgg0lZWVly4MCBJEmS5M4770ze8pa3JJ2dnYU+V111VTJ79uzC7f/0n/5TsmDBgqL9nHHGGclf/MVfJEmSJN3d3Ul9fX3y1a9+tdC+Z8+epLKyMvlf/+t/DeXhADAK3HLLLcnMmTMLt41NAGTl7rvvTqZMmXLY9oceeigpLy9PWltbC9u+9a1vJdXV1YXx6sorr0ze9ra3Fd3vk5/8ZNLc3Fy4/Z73vCdZunRp4fbBgweTk08+Obn55psH+UgAGO2MKQAMl4hIfvzjHxdu9+e624svvphERPLcc88V+vzDP/xDUlZWlvzud79LkqR/1wABoC+7du1KIiJ54oknkiT5l7GooqIieeCBBwp9WlpakohI1q9fnyTJ4F3jIxtmqmLIvfrqq/GDH/wgzjrrrKioqIiIiPXr18cHPvCBmDBhQqFfc3NzbNu2LV577bVCn/nz5xftq7m5OdavXx8R/5LWbG1tLeozZcqUOOOMMwp9AKAve/fujWnTphVuG5sAKDXr16+P0047Lerq6grbmpubo62tLbZs2VLoc6Sx6cCBA7Fx48aiPuXl5TF//nxjEwADYkwBIEv9ue62fv36mDp1arz73e8u9Jk/f36Ul5fHM888U+hztGuAANCXvXv3RkQU3l/auHFjdHV1FY1Pc+bMicbGxqLx6Xiv8ZEdoSqGzFVXXRUnnHBC1NTURC6XiwcffLDQ1traWvSiERGF262trUfsk25P36+3PgDQm3/6p3+Kb37zm/EXf/EXhW3GJgBKzfGMTW1tbdHe3h75fD4OHjxobALguBlTAMhSf667tba2xvTp04vax48fH9OmTTvq31DpxwCA3nR3d8dll10W733ve+Ptb397RPzL2DFhwoSYOnVqUd9Dx6fjvcZHdoSq6Le//Mu/jLKysiN+bd26tdD/iiuuiE2bNsWjjz4a48aNi09/+tORJEmGRwDAaDPQsSki4ne/+1185CMfiT/7sz+LL3zhCxlVDsBodSxjEwAAAABQ2pYuXRq//vWv47777su6FIbR+KwLYORYvnx5fOYznzlinz/8wz8s/Lu2tjZqa2vjj//4j2Pu3LnR0NAQGzZsiKampqivr4+dO3cW3bfndn19feF7b33S7T3bTjrppKI+p59++jEdIwAjy0DHppdffjnOOeecOOuss+I73/lOUT9jEwCDYaBj05HU19fHs88+W7Stv2NTdXV1TJw4McaNGxfjxo074vgFAP1RW1trTAEgM/257lZfXx+7du0qut+bb74Zr7766lH/hko/BgAc6tJLL41Vq1bFunXrYsaMGYXt9fX1ceDAgdizZ0/RbFWHvnd0vNf4yI6Zqui3P/iDP4g5c+Yc8Su9BnVad3d3RER0dnZGRERTU1OsW7cuurq6Cn3WrFkTs2fPjre85S2FPmvXri3az5o1a6KpqSkiImbOnBn19fVFfdra2uKZZ54p9AFgdBvI2PS73/0uzj777Jg3b17cfffdUV5e/GuQsQmAwXA8fzcdqqmpKV544YWiNwXWrFkT1dXVceqppxb6HGlsmjBhQsybN6+oT3d3d6xdu9bYBMCAGFMAyFJ/rrs1NTXFnj17YuPGjYU+jz32WHR3d8cZZ5xR6HO0a4AA0CNJkrj00kvjxz/+cTz22GMxc+bMovZ58+ZFRUVF0fi0bdu2yOVyRePT8V7jI0MJDLINGzYk3/zmN5NNmzYlv/nNb5K1a9cmZ511VvJHf/RHSUdHR5IkSbJnz56krq4uueCCC5Jf//rXyX333ZdMmjQp+fa3v13Yzy9+8Ytk/Pjxyd/8zd8kLS0tyfXXX59UVFQkL7zwQqHPV77ylWTq1KnJgw8+mPzqV79KPv7xjyczZ85M2tvbh/24AShdv/3tb5NZs2YlH/rQh5Lf/va3ySuvvFL46mFsAmC4/fM//3OyadOm5MYbb0wmT56cbNq0Kdm0aVPy+uuvJ0mSJG+++Wby9re/PTn33HOTzZs3Jw8//HDyB3/wB8nVV19d2MdLL72UTJo0KbniiiuSlpaWZMWKFcm4ceOShx9+uNDnvvvuSyorK5OVK1cmL774YnLRRRclU6dOTVpbW4f9mAEY2YwpAAyl119/vfB3UUQkX/va15JNmzYl//zP/5wkSf+uu33kIx9J/uRP/iR55plnkqeeeir5t//23yaf+tSnCu39uQYIAD0uueSSZMqUKcnjjz9e9N7S/v37C30uvvjipLGxMXnssceS559/PmlqakqampoK7YN1jY9sCFUx6H71q18l55xzTjJt2rSksrIyeetb35pcfPHFyW9/+9uifv/n//yf5H3ve19SWVmZ/Jt/82+Sr3zlK4ft6/7770/++I//OJkwYULytre9LVm9enVRe3d3d3LttdcmdXV1SWVlZfKhD30o2bZt25AeHwAjz913351ERK9facYmAIbTkiVLeh2bfv7znxf6/OY3v0k++tGPJhMnTkxqa2uT5cuXJ11dXUX7+fnPf56cfvrpyYQJE5I//MM/TO6+++7DHuub3/xm0tjYmEyYMCF5z3vek2zYsGGIjw6A0cqYAsBQ+fnPf97r30hLlixJkqR/1912796dfOpTn0omT56cVFdXJ5/97GcLH1zp0Z9rgACQJEmf7y2lr7+1t7cn/+W//JfkLW95SzJp0qTkP/7H/1j0of4kGbxrfAy/siRJkmGcGAsAAAAAAAAAAKCklWddAAAAAAAAAAAAQCkRqgIAAAAAAAAAAEgRqgIAAAAAAAAAAEgRqgIAAAAAAAAAAEgRqgIAAAAAAAAAAEgRqgIAAAAAAAAAAEgRqgIAAAAAAAAAAEgRqgIAAAAAAAAAAEgRqgIAAAAAAAAAAEgRqgIAAAAAAAAAAEgRqgIAAAAAAAAAAEj5/wEIGovPJP7+PQAAAABJRU5ErkJggg==", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "fig, ax = plt.subplots(figsize=(30, 6)) # Set the figure size to 10x6\n", "bin_edges = [interval.left for interval in filtered_dates[\"bucket\"].cat.categories] + [\n", " filtered_dates[\"bucket\"].cat.categories[-1].right\n", "]\n", "plt.hist(filtered_dates[\"correct\"], bins=bin_edges, edgecolor=\"black\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "\n", "fig, ax = plt.subplots(figsize=(30, 6)) # Set the figure size to 10x6\n", "bin_edges = np.arange(filtered_dates[\"correct\"].min(), filtered_dates[\"correct\"].max() + 26, 25)\n", "plt.hist(filtered_dates[\"correct\"], bins=bin_edges, edgecolor=\"black\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "from collections import Counter\n", "from scipy.ndimage import convolve1d\n", "from scipy.ndimage import gaussian_filter1d\n", "from scipy.signal.windows import triang" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "def get_bin_idx(label, window=25):\n", " return int((label - (-3000)) // window)\n", "\n", "\n", "def get_lds_kernel_window(kernel, ks, sigma):\n", " assert kernel in [\"gaussian\", \"triang\", \"laplace\"]\n", " half_ks = (ks - 1) // 2\n", " if kernel == \"gaussian\":\n", " base_kernel = [0.0] * half_ks + [1.0] + [0.0] * half_ks\n", " kernel_window = gaussian_filter1d(base_kernel, sigma=sigma) / max(\n", " gaussian_filter1d(base_kernel, sigma=sigma)\n", " )\n", " elif kernel == \"triang\":\n", " kernel_window = triang(ks)\n", " else:\n", " laplace = lambda x: np.exp(-abs(x) / sigma) / (2.0 * sigma)\n", " kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(\n", " map(laplace, np.arange(-half_ks, half_ks + 1))\n", " )\n", "\n", " return kernel_window" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "bin_index_per_label = [get_bin_idx(label) for label in filtered_dates[\"correct\"]]\n", "\n", "Nb = max(bin_index_per_label) + 1\n", "\n", "num_samples_of_bins = dict(Counter(bin_index_per_label))\n", "emp_label_dist = [num_samples_of_bins.get(i, 0) for i in range(Nb)]\n", "\n", "# lds_kernel_window: [ks,], here for example, we use gaussian, ks=5, sigma=2\n", "lds_kernel_window = get_lds_kernel_window(kernel=\"gaussian\", ks=5, sigma=2)\n", "# calculate effective label distribution: [Nb,]\n", "eff_label_dist = convolve1d(np.array(emp_label_dist), weights=lds_kernel_window, mode=\"constant\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 113, 0, 35, 0, 36, 28, 1, 1, 70,\n", " 2, 87, 0, 193, 1, 16, 67, 287, 2,\n", " 52, 10, 443, 7, 85, 2, 45, 4, 14,\n", " 6, 1142, 20, 165, 13, 329, 1, 104, 27,\n", " 597, 23, 697, 98, 355, 44, 145, 80, 186,\n", " 136, 268, 52, 718, 99, 362, 79, 106, 58,\n", " 142, 24, 49, 104, 154, 150, 124, 125, 119,\n", " 107, 191, 113, 980, 214, 1154, 174, 827, 120,\n", " 408, 46, 223, 249, 122, 76, 69, 53, 134,\n", " 156, 252, 173, 198, 230, 627, 120, 373, 144,\n", " 848, 148, 705, 554, 814, 975, 4148, 3313, 3990,\n", " 1566, 3023, 1802, 3341, 1750, 2073, 1857, 6430, 4183,\n", " 3506, 1492, 2834, 3644, 3081, 2890, 5376, 3308, 7121,\n", " 3263, 5851, 3481, 1643, 4370, 5959, 3178, 6091, 5658,\n", " 7929, 3568, 7901, 6216, 13351, 7987, 4463, 866, 2411,\n", " 1642, 1168, 1217, 2032, 310, 1235, 15, 2443, 52,\n", " 1111, 0, 842, 0, 21, 0, 23, 6, 5,\n", " 0, 9, 0, 36, 5, 75, 0, 184, 1,\n", " 1, 0, 0, 0, 3, 0, 0, 0, 3,\n", " 0, 1, 0, 0, 0, 0, 0, 1, 0,\n", " 0, 0, 2, 0, 0, 0, 0, 0, 0,\n", " 0, 15, 1, 2, 0, 53, 0, 109, 21,\n", " 16, 14])" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.array(emp_label_dist)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 143, 139, 162, 91, 93, 63, 119, 93, 148,\n", " 151, 314, 267, 282, 256, 492, 356, 410, 388,\n", " 689, 485, 576, 516, 512, 134, 135, 66, 1042,\n", " 1119, 1320, 1272, 1458, 498, 573, 444, 925, 710,\n", " 1331, 1354, 1628, 1150, 1211, 669, 726, 547, 744,\n", " 678, 1221, 1186, 1401, 1232, 1237, 656, 668, 380,\n", " 352, 343, 424, 445, 542, 609, 618, 575, 608,\n", " 604, 1342, 1496, 2443, 2478, 3071, 2334, 2445, 1476,\n", " 1466, 959, 956, 680, 680, 515, 410, 444, 607,\n", " 715, 848, 929, 1333, 1261, 1448, 1386, 1888, 1528,\n", " 2049, 2215, 2795, 2954, 6425, 8987, 12326, 13191, 14759,\n", " 12589, 12500, 10667, 11074, 10011, 13870, 14991, 16931, 16455,\n", " 16824, 14206, 13345, 12999, 16307, 16855, 19994, 20409, 22972,\n", " 21359, 19751, 17120, 19205, 17275, 19736, 23182, 26368, 24708,\n", " 28664, 28731, 35419, 36234, 37396, 30914, 26364, 15632, 9616,\n", " 6814, 7685, 5919, 5538, 4457, 5383, 3804, 4519, 3426,\n", " 3979, 1891, 1813, 816, 763, 46, 50, 32, 38,\n", " 18, 44, 46, 112, 109, 268, 250, 250, 175,\n", " 159, 1, 3, 2, 3, 2, 5, 2, 3,\n", " 3, 3, 0, 0, 0, 0, 0, 1, 0,\n", " 2, 1, 2, 1, 1, 0, 0, 0, 12,\n", " 15, 17, 17, 61, 52, 148, 171, 188, 151,\n", " 142, 47])" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eff_label_dist" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# from loss import weighted_mse_loss\n", "\n", "# Use re-weighting based on effective label distribution, sample-wise weights: [Ns,]\n", "eff_num_per_label = [eff_label_dist[bin_idx] for bin_idx in bin_index_per_label]\n", "weights = [np.float32(1 / x) for x in eff_num_per_label]\n", "\n", "# calculate loss\n", "# loss = weighted_mse_loss(preds, labels, weights=weights)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(197667,)" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.array(weights).shape" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "'ellipsis' object is not iterable", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[40], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m preds, labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m, \u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# assign each label to its corresponding bin (start from 0)\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# with your defined get_bin_idx(), return bin_index_per_label: [Ns,]\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m bin_index_per_label \u001b[38;5;241m=\u001b[39m [get_bin_idx(label) \u001b[38;5;28;01mfor\u001b[39;00m label \u001b[38;5;129;01min\u001b[39;00m labels]\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m# calculate empirical (original) label distribution: [Nb,]\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# \"Nb\" is the number of bins\u001b[39;00m\n\u001b[1;32m 9\u001b[0m Nb \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmax\u001b[39m(bin_index_per_label) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n", "\u001b[0;31mTypeError\u001b[0m: 'ellipsis' object is not iterable" ] } ], "source": [ "# preds, labels: [Ns,], \"Ns\" is the number of total samples\n", "preds, labels = ..., ...\n", "# assign each label to its corresponding bin (start from 0)\n", "# with your defined get_bin_idx(), return bin_index_per_label: [Ns,]\n", "bin_index_per_label = [get_bin_idx(label) for label in labels]\n", "\n", "# calculate empirical (original) label distribution: [Nb,]\n", "# \"Nb\" is the number of bins\n", "Nb = max(bin_index_per_label) + 1\n", "num_samples_of_bins = dict(Counter(bin_index_per_label))\n", "emp_label_dist = [num_samples_of_bins.get(i, 0) for i in range(Nb)]\n", "\n", "# lds_kernel_window: [ks,], here for example, we use gaussian, ks=5, sigma=2\n", "lds_kernel_window = get_lds_kernel_window(kernel=\"gaussian\", ks=5, sigma=2)\n", "# calculate effective label distribution: [Nb,]\n", "eff_label_dist = convolve1d(np.array(emp_label_dist), weights=lds_kernel_window, mode=\"constant\")" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn.functional as F\n", "import numpy as np\n", "from scipy.ndimage import gaussian_filter1d\n", "from scipy.signal.windows import triang\n", "from scipy.ndimage import convolve1d\n", "\n", "\n", "def get_lds_kernel_window(kernel, ks, sigma):\n", " assert kernel in [\"gaussian\", \"triang\", \"laplace\"]\n", " half_ks = (ks - 1) // 2\n", "\n", " if kernel == \"gaussian\":\n", " base_kernel = [0.0] * half_ks + [1.0] + [0.0] * half_ks\n", " kernel_window = gaussian_filter1d(base_kernel, sigma=sigma) / max(\n", " gaussian_filter1d(base_kernel, sigma=sigma)\n", " )\n", " elif kernel == \"triang\":\n", " kernel_window = triang(ks)\n", " else:\n", " laplace = lambda x: np.exp(-abs(x) / sigma) / (2.0 * sigma)\n", " kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(\n", " map(laplace, np.arange(-half_ks, half_ks + 1))\n", " )\n", "\n", " return kernel_window\n", "\n", "\n", "def prepare_weights(labels, reweight, lds=True, lds_kernel=\"gaussian\", lds_ks=5, lds_sigma=2):\n", " assert reweight in {\"none\", \"inverse\", \"sqrt_inv\"}\n", " assert (\n", " reweight != \"none\" if lds else True\n", " ), \"Set reweight to 'sqrt_inv' (default) or 'inverse' when using LDS\"\n", "\n", " value_dict = {\n", " x: 0 for x in list(set(labels))\n", " } # initialize value dictionary with labels as keys\n", " for label in labels:\n", " value_dict[label] += 1 # increment counts of labels which occur multiple times\n", " if reweight == \"sqrt_inv\":\n", " value_dict = {k: np.sqrt(v) for k, v in value_dict.items()}\n", " elif reweight == \"inverse\":\n", " value_dict = {\n", " k: np.clip(v, 5, 1000) for k, v in value_dict.items()\n", " } # clip weights for inverse re-weight\n", " num_per_label = [value_dict[label] for label in labels]\n", " if not len(num_per_label) or reweight == \"none\":\n", " return None\n", " print(f\"Using re-weighting: [{reweight.upper()}]\")\n", "\n", " if lds:\n", " lds_kernel_window = get_lds_kernel_window(lds_kernel, lds_ks, lds_sigma)\n", " print(f\"Using LDS: [{lds_kernel.upper()}] ({lds_ks}/{lds_sigma})\")\n", " # apply kernel to the reweighted values\n", " smoothed_value = convolve1d(\n", " np.asarray([v for _, v in value_dict.items()]),\n", " weights=lds_kernel_window,\n", " mode=\"constant\",\n", " )\n", " value_dict_keys = list(value_dict.keys())\n", " num_per_label = [smoothed_value[value_dict_keys.index(label)] for label in labels]\n", "\n", " weights = [np.float32(1 / x) for x in num_per_label]\n", " scaling = len(weights) / np.sum(weights)\n", " weights = [scaling * x for x in weights]\n", "\n", " return torch.Tensor(weights)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using re-weighting: [SQRT_INV]\n", "Using LDS: [GAUSSIAN] (5/2)\n" ] }, { "data": { "text/plain": [ "tensor([3.7173, 0.5529, 0.5529, ..., 1.0680, 3.0975, 0.5203])" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prepare_weights(filtered_dates[\"correct\"], \"sqrt_inv\")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using re-weighting: [SQRT_INV]\n", "Using LDS: [GAUSSIAN] (1000/1000)\n" ] } ], "source": [ "weights = np.array(\n", " prepare_weights(filtered_dates[\"correct\"], \"sqrt_inv\", lds_ks=1000, lds_sigma=1000)\n", ")" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, ax = plt.subplots(figsize=(30, 6)) # Set the figure size to 10x6\n", "bin_edges = np.arange(filtered_dates[\"correct\"].min(), filtered_dates[\"correct\"].max() + 51, 50)\n", "plt.hist(filtered_dates[\"correct\"] * weights, bins=bin_edges, edgecolor=\"black\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, ax = plt.subplots(figsize=(30, 6)) # Set the figure size to 10x6\n", "bin_edges = np.arange(filtered_dates[\"correct\"].min(), filtered_dates[\"correct\"].max() + 51, 50)\n", "plt.hist(filtered_dates[\"correct\"], bins=bin_edges, edgecolor=\"black\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "from scipy.ndimage import gaussian_filter1d\n", "from scipy.signal.windows import triang\n", "from scipy.ndimage import convolve1d\n", "\n", "\n", "def get_lds_kernel_window(lds_kernel=\"gaussian\", lds_ks=9, lds_sigma=1):\n", " r\"\"\"Function to determine the label distribution smoothing kernel window\n", "\n", " lds_kernel (str): LDS kernel type\n", " lds_ks (int): LDS kernel size (should be an odd number).\n", " lds_sigma (float): LDS gaussian/laplace kernel sigma\n", " \"\"\"\n", "\n", " assert lds_kernel in [\"gaussian\", \"triang\", \"laplace\"]\n", " half_ks = (lds_ks - 1) // 2\n", "\n", " if lds_kernel == \"gaussian\":\n", " base_kernel = [0.0] * half_ks + [1.0] + [0.0] * half_ks\n", " kernel_window = gaussian_filter1d(base_kernel, sigma=lds_sigma) / max(\n", " gaussian_filter1d(base_kernel, sigma=lds_sigma)\n", " )\n", " elif lds_kernel == \"triang\":\n", " kernel_window = triang(lds_ks)\n", " else:\n", "\n", " def laplace(x):\n", " return np.exp(-abs(x) / lds_sigma) / (2.0 * lds_sigma)\n", "\n", " kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(\n", " map(laplace, np.arange(-half_ks, half_ks + 1))\n", " )\n", "\n", " return kernel_window\n", "\n", "\n", "def prepare_LDS_weights(\n", " labels,\n", " n_bins=None,\n", " label_range=None,\n", " reweight=\"inv\",\n", " lds_kernel=\"gaussian\",\n", " lds_ks=9,\n", " lds_sigma=1,\n", " max_rel_weight=None,\n", " show_plot=True,\n", "):\n", "\n", " assert reweight in {\"inv\", \"sqrt_inv\"}\n", " labels_shape = labels.shape\n", " if n_bins is None:\n", " labels = labels.astype(int)\n", " n_bins = np.max(labels) - np.min(labels)\n", " num_per_label, bin_edges = np.histogram(labels, bins=n_bins, range=label_range)\n", " new_labels = np.searchsorted(bin_edges, labels, side=\"left\")\n", " new_labels[new_labels == 0] = 1\n", " if reweight == \"sqrt_inv\":\n", " num_per_label = np.sqrt(num_per_label)\n", " lds_kernel_window = get_lds_kernel_window(\n", " lds_kernel=lds_kernel, lds_ks=lds_ks, lds_sigma=lds_sigma\n", " )\n", " smoothed_value = convolve1d(num_per_label, weights=lds_kernel_window, mode=\"constant\")\n", " if show_plot:\n", " plt.bar(\n", " bin_edges[:-1],\n", " num_per_label / num_per_label.sum(),\n", " width=(bin_edges[1] - bin_edges[0]),\n", " color=\"lime\",\n", " edgecolor=\"black\",\n", " label=\"original\",\n", " )\n", " plt.plot(\n", " bin_edges[:-1], smoothed_value / smoothed_value.sum(), color=\"red\", label=\"smoothed\"\n", " )\n", " plt.title(f\"Label distribution by bin (reweight={reweight})\")\n", " plt.legend(loc=\"best\")\n", " plt.show()\n", " num_per_label = smoothed_value[new_labels.flatten() - 1].reshape(*labels_shape)\n", " weights = 1 / num_per_label\n", " weights[num_per_label == 0] = 0\n", " if max_rel_weight is not None:\n", " weights = np.clip(weights, None, np.min(weights) * max_rel_weight)\n", " weights = weights / weights.sum() * len(labels)\n", " return torch.Tensor(weights)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "tensor([14.5464, 0.6404, 0.6404, ..., 0.8173, 2.5599, 0.8173])" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prepare_LDS_weights(\n", " filtered_dates[\"correct\"],\n", " n_bins=100,\n", " label_range=(-3000, 2021),\n", " reweight=\"sqrt_inv\",\n", " lds_ks=5,\n", " lds_sigma=2,\n", ")" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "w = prepare_LDS_weights(\n", " filtered_dates[\"correct\"],\n", " n_bins=500,\n", " label_range=(-3000, 2021),\n", " reweight=\"sqrt_inv\",\n", " lds_ks=20,\n", " lds_sigma=2,\n", ")" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([13.8139, 0.6039, 0.6039, ..., 0.8029, 2.6979, 0.7797])" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 800\n", "1 100\n", "2 100\n", "3 600\n", "4 -300\n", " ... \n", "719 -350\n", "720 -350\n", "721 -375\n", "722 -875\n", "723 -350\n", "Name: correct, Length: 197667, dtype: object" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered_dates[\"correct\"]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ImageObject typeDescriptionCultureProduction dateProduction placeMaterialsSubjectsReg numberregion
0https://media.britishmuseum.org/media/Reposito...adzeAdze? of pecked and ground stone, grooved for ...Anasazi1-1600 (?)NaNstoneNaNAm1994,09.1north_america
1https://media.britishmuseum.org/media/Reposito...altarThree fragments of burnt clay that formed part...Middle Woodland Period200BC - 400ADNaNclayNaNAm,S.818north_america
2https://media.britishmuseum.org/media/Reposito...altarFragments of an altar or crematory basin made ...Middle Woodland Period200BC - 400ADNaNclayNaNAm,S.817north_america
3https://media.britishmuseum.org/media/Reposito...amuletChalchihuitl, amulet, pendant made of amazonst...Classic Maya400 - 800NaNamazoniteNaNAm.9685north_america
4https://media.britishmuseum.org/media/Reposito...arrowThree expanding stem arrow or spear points, ma...Early Woodland Period1000BC - 400ADNaNchertNaNAm,S.758.a-cnorth_america
.................................
719https://media.britishmuseum.org/media/Reposito...whistleAnthropomorphic whistle / whistle in the shape...Nasca100BC-600NaNpotterysociety/human lifeAm1954,05.196south_america
720https://media.britishmuseum.org/media/Reposito...whistleWhistle made of pottery, possibly modelled and...Nasca100BC-600NaNpotteryanimalAm1954,05.669south_america
721https://media.britishmuseum.org/media/Reposito...whistleAnthropomorphic whistle in the shape of the An...Nasca100BC-650NaNpotterysociety/human life; anthropomorphism; amphibia...Am1954,05.194south_america
722https://media.britishmuseum.org/media/Reposito...whistleOvoid shaped whistle made of tumbaga by lost-w...Zenu150BC-1600 (?)NaNtumbagaNaNAm.6877south_america
723https://media.britishmuseum.org/media/Reposito...whistling vesselRectangular spouted jar with bridge and (broke...Nasca100BC-600NaNpotterybirdAm1982,Q.944south_america
\n", "

201119 rows × 10 columns

\n", "
" ], "text/plain": [ " Image Object type \\\n", "0 https://media.britishmuseum.org/media/Reposito... adze \n", "1 https://media.britishmuseum.org/media/Reposito... altar \n", "2 https://media.britishmuseum.org/media/Reposito... altar \n", "3 https://media.britishmuseum.org/media/Reposito... amulet \n", "4 https://media.britishmuseum.org/media/Reposito... arrow \n", ".. ... ... \n", "719 https://media.britishmuseum.org/media/Reposito... whistle \n", "720 https://media.britishmuseum.org/media/Reposito... whistle \n", "721 https://media.britishmuseum.org/media/Reposito... whistle \n", "722 https://media.britishmuseum.org/media/Reposito... whistle \n", "723 https://media.britishmuseum.org/media/Reposito... whistling vessel \n", "\n", " Description \\\n", "0 Adze? of pecked and ground stone, grooved for ... \n", "1 Three fragments of burnt clay that formed part... \n", "2 Fragments of an altar or crematory basin made ... \n", "3 Chalchihuitl, amulet, pendant made of amazonst... \n", "4 Three expanding stem arrow or spear points, ma... \n", ".. ... \n", "719 Anthropomorphic whistle / whistle in the shape... \n", "720 Whistle made of pottery, possibly modelled and... \n", "721 Anthropomorphic whistle in the shape of the An... \n", "722 Ovoid shaped whistle made of tumbaga by lost-w... \n", "723 Rectangular spouted jar with bridge and (broke... \n", "\n", " Culture Production date Production place Materials \\\n", "0 Anasazi 1-1600 (?) NaN stone \n", "1 Middle Woodland Period 200BC - 400AD NaN clay \n", "2 Middle Woodland Period 200BC - 400AD NaN clay \n", "3 Classic Maya 400 - 800 NaN amazonite \n", "4 Early Woodland Period 1000BC - 400AD NaN chert \n", ".. ... ... ... ... \n", "719 Nasca 100BC-600 NaN pottery \n", "720 Nasca 100BC-600 NaN pottery \n", "721 Nasca 100BC-650 NaN pottery \n", "722 Zenu 150BC-1600 (?) NaN tumbaga \n", "723 Nasca 100BC-600 NaN pottery \n", "\n", " Subjects Reg number \\\n", "0 NaN Am1994,09.1 \n", "1 NaN Am,S.818 \n", "2 NaN Am,S.817 \n", "3 NaN Am.9685 \n", "4 NaN Am,S.758.a-c \n", ".. ... ... \n", "719 society/human life Am1954,05.196 \n", "720 animal Am1954,05.669 \n", "721 society/human life; anthropomorphism; amphibia... Am1954,05.194 \n", "722 NaN Am.6877 \n", "723 bird Am1982,Q.944 \n", "\n", " region \n", "0 north_america \n", "1 north_america \n", "2 north_america \n", "3 north_america \n", "4 north_america \n", ".. ... \n", "719 south_america \n", "720 south_america \n", "721 south_america \n", "722 south_america \n", "723 south_america \n", "\n", "[201119 rows x 10 columns]" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "world_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "ArtifactClassification", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }