{ "cells": [ { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "obj2info = pd.read_csv(\"../data/processed/OM_obj_to_info.csv\")\n", "file2obj = pd.read_csv(\"../data/processed/OM_file_to_obj.csv\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "file_counts = file2obj[\"obj_num\"].value_counts()\n", "# file2obj" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "obj_num\n", "durom.1969.406 249\n", "durom.1973.47 191\n", "DUROM.1954.Spalding29.W 112\n", "durom.1960.2332 101\n", "durom.2014.1 76\n", " ... \n", "durom.2006.46.32 1\n", "durom.2006.44.16 1\n", "durom.2006.45.194 1\n", "durom.2006.46.13 1\n", "durom.1964.183 1\n", "Name: count, Length: 12642, dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "file_counts" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Images per instanceNumber of instancesNumber of images
036962088
147032812
253601800
368535118
474713297
582231784
69110990
710+4567836
8Total387225725
\n", "
" ], "text/plain": [ " Images per instance Number of instances Number of images\n", "0 3 696 2088\n", "1 4 703 2812\n", "2 5 360 1800\n", "3 6 853 5118\n", "4 7 471 3297\n", "5 8 223 1784\n", "6 9 110 990\n", "7 10+ 456 7836\n", "8 Total 3872 25725" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distribution_df = pd.DataFrame()\n", "distribution_df[\"Images per instance\"] = file_counts.value_counts().sort_index().index\n", "distribution_df[\"Number of instances\"] = file_counts.value_counts().sort_index().values\n", "distribution_df[\"Number of images\"] = (\n", " distribution_df[\"Images per instance\"] * distribution_df[\"Number of instances\"]\n", ")\n", "num_instances_10plus = distribution_df[distribution_df[\"Images per instance\"] >= 10][\n", " \"Number of instances\"\n", "].sum()\n", "num_images_10plus = distribution_df[distribution_df[\"Images per instance\"] >= 10][\n", " \"Number of images\"\n", "].sum()\n", "distribution_df = distribution_df[\n", " (distribution_df[\"Images per instance\"] < 10) & (distribution_df[\"Images per instance\"] > 2)\n", "]\n", "\n", "distribution_df = pd.concat(\n", " [\n", " distribution_df,\n", " pd.DataFrame(\n", " {\n", " \"Images per instance\": [\"10+\"],\n", " \"Number of instances\": [num_instances_10plus],\n", " \"Number of images\": [num_images_10plus],\n", " }\n", " ),\n", " ],\n", " ignore_index=True,\n", ")\n", "\n", "# append total\n", "distribution_df = pd.concat(\n", " [\n", " distribution_df,\n", " pd.DataFrame(\n", " {\n", " \"Images per instance\": [\"Total\"],\n", " \"Number of instances\": [distribution_df[\"Number of instances\"].sum()],\n", " \"Number of images\": [distribution_df[\"Number of images\"].sum()],\n", " }\n", " ),\n", " ],\n", " ignore_index=True,\n", ")\n", "# distribution_df = distribution_df[['Images per instance', 'Number of images', 'Number of instances']]\n", "distribution_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This distribution broadly follows that from Winterbottom's paper, with a few minor differences. \n", "\n", "I am not expecting it to be exactly the same, as winterbottom did not use the database at all, instead just looked at the images" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Assessing for alternative text labels" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "full_df = pd.read_excel(\"../data/raw/Durham_University_Museums_data.xlsx\")\n", "full_df = full_df.filter(regex=r\"^(?!Unnamed).*$\")\n", "full_df = full_df.dropna(subset=[\"description\"])" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ColumnNull Percentageunique_values
0object_number0.0053460
4description0.001191
6material4.436442
1object_name8.9626163
22alternative_number18.3046165
13production.place34.423234
12production.period40.90414
3reproduction.reference50.2376
11production.date.end50.906923
10production.date.start51.04127
2other_name58.72968
9number_of_parts62.08949
8physical_description73.54485
14field_coll.place77.88812
16field_coll.method83.38546
18content.subject87.251449
7technique87.5822
21association.subject88.35516
15field_coll.notes91.09773
5label.text91.6978
20association.person95.54289
17content.person.name95.89247
19association.period97.7036718
\n", "
" ], "text/plain": [ " Column Null Percentage unique_values\n", "0 object_number 0.00 53460\n", "4 description 0.00 1191\n", "6 material 4.43 6442\n", "1 object_name 8.96 26163\n", "22 alternative_number 18.30 46165\n", "13 production.place 34.42 3234\n", "12 production.period 40.90 414\n", "3 reproduction.reference 50.23 76\n", "11 production.date.end 50.90 6923\n", "10 production.date.start 51.04 127\n", "2 other_name 58.72 968\n", "9 number_of_parts 62.08 949\n", "8 physical_description 73.54 485\n", "14 field_coll.place 77.88 812\n", "16 field_coll.method 83.38 546\n", "18 content.subject 87.25 1449\n", "7 technique 87.58 22\n", "21 association.subject 88.35 516\n", "15 field_coll.notes 91.09 773\n", "5 label.text 91.69 78\n", "20 association.person 95.54 289\n", "17 content.person.name 95.89 247\n", "19 association.period 97.70 36718" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "null_percentage = (full_df.isnull().sum() / len(full_df)) * 100\n", "desc_df = pd.DataFrame(\n", " {\"Column\": null_percentage.index, \"Null Percentage\": null_percentage.values}\n", ")\n", "desc_df[\"Null Percentage\"] = desc_df[\"Null Percentage\"].round(2)\n", "desc_df = desc_df.sort_values(by=\"Null Percentage\")\n", "desc_df[\"unique_values\"] = full_df.nunique().values\n", "desc_df" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "def get_distribution(df, column, lower_bound=2):\n", " distribution = pd.DataFrame()\n", " col_counts = df[column].value_counts()\n", " distribution[f\"{column}s per instance\"] = col_counts.value_counts().sort_index().index\n", " distribution[\"Number of instances\"] = col_counts.value_counts().sort_index().values\n", " distribution[f\"Number of {column}s\"] = (\n", " distribution[f\"{column}s per instance\"] * distribution[\"Number of instances\"]\n", " )\n", " num_instances_10_50 = distribution[\n", " (distribution[f\"{column}s per instance\"] >= 10)\n", " & (distribution[f\"{column}s per instance\"] < 50)\n", " ][\"Number of instances\"].sum()\n", " num_images_10_50 = distribution[\n", " (distribution[f\"{column}s per instance\"] >= 10)\n", " & (distribution[f\"{column}s per instance\"] < 50)\n", " ][f\"Number of {column}s\"].sum()\n", " num_instances_50_100 = distribution[\n", " (distribution[f\"{column}s per instance\"] >= 50)\n", " & (distribution[f\"{column}s per instance\"] < 100)\n", " ][\"Number of instances\"].sum()\n", " num_images_50_100 = distribution[\n", " (distribution[f\"{column}s per instance\"] >= 50)\n", " & (distribution[f\"{column}s per instance\"] < 100)\n", " ][f\"Number of {column}s\"].sum()\n", " num_instances_100_1000 = distribution[\n", " (distribution[f\"{column}s per instance\"] >= 100)\n", " & (distribution[f\"{column}s per instance\"] < 1000)\n", " ][\"Number of instances\"].sum()\n", " num_images_100_1000 = distribution[\n", " (distribution[f\"{column}s per instance\"] >= 100)\n", " & (distribution[f\"{column}s per instance\"] < 1000)\n", " ][f\"Number of {column}s\"].sum()\n", " num_instances_1000plus = distribution[distribution[f\"{column}s per instance\"] >= 1000][\n", " \"Number of instances\"\n", " ].sum()\n", " num_images_1000plus = distribution[distribution[f\"{column}s per instance\"] >= 1000][\n", " f\"Number of {column}s\"\n", " ].sum()\n", "\n", " distribution = distribution[\n", " (distribution[f\"{column}s per instance\"] < 10)\n", " & (distribution[f\"{column}s per instance\"] > lower_bound)\n", " ]\n", "\n", " distribution = pd.concat(\n", " [\n", " distribution,\n", " pd.DataFrame(\n", " {\n", " f\"{column}s per instance\": [\"10-50\"],\n", " \"Number of instances\": [num_instances_10_50],\n", " f\"Number of {column}s\": [num_images_10_50],\n", " }\n", " ),\n", " pd.DataFrame(\n", " {\n", " f\"{column}s per instance\": [\"50-100\"],\n", " \"Number of instances\": [num_instances_50_100],\n", " f\"Number of {column}s\": [num_images_50_100],\n", " }\n", " ),\n", " pd.DataFrame(\n", " {\n", " f\"{column}s per instance\": [\"100-1000\"],\n", " \"Number of instances\": [num_instances_100_1000],\n", " f\"Number of {column}s\": [num_images_100_1000],\n", " }\n", " ),\n", " pd.DataFrame(\n", " {\n", " f\"{column}s per instance\": [\"1000+\"],\n", " \"Number of instances\": [num_instances_1000plus],\n", " f\"Number of {column}s\": [num_images_1000plus],\n", " }\n", " ),\n", " ],\n", " ignore_index=True,\n", " )\n", "\n", " distribution = pd.concat(\n", " [\n", " distribution,\n", " pd.DataFrame(\n", " {\n", " f\"{column}s per instance\": [\"Total\"],\n", " \"Number of instances\": [distribution[\"Number of instances\"].sum()],\n", " f\"Number of {column}s\": [distribution[f\"Number of {column}s\"].sum()],\n", " }\n", " ),\n", " ],\n", " ignore_index=True,\n", " )\n", " # rename columns\n", " return distribution" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
object_names per instanceNumber of instancesNumber of object_names
0393279
1457228
2553265
3632192
4727189
5824192
6927243
710-502274921
850-100513683
9100-10006517027
101000+720758
11Total66347977
\n", "
" ], "text/plain": [ " object_names per instance Number of instances Number of object_names\n", "0 3 93 279\n", "1 4 57 228\n", "2 5 53 265\n", "3 6 32 192\n", "4 7 27 189\n", "5 8 24 192\n", "6 9 27 243\n", "7 10-50 227 4921\n", "8 50-100 51 3683\n", "9 100-1000 65 17027\n", "10 1000+ 7 20758\n", "11 Total 663 47977" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_distribution(full_df, \"object_name\")" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
materials per instanceNumber of instancesNumber of materials
033090
141664
251470
36954
471070
58648
69545
710-50881975
850-100211409
9100-10004313030
101000+1234036
11Total25450891
\n", "
" ], "text/plain": [ " materials per instance Number of instances Number of materials\n", "0 3 30 90\n", "1 4 16 64\n", "2 5 14 70\n", "3 6 9 54\n", "4 7 10 70\n", "5 8 6 48\n", "6 9 5 45\n", "7 10-50 88 1975\n", "8 50-100 21 1409\n", "9 100-1000 43 13030\n", "10 1000+ 12 34036\n", "11 Total 254 50891" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_distribution(full_df, \"material\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Production date could be used for a regression task, and the other fields could be used for a classification task." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Year" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
production.date.starts per instanceNumber of instancesNumber of production.date.starts
01275275
12129258
2375225
3472288
4545225
5632192
6720140
7816128
8921189
910-501994226
1050-100392661
11100-10004110259
121000+47110
13Total96826176
\n", "
" ], "text/plain": [ " production.date.starts per instance Number of instances \\\n", "0 1 275 \n", "1 2 129 \n", "2 3 75 \n", "3 4 72 \n", "4 5 45 \n", "5 6 32 \n", "6 7 20 \n", "7 8 16 \n", "8 9 21 \n", "9 10-50 199 \n", "10 50-100 39 \n", "11 100-1000 41 \n", "12 1000+ 4 \n", "13 Total 968 \n", "\n", " Number of production.date.starts \n", "0 275 \n", "1 258 \n", "2 225 \n", "3 288 \n", "4 225 \n", "5 192 \n", "6 140 \n", "7 128 \n", "8 189 \n", "9 4226 \n", "10 2661 \n", "11 10259 \n", "12 7110 \n", "13 26176 " ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_distribution(full_df, \"production.date.start\", lower_bound=0)" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
production.date.ends per instanceNumber of instancesNumber of production.date.ends
01285285
12120240
2363189
3446184
4532160
5637222
6726182
7820160
8919171
910-502104562
1050-100412588
11100-10004711609
121000+35696
13Total94926248
\n", "
" ], "text/plain": [ " production.date.ends per instance Number of instances \\\n", "0 1 285 \n", "1 2 120 \n", "2 3 63 \n", "3 4 46 \n", "4 5 32 \n", "5 6 37 \n", "6 7 26 \n", "7 8 20 \n", "8 9 19 \n", "9 10-50 210 \n", "10 50-100 41 \n", "11 100-1000 47 \n", "12 1000+ 3 \n", "13 Total 949 \n", "\n", " Number of production.date.ends \n", "0 285 \n", "1 240 \n", "2 189 \n", "3 184 \n", "4 160 \n", "5 222 \n", "6 182 \n", "7 160 \n", "8 171 \n", "9 4562 \n", "10 2588 \n", "11 11609 \n", "12 5696 \n", "13 26248 " ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_distribution(full_df, \"production.date.end\", lower_bound=0)" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
start_yearend_yearyear_diffmid_year
2-3000-30000-3000
142-600-332268-466
143-1069-716353-893
147-716-332384-524
148-716-332384-524
...............
600812182224220
600821996199601996
600832016201602016
600841996199601996
600851996199601996
\n", "

26016 rows × 4 columns

\n", "
" ], "text/plain": [ " start_year end_year year_diff mid_year\n", "2 -3000 -3000 0 -3000\n", "142 -600 -332 268 -466\n", "143 -1069 -716 353 -893\n", "147 -716 -332 384 -524\n", "148 -716 -332 384 -524\n", "... ... ... ... ...\n", "60081 218 222 4 220\n", "60082 1996 1996 0 1996\n", "60083 2016 2016 0 2016\n", "60084 1996 1996 0 1996\n", "60085 1996 1996 0 1996\n", "\n", "[26016 rows x 4 columns]" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "year_df = pd.DataFrame()\n", "year_df[\"start_year\"] = full_df[\"production.date.start\"]\n", "year_df[\"end_year\"] = full_df[\"production.date.end\"]\n", "year_df = year_df.dropna()\n", "\n", "non_numeric_instances = year_df[\n", " pd.to_numeric(year_df[\"start_year\"], errors=\"coerce\").isna()\n", " | pd.to_numeric(year_df[\"end_year\"], errors=\"coerce\").isna()\n", "]\n", "# get non-numeric instances\n", "year_df = year_df[~year_df.index.isin(non_numeric_instances.index)]\n", "year_df[\"start_year\"] = year_df[\"start_year\"].astype(int)\n", "year_df[\"end_year\"] = year_df[\"end_year\"].astype(int)\n", "year_df[\"year_diff\"] = year_df[\"end_year\"] - year_df[\"start_year\"]\n", "\n", "year_df[\"mid_year\"] = year_df[\"start_year\"] + year_df[\"year_diff\"] / 2\n", "year_df[\"mid_year\"] = year_df[\"mid_year\"].apply(lambda x: int(np.floor(x)))\n", "year_df" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "year_df[\"mid_year\"].hist(bins=50)\n", "plt.xlabel(\"Mid Year\")\n", "plt.ylabel(\"Frequency\")\n", "plt.title(\"Distribution of Mid Year\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Object name" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "object_name\n", "sherds 5068\n", "photographs 4729\n", "coins 4609\n", "amulets 2485\n", "Woodblock Print 1386\n", "figures 1316\n", "vessels 1165\n", "bowls 807\n", "Papercut 731\n", "pages 647\n", "Slide 633\n", "jars 566\n", "Seal 554\n", "postcards 541\n", "vases 518\n", "Name: count, dtype: int64" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "full_df[\"object_name\"].value_counts()[:15]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Material" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "material\n", "pottery 8844\n", "paper 5822\n", "metal 3739\n", "photographic paper 3396\n", "faience 2961\n", " ... \n", "Shell (Ostrich Egg) 1\n", "serpentinite 1\n", "balsa 1\n", "maple 1\n", "fabric art 1\n", "Name: count, Length: 414, dtype: int64" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "full_df[\"material\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "ArtifactClassification", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }