In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [2]:
file2obj = pd.read_csv("../data/processed/OM_file_to_obj.csv")
obj2info = pd.read_csv("../data/processed/OM_obj_to_info.csv")


# Could eventually do something with these columns, but need cleaning first
obj2info.drop(
    columns=["number_of_parts", "production.date.start", "production.date.end", "obj_num_old"],
    inplace=True,
)

file2obj["image"] = file2obj.apply(lambda x: os.path.join(x["root"], x["file"]), axis=1)
# file2obj.rename(columns={"obj_num": "label"}, inplace=True)

join_df = file2obj[["obj_num", "file", "image", "root"]].merge(
    obj2info, left_on="obj_num", right_on="obj_num", how="left"
)

In [3]:
file2obj["root"]

0        data/raw/images/fulling_mill/1985
1        data/raw/images/fulling_mill/1985
2        data/raw/images/fulling_mill/1985
3        data/raw/images/fulling_mill/1985
4        data/raw/images/fulling_mill/1985
                       ...                
37300        data/raw/images/egyptian/2014
37301        data/raw/images/egyptian/2014
37302        data/raw/images/egyptian/2014
37303        data/raw/images/egyptian/1963
37304        data/raw/images/egyptian/1963
Name: root, Length: 37305, dtype: object

In [4]:
obj2info.dropna(subset=["material", "description"], inplace=False)

Unnamed: 0,obj_num,description,object_name,other_name,material,production.period,production.place
0,eg3,"squat shouldered jar, no rim",bowls,bowl,limestone,1st Dynasty,Egypt
1,eg64,axe-head,axes: woodworking tools,axe-head,granite,,Egypt
2,eg71,the working end of a fish tail knife with pres...,knives,knife,Flint/Chert,Naqada II,Egypt
3,eg75,seated figure of priest holding unrolled papyr...,Human Figurine,imhotep figurine,bronze,Late Period,Egypt
4,durom.1971.78,"seated woman, inset eyes (lost), headdress had...",Human Figurine,Hathor figurine,bronze,Late Period,Egypt
...,...,...,...,...,...,...,...
12349,durma.2020.3.2562,A silver Roman coin which is a part of the Pie...,coins,,metal,Roman,Rome
12350,durma.2020.3.2060,A silver Roman coin which is a part of the Pie...,coins,,metal,Roman,
12351,durma.2020.3.1446,A silver Roman coin which is a part of the Pie...,coins,,metal,Roman,Rome
12352,durma.2020.3.2042,A silver Roman coin which is a part of the Pie...,coins,,metal,Roman,Rome


In [5]:
label_col = "material"

o2i_lim = obj2info.dropna(subset=[label_col, "description"], inplace=False)

num_counts = o2i_lim[label_col].value_counts()
for lower_lim in [3]:
    o2i_lim = o2i_lim[o2i_lim[label_col].isin(num_counts[num_counts > lower_lim].index)]
train, val_test = train_test_split(
    o2i_lim, stratify=o2i_lim[label_col], test_size=0.4, random_state=42
)
val, test = train_test_split(
    val_test, stratify=val_test[label_col], test_size=0.8, random_state=42
)

In [6]:
from datasets import Dataset, DatasetDict

In [7]:
ds = Dataset.from_pandas(join_df).to_pandas()

In [9]:
ds

Unnamed: 0,obj_num,file,image,root,description,object_name,other_name,material,production.period,production.place
0,durma.1985.15.68,1985.15.68.jpg,data/raw/images/fulling_mill/1985/1985.15.68.jpg,data/raw/images/fulling_mill/1985,2 fragments of a bowl with open fret work at t...,,Rim Sherds,pottery,Post-Medieval,
1,durma.1985.52.37,1985.52.37.ff2.jpg,data/raw/images/fulling_mill/1985/1985.52.37.f...,data/raw/images/fulling_mill/1985,Reconstructed small vessel (many pieces with s...,pottery,Pottery,pottery,Roman,
2,durma.1985.81.4496,1985.81.4496 d2.jpg,data/raw/images/fulling_mill/1985/1985.81.4496...,data/raw/images/fulling_mill/1985,Fragment of a Samian beaker. Panell decoration...,vessels,pottery,pottery,Roman,
3,durma.1985.9.1,1985.9.1.1-d4.jpg,data/raw/images/fulling_mill/1985/1985.9.1.1-d...,data/raw/images/fulling_mill/1985,2 Fragmentary Saxon Cinerary Urns + 1 relative...,,Cinerary Urns,pottery,Saxon,
4,durma.1985.52.37,1985.52.37.sf2.jpg,data/raw/images/fulling_mill/1985/1985.52.37.s...,data/raw/images/fulling_mill/1985,Reconstructed small vessel (many pieces with s...,pottery,Pottery,pottery,Roman,
...,...,...,...,...,...,...,...,...,...,...
37300,durom.2014.1.2,2014.1.2 bb.jpg,data/raw/images/egyptian/2014/2014.1.2 bb.jpg,data/raw/images/egyptian/2014,"One of a collection of 162 flint tools. Brown,...",blades,,Flint/Chert,Neolithic Period,Egypt
37301,durom.2014.1.71,2014.1.71 ll.jpg,data/raw/images/egyptian/2014/2014.1.71 ll.jpg,data/raw/images/egyptian/2014,"One of a collection of 162 flint tools. Large,...",axes: woodworking tools,,Flint/Chert,Neolithic Period,Egypt
37302,durom.2014.1.2,2014.1.2 rr.jpg,data/raw/images/egyptian/2014/2014.1.2 rr.jpg,data/raw/images/egyptian/2014,"One of a collection of 162 flint tools. Brown,...",blades,,Flint/Chert,Neolithic Period,Egypt
37303,durom.1963.4,1963.4.jpg,data/raw/images/egyptian/1963/1963.4.jpg,data/raw/images/egyptian/1963,The woman is dressed in Qing dynasty style and...,figures,牙雕母婴像,ivory,late Qing dynasty,China


In [10]:
index_col = "obj_num"
text_col = "obj_num"
label_col = "material"
lower_lim = 3
problem_type = "image"


o2i_lim = (
    ds.drop_duplicates(subset=[index_col, label_col], inplace=False)
    .dropna(subset=[text_col, label_col], inplace=False)
    .drop(columns=["root", "file", "image"], inplace=False)
)


num_counts = o2i_lim[label_col].value_counts()
o2i_lim = o2i_lim[o2i_lim[label_col].isin(num_counts[num_counts > lower_lim].index)]

train, val_test = train_test_split(
    o2i_lim, stratify=o2i_lim[label_col], test_size=0.41, random_state=42
)
val, test = train_test_split(
    val_test, stratify=val_test[label_col], test_size=0.5, random_state=42
)
print(train.shape, val.shape, test.shape, o2i_lim.shape)

if problem_type == "image":
    train = train.merge(
        ds[["obj_num", "root", "file", "image"]], left_on="obj_num", right_on="obj_num", how="left"
    )
    val = val.merge(
        ds[["obj_num", "root", "file", "image"]], left_on="obj_num", right_on="obj_num", how="left"
    )
    test = test.merge(
        ds[["obj_num", "root", "file", "image"]], left_on="obj_num", right_on="obj_num", how="left"
    )
    print(train.shape, val.shape, test.shape, ds.shape)

# ds_dict = DatasetDict({"train": Dataset.from_pandas(train), "val": Dataset.from_pandas(val), "test": Dataset.from_pandas(test)})
# ds_dict

# if problem_type == "image":

# o2i_lim_ds = o2i_lim_ds.train_test_split(test_size=0.3, stratify_by_column=label_col, seed=42)
# o2i_lim_ds_valtest = o2i_lim_ds["test"].train_test_split(test_size=0.5, stratify_by_column=label_col, seed=42)
# o2i_lim_ds = DatasetDict({"train": o2i_lim_ds["train"], "val": o2i_lim_ds_valtest["train"], "test": o2i_lim_ds_valtest["test"]})

# if problem_type == "image":
#     file2obj = ds[["obj_num", "file", "image", "root"]].drop_duplicates(subset=["obj_num"], inplace=False)
#     train = o2i_lim_ds["train"].merge(file2obj, left_on="obj_num", right_on="obj_num", how="left")
#     val = o2i_lim_ds["val"].merge(file2obj, left_on="obj_num", right_on="obj_num", how="left")
#     test = o2i_lim_ds["test"].merge(file2obj, left_on="obj_num", right_on="obj_num", how="left")
#     o2i_lim_ds = DatasetDict({"train": train, "val": val, "test": test})
# o2i_lim_ds

(6819, 7) (2370, 7) (2370, 7) (11559, 7)
(19246, 10) (6743, 10) (7078, 10) (37305, 10)


In [None]:
o2i_lim

Unnamed: 0,obj_num,description,object_name,other_name,material,production.period,production.place


In [None]:
cols_to_drop = ["col1", "col2", "col3"]
ds = ds.drop(cols_to_drop, axis=1, errors="ignore")

In [None]:
ds_lim = ds_all.dropna(subset=["image", args.label_col], inplace=False)
if "3D" in args.dataset:
    ds_lim = ds_all[ds_all["original"]]

num_counts = ds_lim[args.label_col].value_counts()
ds_lim = ds_lim[ds_lim[args.label_col].isin(num_counts[num_counts > args.lower_lim].index)]

train, val_test = train_test_split(
    ds_lim,
    stratify=ds_lim[args.label_col],
    test_size=2 * args.testset_size,
    random_state=42,
)
val, test = train_test_split(
    val_test, stratify=val_test[args.label_col], test_size=0.5, random_state=42
)