In [1]:
import logging
import os
from pathlib import Path
import click
from dotenv import find_dotenv, load_dotenv

from datasets import load_dataset, ClassLabel
import numpy as np
import wandb
import yaml
from transformers.trainer_callback import EarlyStoppingCallback
from artifact_classification.utils import ConfigLoader
from torchvision.transforms import (
    Compose,
    Normalize,
    ToTensor,
    CenterCrop,
    Resize,
)
from transformers import (
    AutoImageProcessor,
    AutoModelForImageClassification,
    TrainingArguments,
    Trainer,
    DefaultDataCollator,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    AutoTokenizer,
)
from sklearn.metrics import top_k_accuracy_score
import evaluate

In [2]:
config = "om3txt_name"

# Training args
args = ConfigLoader(config, "../configs/train_configs.yaml", "../configs/train_default.yaml")

# # Load dataset, filter out na inputs and labels and encode labels (as label column can change)
# dataset = load_dataset(args.dataset)  # , download_mode="force_redownload")
# dataset = dataset.filter(lambda example: example[args.label_column] is not None)
# if args.problem_type == "text":
#     dataset = dataset.filter(lambda example: example[args.text_column] is not None)
# dataset = dataset.rename_column(args.label_column, "label")
# if not isinstance(dataset["train"].features["label"], ClassLabel):
#     dataset = dataset.class_encode_column("label")

Updating with:
{'config': 'om3txt_name', 'dataset': 'james-burton/OrientalMuseum_min3-name-text', 'wandb_proj_name': 'OrientalMuesumText', 'model_base': 'microsoft/deberta-v3-base', 'problem_type': 'text'}


{'config': 'om3txt_name', 'fast_dev_run': False, 'do_train': True, 'do_predict': True, 'batch_size': 16, 'model_base': 'microsoft/deberta-v3-base', 'output_root': 'models/', 'num_epochs': 100, 'early_stopping_patience': 5, 'grad_accumulation_steps': 1, 'seed': 42, 'logging_steps': 10, 'lr_scheduler': 'linear', 'warmup_ratio': 0, 'weight_decay': 0, 'device': 'cuda', 'num_workers': 1, 'resume_from_checkpoint': False, 'predict_batch_size': 16, 'save_total_limit': 1, 'lr': 5e-05, 'pytorch2_0': False, 'max_length': 512, 'text_column': 'description', 'fp16': True, 'dataset': 'james-burton/OrientalMuseum_min3-name-text', 'wandb_proj_name': 'OrientalMuesumText', 'problem_type': 'text'}



In [2]:
import yaml

with open("../configs/train_configs.yaml", "r") as file:
    configs = list(yaml.safe_load_all(file))

config_names = " ".join([cfg["config"] for cfg in configs])
print(config_names)

testing om3_num om3_material om3_name om3txt_material om3txt_name om3-white_num om3-white_material om3-white_name om3-3Dwhite_num om3-3Dwhite_material om3-3Dwhite_name om3-3Dwhite-1frame_num om3-3Dwhite-1frame_material om3-3Dwhite-1frame_name om4_num om4_material om4_name om4txt_material om4txt_name om4-white_num om4-white_material om4-white_name om4-3Dwhite_num om4-3Dwhite_material om4-3Dwhite_name om4-3Dwhite-1frame_num om4-3Dwhite-1frame_material om4-3Dwhite-1frame_name om5_num om5_material om5_name om5txt_material om5txt_name om5-white_num om5-white_material om5-white_name om5-3Dwhite_num om5-3Dwhite_material om5-3Dwhite_name om5-3Dwhite-1frame_num om5-3Dwhite-1frame_material om5-3Dwhite-1frame_name om6_num om6_material om6_name om6txt_material om6txt_name om6-white_num om6-white_material om6-white_name om6-3Dwhite_num om6-3Dwhite_material om6-3Dwhite_name om6-3Dwhite-1frame_num om6-3Dwhite-1frame_material om6-3Dwhite-1frame_name om3-3DwhiteTVT_num om3-3DwhiteTVT_material om3-3Dwhi

In [4]:
" ".join(
    [cfg["config"] for cfg in configs if not ("txt" in cfg["config"] or "num" in cfg["config"])]
)
# " ".join([cfg["config"] for cfg in configs if "1frame" in cfg["config"]])

'testing om3_material om3_name om3-white_material om3-white_name om3-3Dwhite_material om3-3Dwhite_name om3-3Dwhite-1frame_material om3-3Dwhite-1frame_name om4_material om4_name om4-white_material om4-white_name om4-3Dwhite_material om4-3Dwhite_name om4-3Dwhite-1frame_material om4-3Dwhite-1frame_name om5_material om5_name om5-white_material om5-white_name om5-3Dwhite_material om5-3Dwhite_name om5-3Dwhite-1frame_material om5-3Dwhite-1frame_name om6_material om6_name om6-white_material om6-white_name om6-3Dwhite_material om6-3Dwhite_name om6-3Dwhite-1frame_material om6-3Dwhite-1frame_name om3-3DwhiteTVT_material om3-3DwhiteTVT_name'

In [37]:
l2i = {
    "Album Painting": 0,
    "Animal Figurine": 1,
    "Animal Mummy": 2,
    "Animal bone": 3,
    "Belt Hook": 4,
    "Blouse": 5,
    "Bolt": 6,
    "Box": 7,
    "Brush Pot": 8,
    "Cap": 9,
    "Case": 10,
    "Clay pipe (smoking)": 11,
    "Cosmetic and Medical Equipment and Implements": 12,
    "Cup And Saucer": 13,
    "DVDs": 14,
    "Dagger": 15,
    "Disc": 16,
    "Domestic Equipment and Utensils": 17,
    "Earring": 18,
    "Finger Ring": 19,
    "Funerary Cone": 20,
    "Funerary goods": 21,
    "Funerary money": 22,
    "Hanging": 23,
    "Heart Scarab": 24,
    "Human Figurine": 25,
    "Inkstick": 26,
    "Kite": 27,
    "Kohl Pot": 28,
    "Letter": 29,
    "Manuscript Page": 30,
    "Mat": 31,
    "Mica Painting": 32,
    "Miniature Painting": 33,
    "Mortar": 34,
    "Mummy Label": 35,
    "Oracle Bone": 36,
    "Ostraka": 37,
    "Palette": 38,
    "Panel": 39,
    "Part": 40,
    "Pendant": 41,
    "Pipe": 42,
    "Pith Painting": 43,
    "Plaque": 44,
    "Plate": 45,
    "Scarab Seal": 46,
    "Scarf": 47,
    "Screen": 48,
    "Seal": 49,
    "Slide": 50,
    "Stand": 51,
    "Thangka": 52,
    "Water Dropper": 53,
    "Water Pot": 54,
    "Woodblock Print": 55,
    "accessories": 56,
    "albums": 57,
    "amulets": 58,
    "animation cels": 59,
    "animation drawings": 60,
    "armor": 61,
    "arrowheads": 62,
    "axes: woodworking tools": 63,
    "badges": 64,
    "bags": 65,
    "bandages": 66,
    "baskets": 67,
    "beads": 68,
    "bells": 69,
    "belts": 70,
    "blades": 71,
    "books": 72,
    "bottles": 73,
    "bowls": 74,
    "boxes": 75,
    "bracelets": 76,
    "brick": 77,
    "brooches": 78,
    "brush washers": 79,
    "buckets": 80,
    "buckles": 81,
    "calligraphy": 82,
    "canopic jars": 83,
    "cards": 84,
    "carvings": 85,
    "chains": 86,
    "chessmen": 87,
    "chopsticks": 88,
    "claypipe": 89,
    "cloth": 90,
    "clothing": 91,
    "coats": 92,
    "coins": 93,
    "collar": 94,
    "compact discs": 95,
    "containers": 96,
    "coverings": 97,
    "covers": 98,
    "cups": 99,
    "deity figurine": 100,
    "diagrams": 101,
    "dishes": 102,
    "dolls": 103,
    "drawings": 104,
    "dresses": 105,
    "drums": 106,
    "earrings": 107,
    "embroidery": 108,
    "ensembles": 109,
    "envelopes": 110,
    "equipment for personal use: grooming, hygiene and health care": 111,
    "ewers": 112,
    "fans": 113,
    "figures": 114,
    "figurines": 115,
    "flags": 116,
    "flasks": 117,
    "furniture components": 118,
    "gaming counters": 119,
    "glassware": 120,
    "hairpins": 121,
    "handles": 122,
    "harnesses": 123,
    "hats": 124,
    "headdresses": 125,
    "heads": 126,
    "incense burners": 127,
    "inlays": 128,
    "jackets": 129,
    "jars": 130,
    "jewelry": 131,
    "juglets": 132,
    "jugs": 133,
    "keys": 134,
    "kimonos": 135,
    "knives": 136,
    "lamps": 137,
    "lanterns": 138,
    "lids": 139,
    "maces": 140,
    "masks": 141,
    "medals": 142,
    "mirrors": 143,
    "models": 144,
    "mounts": 145,
    "nails": 146,
    "necklaces": 147,
    "needles": 148,
    "netsukes": 149,
    "ornaments": 150,
    "pages": 151,
    "paintings": 152,
    "paper money": 153,
    "pendants": 154,
    "petticoats": 155,
    "photographs": 156,
    "pictures": 157,
    "pins": 158,
    "playing cards": 159,
    "poker": 160,
    "postage stamps": 161,
    "postcards": 162,
    "posters": 163,
    "pots": 164,
    "pottery": 165,
    "prints": 166,
    "puppets": 167,
    "purses": 168,
    "reliefs": 169,
    "rings": 170,
    "robes": 171,
    "rubbings": 172,
    "rugs": 173,
    "sandals": 174,
    "saris": 175,
    "sarongs": 176,
    "scabbards": 177,
    "scaraboids": 178,
    "scarabs": 179,
    "scrolls": 180,
    "seed": 181,
    "seppa": 182,
    "shadow puppets": 183,
    "shawls": 184,
    "shell": 185,
    "sherds": 186,
    "shields": 187,
    "shoes": 188,
    "sketches": 189,
    "skirts": 190,
    "snuff bottles": 191,
    "socks": 192,
    "spatulas": 193,
    "spoons": 194,
    "statues": 195,
    "statuettes": 196,
    "stelae": 197,
    "straps": 198,
    "studs": 199,
    "swords": 200,
    "tablets": 201,
    "tacks": 202,
    "tea bowls": 203,
    "teapots": 204,
    "tiles": 205,
    "tools": 206,
    "toys": 207,
    "trays": 208,
    "tubes": 209,
    "tweezers": 210,
    "underwear": 211,
    "unidentified": 212,
    "ushabti": 213,
    "utensils": 214,
    "vases": 215,
    "vessels": 216,
    "weight": 217,
    "weights": 218,
    "whorls": 219,
    "wood blocks": 220,
}

In [38]:
import json

In [39]:
# json dump
with open("l2i.json", "w") as f:
    json.dump({str(v): k for k, v in l2i.items()}, f)
# {str(v): k for k, v in l2i.items()}

In [7]:
from transformers import AutoConfig

In [45]:
config = AutoConfig.from_pretrained("james-burton/om6txt_name")

config.json:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

In [46]:
from PIL import Image

image_path = "../data/processed/OM_3Dimages_white/egyptian/1951/1951.42-tt_2.png"
image = Image.open(image_path)
image.show()

/snap/core20/current/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /lib/x86_64-linux-gnu/libproxy.so.1)
Failed to load module: /home/james/snap/code/common/.cache/gio-modules/libgiolibproxy.so
eog: symbol lookup error: /snap/core20/current/lib/x86_64-linux-gnu/libpthread.so.0: undefined symbol: __libc_pthread_init, version GLIBC_PRIVATE
