{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "shadows", "1": "security", "2": "beagle", "3": "yes", "4": "brick", "5": "snowboarder", "6": "skiing", "7": "unknown", "8": "zoo", "9": "chopsticks", "10": "curtains", "11": "6", "12": "desert", "13": "snowboard", "14": "lanyard", "15": "woman", "16": "lying down", "17": "red and yellow", "18": "giraffe", "19": "church", "20": "2", "21": "down", "22": "tired", "23": "bicycle", "24": "birthday", "25": "jeep", "26": "purple", "27": "on road", "28": "pink", "29": "little girl", "30": "9:35", "31": "cup", "32": "cloudy", "33": "curtain", "34": "cross", "35": "out", "36": "door", "37": "stripes", "38": "sun", "39": "wedding", "40": "net", "41": "tower", "42": "doughnut", "43": "car", "44": "8:35", "45": "lady", "46": "on street", "47": "bus", "48": "smiling", "49": "blue and white", "50": "sleeping", "51": "big ben", "52": "plastic", "53": "2013", "54": "donut", "55": "no", "56": "happy", "57": "lg", "58": "chair", "59": "giraffes", "60": "bicycles", "61": "fashion", "62": "wine", "63": "black and white", "64": "tan", "65": "talking", "66": "suv", "67": "girl", "68": "tv", "69": "right", "70": "neon", "71": "clock", "72": "beige", "73": "nothing", "74": "women", "75": "smile", "76": "at table", "77": "8", "78": "calico", "79": "resting", "80": "talking on phone", "81": "windows", "82": "human", "83": "watching", "84": "necklace", "85": "hair", "86": "protection", "87": "large", "88": "natural", "89": "outside", "90": "low", "91": "hawaii", "92": "2010", "93": "table", "94": "snow", "95": "not there", "96": "small", "97": "shadow", "98": "wine tasting", "99": "tabby", "100": "clear", "101": "many", "102": "snowboarding", "103": "trees", "104": "backpack", "105": "park", "106": "style", "107": "walking", "108": "brown", "109": "bricks", "110": "in car", "111": "air", "112": "crossing", "113": "hat", "114": "solid", "115": "street", "116": "forest", "117": "exit", "118": "yellow", "119": "leather", "120": "gray", "121": "bedroom", "122": "they aren't", "123": "full", "124": "skateboarding", "125": "clock tower", "126": "camera", "127": "white", "128": "blue", "129": "0", "130": "plain", "131": "person", "132": "not sure", "133": "4", "134": "don't know", "135": "shrimp", "136": "shelter", "137": "canopy", "138": "7:35", "139": "red and blue", "140": "white and blue", "141": "platform", "142": "7:45", "143": "sky", "144": "roof", "145": "fence", "146": "king", "147": "skier", "148": "10", "149": "window", "150": "crown", "151": "sidewalk", "152": "double", "153": "train", "154": "soccer ball", "155": "green", "156": "dirt", "157": "gray and black", "158": "name tag", "159": "cat", "160": "bike rack", "161": "monitor", "162": "7", "163": "man", "164": "5", "165": "shade", "166": "queen", "167": "wall", "168": "stand", "169": "skateboard", "170": "cage", "171": "africa", "172": "3", "173": "french", "174": "arrow", "175": "soccer", "176": "woods", "177": "orange", "178": "laying down", "179": "ice cream", "180": "bikes", "181": "tent", "182": "plate", "183": "2000", "184": "ground", "185": "picnic table", "186": "boy", "187": "black", "188": "screen", "189": "dog", "190": "white and black", "191": "blonde", "192": "can't tell", "193": "ball", "194": "station", "195": "red", "196": "rack", "197": "1", "198": "photographer" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 129, "1": 197, "10": 148, "2": 20, "2000": 183, "2010": 92, "2013": 53, "3": 172, "4": 133, "5": 164, "6": 11, "7": 162, "7:35": 138, "7:45": 142, "8": 77, "8:35": 44, "9:35": 30, "africa": 171, "air": 111, "arrow": 174, "at table": 76, "backpack": 104, "ball": 193, "beagle": 2, "bedroom": 121, "beige": 72, "bicycle": 23, "bicycles": 60, "big ben": 51, "bike rack": 160, "bikes": 180, "birthday": 24, "black": 187, "black and white": 63, "blonde": 191, "blue": 128, "blue and white": 49, "boy": 186, "brick": 4, "bricks": 109, "brown": 108, "bus": 47, "cage": 170, "calico": 78, "camera": 126, "can't tell": 192, "canopy": 137, "car": 43, "cat": 159, "chair": 58, "chopsticks": 9, "church": 19, "clear": 100, "clock": 71, "clock tower": 125, "cloudy": 32, "cross": 34, "crossing": 112, "crown": 150, "cup": 31, "curtain": 33, "curtains": 10, "desert": 12, "dirt": 156, "dog": 189, "don't know": 134, "donut": 54, "door": 36, "double": 152, "doughnut": 42, "down": 21, "exit": 117, "fashion": 61, "fence": 145, "forest": 116, "french": 173, "full": 123, "giraffe": 18, "giraffes": 59, "girl": 67, "gray": 120, "gray and black": 157, "green": 155, "ground": 184, "hair": 85, "happy": 56, "hat": 113, "hawaii": 91, "human": 82, "ice cream": 179, "in car": 110, "jeep": 25, "king": 146, "lady": 45, "lanyard": 14, "large": 87, "laying down": 178, "leather": 119, "lg": 57, "little girl": 29, "low": 90, "lying down": 16, "man": 163, "many": 101, "monitor": 161, "name tag": 158, "natural": 88, "necklace": 84, "neon": 70, "net": 40, "no": 55, "not sure": 132, "not there": 95, "nothing": 73, "on road": 27, "on street": 46, "orange": 177, "out": 35, "outside": 89, "park": 105, "person": 131, "photographer": 198, "picnic table": 185, "pink": 28, "plain": 130, "plastic": 52, "plate": 182, "platform": 141, "protection": 86, "purple": 26, "queen": 166, "rack": 196, "red": 195, "red and blue": 139, "red and yellow": 17, "resting": 79, "right": 69, "roof": 144, "screen": 188, "security": 1, "shade": 165, "shadow": 97, "shadows": 0, "shelter": 136, "shrimp": 135, "sidewalk": 151, "skateboard": 169, "skateboarding": 124, "skier": 147, "skiing": 6, "sky": 143, "sleeping": 50, "small": 96, "smile": 75, "smiling": 48, "snow": 94, "snowboard": 13, "snowboarder": 5, "snowboarding": 102, "soccer": 175, "soccer ball": 154, "solid": 114, "stand": 168, "station": 194, "street": 115, "stripes": 37, "style": 106, "sun": 38, "suv": 66, "tabby": 99, "table": 93, "talking": 65, "talking on phone": 80, "tan": 64, "tent": 181, "they aren't": 122, "tired": 22, "tower": 41, "train": 153, "trees": 103, "tv": 68, "unknown": 7, "walking": 107, "wall": 167, "watching": 83, "wedding": 39, "white": 127, "white and black": 190, "white and blue": 140, "window": 149, "windows": 81, "wine": 62, "wine tasting": 98, "woman": 15, "women": 74, "woods": 176, "yellow": 118, "yes": 3, "zoo": 8 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.33.2", "type_vocab_size": 2, "vocab_size": 30522 }