{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "brick", "1": "many", "2": "cup", "3": "stripes", "4": "church", "5": "on street", "6": "shade", "7": "don't know", "8": "curtain", "9": "wall", "10": "clock tower", "11": "2000", "12": "plastic", "13": "9:35", "14": "hawaii", "15": "cloudy", "16": "snowboard", "17": "birthday", "18": "pink", "19": "snowboarder", "20": "plate", "21": "window", "22": "windows", "23": "security", "24": "crossing", "25": "sky", "26": "yes", "27": "outside", "28": "laying down", "29": "necklace", "30": "brown", "31": "full", "32": "donut", "33": "king", "34": "6", "35": "tower", "36": "street", "37": "lady", "38": "smiling", "39": "woods", "40": "happy", "41": "white and blue", "42": "bicycles", "43": "gray", "44": "girl", "45": "rack", "46": "french", "47": "africa", "48": "ground", "49": "big ben", "50": "shadows", "51": "purple", "52": "out", "53": "talking", "54": "7:35", "55": "0", "56": "lying down", "57": "blonde", "58": "person", "59": "monitor", "60": "skiing", "61": "bicycle", "62": "sun", "63": "chopsticks", "64": "tabby", "65": "natural", "66": "on road", "67": "watching", "68": "10", "69": "blue", "70": "exit", "71": "8", "72": "double", "73": "giraffe", "74": "2", "75": "shelter", "76": "smile", "77": "soccer", "78": "chair", "79": "not sure", "80": "photographer", "81": "backpack", "82": "door", "83": "1", "84": "green", "85": "cat", "86": "lg", "87": "table", "88": "air", "89": "orange", "90": "walking", "91": "snow", "92": "nothing", "93": "wine", "94": "clock", "95": "car", "96": "2013", "97": "park", "98": "dirt", "99": "white", "100": "red and blue", "101": "net", "102": "desert", "103": "right", "104": "gray and black", "105": "canopy", "106": "plain", "107": "cage", "108": "little girl", "109": "man", "110": "beige", "111": "hair", "112": "soccer ball", "113": "fence", "114": "name tag", "115": "wedding", "116": "no", "117": "fashion", "118": "tv", "119": "7:45", "120": "cross", "121": "forest", "122": "bikes", "123": "sleeping", "124": "tired", "125": "leather", "126": "hat", "127": "suv", "128": "shrimp", "129": "crown", "130": "platform", "131": "curtains", "132": "not there", "133": "talking on phone", "134": "3", "135": "women", "136": "human", "137": "black and white", "138": "down", "139": "bus", "140": "style", "141": "doughnut", "142": "can't tell", "143": "zoo", "144": "clear", "145": "black", "146": "woman", "147": "large", "148": "resting", "149": "giraffes", "150": "bricks", "151": "yellow", "152": "screen", "153": "skateboarding", "154": "skateboard", "155": "solid", "156": "in car", "157": "ice cream", "158": "red and yellow", "159": "neon", "160": "blue and white", "161": "boy", "162": "bedroom", "163": "train", "164": "station", "165": "8:35", "166": "low", "167": "trees", "168": "7", "169": "roof", "170": "beagle", "171": "4", "172": "at table", "173": "bike rack", "174": "they aren't", "175": "sidewalk", "176": "tan", "177": "skier", "178": "small", "179": "stand", "180": "lanyard", "181": "queen", "182": "shadow", "183": "calico", "184": "arrow", "185": "dog", "186": "jeep", "187": "camera", "188": "red", "189": "5", "190": "picnic table", "191": "tent", "192": "protection", "193": "unknown", "194": "white and black", "195": "wine tasting", "196": "2010", "197": "ball", "198": "snowboarding" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 55, "1": 83, "10": 68, "2": 74, "2000": 11, "2010": 196, "2013": 96, "3": 134, "4": 171, "5": 189, "6": 34, "7": 168, "7:35": 54, "7:45": 119, "8": 71, "8:35": 165, "9:35": 13, "africa": 47, "air": 88, "arrow": 184, "at table": 172, "backpack": 81, "ball": 197, "beagle": 170, "bedroom": 162, "beige": 110, "bicycle": 61, "bicycles": 42, "big ben": 49, "bike rack": 173, "bikes": 122, "birthday": 17, "black": 145, "black and white": 137, "blonde": 57, "blue": 69, "blue and white": 160, "boy": 161, "brick": 0, "bricks": 150, "brown": 30, "bus": 139, "cage": 107, "calico": 183, "camera": 187, "can't tell": 142, "canopy": 105, "car": 95, "cat": 85, "chair": 78, "chopsticks": 63, "church": 4, "clear": 144, "clock": 94, "clock tower": 10, "cloudy": 15, "cross": 120, "crossing": 24, "crown": 129, "cup": 2, "curtain": 8, "curtains": 131, "desert": 102, "dirt": 98, "dog": 185, "don't know": 7, "donut": 32, "door": 82, "double": 72, "doughnut": 141, "down": 138, "exit": 70, "fashion": 117, "fence": 113, "forest": 121, "french": 46, "full": 31, "giraffe": 73, "giraffes": 149, "girl": 44, "gray": 43, "gray and black": 104, "green": 84, "ground": 48, "hair": 111, "happy": 40, "hat": 126, "hawaii": 14, "human": 136, "ice cream": 157, "in car": 156, "jeep": 186, "king": 33, "lady": 37, "lanyard": 180, "large": 147, "laying down": 28, "leather": 125, "lg": 86, "little girl": 108, "low": 166, "lying down": 56, "man": 109, "many": 1, "monitor": 59, "name tag": 114, "natural": 65, "necklace": 29, "neon": 159, "net": 101, "no": 116, "not sure": 79, "not there": 132, "nothing": 92, "on road": 66, "on street": 5, "orange": 89, "out": 52, "outside": 27, "park": 97, "person": 58, "photographer": 80, "picnic table": 190, "pink": 18, "plain": 106, "plastic": 12, "plate": 20, "platform": 130, "protection": 192, "purple": 51, "queen": 181, "rack": 45, "red": 188, "red and blue": 100, "red and yellow": 158, "resting": 148, "right": 103, "roof": 169, "screen": 152, "security": 23, "shade": 6, "shadow": 182, "shadows": 50, "shelter": 75, "shrimp": 128, "sidewalk": 175, "skateboard": 154, "skateboarding": 153, "skier": 177, "skiing": 60, "sky": 25, "sleeping": 123, "small": 178, "smile": 76, "smiling": 38, "snow": 91, "snowboard": 16, "snowboarder": 19, "snowboarding": 198, "soccer": 77, "soccer ball": 112, "solid": 155, "stand": 179, "station": 164, "street": 36, "stripes": 3, "style": 140, "sun": 62, "suv": 127, "tabby": 64, "table": 87, "talking": 53, "talking on phone": 133, "tan": 176, "tent": 191, "they aren't": 174, "tired": 124, "tower": 35, "train": 163, "trees": 167, "tv": 118, "unknown": 193, "walking": 90, "wall": 9, "watching": 67, "wedding": 115, "white": 99, "white and black": 194, "white and blue": 41, "window": 21, "windows": 22, "wine": 93, "wine tasting": 195, "woman": 146, "women": 135, "woods": 39, "yellow": 151, "yes": 26, "zoo": 143 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.46.2", "type_vocab_size": 2, "vocab_size": 30522 }