{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "cross", "1": "white and blue", "2": "girl", "3": "wedding", "4": "woman", "5": "clear", "6": "tent", "7": "plain", "8": "king", "9": "arrow", "10": "they aren't", "11": "walking", "12": "4", "13": "plastic", "14": "outside", "15": "trees", "16": "7:45", "17": "africa", "18": "blue and white", "19": "crown", "20": "lg", "21": "yes", "22": "tower", "23": "at table", "24": "giraffes", "25": "security", "26": "full", "27": "in car", "28": "don't know", "29": "street", "30": "2013", "31": "dog", "32": "red", "33": "person", "34": "6", "35": "white", "36": "on street", "37": "many", "38": "human", "39": "snowboard", "40": "skateboarding", "41": "blonde", "42": "birthday", "43": "bikes", "44": "car", "45": "monitor", "46": "small", "47": "natural", "48": "9:35", "49": "net", "50": "tabby", "51": "jeep", "52": "roof", "53": "tv", "54": "exit", "55": "skiing", "56": "brown", "57": "church", "58": "smiling", "59": "clock", "60": "brick", "61": "zoo", "62": "sleeping", "63": "not sure", "64": "red and yellow", "65": "neon", "66": "man", "67": "rack", "68": "black", "69": "7", "70": "ice cream", "71": "talking on phone", "72": "photographer", "73": "camera", "74": "1", "75": "hair", "76": "white and black", "77": "0", "78": "snow", "79": "tired", "80": "wine", "81": "shelter", "82": "screen", "83": "cup", "84": "7:35", "85": "door", "86": "large", "87": "bicycles", "88": "beige", "89": "shadow", "90": "shadows", "91": "sidewalk", "92": "skateboard", "93": "bus", "94": "hat", "95": "watching", "96": "blue", "97": "fashion", "98": "hawaii", "99": "cloudy", "100": "crossing", "101": "curtain", "102": "picnic table", "103": "train", "104": "big ben", "105": "shrimp", "106": "giraffe", "107": "bedroom", "108": "8", "109": "talking", "110": "dirt", "111": "soccer ball", "112": "sky", "113": "3", "114": "lanyard", "115": "calico", "116": "backpack", "117": "no", "118": "purple", "119": "on road", "120": "10", "121": "snowboarding", "122": "bicycle", "123": "platform", "124": "women", "125": "air", "126": "lady", "127": "chair", "128": "out", "129": "can't tell", "130": "leather", "131": "beagle", "132": "unknown", "133": "desert", "134": "boy", "135": "curtains", "136": "station", "137": "nothing", "138": "doughnut", "139": "yellow", "140": "clock tower", "141": "ball", "142": "suv", "143": "little girl", "144": "lying down", "145": "stand", "146": "bricks", "147": "black and white", "148": "resting", "149": "not there", "150": "laying down", "151": "2000", "152": "sun", "153": "skier", "154": "woods", "155": "down", "156": "red and blue", "157": "necklace", "158": "smile", "159": "green", "160": "donut", "161": "fence", "162": "canopy", "163": "orange", "164": "queen", "165": "cage", "166": "style", "167": "cat", "168": "gray and black", "169": "plate", "170": "protection", "171": "2", "172": "right", "173": "happy", "174": "2010", "175": "windows", "176": "ground", "177": "gray", "178": "solid", "179": "window", "180": "table", "181": "wall", "182": "chopsticks", "183": "double", "184": "8:35", "185": "tan", "186": "soccer", "187": "snowboarder", "188": "name tag", "189": "bike rack", "190": "wine tasting", "191": "shade", "192": "low", "193": "park", "194": "stripes", "195": "pink", "196": "forest", "197": "french", "198": "5" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 77, "1": 74, "10": 120, "2": 171, "2000": 151, "2010": 174, "2013": 30, "3": 113, "4": 12, "5": 198, "6": 34, "7": 69, "7:35": 84, "7:45": 16, "8": 108, "8:35": 184, "9:35": 48, "africa": 17, "air": 125, "arrow": 9, "at table": 23, "backpack": 116, "ball": 141, "beagle": 131, "bedroom": 107, "beige": 88, "bicycle": 122, "bicycles": 87, "big ben": 104, "bike rack": 189, "bikes": 43, "birthday": 42, "black": 68, "black and white": 147, "blonde": 41, "blue": 96, "blue and white": 18, "boy": 134, "brick": 60, "bricks": 146, "brown": 56, "bus": 93, "cage": 165, "calico": 115, "camera": 73, "can't tell": 129, "canopy": 162, "car": 44, "cat": 167, "chair": 127, "chopsticks": 182, "church": 57, "clear": 5, "clock": 59, "clock tower": 140, "cloudy": 99, "cross": 0, "crossing": 100, "crown": 19, "cup": 83, "curtain": 101, "curtains": 135, "desert": 133, "dirt": 110, "dog": 31, "don't know": 28, "donut": 160, "door": 85, "double": 183, "doughnut": 138, "down": 155, "exit": 54, "fashion": 97, "fence": 161, "forest": 196, "french": 197, "full": 26, "giraffe": 106, "giraffes": 24, "girl": 2, "gray": 177, "gray and black": 168, "green": 159, "ground": 176, "hair": 75, "happy": 173, "hat": 94, "hawaii": 98, "human": 38, "ice cream": 70, "in car": 27, "jeep": 51, "king": 8, "lady": 126, "lanyard": 114, "large": 86, "laying down": 150, "leather": 130, "lg": 20, "little girl": 143, "low": 192, "lying down": 144, "man": 66, "many": 37, "monitor": 45, "name tag": 188, "natural": 47, "necklace": 157, "neon": 65, "net": 49, "no": 117, "not sure": 63, "not there": 149, "nothing": 137, "on road": 119, "on street": 36, "orange": 163, "out": 128, "outside": 14, "park": 193, "person": 33, "photographer": 72, "picnic table": 102, "pink": 195, "plain": 7, "plastic": 13, "plate": 169, "platform": 123, "protection": 170, "purple": 118, "queen": 164, "rack": 67, "red": 32, "red and blue": 156, "red and yellow": 64, "resting": 148, "right": 172, "roof": 52, "screen": 82, "security": 25, "shade": 191, "shadow": 89, "shadows": 90, "shelter": 81, "shrimp": 105, "sidewalk": 91, "skateboard": 92, "skateboarding": 40, "skier": 153, "skiing": 55, "sky": 112, "sleeping": 62, "small": 46, "smile": 158, "smiling": 58, "snow": 78, "snowboard": 39, "snowboarder": 187, "snowboarding": 121, "soccer": 186, "soccer ball": 111, "solid": 178, "stand": 145, "station": 136, "street": 29, "stripes": 194, "style": 166, "sun": 152, "suv": 142, "tabby": 50, "table": 180, "talking": 109, "talking on phone": 71, "tan": 185, "tent": 6, "they aren't": 10, "tired": 79, "tower": 22, "train": 103, "trees": 15, "tv": 53, "unknown": 132, "walking": 11, "wall": 181, "watching": 95, "wedding": 3, "white": 35, "white and black": 76, "white and blue": 1, "window": 179, "windows": 175, "wine": 80, "wine tasting": 190, "woman": 4, "women": 124, "woods": 154, "yellow": 139, "yes": 21, "zoo": 61 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.46.2", "type_vocab_size": 2, "vocab_size": 30522 }