{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "small", "1": "talking on phone", "2": "large", "3": "stand", "4": "lying down", "5": "red", "6": "hat", "7": "3", "8": "cloudy", "9": "at table", "10": "pink", "11": "yellow", "12": "7:35", "13": "0", "14": "church", "15": "on street", "16": "little girl", "17": "they aren't", "18": "woman", "19": "white", "20": "king", "21": "8:35", "22": "bedroom", "23": "5", "24": "skier", "25": "name tag", "26": "door", "27": "can't tell", "28": "chair", "29": "yes", "30": "blue and white", "31": "wine", "32": "photographer", "33": "queen", "34": "suv", "35": "beige", "36": "shadow", "37": "forest", "38": "calico", "39": "resting", "40": "right", "41": "wedding", "42": "stripes", "43": "plate", "44": "woods", "45": "zoo", "46": "4", "47": "full", "48": "rack", "49": "tan", "50": "many", "51": "snow", "52": "big ben", "53": "don't know", "54": "chopsticks", "55": "gray", "56": "white and blue", "57": "tv", "58": "curtain", "59": "beagle", "60": "8", "61": "not sure", "62": "bikes", "63": "birthday", "64": "wall", "65": "wine tasting", "66": "shade", "67": "tent", "68": "7", "69": "tower", "70": "giraffe", "71": "necklace", "72": "ground", "73": "trees", "74": "2", "75": "hawaii", "76": "sidewalk", "77": "lanyard", "78": "bus", "79": "blue", "80": "human", "81": "boy", "82": "table", "83": "picnic table", "84": "ball", "85": "car", "86": "bicycles", "87": "7:45", "88": "french", "89": "shrimp", "90": "snowboarding", "91": "clock tower", "92": "not there", "93": "net", "94": "roof", "95": "fence", "96": "women", "97": "orange", "98": "cup", "99": "out", "100": "no", "101": "screen", "102": "on road", "103": "nothing", "104": "curtains", "105": "low", "106": "cross", "107": "outside", "108": "talking", "109": "hair", "110": "dirt", "111": "brick", "112": "1", "113": "canopy", "114": "arrow", "115": "soccer ball", "116": "solid", "117": "windows", "118": "street", "119": "2000", "120": "ice cream", "121": "double", "122": "snowboard", "123": "shelter", "124": "tired", "125": "protection", "126": "platform", "127": "sky", "128": "lady", "129": "blonde", "130": "donut", "131": "girl", "132": "station", "133": "purple", "134": "skateboard", "135": "soccer", "136": "brown", "137": "desert", "138": "doughnut", "139": "crown", "140": "cat", "141": "red and yellow", "142": "walking", "143": "clock", "144": "tabby", "145": "watching", "146": "sleeping", "147": "dog", "148": "lg", "149": "leather", "150": "in car", "151": "exit", "152": "bike rack", "153": "backpack", "154": "man", "155": "style", "156": "person", "157": "clear", "158": "smile", "159": "crossing", "160": "camera", "161": "black and white", "162": "down", "163": "gray and black", "164": "plastic", "165": "bicycle", "166": "fashion", "167": "unknown", "168": "bricks", "169": "green", "170": "monitor", "171": "sun", "172": "9:35", "173": "2013", "174": "natural", "175": "snowboarder", "176": "africa", "177": "neon", "178": "smiling", "179": "cage", "180": "black", "181": "giraffes", "182": "air", "183": "train", "184": "skateboarding", "185": "6", "186": "10", "187": "shadows", "188": "red and blue", "189": "window", "190": "security", "191": "white and black", "192": "skiing", "193": "2010", "194": "plain", "195": "laying down", "196": "jeep", "197": "happy", "198": "park" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 13, "1": 112, "10": 186, "2": 74, "2000": 119, "2010": 193, "2013": 173, "3": 7, "4": 46, "5": 23, "6": 185, "7": 68, "7:35": 12, "7:45": 87, "8": 60, "8:35": 21, "9:35": 172, "africa": 176, "air": 182, "arrow": 114, "at table": 9, "backpack": 153, "ball": 84, "beagle": 59, "bedroom": 22, "beige": 35, "bicycle": 165, "bicycles": 86, "big ben": 52, "bike rack": 152, "bikes": 62, "birthday": 63, "black": 180, "black and white": 161, "blonde": 129, "blue": 79, "blue and white": 30, "boy": 81, "brick": 111, "bricks": 168, "brown": 136, "bus": 78, "cage": 179, "calico": 38, "camera": 160, "can't tell": 27, "canopy": 113, "car": 85, "cat": 140, "chair": 28, "chopsticks": 54, "church": 14, "clear": 157, "clock": 143, "clock tower": 91, "cloudy": 8, "cross": 106, "crossing": 159, "crown": 139, "cup": 98, "curtain": 58, "curtains": 104, "desert": 137, "dirt": 110, "dog": 147, "don't know": 53, "donut": 130, "door": 26, "double": 121, "doughnut": 138, "down": 162, "exit": 151, "fashion": 166, "fence": 95, "forest": 37, "french": 88, "full": 47, "giraffe": 70, "giraffes": 181, "girl": 131, "gray": 55, "gray and black": 163, "green": 169, "ground": 72, "hair": 109, "happy": 197, "hat": 6, "hawaii": 75, "human": 80, "ice cream": 120, "in car": 150, "jeep": 196, "king": 20, "lady": 128, "lanyard": 77, "large": 2, "laying down": 195, "leather": 149, "lg": 148, "little girl": 16, "low": 105, "lying down": 4, "man": 154, "many": 50, "monitor": 170, "name tag": 25, "natural": 174, "necklace": 71, "neon": 177, "net": 93, "no": 100, "not sure": 61, "not there": 92, "nothing": 103, "on road": 102, "on street": 15, "orange": 97, "out": 99, "outside": 107, "park": 198, "person": 156, "photographer": 32, "picnic table": 83, "pink": 10, "plain": 194, "plastic": 164, "plate": 43, "platform": 126, "protection": 125, "purple": 133, "queen": 33, "rack": 48, "red": 5, "red and blue": 188, "red and yellow": 141, "resting": 39, "right": 40, "roof": 94, "screen": 101, "security": 190, "shade": 66, "shadow": 36, "shadows": 187, "shelter": 123, "shrimp": 89, "sidewalk": 76, "skateboard": 134, "skateboarding": 184, "skier": 24, "skiing": 192, "sky": 127, "sleeping": 146, "small": 0, "smile": 158, "smiling": 178, "snow": 51, "snowboard": 122, "snowboarder": 175, "snowboarding": 90, "soccer": 135, "soccer ball": 115, "solid": 116, "stand": 3, "station": 132, "street": 118, "stripes": 42, "style": 155, "sun": 171, "suv": 34, "tabby": 144, "table": 82, "talking": 108, "talking on phone": 1, "tan": 49, "tent": 67, "they aren't": 17, "tired": 124, "tower": 69, "train": 183, "trees": 73, "tv": 57, "unknown": 167, "walking": 142, "wall": 64, "watching": 145, "wedding": 41, "white": 19, "white and black": 191, "white and blue": 56, "window": 189, "windows": 117, "wine": 31, "wine tasting": 65, "woman": 18, "women": 96, "woods": 44, "yellow": 11, "yes": 29, "zoo": 45 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.40.1", "type_vocab_size": 2, "vocab_size": 30522 }