Transformers
Safetensors
English
German
qwen2_vl
multimodal_embedding
text-generation-inference
Inference Endpoints
tattrongvu commited on
Commit
0c8113b
·
verified ·
1 Parent(s): cfa62cb

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "vidore/colqwen2-base",
3
+ "architectures": [
4
+ "ColQwen2"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 1536,
11
+ "image_token_id": 151655,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 8960,
14
+ "max_position_embeddings": 32768,
15
+ "max_window_layers": 28,
16
+ "model_type": "qwen2_vl",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 2,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": {
22
+ "mrope_section": [
23
+ 16,
24
+ 24,
25
+ 24
26
+ ],
27
+ "rope_type": "default",
28
+ "type": "default"
29
+ },
30
+ "rope_theta": 1000000.0,
31
+ "sliding_window": 32768,
32
+ "tie_word_embeddings": true,
33
+ "torch_dtype": "bfloat16",
34
+ "transformers_version": "4.46.3",
35
+ "use_cache": true,
36
+ "use_sliding_window": false,
37
+ "video_token_id": 151656,
38
+ "vision_config": {
39
+ "hidden_size": 1536,
40
+ "in_chans": 3,
41
+ "model_type": "qwen2_vl",
42
+ "spatial_patch_size": 14
43
+ },
44
+ "vision_end_token_id": 151653,
45
+ "vision_start_token_id": 151652,
46
+ "vision_token_id": 151654,
47
+ "vocab_size": 151936
48
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.01,
10
+ "top_k": 1,
11
+ "top_p": 0.001,
12
+ "transformers_version": "4.46.3"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7633f4be5163e68d13cfc54d01be80b18c9ec4795f505255436940265df282cb
3
+ size 4418444496
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Qwen2VLImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "max_pixels": 12845056,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "ColQwen2Processor",
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "max_pixels": 12845056,
26
+ "min_pixels": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
+ }
results.json ADDED
@@ -0,0 +1,582 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vidore/arxivqa_test_subsampled": {
3
+ "ndcg_at_1": 0.846,
4
+ "ndcg_at_3": 0.89433,
5
+ "ndcg_at_5": 0.8976,
6
+ "ndcg_at_10": 0.90642,
7
+ "ndcg_at_20": 0.91194,
8
+ "ndcg_at_50": 0.91484,
9
+ "ndcg_at_100": 0.91516,
10
+ "map_at_1": 0.846,
11
+ "map_at_3": 0.88267,
12
+ "map_at_5": 0.88447,
13
+ "map_at_10": 0.88834,
14
+ "map_at_20": 0.88984,
15
+ "map_at_50": 0.89035,
16
+ "map_at_100": 0.89038,
17
+ "recall_at_1": 0.846,
18
+ "recall_at_3": 0.928,
19
+ "recall_at_5": 0.936,
20
+ "recall_at_10": 0.962,
21
+ "recall_at_20": 0.984,
22
+ "recall_at_50": 0.998,
23
+ "recall_at_100": 1.0,
24
+ "precision_at_1": 0.846,
25
+ "precision_at_3": 0.30933,
26
+ "precision_at_5": 0.1872,
27
+ "precision_at_10": 0.0962,
28
+ "precision_at_20": 0.0492,
29
+ "precision_at_50": 0.01996,
30
+ "precision_at_100": 0.01,
31
+ "mrr_at_1": 0.846,
32
+ "mrr_at_3": 0.8823333333333332,
33
+ "mrr_at_5": 0.8850333333333331,
34
+ "mrr_at_10": 0.8883349206349205,
35
+ "mrr_at_20": 0.889943956653941,
36
+ "mrr_at_50": 0.8903646887891893,
37
+ "mrr_at_100": 0.8903913554558559,
38
+ "naucs_at_1_max": 0.8081512829311069,
39
+ "naucs_at_1_std": -0.0772872266066822,
40
+ "naucs_at_1_diff1": 0.9432244208064874,
41
+ "naucs_at_3_max": 0.8414124909222956,
42
+ "naucs_at_3_std": 0.04741155721548222,
43
+ "naucs_at_3_diff1": 0.9146695715323183,
44
+ "naucs_at_5_max": 0.8297589869281041,
45
+ "naucs_at_5_std": 0.02895950046684961,
46
+ "naucs_at_5_diff1": 0.9121732026143777,
47
+ "naucs_at_10_max": 0.8905351614329942,
48
+ "naucs_at_10_std": 0.0927072583419271,
49
+ "naucs_at_10_diff1": 0.9217406260749892,
50
+ "naucs_at_20_max": 0.8768674136321192,
51
+ "naucs_at_20_std": 0.14495798319327652,
52
+ "naucs_at_20_diff1": 0.8631535947712419,
53
+ "naucs_at_50_max": 0.86928104575168,
54
+ "naucs_at_50_std": -1.739962651727529,
55
+ "naucs_at_50_diff1": 1.0,
56
+ "naucs_at_100_max": 1.0,
57
+ "naucs_at_100_std": 1.0,
58
+ "naucs_at_100_diff1": 1.0
59
+ },
60
+ "vidore/docvqa_test_subsampled": {
61
+ "ndcg_at_1": 0.5122,
62
+ "ndcg_at_3": 0.58642,
63
+ "ndcg_at_5": 0.60675,
64
+ "ndcg_at_10": 0.62745,
65
+ "ndcg_at_20": 0.6406,
66
+ "ndcg_at_50": 0.65097,
67
+ "ndcg_at_100": 0.65752,
68
+ "map_at_1": 0.5122,
69
+ "map_at_3": 0.56837,
70
+ "map_at_5": 0.57979,
71
+ "map_at_10": 0.58828,
72
+ "map_at_20": 0.59201,
73
+ "map_at_50": 0.59359,
74
+ "map_at_100": 0.59419,
75
+ "recall_at_1": 0.5122,
76
+ "recall_at_3": 0.63858,
77
+ "recall_at_5": 0.68736,
78
+ "recall_at_10": 0.75166,
79
+ "recall_at_20": 0.80266,
80
+ "recall_at_50": 0.85588,
81
+ "recall_at_100": 0.89579,
82
+ "precision_at_1": 0.5122,
83
+ "precision_at_3": 0.21286,
84
+ "precision_at_5": 0.13747,
85
+ "precision_at_10": 0.07517,
86
+ "precision_at_20": 0.04013,
87
+ "precision_at_50": 0.01712,
88
+ "precision_at_100": 0.00896,
89
+ "mrr_at_1": 0.5077605321507761,
90
+ "mrr_at_3": 0.5661492978566152,
91
+ "mrr_at_5": 0.5760162601626018,
92
+ "mrr_at_10": 0.5855110336817655,
93
+ "mrr_at_20": 0.5891507477844685,
94
+ "mrr_at_50": 0.5906413959072078,
95
+ "mrr_at_100": 0.5912222399166979,
96
+ "naucs_at_1_max": 0.1805386230325004,
97
+ "naucs_at_1_std": 0.2083777775408145,
98
+ "naucs_at_1_diff1": 0.8899357497181541,
99
+ "naucs_at_3_max": 0.036250343926384106,
100
+ "naucs_at_3_std": 0.2729014248258626,
101
+ "naucs_at_3_diff1": 0.8128872229558094,
102
+ "naucs_at_5_max": -0.06393649520768908,
103
+ "naucs_at_5_std": 0.35537819721239466,
104
+ "naucs_at_5_diff1": 0.7893105740940741,
105
+ "naucs_at_10_max": -0.2303383632743928,
106
+ "naucs_at_10_std": 0.4494027929577441,
107
+ "naucs_at_10_diff1": 0.7907508093846765,
108
+ "naucs_at_20_max": -0.4436196516643613,
109
+ "naucs_at_20_std": 0.4884483476927145,
110
+ "naucs_at_20_diff1": 0.8109543227071737,
111
+ "naucs_at_50_max": -0.6245429545199258,
112
+ "naucs_at_50_std": 0.7349758004656695,
113
+ "naucs_at_50_diff1": 0.8508679773914936,
114
+ "naucs_at_100_max": -0.8863218597112107,
115
+ "naucs_at_100_std": 0.9554198503123303,
116
+ "naucs_at_100_diff1": 0.8643881531474712
117
+ },
118
+ "vidore/infovqa_test_subsampled": {
119
+ "ndcg_at_1": 0.86842,
120
+ "ndcg_at_3": 0.91276,
121
+ "ndcg_at_5": 0.9179,
122
+ "ndcg_at_10": 0.92323,
123
+ "ndcg_at_20": 0.92567,
124
+ "ndcg_at_50": 0.92726,
125
+ "ndcg_at_100": 0.92757,
126
+ "map_at_1": 0.86842,
127
+ "map_at_3": 0.90216,
128
+ "map_at_5": 0.90509,
129
+ "map_at_10": 0.90734,
130
+ "map_at_20": 0.90794,
131
+ "map_at_50": 0.90819,
132
+ "map_at_100": 0.90821,
133
+ "recall_at_1": 0.86842,
134
+ "recall_at_3": 0.94332,
135
+ "recall_at_5": 0.95547,
136
+ "recall_at_10": 0.97166,
137
+ "recall_at_20": 0.98178,
138
+ "recall_at_50": 0.98988,
139
+ "recall_at_100": 0.9919,
140
+ "precision_at_1": 0.86842,
141
+ "precision_at_3": 0.31444,
142
+ "precision_at_5": 0.19109,
143
+ "precision_at_10": 0.09717,
144
+ "precision_at_20": 0.04909,
145
+ "precision_at_50": 0.0198,
146
+ "precision_at_100": 0.00992,
147
+ "mrr_at_1": 0.8663967611336032,
148
+ "mrr_at_3": 0.9018218623481781,
149
+ "mrr_at_5": 0.90414979757085,
150
+ "mrr_at_10": 0.9064793393740761,
151
+ "mrr_at_20": 0.9070689308111907,
152
+ "mrr_at_50": 0.907317794217949,
153
+ "mrr_at_100": 0.9073397973864051,
154
+ "naucs_at_1_max": 0.5849894597264937,
155
+ "naucs_at_1_std": -0.09180599537618413,
156
+ "naucs_at_1_diff1": 0.9539022482093868,
157
+ "naucs_at_3_max": 0.653300720791228,
158
+ "naucs_at_3_std": -0.21668629871058337,
159
+ "naucs_at_3_diff1": 0.9135264836941877,
160
+ "naucs_at_5_max": 0.7483953850859967,
161
+ "naucs_at_5_std": 0.034054461502585474,
162
+ "naucs_at_5_diff1": 0.9018156568894024,
163
+ "naucs_at_10_max": 0.8978323372990794,
164
+ "naucs_at_10_std": 0.4924193687602959,
165
+ "naucs_at_10_diff1": 0.9055324021231301,
166
+ "naucs_at_20_max": 0.9546293120196974,
167
+ "naucs_at_20_std": 0.7223350051556908,
168
+ "naucs_at_20_diff1": 0.9419549093045442,
169
+ "naucs_at_50_max": 0.9183327616354471,
170
+ "naucs_at_50_std": 0.6841596004695476,
171
+ "naucs_at_50_diff1": 0.9216391275611305,
172
+ "naucs_at_100_max": 0.8979159520443043,
173
+ "naucs_at_100_std": 0.6051995005869166,
174
+ "naucs_at_100_diff1": 0.9346992729676393
175
+ },
176
+ "vidore/tabfquad_test_subsampled": {
177
+ "ndcg_at_1": 0.90357,
178
+ "ndcg_at_3": 0.94366,
179
+ "ndcg_at_5": 0.94781,
180
+ "ndcg_at_10": 0.95262,
181
+ "ndcg_at_20": 0.95361,
182
+ "ndcg_at_50": 0.95431,
183
+ "ndcg_at_100": 0.95431,
184
+ "map_at_1": 0.90357,
185
+ "map_at_3": 0.93512,
186
+ "map_at_5": 0.93726,
187
+ "map_at_10": 0.93936,
188
+ "map_at_20": 0.93968,
189
+ "map_at_50": 0.93979,
190
+ "map_at_100": 0.93979,
191
+ "recall_at_1": 0.90357,
192
+ "recall_at_3": 0.96786,
193
+ "recall_at_5": 0.97857,
194
+ "recall_at_10": 0.99286,
195
+ "recall_at_20": 0.99643,
196
+ "recall_at_50": 1.0,
197
+ "recall_at_100": 1.0,
198
+ "precision_at_1": 0.90357,
199
+ "precision_at_3": 0.32262,
200
+ "precision_at_5": 0.19571,
201
+ "precision_at_10": 0.09929,
202
+ "precision_at_20": 0.04982,
203
+ "precision_at_50": 0.02,
204
+ "precision_at_100": 0.01,
205
+ "mrr_at_1": 0.9,
206
+ "mrr_at_3": 0.9333333333333332,
207
+ "mrr_at_5": 0.9358333333333332,
208
+ "mrr_at_10": 0.9382879818594103,
209
+ "mrr_at_20": 0.9382879818594103,
210
+ "mrr_at_50": 0.938390022675737,
211
+ "mrr_at_100": 0.938390022675737,
212
+ "naucs_at_1_max": 0.6358197600027652,
213
+ "naucs_at_1_std": 0.28033682608845983,
214
+ "naucs_at_1_diff1": 0.9296261714562363,
215
+ "naucs_at_3_max": 0.9709513435003594,
216
+ "naucs_at_3_std": 0.7201473181865297,
217
+ "naucs_at_3_diff1": 0.9400871459694935,
218
+ "naucs_at_5_max": 0.9782135076252753,
219
+ "naucs_at_5_std": 0.7226112667289174,
220
+ "naucs_at_5_diff1": 0.910130718954251,
221
+ "naucs_at_10_max": 1.0,
222
+ "naucs_at_10_std": 1.0,
223
+ "naucs_at_10_diff1": 0.7957516339869304,
224
+ "naucs_at_20_max": 1.0,
225
+ "naucs_at_20_std": 1.0,
226
+ "naucs_at_20_diff1": 0.72222222222224,
227
+ "naucs_at_50_max": 1.0,
228
+ "naucs_at_50_std": 1.0,
229
+ "naucs_at_50_diff1": 1.0,
230
+ "naucs_at_100_max": 1.0,
231
+ "naucs_at_100_std": 1.0,
232
+ "naucs_at_100_diff1": 1.0
233
+ },
234
+ "vidore/tatdqa_test": {
235
+ "ndcg_at_1": 0.67922,
236
+ "ndcg_at_3": 0.77244,
237
+ "ndcg_at_5": 0.79694,
238
+ "ndcg_at_10": 0.8121,
239
+ "ndcg_at_20": 0.8175,
240
+ "ndcg_at_50": 0.82077,
241
+ "ndcg_at_100": 0.82237,
242
+ "map_at_1": 0.67922,
243
+ "map_at_3": 0.75,
244
+ "map_at_5": 0.76373,
245
+ "map_at_10": 0.77012,
246
+ "map_at_20": 0.7717,
247
+ "map_at_50": 0.77223,
248
+ "map_at_100": 0.77238,
249
+ "recall_at_1": 0.67922,
250
+ "recall_at_3": 0.83718,
251
+ "recall_at_5": 0.89611,
252
+ "recall_at_10": 0.94228,
253
+ "recall_at_20": 0.96294,
254
+ "recall_at_50": 0.97934,
255
+ "recall_at_100": 0.98906,
256
+ "precision_at_1": 0.67922,
257
+ "precision_at_3": 0.27906,
258
+ "precision_at_5": 0.17922,
259
+ "precision_at_10": 0.09423,
260
+ "precision_at_20": 0.04815,
261
+ "precision_at_50": 0.01959,
262
+ "precision_at_100": 0.00989,
263
+ "mrr_at_1": 0.6798298906439855,
264
+ "mrr_at_3": 0.7503037667071691,
265
+ "mrr_at_5": 0.7639732685297701,
266
+ "mrr_at_10": 0.7703048756967354,
267
+ "mrr_at_20": 0.771781703893678,
268
+ "mrr_at_50": 0.7722855522833878,
269
+ "mrr_at_100": 0.7724409847415249,
270
+ "naucs_at_1_max": 0.16344622447696044,
271
+ "naucs_at_1_std": -0.2550595413752476,
272
+ "naucs_at_1_diff1": 0.798163282282663,
273
+ "naucs_at_3_max": 0.25140875244161615,
274
+ "naucs_at_3_std": -0.19148632297929294,
275
+ "naucs_at_3_diff1": 0.735908847266564,
276
+ "naucs_at_5_max": 0.36331170472018576,
277
+ "naucs_at_5_std": -0.025518611627873482,
278
+ "naucs_at_5_diff1": 0.7030926615249569,
279
+ "naucs_at_10_max": 0.3881662571474966,
280
+ "naucs_at_10_std": 0.19492404683272815,
281
+ "naucs_at_10_diff1": 0.7210851978418036,
282
+ "naucs_at_20_max": 0.47757976788208983,
283
+ "naucs_at_20_std": 0.4380179245317548,
284
+ "naucs_at_20_diff1": 0.6933989034189479,
285
+ "naucs_at_50_max": 0.6469748935781172,
286
+ "naucs_at_50_std": 0.6850989095376581,
287
+ "naucs_at_50_diff1": 0.7523078962936631,
288
+ "naucs_at_100_max": 0.6351305556652902,
289
+ "naucs_at_100_std": 0.667464964759476,
290
+ "naucs_at_100_diff1": 0.7283558047703392
291
+ },
292
+ "vidore/shiftproject_test": {
293
+ "ndcg_at_1": 0.78,
294
+ "ndcg_at_3": 0.87071,
295
+ "ndcg_at_5": 0.89137,
296
+ "ndcg_at_10": 0.89493,
297
+ "ndcg_at_20": 0.89493,
298
+ "ndcg_at_50": 0.89493,
299
+ "ndcg_at_100": 0.89655,
300
+ "map_at_1": 0.78,
301
+ "map_at_3": 0.85,
302
+ "map_at_5": 0.8615,
303
+ "map_at_10": 0.86317,
304
+ "map_at_20": 0.86317,
305
+ "map_at_50": 0.86317,
306
+ "map_at_100": 0.86331,
307
+ "recall_at_1": 0.78,
308
+ "recall_at_3": 0.93,
309
+ "recall_at_5": 0.98,
310
+ "recall_at_10": 0.99,
311
+ "recall_at_20": 0.99,
312
+ "recall_at_50": 0.99,
313
+ "recall_at_100": 1.0,
314
+ "precision_at_1": 0.78,
315
+ "precision_at_3": 0.31,
316
+ "precision_at_5": 0.196,
317
+ "precision_at_10": 0.099,
318
+ "precision_at_20": 0.0495,
319
+ "precision_at_50": 0.0198,
320
+ "precision_at_100": 0.01,
321
+ "mrr_at_1": 0.8,
322
+ "mrr_at_3": 0.8683333333333334,
323
+ "mrr_at_5": 0.8758333333333335,
324
+ "mrr_at_10": 0.8772619047619048,
325
+ "mrr_at_20": 0.8772619047619048,
326
+ "mrr_at_50": 0.8772619047619048,
327
+ "mrr_at_100": 0.8774047619047619,
328
+ "naucs_at_1_max": 0.17508230166458016,
329
+ "naucs_at_1_std": -0.3999629062920201,
330
+ "naucs_at_1_diff1": 0.8873278620114059,
331
+ "naucs_at_3_max": 0.32412965186074477,
332
+ "naucs_at_3_std": -0.44864612511671287,
333
+ "naucs_at_3_diff1": 0.8382686407896491,
334
+ "naucs_at_5_max": 0.6136788048552745,
335
+ "naucs_at_5_std": 0.35807656395892185,
336
+ "naucs_at_5_diff1": 0.7117180205415541,
337
+ "naucs_at_10_max": 0.35807656395891135,
338
+ "naucs_at_10_std": 0.35807656395891135,
339
+ "naucs_at_10_diff1": 0.5541549953314738,
340
+ "naucs_at_20_max": 0.35807656395891135,
341
+ "naucs_at_20_std": 0.35807656395891135,
342
+ "naucs_at_20_diff1": 0.5541549953314738,
343
+ "naucs_at_50_max": 0.35807656395892007,
344
+ "naucs_at_50_std": 0.35807656395892007,
345
+ "naucs_at_50_diff1": 0.554154995331464,
346
+ "naucs_at_100_max": null,
347
+ "naucs_at_100_std": null,
348
+ "naucs_at_100_diff1": null
349
+ },
350
+ "vidore/syntheticDocQA_artificial_intelligence_test": {
351
+ "ndcg_at_1": 0.99,
352
+ "ndcg_at_3": 0.99631,
353
+ "ndcg_at_5": 0.99631,
354
+ "ndcg_at_10": 0.99631,
355
+ "ndcg_at_20": 0.99631,
356
+ "ndcg_at_50": 0.99631,
357
+ "ndcg_at_100": 0.99631,
358
+ "map_at_1": 0.99,
359
+ "map_at_3": 0.995,
360
+ "map_at_5": 0.995,
361
+ "map_at_10": 0.995,
362
+ "map_at_20": 0.995,
363
+ "map_at_50": 0.995,
364
+ "map_at_100": 0.995,
365
+ "recall_at_1": 0.99,
366
+ "recall_at_3": 1.0,
367
+ "recall_at_5": 1.0,
368
+ "recall_at_10": 1.0,
369
+ "recall_at_20": 1.0,
370
+ "recall_at_50": 1.0,
371
+ "recall_at_100": 1.0,
372
+ "precision_at_1": 0.99,
373
+ "precision_at_3": 0.33333,
374
+ "precision_at_5": 0.2,
375
+ "precision_at_10": 0.1,
376
+ "precision_at_20": 0.05,
377
+ "precision_at_50": 0.02,
378
+ "precision_at_100": 0.01,
379
+ "mrr_at_1": 0.99,
380
+ "mrr_at_3": 0.995,
381
+ "mrr_at_5": 0.995,
382
+ "mrr_at_10": 0.995,
383
+ "mrr_at_20": 0.995,
384
+ "mrr_at_50": 0.995,
385
+ "mrr_at_100": 0.995,
386
+ "naucs_at_1_max": 0.7222222222222201,
387
+ "naucs_at_1_std": 0.7222222222222201,
388
+ "naucs_at_1_diff1": 1.0,
389
+ "naucs_at_3_max": 1.0,
390
+ "naucs_at_3_std": 1.0,
391
+ "naucs_at_3_diff1": 1.0,
392
+ "naucs_at_5_max": 1.0,
393
+ "naucs_at_5_std": 1.0,
394
+ "naucs_at_5_diff1": 1.0,
395
+ "naucs_at_10_max": 1.0,
396
+ "naucs_at_10_std": 1.0,
397
+ "naucs_at_10_diff1": 1.0,
398
+ "naucs_at_20_max": 1.0,
399
+ "naucs_at_20_std": 1.0,
400
+ "naucs_at_20_diff1": 1.0,
401
+ "naucs_at_50_max": null,
402
+ "naucs_at_50_std": null,
403
+ "naucs_at_50_diff1": null,
404
+ "naucs_at_100_max": null,
405
+ "naucs_at_100_std": null,
406
+ "naucs_at_100_diff1": null
407
+ },
408
+ "vidore/syntheticDocQA_energy_test": {
409
+ "ndcg_at_1": 0.95,
410
+ "ndcg_at_3": 0.96131,
411
+ "ndcg_at_5": 0.96562,
412
+ "ndcg_at_10": 0.97184,
413
+ "ndcg_at_20": 0.97184,
414
+ "ndcg_at_50": 0.97184,
415
+ "ndcg_at_100": 0.97184,
416
+ "map_at_1": 0.95,
417
+ "map_at_3": 0.95833,
418
+ "map_at_5": 0.96083,
419
+ "map_at_10": 0.96326,
420
+ "map_at_20": 0.96326,
421
+ "map_at_50": 0.96326,
422
+ "map_at_100": 0.96326,
423
+ "recall_at_1": 0.95,
424
+ "recall_at_3": 0.97,
425
+ "recall_at_5": 0.98,
426
+ "recall_at_10": 1.0,
427
+ "recall_at_20": 1.0,
428
+ "recall_at_50": 1.0,
429
+ "recall_at_100": 1.0,
430
+ "precision_at_1": 0.95,
431
+ "precision_at_3": 0.32333,
432
+ "precision_at_5": 0.196,
433
+ "precision_at_10": 0.1,
434
+ "precision_at_20": 0.05,
435
+ "precision_at_50": 0.02,
436
+ "precision_at_100": 0.01,
437
+ "mrr_at_1": 0.95,
438
+ "mrr_at_3": 0.9583333333333333,
439
+ "mrr_at_5": 0.9608333333333333,
440
+ "mrr_at_10": 0.9633730158730158,
441
+ "mrr_at_20": 0.9633730158730158,
442
+ "mrr_at_50": 0.9633730158730158,
443
+ "mrr_at_100": 0.9633730158730158,
444
+ "naucs_at_1_max": 0.5883286647992536,
445
+ "naucs_at_1_std": -0.408309990662932,
446
+ "naucs_at_1_diff1": 0.9738562091503253,
447
+ "naucs_at_3_max": 0.5714285714285686,
448
+ "naucs_at_3_std": -1.040616246498596,
449
+ "naucs_at_3_diff1": 1.0,
450
+ "naucs_at_5_max": 0.4960317460317504,
451
+ "naucs_at_5_std": -1.7399626517273414,
452
+ "naucs_at_5_diff1": 1.0,
453
+ "naucs_at_10_max": 1.0,
454
+ "naucs_at_10_std": 1.0,
455
+ "naucs_at_10_diff1": 1.0,
456
+ "naucs_at_20_max": 1.0,
457
+ "naucs_at_20_std": 1.0,
458
+ "naucs_at_20_diff1": 1.0,
459
+ "naucs_at_50_max": null,
460
+ "naucs_at_50_std": null,
461
+ "naucs_at_50_diff1": null,
462
+ "naucs_at_100_max": null,
463
+ "naucs_at_100_std": null,
464
+ "naucs_at_100_diff1": null
465
+ },
466
+ "vidore/syntheticDocQA_government_reports_test": {
467
+ "ndcg_at_1": 0.89,
468
+ "ndcg_at_3": 0.95178,
469
+ "ndcg_at_5": 0.95609,
470
+ "ndcg_at_10": 0.95609,
471
+ "ndcg_at_20": 0.95609,
472
+ "ndcg_at_50": 0.95609,
473
+ "ndcg_at_100": 0.95609,
474
+ "map_at_1": 0.89,
475
+ "map_at_3": 0.93833,
476
+ "map_at_5": 0.94083,
477
+ "map_at_10": 0.94083,
478
+ "map_at_20": 0.94083,
479
+ "map_at_50": 0.94083,
480
+ "map_at_100": 0.94083,
481
+ "recall_at_1": 0.89,
482
+ "recall_at_3": 0.99,
483
+ "recall_at_5": 1.0,
484
+ "recall_at_10": 1.0,
485
+ "recall_at_20": 1.0,
486
+ "recall_at_50": 1.0,
487
+ "recall_at_100": 1.0,
488
+ "precision_at_1": 0.89,
489
+ "precision_at_3": 0.33,
490
+ "precision_at_5": 0.2,
491
+ "precision_at_10": 0.1,
492
+ "precision_at_20": 0.05,
493
+ "precision_at_50": 0.02,
494
+ "precision_at_100": 0.01,
495
+ "mrr_at_1": 0.89,
496
+ "mrr_at_3": 0.94,
497
+ "mrr_at_5": 0.9425,
498
+ "mrr_at_10": 0.9425,
499
+ "mrr_at_20": 0.9425,
500
+ "mrr_at_50": 0.9425,
501
+ "mrr_at_100": 0.9425,
502
+ "naucs_at_1_max": 0.4527531998969162,
503
+ "naucs_at_1_std": 0.3915041663087361,
504
+ "naucs_at_1_diff1": 0.8127308650459574,
505
+ "naucs_at_3_max": 1.0,
506
+ "naucs_at_3_std": 0.5541549953314585,
507
+ "naucs_at_3_diff1": 0.8692810457516356,
508
+ "naucs_at_5_max": 1.0,
509
+ "naucs_at_5_std": 1.0,
510
+ "naucs_at_5_diff1": 1.0,
511
+ "naucs_at_10_max": 1.0,
512
+ "naucs_at_10_std": 1.0,
513
+ "naucs_at_10_diff1": 1.0,
514
+ "naucs_at_20_max": 1.0,
515
+ "naucs_at_20_std": 1.0,
516
+ "naucs_at_20_diff1": 1.0,
517
+ "naucs_at_50_max": null,
518
+ "naucs_at_50_std": null,
519
+ "naucs_at_50_diff1": null,
520
+ "naucs_at_100_max": null,
521
+ "naucs_at_100_std": null,
522
+ "naucs_at_100_diff1": null
523
+ },
524
+ "vidore/syntheticDocQA_healthcare_industry_test": {
525
+ "ndcg_at_1": 0.97,
526
+ "ndcg_at_3": 0.98262,
527
+ "ndcg_at_5": 0.98693,
528
+ "ndcg_at_10": 0.98693,
529
+ "ndcg_at_20": 0.98693,
530
+ "ndcg_at_50": 0.98693,
531
+ "ndcg_at_100": 0.98693,
532
+ "map_at_1": 0.97,
533
+ "map_at_3": 0.98,
534
+ "map_at_5": 0.9825,
535
+ "map_at_10": 0.9825,
536
+ "map_at_20": 0.9825,
537
+ "map_at_50": 0.9825,
538
+ "map_at_100": 0.9825,
539
+ "recall_at_1": 0.97,
540
+ "recall_at_3": 0.99,
541
+ "recall_at_5": 1.0,
542
+ "recall_at_10": 1.0,
543
+ "recall_at_20": 1.0,
544
+ "recall_at_50": 1.0,
545
+ "recall_at_100": 1.0,
546
+ "precision_at_1": 0.97,
547
+ "precision_at_3": 0.33,
548
+ "precision_at_5": 0.2,
549
+ "precision_at_10": 0.1,
550
+ "precision_at_20": 0.05,
551
+ "precision_at_50": 0.02,
552
+ "precision_at_100": 0.01,
553
+ "mrr_at_1": 0.97,
554
+ "mrr_at_3": 0.98,
555
+ "mrr_at_5": 0.9825,
556
+ "mrr_at_10": 0.9825,
557
+ "mrr_at_20": 0.9825,
558
+ "mrr_at_50": 0.9825,
559
+ "mrr_at_100": 0.9825,
560
+ "naucs_at_1_max": 0.5448179271708694,
561
+ "naucs_at_1_std": 0.044817927170871553,
562
+ "naucs_at_1_diff1": 0.9564270152505465,
563
+ "naucs_at_3_max": 0.7222222222222157,
564
+ "naucs_at_3_std": -0.5634920634921204,
565
+ "naucs_at_3_diff1": 1.0,
566
+ "naucs_at_5_max": 1.0,
567
+ "naucs_at_5_std": 1.0,
568
+ "naucs_at_5_diff1": 1.0,
569
+ "naucs_at_10_max": 1.0,
570
+ "naucs_at_10_std": 1.0,
571
+ "naucs_at_10_diff1": 1.0,
572
+ "naucs_at_20_max": 1.0,
573
+ "naucs_at_20_std": 1.0,
574
+ "naucs_at_20_diff1": 1.0,
575
+ "naucs_at_50_max": null,
576
+ "naucs_at_50_std": null,
577
+ "naucs_at_50_diff1": null,
578
+ "naucs_at_100_max": null,
579
+ "naucs_at_100_std": null,
580
+ "naucs_at_100_diff1": null
581
+ }
582
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:091aa7594dc2fcfbfa06b9e3c22a5f0562ac14f30375c13af7309407a0e67b8a
3
+ size 11420371
tokenizer_config.json ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ }
116
+ },
117
+ "additional_special_tokens": [
118
+ "<|im_start|>",
119
+ "<|im_end|>",
120
+ "<|object_ref_start|>",
121
+ "<|object_ref_end|>",
122
+ "<|box_start|>",
123
+ "<|box_end|>",
124
+ "<|quad_start|>",
125
+ "<|quad_end|>",
126
+ "<|vision_start|>",
127
+ "<|vision_end|>",
128
+ "<|vision_pad|>",
129
+ "<|image_pad|>",
130
+ "<|video_pad|>"
131
+ ],
132
+ "bos_token": null,
133
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
134
+ "clean_up_tokenization_spaces": false,
135
+ "eos_token": "<|im_end|>",
136
+ "errors": "replace",
137
+ "model_max_length": 32768,
138
+ "pad_token": "<|endoftext|>",
139
+ "padding_side": "left",
140
+ "processor_class": "ColQwen2Processor",
141
+ "split_special_tokens": false,
142
+ "tokenizer_class": "Qwen2Tokenizer",
143
+ "unk_token": null
144
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff