SagiPolaczek
commited on
Commit
•
07042f2
1
Parent(s):
25d3466
Push model using huggingface_hub.
Browse files
tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
CHANGED
@@ -77,7 +77,7 @@
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
-
"content": "<
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
@@ -86,7 +86,7 @@
|
|
86 |
},
|
87 |
{
|
88 |
"id": 9,
|
89 |
-
"content": "<
|
90 |
"single_word": false,
|
91 |
"lstrip": false,
|
92 |
"rstrip": false,
|
@@ -95,7 +95,7 @@
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
-
"content": "<
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
@@ -104,7 +104,7 @@
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
-
"content": "<
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
@@ -2318,7 +2318,7 @@
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
-
"content": "<
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
@@ -2327,7 +2327,7 @@
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
-
"content": "<
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
@@ -2336,7 +2336,7 @@
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
-
"content": "<
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
@@ -2417,7 +2417,7 @@
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
-
"content": "<
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
@@ -2426,7 +2426,7 @@
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
-
"content": "<
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
@@ -2435,7 +2435,7 @@
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
-
"content": "<
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
@@ -2444,7 +2444,7 @@
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
-
"content": "<
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
@@ -2453,7 +2453,7 @@
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
-
"content": "<
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
@@ -2462,7 +2462,7 @@
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
-
"content": "<
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
@@ -2570,7 +2570,7 @@
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
-
"content": "<
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
@@ -2669,7 +2669,7 @@
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
-
"content": "<
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
@@ -2678,7 +2678,7 @@
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
-
"content": "<
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
@@ -2852,10 +2852,10 @@
|
|
2852 |
"<EOS>": 5,
|
2853 |
"<MOLECULAR_ENTITY>": 6,
|
2854 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2855 |
-
"<
|
2856 |
-
"<
|
2857 |
-
"<
|
2858 |
-
"<
|
2859 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2860 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2861 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
@@ -3101,9 +3101,9 @@
|
|
3101 |
"<SENTINEL_ID_197>": 254,
|
3102 |
"<SENTINEL_ID_198>": 255,
|
3103 |
"<SENTINEL_ID_199>": 256,
|
3104 |
-
"<
|
3105 |
-
"<
|
3106 |
-
"<
|
3107 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3108 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3109 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
@@ -3112,12 +3112,12 @@
|
|
3112 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3113 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3114 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3115 |
-
"<
|
3116 |
-
"<
|
3117 |
-
"<
|
3118 |
-
"<
|
3119 |
-
"<
|
3120 |
-
"<
|
3121 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3122 |
"<TIMESTEP>": 275,
|
3123 |
"<DIFFUSION>": 276,
|
@@ -3129,7 +3129,7 @@
|
|
3129 |
"<BACKSPACE>": 282,
|
3130 |
"<SEQUENCE_NATURAL_START>": 283,
|
3131 |
"<NOOP>": 284,
|
3132 |
-
"<
|
3133 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3134 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3135 |
"<CELL_TYPE_CLASS>": 288,
|
@@ -3140,8 +3140,8 @@
|
|
3140 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3141 |
"<COMPLEX_ENTITY>": 294,
|
3142 |
"<ALTERNATIVE>": 295,
|
3143 |
-
"<
|
3144 |
-
"<
|
3145 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3146 |
"<MUTATED>": 299,
|
3147 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
+
"content": "<INTERNAL_0>",
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
|
|
86 |
},
|
87 |
{
|
88 |
"id": 9,
|
89 |
+
"content": "<INTERNAL_1>",
|
90 |
"single_word": false,
|
91 |
"lstrip": false,
|
92 |
"rstrip": false,
|
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
+
"content": "<INTERNAL_2>",
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
+
"content": "<INTERNAL_3>",
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
+
"content": "<INTERNAL_17>",
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
+
"content": "<INTERNAL_15>",
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
+
"content": "<INTERNAL_16>",
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
+
"content": "<INTERNAL_7>",
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
+
"content": "<INTERNAL_6>",
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
+
"content": "<INTERNAL_9>",
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
+
"content": "<INTERNAL_5>",
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
+
"content": "<INTERNAL_8>",
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
+
"content": "<INTERNAL_4>",
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
+
"content": "<INTERNAL_14>",
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
+
"content": "<INTERNAL_13>",
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
+
"content": "<INTERNAL_12>",
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
|
|
2852 |
"<EOS>": 5,
|
2853 |
"<MOLECULAR_ENTITY>": 6,
|
2854 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2855 |
+
"<INTERNAL_0>": 8,
|
2856 |
+
"<INTERNAL_1>": 9,
|
2857 |
+
"<INTERNAL_2>": 10,
|
2858 |
+
"<INTERNAL_3>": 11,
|
2859 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2860 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2861 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
|
|
3101 |
"<SENTINEL_ID_197>": 254,
|
3102 |
"<SENTINEL_ID_198>": 255,
|
3103 |
"<SENTINEL_ID_199>": 256,
|
3104 |
+
"<INTERNAL_17>": 257,
|
3105 |
+
"<INTERNAL_15>": 258,
|
3106 |
+
"<INTERNAL_16>": 259,
|
3107 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3108 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3109 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
|
|
3112 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3113 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3114 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3115 |
+
"<INTERNAL_7>": 268,
|
3116 |
+
"<INTERNAL_6>": 269,
|
3117 |
+
"<INTERNAL_9>": 270,
|
3118 |
+
"<INTERNAL_5>": 271,
|
3119 |
+
"<INTERNAL_8>": 272,
|
3120 |
+
"<INTERNAL_4>": 273,
|
3121 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3122 |
"<TIMESTEP>": 275,
|
3123 |
"<DIFFUSION>": 276,
|
|
|
3129 |
"<BACKSPACE>": 282,
|
3130 |
"<SEQUENCE_NATURAL_START>": 283,
|
3131 |
"<NOOP>": 284,
|
3132 |
+
"<INTERNAL_14>": 285,
|
3133 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3134 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3135 |
"<CELL_TYPE_CLASS>": 288,
|
|
|
3140 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3141 |
"<COMPLEX_ENTITY>": 294,
|
3142 |
"<ALTERNATIVE>": 295,
|
3143 |
+
"<INTERNAL_13>": 296,
|
3144 |
+
"<INTERNAL_12>": 297,
|
3145 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3146 |
"<MUTATED>": 299,
|
3147 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
tokenizer/cell_attributes_tokenizer.json
CHANGED
@@ -77,7 +77,7 @@
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
-
"content": "<
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
@@ -86,7 +86,7 @@
|
|
86 |
},
|
87 |
{
|
88 |
"id": 9,
|
89 |
-
"content": "<
|
90 |
"single_word": false,
|
91 |
"lstrip": false,
|
92 |
"rstrip": false,
|
@@ -95,7 +95,7 @@
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
-
"content": "<
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
@@ -104,7 +104,7 @@
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
-
"content": "<
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
@@ -2318,7 +2318,7 @@
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
-
"content": "<
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
@@ -2327,7 +2327,7 @@
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
-
"content": "<
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
@@ -2336,7 +2336,7 @@
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
-
"content": "<
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
@@ -2417,7 +2417,7 @@
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
-
"content": "<
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
@@ -2426,7 +2426,7 @@
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
-
"content": "<
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
@@ -2435,7 +2435,7 @@
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
-
"content": "<
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
@@ -2444,7 +2444,7 @@
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
-
"content": "<
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
@@ -2453,7 +2453,7 @@
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
-
"content": "<
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
@@ -2462,7 +2462,7 @@
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
-
"content": "<
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
@@ -2570,7 +2570,7 @@
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
-
"content": "<
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
@@ -2669,7 +2669,7 @@
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
-
"content": "<
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
@@ -2678,7 +2678,7 @@
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
-
"content": "<
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
@@ -2857,10 +2857,10 @@
|
|
2857 |
"<EOS>": 5,
|
2858 |
"<MOLECULAR_ENTITY>": 6,
|
2859 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2860 |
-
"<
|
2861 |
-
"<
|
2862 |
-
"<
|
2863 |
-
"<
|
2864 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2865 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2866 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
@@ -3106,9 +3106,9 @@
|
|
3106 |
"<SENTINEL_ID_197>": 254,
|
3107 |
"<SENTINEL_ID_198>": 255,
|
3108 |
"<SENTINEL_ID_199>": 256,
|
3109 |
-
"<
|
3110 |
-
"<
|
3111 |
-
"<
|
3112 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3113 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3114 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
@@ -3117,12 +3117,12 @@
|
|
3117 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3118 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3119 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3120 |
-
"<
|
3121 |
-
"<
|
3122 |
-
"<
|
3123 |
-
"<
|
3124 |
-
"<
|
3125 |
-
"<
|
3126 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3127 |
"<TIMESTEP>": 275,
|
3128 |
"<DIFFUSION>": 276,
|
@@ -3134,7 +3134,7 @@
|
|
3134 |
"<BACKSPACE>": 282,
|
3135 |
"<SEQUENCE_NATURAL_START>": 283,
|
3136 |
"<NOOP>": 284,
|
3137 |
-
"<
|
3138 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3139 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3140 |
"<CELL_TYPE_CLASS>": 288,
|
@@ -3145,8 +3145,8 @@
|
|
3145 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3146 |
"<COMPLEX_ENTITY>": 294,
|
3147 |
"<ALTERNATIVE>": 295,
|
3148 |
-
"<
|
3149 |
-
"<
|
3150 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3151 |
"<MUTATED>": 299,
|
3152 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
+
"content": "<INTERNAL_0>",
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
|
|
86 |
},
|
87 |
{
|
88 |
"id": 9,
|
89 |
+
"content": "<INTERNAL_1>",
|
90 |
"single_word": false,
|
91 |
"lstrip": false,
|
92 |
"rstrip": false,
|
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
+
"content": "<INTERNAL_2>",
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
+
"content": "<INTERNAL_3>",
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
+
"content": "<INTERNAL_17>",
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
+
"content": "<INTERNAL_15>",
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
+
"content": "<INTERNAL_16>",
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
+
"content": "<INTERNAL_7>",
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
+
"content": "<INTERNAL_6>",
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
+
"content": "<INTERNAL_9>",
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
+
"content": "<INTERNAL_5>",
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
+
"content": "<INTERNAL_8>",
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
+
"content": "<INTERNAL_4>",
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
+
"content": "<INTERNAL_14>",
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
+
"content": "<INTERNAL_13>",
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
+
"content": "<INTERNAL_12>",
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
|
|
2857 |
"<EOS>": 5,
|
2858 |
"<MOLECULAR_ENTITY>": 6,
|
2859 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2860 |
+
"<INTERNAL_0>": 8,
|
2861 |
+
"<INTERNAL_1>": 9,
|
2862 |
+
"<INTERNAL_2>": 10,
|
2863 |
+
"<INTERNAL_3>": 11,
|
2864 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2865 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2866 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
|
|
3106 |
"<SENTINEL_ID_197>": 254,
|
3107 |
"<SENTINEL_ID_198>": 255,
|
3108 |
"<SENTINEL_ID_199>": 256,
|
3109 |
+
"<INTERNAL_17>": 257,
|
3110 |
+
"<INTERNAL_15>": 258,
|
3111 |
+
"<INTERNAL_16>": 259,
|
3112 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3113 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3114 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
|
|
3117 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3118 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3119 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3120 |
+
"<INTERNAL_7>": 268,
|
3121 |
+
"<INTERNAL_6>": 269,
|
3122 |
+
"<INTERNAL_9>": 270,
|
3123 |
+
"<INTERNAL_5>": 271,
|
3124 |
+
"<INTERNAL_8>": 272,
|
3125 |
+
"<INTERNAL_4>": 273,
|
3126 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3127 |
"<TIMESTEP>": 275,
|
3128 |
"<DIFFUSION>": 276,
|
|
|
3134 |
"<BACKSPACE>": 282,
|
3135 |
"<SEQUENCE_NATURAL_START>": 283,
|
3136 |
"<NOOP>": 284,
|
3137 |
+
"<INTERNAL_14>": 285,
|
3138 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3139 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3140 |
"<CELL_TYPE_CLASS>": 288,
|
|
|
3145 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3146 |
"<COMPLEX_ENTITY>": 294,
|
3147 |
"<ALTERNATIVE>": 295,
|
3148 |
+
"<INTERNAL_13>": 296,
|
3149 |
+
"<INTERNAL_12>": 297,
|
3150 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3151 |
"<MUTATED>": 299,
|
3152 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
tokenizer/gene_tokenizer.json
CHANGED
@@ -77,7 +77,7 @@
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
-
"content": "<
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
@@ -86,7 +86,7 @@
|
|
86 |
},
|
87 |
{
|
88 |
"id": 9,
|
89 |
-
"content": "<
|
90 |
"single_word": false,
|
91 |
"lstrip": false,
|
92 |
"rstrip": false,
|
@@ -95,7 +95,7 @@
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
-
"content": "<
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
@@ -104,7 +104,7 @@
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
-
"content": "<
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
@@ -2318,7 +2318,7 @@
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
-
"content": "<
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
@@ -2327,7 +2327,7 @@
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
-
"content": "<
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
@@ -2336,7 +2336,7 @@
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
-
"content": "<
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
@@ -2417,7 +2417,7 @@
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
-
"content": "<
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
@@ -2426,7 +2426,7 @@
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
-
"content": "<
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
@@ -2435,7 +2435,7 @@
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
-
"content": "<
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
@@ -2444,7 +2444,7 @@
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
-
"content": "<
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
@@ -2453,7 +2453,7 @@
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
-
"content": "<
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
@@ -2462,7 +2462,7 @@
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
-
"content": "<
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
@@ -2570,7 +2570,7 @@
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
-
"content": "<
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
@@ -2669,7 +2669,7 @@
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
-
"content": "<
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
@@ -2678,7 +2678,7 @@
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
-
"content": "<
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
@@ -2857,10 +2857,10 @@
|
|
2857 |
"<EOS>": 5,
|
2858 |
"<MOLECULAR_ENTITY>": 6,
|
2859 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2860 |
-
"<
|
2861 |
-
"<
|
2862 |
-
"<
|
2863 |
-
"<
|
2864 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2865 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2866 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
@@ -3106,9 +3106,9 @@
|
|
3106 |
"<SENTINEL_ID_197>": 254,
|
3107 |
"<SENTINEL_ID_198>": 255,
|
3108 |
"<SENTINEL_ID_199>": 256,
|
3109 |
-
"<
|
3110 |
-
"<
|
3111 |
-
"<
|
3112 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3113 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3114 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
@@ -3117,12 +3117,12 @@
|
|
3117 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3118 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3119 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3120 |
-
"<
|
3121 |
-
"<
|
3122 |
-
"<
|
3123 |
-
"<
|
3124 |
-
"<
|
3125 |
-
"<
|
3126 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3127 |
"<TIMESTEP>": 275,
|
3128 |
"<DIFFUSION>": 276,
|
@@ -3134,7 +3134,7 @@
|
|
3134 |
"<BACKSPACE>": 282,
|
3135 |
"<SEQUENCE_NATURAL_START>": 283,
|
3136 |
"<NOOP>": 284,
|
3137 |
-
"<
|
3138 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3139 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3140 |
"<CELL_TYPE_CLASS>": 288,
|
@@ -3145,8 +3145,8 @@
|
|
3145 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3146 |
"<COMPLEX_ENTITY>": 294,
|
3147 |
"<ALTERNATIVE>": 295,
|
3148 |
-
"<
|
3149 |
-
"<
|
3150 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3151 |
"<MUTATED>": 299,
|
3152 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
+
"content": "<INTERNAL_0>",
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
|
|
86 |
},
|
87 |
{
|
88 |
"id": 9,
|
89 |
+
"content": "<INTERNAL_1>",
|
90 |
"single_word": false,
|
91 |
"lstrip": false,
|
92 |
"rstrip": false,
|
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
+
"content": "<INTERNAL_2>",
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
+
"content": "<INTERNAL_3>",
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
+
"content": "<INTERNAL_17>",
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
+
"content": "<INTERNAL_15>",
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
+
"content": "<INTERNAL_16>",
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
+
"content": "<INTERNAL_7>",
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
+
"content": "<INTERNAL_6>",
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
+
"content": "<INTERNAL_9>",
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
+
"content": "<INTERNAL_5>",
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
+
"content": "<INTERNAL_8>",
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
+
"content": "<INTERNAL_4>",
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
+
"content": "<INTERNAL_14>",
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
+
"content": "<INTERNAL_13>",
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
+
"content": "<INTERNAL_12>",
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
|
|
2857 |
"<EOS>": 5,
|
2858 |
"<MOLECULAR_ENTITY>": 6,
|
2859 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2860 |
+
"<INTERNAL_0>": 8,
|
2861 |
+
"<INTERNAL_1>": 9,
|
2862 |
+
"<INTERNAL_2>": 10,
|
2863 |
+
"<INTERNAL_3>": 11,
|
2864 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2865 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2866 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
|
|
3106 |
"<SENTINEL_ID_197>": 254,
|
3107 |
"<SENTINEL_ID_198>": 255,
|
3108 |
"<SENTINEL_ID_199>": 256,
|
3109 |
+
"<INTERNAL_17>": 257,
|
3110 |
+
"<INTERNAL_15>": 258,
|
3111 |
+
"<INTERNAL_16>": 259,
|
3112 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3113 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3114 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
|
|
3117 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3118 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3119 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3120 |
+
"<INTERNAL_7>": 268,
|
3121 |
+
"<INTERNAL_6>": 269,
|
3122 |
+
"<INTERNAL_9>": 270,
|
3123 |
+
"<INTERNAL_5>": 271,
|
3124 |
+
"<INTERNAL_8>": 272,
|
3125 |
+
"<INTERNAL_4>": 273,
|
3126 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3127 |
"<TIMESTEP>": 275,
|
3128 |
"<DIFFUSION>": 276,
|
|
|
3134 |
"<BACKSPACE>": 282,
|
3135 |
"<SEQUENCE_NATURAL_START>": 283,
|
3136 |
"<NOOP>": 284,
|
3137 |
+
"<INTERNAL_14>": 285,
|
3138 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3139 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3140 |
"<CELL_TYPE_CLASS>": 288,
|
|
|
3145 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3146 |
"<COMPLEX_ENTITY>": 294,
|
3147 |
"<ALTERNATIVE>": 295,
|
3148 |
+
"<INTERNAL_13>": 296,
|
3149 |
+
"<INTERNAL_12>": 297,
|
3150 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3151 |
"<MUTATED>": 299,
|
3152 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
tokenizer/t5_tokenizer_AA_special.json
CHANGED
@@ -77,7 +77,7 @@
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
-
"content": "<
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
@@ -86,7 +86,7 @@
|
|
86 |
},
|
87 |
{
|
88 |
"id": 9,
|
89 |
-
"content": "<
|
90 |
"single_word": false,
|
91 |
"lstrip": false,
|
92 |
"rstrip": false,
|
@@ -95,7 +95,7 @@
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
-
"content": "<
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
@@ -104,7 +104,7 @@
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
-
"content": "<
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
@@ -2318,7 +2318,7 @@
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
-
"content": "<
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
@@ -2327,7 +2327,7 @@
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
-
"content": "<
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
@@ -2336,7 +2336,7 @@
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
-
"content": "<
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
@@ -2417,7 +2417,7 @@
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
-
"content": "<
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
@@ -2426,7 +2426,7 @@
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
-
"content": "<
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
@@ -2435,7 +2435,7 @@
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
-
"content": "<
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
@@ -2444,7 +2444,7 @@
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
-
"content": "<
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
@@ -2453,7 +2453,7 @@
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
-
"content": "<
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
@@ -2462,7 +2462,7 @@
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
-
"content": "<
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
@@ -2570,7 +2570,7 @@
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
-
"content": "<
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
@@ -2669,7 +2669,7 @@
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
-
"content": "<
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
@@ -2678,7 +2678,7 @@
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
-
"content": "<
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
@@ -2857,10 +2857,10 @@
|
|
2857 |
"<EOS>": 5,
|
2858 |
"<MOLECULAR_ENTITY>": 6,
|
2859 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2860 |
-
"<
|
2861 |
-
"<
|
2862 |
-
"<
|
2863 |
-
"<
|
2864 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2865 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2866 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
@@ -3106,9 +3106,9 @@
|
|
3106 |
"<SENTINEL_ID_197>": 254,
|
3107 |
"<SENTINEL_ID_198>": 255,
|
3108 |
"<SENTINEL_ID_199>": 256,
|
3109 |
-
"<
|
3110 |
-
"<
|
3111 |
-
"<
|
3112 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3113 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3114 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
@@ -3117,12 +3117,12 @@
|
|
3117 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3118 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3119 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3120 |
-
"<
|
3121 |
-
"<
|
3122 |
-
"<
|
3123 |
-
"<
|
3124 |
-
"<
|
3125 |
-
"<
|
3126 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3127 |
"<TIMESTEP>": 275,
|
3128 |
"<DIFFUSION>": 276,
|
@@ -3134,7 +3134,7 @@
|
|
3134 |
"<BACKSPACE>": 282,
|
3135 |
"<SEQUENCE_NATURAL_START>": 283,
|
3136 |
"<NOOP>": 284,
|
3137 |
-
"<
|
3138 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3139 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3140 |
"<CELL_TYPE_CLASS>": 288,
|
@@ -3145,8 +3145,8 @@
|
|
3145 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3146 |
"<COMPLEX_ENTITY>": 294,
|
3147 |
"<ALTERNATIVE>": 295,
|
3148 |
-
"<
|
3149 |
-
"<
|
3150 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3151 |
"<MUTATED>": 299,
|
3152 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
+
"content": "<INTERNAL_0>",
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
|
|
86 |
},
|
87 |
{
|
88 |
"id": 9,
|
89 |
+
"content": "<INTERNAL_1>",
|
90 |
"single_word": false,
|
91 |
"lstrip": false,
|
92 |
"rstrip": false,
|
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
+
"content": "<INTERNAL_2>",
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
+
"content": "<INTERNAL_3>",
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
+
"content": "<INTERNAL_17>",
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
+
"content": "<INTERNAL_15>",
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
+
"content": "<INTERNAL_16>",
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
+
"content": "<INTERNAL_7>",
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
+
"content": "<INTERNAL_6>",
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
+
"content": "<INTERNAL_9>",
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
+
"content": "<INTERNAL_5>",
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
+
"content": "<INTERNAL_8>",
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
+
"content": "<INTERNAL_4>",
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
+
"content": "<INTERNAL_14>",
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
+
"content": "<INTERNAL_13>",
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
+
"content": "<INTERNAL_12>",
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
|
|
2857 |
"<EOS>": 5,
|
2858 |
"<MOLECULAR_ENTITY>": 6,
|
2859 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2860 |
+
"<INTERNAL_0>": 8,
|
2861 |
+
"<INTERNAL_1>": 9,
|
2862 |
+
"<INTERNAL_2>": 10,
|
2863 |
+
"<INTERNAL_3>": 11,
|
2864 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2865 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2866 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
|
|
3106 |
"<SENTINEL_ID_197>": 254,
|
3107 |
"<SENTINEL_ID_198>": 255,
|
3108 |
"<SENTINEL_ID_199>": 256,
|
3109 |
+
"<INTERNAL_17>": 257,
|
3110 |
+
"<INTERNAL_15>": 258,
|
3111 |
+
"<INTERNAL_16>": 259,
|
3112 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3113 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3114 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
|
|
3117 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3118 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3119 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3120 |
+
"<INTERNAL_7>": 268,
|
3121 |
+
"<INTERNAL_6>": 269,
|
3122 |
+
"<INTERNAL_9>": 270,
|
3123 |
+
"<INTERNAL_5>": 271,
|
3124 |
+
"<INTERNAL_8>": 272,
|
3125 |
+
"<INTERNAL_4>": 273,
|
3126 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3127 |
"<TIMESTEP>": 275,
|
3128 |
"<DIFFUSION>": 276,
|
|
|
3134 |
"<BACKSPACE>": 282,
|
3135 |
"<SEQUENCE_NATURAL_START>": 283,
|
3136 |
"<NOOP>": 284,
|
3137 |
+
"<INTERNAL_14>": 285,
|
3138 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3139 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3140 |
"<CELL_TYPE_CLASS>": 288,
|
|
|
3145 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3146 |
"<COMPLEX_ENTITY>": 294,
|
3147 |
"<ALTERNATIVE>": 295,
|
3148 |
+
"<INTERNAL_13>": 296,
|
3149 |
+
"<INTERNAL_12>": 297,
|
3150 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3151 |
"<MUTATED>": 299,
|
3152 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|