SagiPolaczek commited on
Commit
07042f2
1 Parent(s): 25d3466

Push model using huggingface_hub.

Browse files
tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json CHANGED
@@ -77,7 +77,7 @@
77
  },
78
  {
79
  "id": 8,
80
- "content": "<MOLECULAR_ENTITY_ANTIGEN>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
@@ -86,7 +86,7 @@
86
  },
87
  {
88
  "id": 9,
89
- "content": "<MOLECULAR_ENTITY_EPITOPE>",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
@@ -95,7 +95,7 @@
95
  },
96
  {
97
  "id": 10,
98
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
@@ -104,7 +104,7 @@
104
  },
105
  {
106
  "id": 11,
107
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
@@ -2318,7 +2318,7 @@
2318
  },
2319
  {
2320
  "id": 257,
2321
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIGEN>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
@@ -2327,7 +2327,7 @@
2327
  },
2328
  {
2329
  "id": 258,
2330
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
@@ -2336,7 +2336,7 @@
2336
  },
2337
  {
2338
  "id": 259,
2339
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
@@ -2417,7 +2417,7 @@
2417
  },
2418
  {
2419
  "id": 268,
2420
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
@@ -2426,7 +2426,7 @@
2426
  },
2427
  {
2428
  "id": 269,
2429
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
@@ -2435,7 +2435,7 @@
2435
  },
2436
  {
2437
  "id": 270,
2438
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
@@ -2444,7 +2444,7 @@
2444
  },
2445
  {
2446
  "id": 271,
2447
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
@@ -2453,7 +2453,7 @@
2453
  },
2454
  {
2455
  "id": 272,
2456
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
@@ -2462,7 +2462,7 @@
2462
  },
2463
  {
2464
  "id": 273,
2465
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
@@ -2570,7 +2570,7 @@
2570
  },
2571
  {
2572
  "id": 285,
2573
- "content": "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
@@ -2669,7 +2669,7 @@
2669
  },
2670
  {
2671
  "id": 296,
2672
- "content": "<CDR3_REGION>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
@@ -2678,7 +2678,7 @@
2678
  },
2679
  {
2680
  "id": 297,
2681
- "content": "<GENERAL_CHAIN>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
@@ -2852,10 +2852,10 @@
2852
  "<EOS>": 5,
2853
  "<MOLECULAR_ENTITY>": 6,
2854
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2855
- "<MOLECULAR_ENTITY_ANTIGEN>": 8,
2856
- "<MOLECULAR_ENTITY_EPITOPE>": 9,
2857
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>": 10,
2858
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>": 11,
2859
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2860
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2861
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
@@ -3101,9 +3101,9 @@
3101
  "<SENTINEL_ID_197>": 254,
3102
  "<SENTINEL_ID_198>": 255,
3103
  "<SENTINEL_ID_199>": 256,
3104
- "<MOLECULAR_ENTITY_TYPE_ANTIGEN>": 257,
3105
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>": 258,
3106
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>": 259,
3107
  "<ATTRIBUTE_ORGANISM>": 260,
3108
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3109
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
@@ -3112,12 +3112,12 @@
3112
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3113
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3114
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3115
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>": 268,
3116
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>": 269,
3117
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>": 270,
3118
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>": 271,
3119
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>": 272,
3120
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>": 273,
3121
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3122
  "<TIMESTEP>": 275,
3123
  "<DIFFUSION>": 276,
@@ -3129,7 +3129,7 @@
3129
  "<BACKSPACE>": 282,
3130
  "<SEQUENCE_NATURAL_START>": 283,
3131
  "<NOOP>": 284,
3132
- "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>": 285,
3133
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3134
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3135
  "<CELL_TYPE_CLASS>": 288,
@@ -3140,8 +3140,8 @@
3140
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3141
  "<COMPLEX_ENTITY>": 294,
3142
  "<ALTERNATIVE>": 295,
3143
- "<CDR3_REGION>": 296,
3144
- "<GENERAL_CHAIN>": 297,
3145
  "<SUBMOLECULAR_ENTITY>": 298,
3146
  "<MUTATED>": 299,
3147
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
 
77
  },
78
  {
79
  "id": 8,
80
+ "content": "<INTERNAL_0>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
 
86
  },
87
  {
88
  "id": 9,
89
+ "content": "<INTERNAL_1>",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
 
95
  },
96
  {
97
  "id": 10,
98
+ "content": "<INTERNAL_2>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
 
104
  },
105
  {
106
  "id": 11,
107
+ "content": "<INTERNAL_3>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
 
2318
  },
2319
  {
2320
  "id": 257,
2321
+ "content": "<INTERNAL_17>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
 
2327
  },
2328
  {
2329
  "id": 258,
2330
+ "content": "<INTERNAL_15>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
 
2336
  },
2337
  {
2338
  "id": 259,
2339
+ "content": "<INTERNAL_16>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
 
2417
  },
2418
  {
2419
  "id": 268,
2420
+ "content": "<INTERNAL_7>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
 
2426
  },
2427
  {
2428
  "id": 269,
2429
+ "content": "<INTERNAL_6>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
 
2435
  },
2436
  {
2437
  "id": 270,
2438
+ "content": "<INTERNAL_9>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
 
2444
  },
2445
  {
2446
  "id": 271,
2447
+ "content": "<INTERNAL_5>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
 
2453
  },
2454
  {
2455
  "id": 272,
2456
+ "content": "<INTERNAL_8>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
 
2462
  },
2463
  {
2464
  "id": 273,
2465
+ "content": "<INTERNAL_4>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
 
2570
  },
2571
  {
2572
  "id": 285,
2573
+ "content": "<INTERNAL_14>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
 
2669
  },
2670
  {
2671
  "id": 296,
2672
+ "content": "<INTERNAL_13>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
 
2678
  },
2679
  {
2680
  "id": 297,
2681
+ "content": "<INTERNAL_12>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
 
2852
  "<EOS>": 5,
2853
  "<MOLECULAR_ENTITY>": 6,
2854
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2855
+ "<INTERNAL_0>": 8,
2856
+ "<INTERNAL_1>": 9,
2857
+ "<INTERNAL_2>": 10,
2858
+ "<INTERNAL_3>": 11,
2859
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2860
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2861
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
 
3101
  "<SENTINEL_ID_197>": 254,
3102
  "<SENTINEL_ID_198>": 255,
3103
  "<SENTINEL_ID_199>": 256,
3104
+ "<INTERNAL_17>": 257,
3105
+ "<INTERNAL_15>": 258,
3106
+ "<INTERNAL_16>": 259,
3107
  "<ATTRIBUTE_ORGANISM>": 260,
3108
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3109
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
 
3112
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3113
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3114
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3115
+ "<INTERNAL_7>": 268,
3116
+ "<INTERNAL_6>": 269,
3117
+ "<INTERNAL_9>": 270,
3118
+ "<INTERNAL_5>": 271,
3119
+ "<INTERNAL_8>": 272,
3120
+ "<INTERNAL_4>": 273,
3121
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3122
  "<TIMESTEP>": 275,
3123
  "<DIFFUSION>": 276,
 
3129
  "<BACKSPACE>": 282,
3130
  "<SEQUENCE_NATURAL_START>": 283,
3131
  "<NOOP>": 284,
3132
+ "<INTERNAL_14>": 285,
3133
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3134
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3135
  "<CELL_TYPE_CLASS>": 288,
 
3140
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3141
  "<COMPLEX_ENTITY>": 294,
3142
  "<ALTERNATIVE>": 295,
3143
+ "<INTERNAL_13>": 296,
3144
+ "<INTERNAL_12>": 297,
3145
  "<SUBMOLECULAR_ENTITY>": 298,
3146
  "<MUTATED>": 299,
3147
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
tokenizer/cell_attributes_tokenizer.json CHANGED
@@ -77,7 +77,7 @@
77
  },
78
  {
79
  "id": 8,
80
- "content": "<MOLECULAR_ENTITY_ANTIGEN>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
@@ -86,7 +86,7 @@
86
  },
87
  {
88
  "id": 9,
89
- "content": "<MOLECULAR_ENTITY_EPITOPE>",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
@@ -95,7 +95,7 @@
95
  },
96
  {
97
  "id": 10,
98
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
@@ -104,7 +104,7 @@
104
  },
105
  {
106
  "id": 11,
107
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
@@ -2318,7 +2318,7 @@
2318
  },
2319
  {
2320
  "id": 257,
2321
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIGEN>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
@@ -2327,7 +2327,7 @@
2327
  },
2328
  {
2329
  "id": 258,
2330
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
@@ -2336,7 +2336,7 @@
2336
  },
2337
  {
2338
  "id": 259,
2339
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
@@ -2417,7 +2417,7 @@
2417
  },
2418
  {
2419
  "id": 268,
2420
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
@@ -2426,7 +2426,7 @@
2426
  },
2427
  {
2428
  "id": 269,
2429
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
@@ -2435,7 +2435,7 @@
2435
  },
2436
  {
2437
  "id": 270,
2438
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
@@ -2444,7 +2444,7 @@
2444
  },
2445
  {
2446
  "id": 271,
2447
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
@@ -2453,7 +2453,7 @@
2453
  },
2454
  {
2455
  "id": 272,
2456
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
@@ -2462,7 +2462,7 @@
2462
  },
2463
  {
2464
  "id": 273,
2465
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
@@ -2570,7 +2570,7 @@
2570
  },
2571
  {
2572
  "id": 285,
2573
- "content": "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
@@ -2669,7 +2669,7 @@
2669
  },
2670
  {
2671
  "id": 296,
2672
- "content": "<CDR3_REGION>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
@@ -2678,7 +2678,7 @@
2678
  },
2679
  {
2680
  "id": 297,
2681
- "content": "<GENERAL_CHAIN>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
@@ -2857,10 +2857,10 @@
2857
  "<EOS>": 5,
2858
  "<MOLECULAR_ENTITY>": 6,
2859
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2860
- "<MOLECULAR_ENTITY_ANTIGEN>": 8,
2861
- "<MOLECULAR_ENTITY_EPITOPE>": 9,
2862
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>": 10,
2863
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>": 11,
2864
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2865
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2866
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
@@ -3106,9 +3106,9 @@
3106
  "<SENTINEL_ID_197>": 254,
3107
  "<SENTINEL_ID_198>": 255,
3108
  "<SENTINEL_ID_199>": 256,
3109
- "<MOLECULAR_ENTITY_TYPE_ANTIGEN>": 257,
3110
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>": 258,
3111
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>": 259,
3112
  "<ATTRIBUTE_ORGANISM>": 260,
3113
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3114
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
@@ -3117,12 +3117,12 @@
3117
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3118
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3119
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3120
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>": 268,
3121
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>": 269,
3122
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>": 270,
3123
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>": 271,
3124
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>": 272,
3125
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>": 273,
3126
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3127
  "<TIMESTEP>": 275,
3128
  "<DIFFUSION>": 276,
@@ -3134,7 +3134,7 @@
3134
  "<BACKSPACE>": 282,
3135
  "<SEQUENCE_NATURAL_START>": 283,
3136
  "<NOOP>": 284,
3137
- "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>": 285,
3138
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3139
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3140
  "<CELL_TYPE_CLASS>": 288,
@@ -3145,8 +3145,8 @@
3145
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3146
  "<COMPLEX_ENTITY>": 294,
3147
  "<ALTERNATIVE>": 295,
3148
- "<CDR3_REGION>": 296,
3149
- "<GENERAL_CHAIN>": 297,
3150
  "<SUBMOLECULAR_ENTITY>": 298,
3151
  "<MUTATED>": 299,
3152
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
 
77
  },
78
  {
79
  "id": 8,
80
+ "content": "<INTERNAL_0>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
 
86
  },
87
  {
88
  "id": 9,
89
+ "content": "<INTERNAL_1>",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
 
95
  },
96
  {
97
  "id": 10,
98
+ "content": "<INTERNAL_2>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
 
104
  },
105
  {
106
  "id": 11,
107
+ "content": "<INTERNAL_3>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
 
2318
  },
2319
  {
2320
  "id": 257,
2321
+ "content": "<INTERNAL_17>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
 
2327
  },
2328
  {
2329
  "id": 258,
2330
+ "content": "<INTERNAL_15>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
 
2336
  },
2337
  {
2338
  "id": 259,
2339
+ "content": "<INTERNAL_16>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
 
2417
  },
2418
  {
2419
  "id": 268,
2420
+ "content": "<INTERNAL_7>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
 
2426
  },
2427
  {
2428
  "id": 269,
2429
+ "content": "<INTERNAL_6>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
 
2435
  },
2436
  {
2437
  "id": 270,
2438
+ "content": "<INTERNAL_9>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
 
2444
  },
2445
  {
2446
  "id": 271,
2447
+ "content": "<INTERNAL_5>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
 
2453
  },
2454
  {
2455
  "id": 272,
2456
+ "content": "<INTERNAL_8>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
 
2462
  },
2463
  {
2464
  "id": 273,
2465
+ "content": "<INTERNAL_4>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
 
2570
  },
2571
  {
2572
  "id": 285,
2573
+ "content": "<INTERNAL_14>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
 
2669
  },
2670
  {
2671
  "id": 296,
2672
+ "content": "<INTERNAL_13>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
 
2678
  },
2679
  {
2680
  "id": 297,
2681
+ "content": "<INTERNAL_12>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
 
2857
  "<EOS>": 5,
2858
  "<MOLECULAR_ENTITY>": 6,
2859
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2860
+ "<INTERNAL_0>": 8,
2861
+ "<INTERNAL_1>": 9,
2862
+ "<INTERNAL_2>": 10,
2863
+ "<INTERNAL_3>": 11,
2864
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2865
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2866
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
 
3106
  "<SENTINEL_ID_197>": 254,
3107
  "<SENTINEL_ID_198>": 255,
3108
  "<SENTINEL_ID_199>": 256,
3109
+ "<INTERNAL_17>": 257,
3110
+ "<INTERNAL_15>": 258,
3111
+ "<INTERNAL_16>": 259,
3112
  "<ATTRIBUTE_ORGANISM>": 260,
3113
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3114
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
 
3117
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3118
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3119
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3120
+ "<INTERNAL_7>": 268,
3121
+ "<INTERNAL_6>": 269,
3122
+ "<INTERNAL_9>": 270,
3123
+ "<INTERNAL_5>": 271,
3124
+ "<INTERNAL_8>": 272,
3125
+ "<INTERNAL_4>": 273,
3126
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3127
  "<TIMESTEP>": 275,
3128
  "<DIFFUSION>": 276,
 
3134
  "<BACKSPACE>": 282,
3135
  "<SEQUENCE_NATURAL_START>": 283,
3136
  "<NOOP>": 284,
3137
+ "<INTERNAL_14>": 285,
3138
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3139
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3140
  "<CELL_TYPE_CLASS>": 288,
 
3145
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3146
  "<COMPLEX_ENTITY>": 294,
3147
  "<ALTERNATIVE>": 295,
3148
+ "<INTERNAL_13>": 296,
3149
+ "<INTERNAL_12>": 297,
3150
  "<SUBMOLECULAR_ENTITY>": 298,
3151
  "<MUTATED>": 299,
3152
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
tokenizer/gene_tokenizer.json CHANGED
@@ -77,7 +77,7 @@
77
  },
78
  {
79
  "id": 8,
80
- "content": "<MOLECULAR_ENTITY_ANTIGEN>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
@@ -86,7 +86,7 @@
86
  },
87
  {
88
  "id": 9,
89
- "content": "<MOLECULAR_ENTITY_EPITOPE>",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
@@ -95,7 +95,7 @@
95
  },
96
  {
97
  "id": 10,
98
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
@@ -104,7 +104,7 @@
104
  },
105
  {
106
  "id": 11,
107
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
@@ -2318,7 +2318,7 @@
2318
  },
2319
  {
2320
  "id": 257,
2321
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIGEN>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
@@ -2327,7 +2327,7 @@
2327
  },
2328
  {
2329
  "id": 258,
2330
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
@@ -2336,7 +2336,7 @@
2336
  },
2337
  {
2338
  "id": 259,
2339
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
@@ -2417,7 +2417,7 @@
2417
  },
2418
  {
2419
  "id": 268,
2420
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
@@ -2426,7 +2426,7 @@
2426
  },
2427
  {
2428
  "id": 269,
2429
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
@@ -2435,7 +2435,7 @@
2435
  },
2436
  {
2437
  "id": 270,
2438
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
@@ -2444,7 +2444,7 @@
2444
  },
2445
  {
2446
  "id": 271,
2447
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
@@ -2453,7 +2453,7 @@
2453
  },
2454
  {
2455
  "id": 272,
2456
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
@@ -2462,7 +2462,7 @@
2462
  },
2463
  {
2464
  "id": 273,
2465
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
@@ -2570,7 +2570,7 @@
2570
  },
2571
  {
2572
  "id": 285,
2573
- "content": "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
@@ -2669,7 +2669,7 @@
2669
  },
2670
  {
2671
  "id": 296,
2672
- "content": "<CDR3_REGION>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
@@ -2678,7 +2678,7 @@
2678
  },
2679
  {
2680
  "id": 297,
2681
- "content": "<GENERAL_CHAIN>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
@@ -2857,10 +2857,10 @@
2857
  "<EOS>": 5,
2858
  "<MOLECULAR_ENTITY>": 6,
2859
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2860
- "<MOLECULAR_ENTITY_ANTIGEN>": 8,
2861
- "<MOLECULAR_ENTITY_EPITOPE>": 9,
2862
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>": 10,
2863
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>": 11,
2864
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2865
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2866
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
@@ -3106,9 +3106,9 @@
3106
  "<SENTINEL_ID_197>": 254,
3107
  "<SENTINEL_ID_198>": 255,
3108
  "<SENTINEL_ID_199>": 256,
3109
- "<MOLECULAR_ENTITY_TYPE_ANTIGEN>": 257,
3110
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>": 258,
3111
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>": 259,
3112
  "<ATTRIBUTE_ORGANISM>": 260,
3113
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3114
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
@@ -3117,12 +3117,12 @@
3117
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3118
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3119
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3120
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>": 268,
3121
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>": 269,
3122
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>": 270,
3123
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>": 271,
3124
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>": 272,
3125
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>": 273,
3126
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3127
  "<TIMESTEP>": 275,
3128
  "<DIFFUSION>": 276,
@@ -3134,7 +3134,7 @@
3134
  "<BACKSPACE>": 282,
3135
  "<SEQUENCE_NATURAL_START>": 283,
3136
  "<NOOP>": 284,
3137
- "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>": 285,
3138
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3139
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3140
  "<CELL_TYPE_CLASS>": 288,
@@ -3145,8 +3145,8 @@
3145
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3146
  "<COMPLEX_ENTITY>": 294,
3147
  "<ALTERNATIVE>": 295,
3148
- "<CDR3_REGION>": 296,
3149
- "<GENERAL_CHAIN>": 297,
3150
  "<SUBMOLECULAR_ENTITY>": 298,
3151
  "<MUTATED>": 299,
3152
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
 
77
  },
78
  {
79
  "id": 8,
80
+ "content": "<INTERNAL_0>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
 
86
  },
87
  {
88
  "id": 9,
89
+ "content": "<INTERNAL_1>",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
 
95
  },
96
  {
97
  "id": 10,
98
+ "content": "<INTERNAL_2>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
 
104
  },
105
  {
106
  "id": 11,
107
+ "content": "<INTERNAL_3>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
 
2318
  },
2319
  {
2320
  "id": 257,
2321
+ "content": "<INTERNAL_17>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
 
2327
  },
2328
  {
2329
  "id": 258,
2330
+ "content": "<INTERNAL_15>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
 
2336
  },
2337
  {
2338
  "id": 259,
2339
+ "content": "<INTERNAL_16>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
 
2417
  },
2418
  {
2419
  "id": 268,
2420
+ "content": "<INTERNAL_7>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
 
2426
  },
2427
  {
2428
  "id": 269,
2429
+ "content": "<INTERNAL_6>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
 
2435
  },
2436
  {
2437
  "id": 270,
2438
+ "content": "<INTERNAL_9>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
 
2444
  },
2445
  {
2446
  "id": 271,
2447
+ "content": "<INTERNAL_5>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
 
2453
  },
2454
  {
2455
  "id": 272,
2456
+ "content": "<INTERNAL_8>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
 
2462
  },
2463
  {
2464
  "id": 273,
2465
+ "content": "<INTERNAL_4>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
 
2570
  },
2571
  {
2572
  "id": 285,
2573
+ "content": "<INTERNAL_14>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
 
2669
  },
2670
  {
2671
  "id": 296,
2672
+ "content": "<INTERNAL_13>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
 
2678
  },
2679
  {
2680
  "id": 297,
2681
+ "content": "<INTERNAL_12>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
 
2857
  "<EOS>": 5,
2858
  "<MOLECULAR_ENTITY>": 6,
2859
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2860
+ "<INTERNAL_0>": 8,
2861
+ "<INTERNAL_1>": 9,
2862
+ "<INTERNAL_2>": 10,
2863
+ "<INTERNAL_3>": 11,
2864
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2865
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2866
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
 
3106
  "<SENTINEL_ID_197>": 254,
3107
  "<SENTINEL_ID_198>": 255,
3108
  "<SENTINEL_ID_199>": 256,
3109
+ "<INTERNAL_17>": 257,
3110
+ "<INTERNAL_15>": 258,
3111
+ "<INTERNAL_16>": 259,
3112
  "<ATTRIBUTE_ORGANISM>": 260,
3113
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3114
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
 
3117
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3118
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3119
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3120
+ "<INTERNAL_7>": 268,
3121
+ "<INTERNAL_6>": 269,
3122
+ "<INTERNAL_9>": 270,
3123
+ "<INTERNAL_5>": 271,
3124
+ "<INTERNAL_8>": 272,
3125
+ "<INTERNAL_4>": 273,
3126
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3127
  "<TIMESTEP>": 275,
3128
  "<DIFFUSION>": 276,
 
3134
  "<BACKSPACE>": 282,
3135
  "<SEQUENCE_NATURAL_START>": 283,
3136
  "<NOOP>": 284,
3137
+ "<INTERNAL_14>": 285,
3138
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3139
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3140
  "<CELL_TYPE_CLASS>": 288,
 
3145
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3146
  "<COMPLEX_ENTITY>": 294,
3147
  "<ALTERNATIVE>": 295,
3148
+ "<INTERNAL_13>": 296,
3149
+ "<INTERNAL_12>": 297,
3150
  "<SUBMOLECULAR_ENTITY>": 298,
3151
  "<MUTATED>": 299,
3152
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
tokenizer/t5_tokenizer_AA_special.json CHANGED
@@ -77,7 +77,7 @@
77
  },
78
  {
79
  "id": 8,
80
- "content": "<MOLECULAR_ENTITY_ANTIGEN>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
@@ -86,7 +86,7 @@
86
  },
87
  {
88
  "id": 9,
89
- "content": "<MOLECULAR_ENTITY_EPITOPE>",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
@@ -95,7 +95,7 @@
95
  },
96
  {
97
  "id": 10,
98
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
@@ -104,7 +104,7 @@
104
  },
105
  {
106
  "id": 11,
107
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
@@ -2318,7 +2318,7 @@
2318
  },
2319
  {
2320
  "id": 257,
2321
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIGEN>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
@@ -2327,7 +2327,7 @@
2327
  },
2328
  {
2329
  "id": 258,
2330
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
@@ -2336,7 +2336,7 @@
2336
  },
2337
  {
2338
  "id": 259,
2339
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
@@ -2417,7 +2417,7 @@
2417
  },
2418
  {
2419
  "id": 268,
2420
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
@@ -2426,7 +2426,7 @@
2426
  },
2427
  {
2428
  "id": 269,
2429
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
@@ -2435,7 +2435,7 @@
2435
  },
2436
  {
2437
  "id": 270,
2438
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
@@ -2444,7 +2444,7 @@
2444
  },
2445
  {
2446
  "id": 271,
2447
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
@@ -2453,7 +2453,7 @@
2453
  },
2454
  {
2455
  "id": 272,
2456
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
@@ -2462,7 +2462,7 @@
2462
  },
2463
  {
2464
  "id": 273,
2465
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
@@ -2570,7 +2570,7 @@
2570
  },
2571
  {
2572
  "id": 285,
2573
- "content": "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
@@ -2669,7 +2669,7 @@
2669
  },
2670
  {
2671
  "id": 296,
2672
- "content": "<CDR3_REGION>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
@@ -2678,7 +2678,7 @@
2678
  },
2679
  {
2680
  "id": 297,
2681
- "content": "<GENERAL_CHAIN>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
@@ -2857,10 +2857,10 @@
2857
  "<EOS>": 5,
2858
  "<MOLECULAR_ENTITY>": 6,
2859
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2860
- "<MOLECULAR_ENTITY_ANTIGEN>": 8,
2861
- "<MOLECULAR_ENTITY_EPITOPE>": 9,
2862
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>": 10,
2863
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>": 11,
2864
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2865
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2866
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
@@ -3106,9 +3106,9 @@
3106
  "<SENTINEL_ID_197>": 254,
3107
  "<SENTINEL_ID_198>": 255,
3108
  "<SENTINEL_ID_199>": 256,
3109
- "<MOLECULAR_ENTITY_TYPE_ANTIGEN>": 257,
3110
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>": 258,
3111
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>": 259,
3112
  "<ATTRIBUTE_ORGANISM>": 260,
3113
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3114
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
@@ -3117,12 +3117,12 @@
3117
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3118
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3119
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3120
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>": 268,
3121
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>": 269,
3122
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>": 270,
3123
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>": 271,
3124
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>": 272,
3125
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>": 273,
3126
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3127
  "<TIMESTEP>": 275,
3128
  "<DIFFUSION>": 276,
@@ -3134,7 +3134,7 @@
3134
  "<BACKSPACE>": 282,
3135
  "<SEQUENCE_NATURAL_START>": 283,
3136
  "<NOOP>": 284,
3137
- "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>": 285,
3138
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3139
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3140
  "<CELL_TYPE_CLASS>": 288,
@@ -3145,8 +3145,8 @@
3145
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3146
  "<COMPLEX_ENTITY>": 294,
3147
  "<ALTERNATIVE>": 295,
3148
- "<CDR3_REGION>": 296,
3149
- "<GENERAL_CHAIN>": 297,
3150
  "<SUBMOLECULAR_ENTITY>": 298,
3151
  "<MUTATED>": 299,
3152
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
 
77
  },
78
  {
79
  "id": 8,
80
+ "content": "<INTERNAL_0>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
 
86
  },
87
  {
88
  "id": 9,
89
+ "content": "<INTERNAL_1>",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
 
95
  },
96
  {
97
  "id": 10,
98
+ "content": "<INTERNAL_2>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
 
104
  },
105
  {
106
  "id": 11,
107
+ "content": "<INTERNAL_3>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
 
2318
  },
2319
  {
2320
  "id": 257,
2321
+ "content": "<INTERNAL_17>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
 
2327
  },
2328
  {
2329
  "id": 258,
2330
+ "content": "<INTERNAL_15>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
 
2336
  },
2337
  {
2338
  "id": 259,
2339
+ "content": "<INTERNAL_16>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
 
2417
  },
2418
  {
2419
  "id": 268,
2420
+ "content": "<INTERNAL_7>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
 
2426
  },
2427
  {
2428
  "id": 269,
2429
+ "content": "<INTERNAL_6>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
 
2435
  },
2436
  {
2437
  "id": 270,
2438
+ "content": "<INTERNAL_9>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
 
2444
  },
2445
  {
2446
  "id": 271,
2447
+ "content": "<INTERNAL_5>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
 
2453
  },
2454
  {
2455
  "id": 272,
2456
+ "content": "<INTERNAL_8>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
 
2462
  },
2463
  {
2464
  "id": 273,
2465
+ "content": "<INTERNAL_4>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
 
2570
  },
2571
  {
2572
  "id": 285,
2573
+ "content": "<INTERNAL_14>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
 
2669
  },
2670
  {
2671
  "id": 296,
2672
+ "content": "<INTERNAL_13>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
 
2678
  },
2679
  {
2680
  "id": 297,
2681
+ "content": "<INTERNAL_12>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
 
2857
  "<EOS>": 5,
2858
  "<MOLECULAR_ENTITY>": 6,
2859
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2860
+ "<INTERNAL_0>": 8,
2861
+ "<INTERNAL_1>": 9,
2862
+ "<INTERNAL_2>": 10,
2863
+ "<INTERNAL_3>": 11,
2864
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2865
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2866
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
 
3106
  "<SENTINEL_ID_197>": 254,
3107
  "<SENTINEL_ID_198>": 255,
3108
  "<SENTINEL_ID_199>": 256,
3109
+ "<INTERNAL_17>": 257,
3110
+ "<INTERNAL_15>": 258,
3111
+ "<INTERNAL_16>": 259,
3112
  "<ATTRIBUTE_ORGANISM>": 260,
3113
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3114
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
 
3117
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3118
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3119
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3120
+ "<INTERNAL_7>": 268,
3121
+ "<INTERNAL_6>": 269,
3122
+ "<INTERNAL_9>": 270,
3123
+ "<INTERNAL_5>": 271,
3124
+ "<INTERNAL_8>": 272,
3125
+ "<INTERNAL_4>": 273,
3126
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3127
  "<TIMESTEP>": 275,
3128
  "<DIFFUSION>": 276,
 
3134
  "<BACKSPACE>": 282,
3135
  "<SEQUENCE_NATURAL_START>": 283,
3136
  "<NOOP>": 284,
3137
+ "<INTERNAL_14>": 285,
3138
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3139
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3140
  "<CELL_TYPE_CLASS>": 288,
 
3145
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3146
  "<COMPLEX_ENTITY>": 294,
3147
  "<ALTERNATIVE>": 295,
3148
+ "<INTERNAL_13>": 296,
3149
+ "<INTERNAL_12>": 297,
3150
  "<SUBMOLECULAR_ENTITY>": 298,
3151
  "<MUTATED>": 299,
3152
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,