Johannes commited on
Commit
690bf20
·
1 Parent(s): 7c2db66

speciesLM k6, metazoa, upstream

Browse files
added_tokens.json ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "acanthaster_planci_gca001949145v1": 4101,
3
+ "acanthochromis_polyacanthus": 4102,
4
+ "accipiter_nisus": 4103,
5
+ "acromyrmex_echinatior_gca000204515v1rs": 4104,
6
+ "acropora_millepora_gca013753865v1": 4105,
7
+ "actinia_equina_gca011057435": 4106,
8
+ "actinia_tenebrosa_gca009602425v1": 4107,
9
+ "acyrthosiphon_pisum": 4108,
10
+ "adineta_vaga": 4109,
11
+ "aedes_aegypti_lvpagwg": 4110,
12
+ "aedes_albopictus": 4111,
13
+ "agrilus_planipennis_gca000699045v2": 4112,
14
+ "ailuropoda_melanoleuca": 4113,
15
+ "amazona_collaria": 4114,
16
+ "amphibalanus_amphitrite_gca019059575v1": 4115,
17
+ "amphilophus_citrinellus": 4116,
18
+ "amphimedon_queenslandica": 4117,
19
+ "amphiprion_ocellaris": 4118,
20
+ "amphiprion_percula": 4119,
21
+ "amyelois_transitella_gca001186105v1rs": 4120,
22
+ "anabas_testudineus": 4121,
23
+ "anas_platyrhynchos": 4122,
24
+ "anas_zonorhyncha": 4123,
25
+ "anneissia_japonica_gca011630105v1": 4124,
26
+ "anolis_carolinensis": 4125,
27
+ "anopheles_albimanus": 4126,
28
+ "anopheles_arabiensis": 4127,
29
+ "anopheles_atroparvus_gca914969975": 4128,
30
+ "anopheles_christyi": 4129,
31
+ "anopheles_coluzzii_ngousso": 4130,
32
+ "anopheles_culicifacies": 4131,
33
+ "anopheles_darlingi": 4132,
34
+ "anopheles_dirus": 4133,
35
+ "anopheles_epiroticus": 4134,
36
+ "anopheles_farauti": 4135,
37
+ "anopheles_funestus": 4136,
38
+ "anopheles_gambiae": 4137,
39
+ "anopheles_maculatus": 4138,
40
+ "anopheles_melas": 4139,
41
+ "anopheles_merus": 4140,
42
+ "anopheles_minimus": 4141,
43
+ "anopheles_quadriannulatus": 4142,
44
+ "anopheles_sinensis_china": 4143,
45
+ "anopheles_stephensi": 4144,
46
+ "anoplophora_glabripennis": 4145,
47
+ "anser_brachyrhynchus": 4146,
48
+ "anser_cygnoides": 4147,
49
+ "anthonomus_grandis_gca022605725v3rs": 4148,
50
+ "aotus_nancymaae": 4149,
51
+ "aphidius_gifuensis_gca014905175v1": 4150,
52
+ "apis_dorsata_gca000469605v1rs": 4151,
53
+ "apis_florea_gca000184785v2rs": 4152,
54
+ "apis_mellifera": 4153,
55
+ "aplysia_californica_gca000002075v2": 4154,
56
+ "apteryx_haastii": 4155,
57
+ "apteryx_owenii": 4156,
58
+ "apteryx_rowi": 4157,
59
+ "aquila_chrysaetos_chrysaetos": 4158,
60
+ "ascaris_suum": 4159,
61
+ "astatotilapia_calliptera": 4160,
62
+ "asterias_rubens_gca902459465v3": 4161,
63
+ "astyanax_mexicanus": 4162,
64
+ "athalia_rosae_gca917208135v1": 4163,
65
+ "athene_cunicularia": 4164,
66
+ "atta_cephalotes": 4165,
67
+ "bactrocera_dorsalis_gca000789215v2": 4166,
68
+ "bactrocera_latifrons_gca001853355v1": 4167,
69
+ "bactrocera_tryoni_gca016617805v2": 4168,
70
+ "balaenoptera_musculus": 4169,
71
+ "belgica_antarctica": 4170,
72
+ "bemisia_tabaci_ssa3nig": 4171,
73
+ "betta_splendens": 4172,
74
+ "bicyclus_anynana_gca900239965v1rs": 4173,
75
+ "biomphalaria_glabrata": 4174,
76
+ "bison_bison_bison": 4175,
77
+ "bombus_impatiens": 4176,
78
+ "bombus_terrestris_gca910591885v2": 4177,
79
+ "bombyx_mandarina_gca003987935v1rs": 4178,
80
+ "bombyx_mori": 4179,
81
+ "bos_grunniens": 4180,
82
+ "bos_indicus_hybrid": 4181,
83
+ "bos_mutus": 4182,
84
+ "bos_taurus_hybrid": 4183,
85
+ "branchiostoma_lanceolatum": 4184,
86
+ "brugia_malayi": 4185,
87
+ "bubo_bubo": 4186,
88
+ "buteo_japonicus": 4187,
89
+ "caenorhabditis_brenneri": 4188,
90
+ "caenorhabditis_briggsae": 4189,
91
+ "caenorhabditis_elegans": 4190,
92
+ "caenorhabditis_japonica": 4191,
93
+ "caenorhabditis_remanei": 4192,
94
+ "cairina_moschata_domestica": 4193,
95
+ "calidris_pugnax": 4194,
96
+ "calidris_pygmaea": 4195,
97
+ "callithrix_jacchus": 4196,
98
+ "callorhinchus_milii": 4197,
99
+ "camarhynchus_parvulus": 4198,
100
+ "camelus_dromedarius": 4199,
101
+ "camponotus_floridanus_gca003227725v1rs": 4200,
102
+ "canis_lupus_familiarisgsd": 4201,
103
+ "capitella_teleta": 4202,
104
+ "capra_hircus": 4203,
105
+ "carassius_auratus": 4204,
106
+ "carlito_syrichta": 4205,
107
+ "castor_canadensis": 4206,
108
+ "catagonus_wagneri": 4207,
109
+ "catharus_ustulatus": 4208,
110
+ "cavia_aperea": 4209,
111
+ "cavia_porcellus": 4210,
112
+ "cebus_imitator": 4211,
113
+ "centruroides_sculpturatus_gca000671375v2": 4212,
114
+ "ceratitis_capitata_gca000347755v4": 4213,
115
+ "cercocebus_atys": 4214,
116
+ "cervus_hanglu_yarkandensis": 4215,
117
+ "chelonoidis_abingdonii": 4216,
118
+ "chelonus_insularis_gca013357705v1rs": 4217,
119
+ "chelydra_serpentina": 4218,
120
+ "chinchilla_lanigera": 4219,
121
+ "chlorocebus_sabaeus": 4220,
122
+ "choloepus_hoffmanni": 4221,
123
+ "chrysemys_picta_bellii": 4222,
124
+ "chrysolophus_pictus": 4223,
125
+ "cimex_lectularius": 4224,
126
+ "ciona_intestinalis": 4225,
127
+ "ciona_savignyi": 4226,
128
+ "clupea_harengus": 4227,
129
+ "clytia_hemisphaerica_gca902728285": 4228,
130
+ "colobus_angolensis_palliatus": 4229,
131
+ "copidosoma_floridanum_gca000648655v2": 4230,
132
+ "corvus_moneduloides": 4231,
133
+ "cotesia_glomerata_gca020080835v1": 4232,
134
+ "cottoperca_gobio": 4233,
135
+ "coturnix_japonica": 4234,
136
+ "crassostrea_gigas": 4235,
137
+ "crassostrea_virginica_gca002022765v4": 4236,
138
+ "cricetulus_griseus_picr": 4237,
139
+ "crocodylus_porosus": 4238,
140
+ "culex_quinquefasciatus_gca015732765v1": 4239,
141
+ "culicoides_sonorensis": 4240,
142
+ "cyanistes_caeruleus": 4241,
143
+ "cyclopterus_lumpus": 4242,
144
+ "cynoglossus_semilaevis": 4243,
145
+ "cyprinodon_variegatus": 4244,
146
+ "cyprinus_carpio_carpio": 4245,
147
+ "danaus_plexippus": 4246,
148
+ "danio_rerio": 4247,
149
+ "daphnia_magna_gca020631705v2": 4248,
150
+ "daphnia_pulex": 4249,
151
+ "daphnia_pulicaria_gca021234035v2rs": 4250,
152
+ "dasypus_novemcinctus": 4251,
153
+ "delphinapterus_leucas": 4252,
154
+ "dendroctonus_ponderosae_gca000355655v1": 4253,
155
+ "dendronephthya_gigantea_gca004324835v1": 4254,
156
+ "denticeps_clupeoides": 4255,
157
+ "dermacentor_andersoni_gca023375885v2rs": 4256,
158
+ "dermacentor_silvarum_gca013339745v1": 4257,
159
+ "dermatophagoides_pteronyssinus_gca001901225v2": 4258,
160
+ "diabrotica_virgifera_gca917563875v2rs": 4259,
161
+ "dicentrarchus_labrax": 4260,
162
+ "dimorphilus_gyrociliatus_gca904063045v1": 4261,
163
+ "dinothrombium_tinctorium": 4262,
164
+ "dipodomys_ordii": 4263,
165
+ "diuraphis_noxia_gca001186385v1": 4264,
166
+ "dromaius_novaehollandiae": 4265,
167
+ "drosophila_ananassae": 4266,
168
+ "drosophila_erecta": 4267,
169
+ "drosophila_grimshawi": 4268,
170
+ "drosophila_melanogaster": 4269,
171
+ "drosophila_mojavensis": 4270,
172
+ "drosophila_persimilis": 4271,
173
+ "drosophila_pseudoobscura": 4272,
174
+ "drosophila_sechellia": 4273,
175
+ "drosophila_simulans": 4274,
176
+ "drosophila_virilis": 4275,
177
+ "drosophila_willistoni": 4276,
178
+ "drosophila_yakuba": 4277,
179
+ "dufourea_novaeangliae_gca001272555v1rs": 4278,
180
+ "echeneis_naucrates": 4279,
181
+ "echinococcus_granulosus_gca000524195v1rs": 4280,
182
+ "echinops_telfairi": 4281,
183
+ "electrophorus_electricus": 4282,
184
+ "eptatretus_burgeri": 4283,
185
+ "equus_asinus": 4284,
186
+ "equus_caballus": 4285,
187
+ "erinaceus_europaeus": 4286,
188
+ "erpetoichthys_calabaricus": 4287,
189
+ "erythrura_gouldiae": 4288,
190
+ "esox_lucius": 4289,
191
+ "eufriesea_mexicana_gca001483705v1rs": 4290,
192
+ "eurytemora_affinis_gca000591075v2": 4291,
193
+ "exaiptasia_diaphana_gca001417965v1": 4292,
194
+ "falco_tinnunculus": 4293,
195
+ "felis_catus": 4294,
196
+ "ficedula_albicollis": 4295,
197
+ "folsomia_candida": 4296,
198
+ "fukomys_damarensis": 4297,
199
+ "fundulus_heteroclitus": 4298,
200
+ "gadus_morhua": 4299,
201
+ "galendromus_occidentalis_gca000255335v2rs": 4300,
202
+ "galleria_mellonella_gca003640425v2rs": 4301,
203
+ "gallus_gallus_gca000002315v5": 4302,
204
+ "gambusia_affinis": 4303,
205
+ "gasterosteus_aculeatus": 4304,
206
+ "geospiza_fortis": 4305,
207
+ "gigantopelta_aegis_gca016097555v1": 4306,
208
+ "glossina_austeni": 4307,
209
+ "glossina_brevipalpis": 4308,
210
+ "glossina_fuscipes": 4309,
211
+ "glossina_morsitans": 4310,
212
+ "glossina_pallidipes": 4311,
213
+ "glossina_palpalis": 4312,
214
+ "gopherus_agassizii": 4313,
215
+ "gopherus_evgoodei": 4314,
216
+ "gorilla_gorilla": 4315,
217
+ "gouania_willdenowi": 4316,
218
+ "habropoda_laboriosa_gca001263275v1rs": 4317,
219
+ "haemaphysalis_longicornis_gca013339765v1": 4318,
220
+ "haliotis_rubra_gca003918875v1rs": 4319,
221
+ "haliotis_rufescens_gca023055435v1rs": 4320,
222
+ "haplochromis_burtoni": 4321,
223
+ "harpegnathos_saltator_gca003227715v2rs": 4322,
224
+ "heliconius_melpomene": 4323,
225
+ "helicoverpa_armigera_gca023701775v1rs": 4324,
226
+ "helicoverpa_zea_gca022581195v1rs": 4325,
227
+ "helobdella_robusta": 4326,
228
+ "hermetia_illucens_gca905115235v1": 4327,
229
+ "heterocephalus_glaber_male": 4328,
230
+ "hippocampus_comes": 4329,
231
+ "hofstenia_miamia": 4330,
232
+ "homalodisca_vitripennis_gca021130785v2rs": 4331,
233
+ "homarus_americanus_gca018991925v1": 4332,
234
+ "homo_sapiens": 4333,
235
+ "hucho_hucho": 4334,
236
+ "hyalella_azteca_gca000764305v2": 4335,
237
+ "hyalomma_asiaticum_gca013339685v1": 4336,
238
+ "hydra_vulgaris_gca022113875v1rs": 4337,
239
+ "hymenolepis_microstoma": 4338,
240
+ "hypsibius_exemplaris_gca002082055v1": 4339,
241
+ "ictalurus_punctatus": 4340,
242
+ "ictidomys_tridecemlineatus": 4341,
243
+ "ixodes_persulcatus_gca013358835v1": 4342,
244
+ "ixodes_scapularis_gca016920785v2": 4343,
245
+ "jaculus_jaculus": 4344,
246
+ "junco_hyemalis": 4345,
247
+ "kryptolebias_marmoratus": 4346,
248
+ "labrus_bergylta": 4347,
249
+ "larimichthys_crocea": 4348,
250
+ "lates_calcarifer": 4349,
251
+ "laticauda_laticaudata": 4350,
252
+ "latimeria_chalumnae": 4351,
253
+ "leguminivora_glycinivorella_gca023078275v1rs": 4352,
254
+ "lepeophtheirus_salmonis_gca016086655v3rs": 4353,
255
+ "lepidothrix_coronata": 4354,
256
+ "lepisosteus_oculatus": 4355,
257
+ "leptinotarsa_decemlineata_gca000500325v2": 4356,
258
+ "leptobrachium_leishanense": 4357,
259
+ "leptotrombidium_deliense": 4358,
260
+ "limulus_polyphemus_gca000517525v1": 4359,
261
+ "linepithema_humile_gca000217595v1rs": 4360,
262
+ "lingula_anatina": 4361,
263
+ "loa_loa": 4362,
264
+ "lonchura_striata_domestica": 4363,
265
+ "lottia_gigantea": 4364,
266
+ "loxodonta_africana": 4365,
267
+ "lucilia_cuprina_gca022045245v1rs": 4366,
268
+ "lutzomyia_longipalpis": 4367,
269
+ "lynx_canadensis": 4368,
270
+ "lytechinus_variegatus_gca018143015v1": 4369,
271
+ "macaca_fascicularis": 4370,
272
+ "macaca_mulatta": 4371,
273
+ "macaca_nemestrina": 4372,
274
+ "malurus_cyaneus_samueli": 4373,
275
+ "manacus_vitellinus": 4374,
276
+ "mandrillus_leucophaeus": 4375,
277
+ "manduca_sexta_gca014839805v1rs": 4376,
278
+ "marmota_marmota_marmota": 4377,
279
+ "mastacembelus_armatus": 4378,
280
+ "mayetiola_destructor": 4379,
281
+ "maylandia_zebra": 4380,
282
+ "megachile_rotundata_gca000220905v1rs": 4381,
283
+ "megaselia_scalaris": 4382,
284
+ "meleagris_gallopavo": 4383,
285
+ "melitaea_cinxia_gca905220565v1": 4384,
286
+ "melopsittacus_undulatus": 4385,
287
+ "mercenaria_mercenaria_gca014805675v2": 4386,
288
+ "meriones_unguiculatus": 4387,
289
+ "mesocricetus_auratus": 4388,
290
+ "microcebus_murinus": 4389,
291
+ "microtus_ochrogaster": 4390,
292
+ "mizuhopecten_yessoensis_gca002113885v2": 4391,
293
+ "mnemiopsis_leidyi": 4392,
294
+ "mola_mola": 4393,
295
+ "monodelphis_domestica": 4394,
296
+ "monodon_monoceros": 4395,
297
+ "monomorium_pharaonis_gca013373865v2": 4396,
298
+ "monopterus_albus": 4397,
299
+ "moschus_moschiferus": 4398,
300
+ "mus_caroli": 4399,
301
+ "mus_musculus": 4400,
302
+ "mus_pahari": 4401,
303
+ "mus_spicilegus": 4402,
304
+ "mus_spretus": 4403,
305
+ "musca_domestica": 4404,
306
+ "mustela_putorius_furo": 4405,
307
+ "myotis_lucifugus": 4406,
308
+ "myripristis_murdjan": 4407,
309
+ "naja_naja": 4408,
310
+ "nannospalax_galili": 4409,
311
+ "nasonia_vitripennis": 4410,
312
+ "necator_americanus": 4411,
313
+ "nematostella_vectensis": 4412,
314
+ "neodiprion_lecontei_gca021901455v1rs": 4413,
315
+ "neodiprion_pinetum_gca021155775v1rs": 4414,
316
+ "neogobius_melanostomus": 4415,
317
+ "neolamprologus_brichardi": 4416,
318
+ "neovison_vison": 4417,
319
+ "nilaparvata_lugens_gca014356525v1rs": 4418,
320
+ "nomascus_leucogenys": 4419,
321
+ "notamacropus_eugenii": 4420,
322
+ "notechis_scutatus": 4421,
323
+ "nothobranchius_furzeri": 4422,
324
+ "nothoprocta_perdicaria": 4423,
325
+ "numida_meleagris": 4424,
326
+ "ochotona_princeps": 4425,
327
+ "octodon_degus": 4426,
328
+ "octopus_bimaculoides": 4427,
329
+ "octopus_sinensis_gca006345805v1": 4428,
330
+ "onchocerca_volvulus": 4429,
331
+ "oncorhynchus_kisutch": 4430,
332
+ "oncorhynchus_mykiss": 4431,
333
+ "oncorhynchus_tshawytscha": 4432,
334
+ "onthophagus_taurus_gca000648695v2": 4433,
335
+ "ooceraea_biroi_gca003672135v1": 4434,
336
+ "orbicella_faveolata_gca002042975v1": 4435,
337
+ "orchesella_cincta": 4436,
338
+ "oreochromis_aureus": 4437,
339
+ "oreochromis_niloticus": 4438,
340
+ "ornithorhynchus_anatinus": 4439,
341
+ "orussus_abietinus_gca000612105v2": 4440,
342
+ "oryctolagus_cuniculus": 4441,
343
+ "oryzias_javanicus": 4442,
344
+ "oryzias_latipes": 4443,
345
+ "oryzias_melastigma": 4444,
346
+ "oryzias_sinensis": 4445,
347
+ "otolemur_garnettii": 4446,
348
+ "otus_sunia": 4447,
349
+ "ovis_aries_rambouillet": 4448,
350
+ "owenia_fusiformis_gca903813345v1": 4449,
351
+ "pan_paniscus": 4450,
352
+ "pan_troglodytes": 4451,
353
+ "panthera_leo": 4452,
354
+ "panthera_pardus": 4453,
355
+ "panthera_tigris_altaica": 4454,
356
+ "papio_anubis": 4455,
357
+ "parambassis_ranga": 4456,
358
+ "paramormyrops_kingsleyae": 4457,
359
+ "parasteatoda_tepidariorum_gca000365465v3": 4458,
360
+ "parus_major": 4459,
361
+ "patiria_miniata_gca015706575v1": 4460,
362
+ "pavo_cristatus": 4461,
363
+ "pectinophora_gossypiella_gca024362695v1rs": 4462,
364
+ "pediculus_humanus": 4463,
365
+ "pelodiscus_sinensis": 4464,
366
+ "pelusios_castaneus": 4465,
367
+ "penaeus_chinensis_gca019202785v2rs": 4466,
368
+ "penaeus_japonicus_gca017312705v1": 4467,
369
+ "penaeus_monodon_gca015228065v1": 4468,
370
+ "penaeus_vannamei_gca003789085v1": 4469,
371
+ "periophthalmus_magnuspinnatus": 4470,
372
+ "peromyscus_maniculatus_bairdii": 4471,
373
+ "petromyzon_marinus": 4472,
374
+ "phascolarctos_cinereus": 4473,
375
+ "phasianus_colchicus": 4474,
376
+ "phlebotomus_papatasi": 4475,
377
+ "phlebotomus_perniciosus_gca918844115v2": 4476,
378
+ "phocoena_sinus": 4477,
379
+ "physeter_catodon": 4478,
380
+ "piliocolobus_tephrosceles": 4479,
381
+ "pocillopora_damicornis_gca003704095v1": 4480,
382
+ "podarcis_muralis": 4481,
383
+ "poecilia_formosa": 4482,
384
+ "poecilia_latipinna": 4483,
385
+ "poecilia_mexicana": 4484,
386
+ "poecilia_reticulata": 4485,
387
+ "pogona_vitticeps": 4486,
388
+ "pogonomyrmex_barbatus_gca000187915v1rs": 4487,
389
+ "polistes_canadensis_gca001313835v1rs": 4488,
390
+ "polistes_dominula_gca001465965v1rs": 4489,
391
+ "polistes_fuscatus_gca010416935v1rs": 4490,
392
+ "pollicipes_pollicipes_gca011947565v2": 4491,
393
+ "pomacea_canaliculata_gca003073045v1": 4492,
394
+ "pomphorhynchus_laevis_gca012934845v2gb": 4493,
395
+ "pongo_abelii": 4494,
396
+ "portunus_trituberculatus_gca017591435v1": 4495,
397
+ "priapulus_caudatus_gca000485595v2": 4496,
398
+ "pristionchus_pacificus": 4497,
399
+ "procambarus_clarkii_gca020424385v2": 4498,
400
+ "procavia_capensis": 4499,
401
+ "prolemur_simus": 4500,
402
+ "propithecus_coquereli": 4501,
403
+ "pseudonaja_textilis": 4502,
404
+ "pteropus_vampyrus": 4503,
405
+ "pundamilia_nyererei": 4504,
406
+ "pygocentrus_nattereri": 4505,
407
+ "rattus_norvegicus_wkybbb": 4506,
408
+ "rhagoletis_pomonella_gca013731165v1": 4507,
409
+ "rhinolophus_ferrumequinum": 4508,
410
+ "rhinopithecus_bieti": 4509,
411
+ "rhinopithecus_roxellana": 4510,
412
+ "rhipicephalus_microplus_gca013339725v1": 4511,
413
+ "rhipicephalus_sanguineus_gca013339695v1": 4512,
414
+ "rhodnius_prolixus": 4513,
415
+ "rhopalosiphum_maidis_gca003676215v3": 4514,
416
+ "saccharomyces_cerevisiae": 4515,
417
+ "saccoglossus_kowalevskii_gca000003605v1": 4516,
418
+ "saimiri_boliviensis_boliviensis": 4517,
419
+ "salarias_fasciatus": 4518,
420
+ "salmo_salar": 4519,
421
+ "salmo_trutta": 4520,
422
+ "salvator_merianae": 4521,
423
+ "sander_lucioperca": 4522,
424
+ "sarcophilus_harrisii": 4523,
425
+ "sarcoptes_scabiei": 4524,
426
+ "schistocerca_americana_gca021461395v2rs": 4525,
427
+ "schistosoma_haematobium_gca000699445v2rs": 4526,
428
+ "schistosoma_mansoni": 4527,
429
+ "sciurus_vulgaris": 4528,
430
+ "scleropages_formosus": 4529,
431
+ "scophthalmus_maximus": 4530,
432
+ "serinus_canaria": 4531,
433
+ "seriola_dumerili": 4532,
434
+ "seriola_lalandi_dorsalis": 4533,
435
+ "sinocyclocheilus_anshuiensis": 4534,
436
+ "sinocyclocheilus_grahami": 4535,
437
+ "sinocyclocheilus_rhinocerous": 4536,
438
+ "sipha_flava_gca003268045v1": 4537,
439
+ "sitophilus_oryzae_gca002938485v2rs": 4538,
440
+ "solenopsis_invicta": 4539,
441
+ "sorex_araneus": 4540,
442
+ "sparus_aurata": 4541,
443
+ "spermophilus_dauricus": 4542,
444
+ "sphaeramia_orbicularis": 4543,
445
+ "sphenodon_punctatus": 4544,
446
+ "stachyris_ruficeps": 4545,
447
+ "stegastes_partitus": 4546,
448
+ "stegodyphus_dumicola_gca010614865v2rs": 4547,
449
+ "stegodyphus_mimosarum": 4548,
450
+ "stomoxys_calcitrans": 4549,
451
+ "strigamia_maritima": 4550,
452
+ "strigops_habroptila": 4551,
453
+ "strix_occidentalis_caurina": 4552,
454
+ "strongylocentrotus_purpuratus": 4553,
455
+ "strongyloides_ratti": 4554,
456
+ "struthio_camelus_australis": 4555,
457
+ "stylophora_pistillata_gca002571385v1": 4556,
458
+ "suricata_suricatta": 4557,
459
+ "sus_scrofa_usmarc": 4558,
460
+ "taeniopygia_guttata": 4559,
461
+ "takifugu_rubripes": 4560,
462
+ "teleopsis_dalmanni": 4561,
463
+ "terrapene_carolina_triunguis": 4562,
464
+ "tetranychus_urticae": 4563,
465
+ "tetraodon_nigroviridis": 4564,
466
+ "thelohanellus_kitauei": 4565,
467
+ "theropithecus_gelada": 4566,
468
+ "thrips_palmi_gca012932325v1rs": 4567,
469
+ "tigriopus_californicus_gca007210705": 4568,
470
+ "trialeurodes_vaporariorum_gca011764245": 4569,
471
+ "tribolium_castaneum": 4570,
472
+ "trichinella_spiralis": 4571,
473
+ "trichogramma_pretiosum_gca000599845v3": 4572,
474
+ "trichoplax_adhaerens": 4573,
475
+ "trichuris_muris": 4574,
476
+ "tupaia_belangeri": 4575,
477
+ "tursiops_truncatus": 4576,
478
+ "urocitellus_parryii": 4577,
479
+ "ursus_americanus": 4578,
480
+ "ursus_maritimus": 4579,
481
+ "ursus_thibetanus_thibetanus": 4580,
482
+ "varanus_komodoensis": 4581,
483
+ "varroa_destructor_gca002443255": 4582,
484
+ "venturia_canescens_gca019457755v1rs": 4583,
485
+ "vicugna_pacos": 4584,
486
+ "vombatus_ursinus": 4585,
487
+ "vulpes_vulpes": 4586,
488
+ "xenopus_tropicalis": 4587,
489
+ "xiphophorus_couchianus": 4588,
490
+ "xiphophorus_maculatus": 4589,
491
+ "zalophus_californianus": 4590,
492
+ "zerene_cesonia_gca012273895v2rs": 4591,
493
+ "zonotrichia_albicollis": 4592,
494
+ "zootermopsis_nevadensis": 4593,
495
+ "zosterops_lateralis_melanops": 4594
496
+ }
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/s/project/denovo-prosit/JohannesHingerl/BERTADN/final_models/huggingface_compatible/rotary_bert_huggingface",
3
+ "auto_map":
4
+ {"AutoModel": "modeling_rotarybert.RotaryBertModel",
5
+ "AutoModelForMaskedLM": "modeling_rotarybert.RotaryBertForMaskedLM"
6
+ },
7
+ "attention_probs_dropout_prob": 0.1,
8
+ "classifier_dropout": null,
9
+ "hidden_act": "gelu_new",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 8192,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.45.2",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 4608
26
+ }
generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "pad_token_id": 0,
4
+ "transformers_version": "4.45.2"
5
+ }
modeling_rotarybert.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from einops import rearrange, repeat
3
+ from transformers.models.bert.modeling_bert import BertSelfAttention, BertAttention, BertLayer, BertEncoder, BertModel, BertForMaskedLM
4
+ from typing import List, Optional, Tuple, Union
5
+ from packaging import version
6
+ import torch
7
+ import torch.nn as nn
8
+
9
+ def rotate_half(x, interleaved=False):
10
+ if not interleaved:
11
+ x1, x2 = x.chunk(2, dim=-1)
12
+ return torch.cat((-x2, x1), dim=-1)
13
+ else:
14
+ x1, x2 = x[..., ::2], x[..., 1::2]
15
+ return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
16
+
17
+
18
+ def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
19
+ """
20
+ x: (batch_size, seqlen, nheads, headdim)
21
+ cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
22
+ """
23
+ ro_dim = cos.shape[-1] * 2
24
+ assert ro_dim <= x.shape[-1]
25
+ cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
26
+ sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
27
+ return torch.cat(
28
+ [x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]],
29
+ dim=-1,
30
+ )
31
+
32
+
33
+ def generate_cos_sin(seqlen, rotary_dim, device, dtype):
34
+ inv_freq = 1.0 / (
35
+ 10000.0
36
+ ** (torch.arange(0, rotary_dim, 2, device=device, dtype=torch.float32) / rotary_dim)
37
+ )
38
+ t = torch.arange(seqlen, device=device, dtype=torch.float32)
39
+ freqs = torch.outer(t, inv_freq)
40
+ cos = torch.cos(freqs).to(dtype)
41
+ sin = torch.sin(freqs).to(dtype)
42
+ return cos, sin
43
+
44
+ # from transformers.models.roformer import RoFormerSinusoidalPositionalEmbedding
45
+
46
+
47
+ class RotaryBertSdpaSelfAttention(BertSelfAttention):
48
+ def __init__(self, config, position_embedding_type=None):
49
+ super().__init__(config, position_embedding_type=position_embedding_type)
50
+ self.dropout_prob = config.attention_probs_dropout_prob
51
+ self.require_contiguous_qkv = False
52
+ # self.rotary_sinuses = RoFormerSinusoidalPositionalEmbedding(config.max_position_embeddings)
53
+
54
+ # Adapted from BertSelfAttention
55
+ def forward(
56
+ self,
57
+ hidden_states: torch.Tensor,
58
+ attention_mask = None,
59
+ head_mask = None,
60
+ encoder_hidden_states = None,
61
+ encoder_attention_mask = None,
62
+ past_key_value = None,
63
+ output_attentions = False,
64
+ ) -> Tuple[torch.Tensor]:
65
+ if output_attentions or head_mask is not None:
66
+ # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
67
+ logger.warning_once(
68
+ "BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
69
+ "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to "
70
+ "the manual attention implementation, but specifying the manual implementation will be required from "
71
+ "Transformers version v5.0.0 onwards. This warning can be removed using the argument "
72
+ '`attn_implementation="eager"` when loading the model.'
73
+ )
74
+
75
+
76
+ bsz, tgt_len, _ = hidden_states.size()
77
+
78
+ query_layer = self.query(hidden_states)
79
+
80
+
81
+ is_cross_attention = encoder_hidden_states is not None
82
+
83
+ current_states = encoder_hidden_states if is_cross_attention else hidden_states
84
+ attention_mask = encoder_attention_mask if is_cross_attention else attention_mask
85
+
86
+ # Check `seq_length` of `past_key_value` == `len(current_states)` to support prefix tuning
87
+
88
+ key_layer = self.key(current_states)
89
+ value_layer = self.value(current_states)
90
+
91
+
92
+ query_layer = self.transpose_for_scores(query_layer)
93
+ key_layer = self.transpose_for_scores(key_layer)
94
+ value_layer = self.transpose_for_scores(value_layer)
95
+
96
+
97
+
98
+ query_layer, key_layer = query_layer.permute(0,2,1,3), key_layer.permute(0,2,1,3)
99
+
100
+
101
+ cos, sin = generate_cos_sin(query_layer.shape[1], query_layer.shape[-1], device = query_layer.device, dtype = torch.float32)
102
+
103
+
104
+ query_layer, key_layer = apply_rotary_emb_torch(query_layer, cos, sin), apply_rotary_emb_torch(key_layer, cos, sin)
105
+
106
+ query_layer, key_layer = query_layer.permute(0,2,1,3), key_layer.permute(0,2,1,3)
107
+
108
+
109
+ # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
110
+ # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
111
+ # Reference: https://github.com/pytorch/pytorch/issues/112577
112
+ if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
113
+ query_layer = query_layer.contiguous()
114
+ key_layer = key_layer.contiguous()
115
+ value_layer = value_layer.contiguous()
116
+
117
+
118
+ is_causal = (
119
+ True if self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 else False
120
+ )
121
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
122
+ query_layer,
123
+ key_layer,
124
+ value_layer,
125
+ attn_mask=attention_mask,
126
+ dropout_p=self.dropout_prob if self.training else 0.0,
127
+ is_causal=is_causal,
128
+ )
129
+
130
+ attn_output = attn_output.transpose(1, 2)
131
+ attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size)
132
+
133
+ outputs = (attn_output,)
134
+ if self.is_decoder:
135
+ outputs = outputs + (past_key_value,)
136
+ return outputs
137
+
138
+
139
+ class RotaryBertAttention(BertAttention):
140
+ def __init__(self, config):
141
+ super().__init__(config)
142
+ self.self = RotaryBertSdpaSelfAttention(config)
143
+
144
+ class RotaryBertLayer(BertLayer):
145
+ def __init__(self, config):
146
+ super().__init__(config)
147
+ self.attention = RotaryBertAttention(config)
148
+
149
+ class RotaryBertEncoder(BertEncoder):
150
+ def __init__(self, config):
151
+ super().__init__(config)
152
+ self.layer = nn.ModuleList([RotaryBertLayer(config) for _ in range(config.num_hidden_layers)])
153
+
154
+ class RotaryBertModel(BertModel):
155
+ def __init__(self, config):
156
+ super().__init__(config)
157
+ self.encoder = RotaryBertEncoder(config)
158
+
159
+ class RotaryBertForMaskedLM(BertForMaskedLM):
160
+ def __init__(self, config):
161
+ super().__init__(config)
162
+ self.bert = RotaryBertModel(config)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78bb502f89c7f13649c5787f672f045618affb36ec0d70bdca90c972877ba64c
3
+ size 398592581
special_tokens_map.json ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "acanthaster_planci_gca001949145v1",
4
+ "acanthochromis_polyacanthus",
5
+ "accipiter_nisus",
6
+ "acromyrmex_echinatior_gca000204515v1rs",
7
+ "acropora_millepora_gca013753865v1",
8
+ "actinia_equina_gca011057435",
9
+ "actinia_tenebrosa_gca009602425v1",
10
+ "acyrthosiphon_pisum",
11
+ "adineta_vaga",
12
+ "aedes_aegypti_lvpagwg",
13
+ "aedes_albopictus",
14
+ "agrilus_planipennis_gca000699045v2",
15
+ "ailuropoda_melanoleuca",
16
+ "amazona_collaria",
17
+ "amphibalanus_amphitrite_gca019059575v1",
18
+ "amphilophus_citrinellus",
19
+ "amphimedon_queenslandica",
20
+ "amphiprion_ocellaris",
21
+ "amphiprion_percula",
22
+ "amyelois_transitella_gca001186105v1rs",
23
+ "anabas_testudineus",
24
+ "anas_platyrhynchos",
25
+ "anas_zonorhyncha",
26
+ "anneissia_japonica_gca011630105v1",
27
+ "anolis_carolinensis",
28
+ "anopheles_albimanus",
29
+ "anopheles_arabiensis",
30
+ "anopheles_atroparvus_gca914969975",
31
+ "anopheles_christyi",
32
+ "anopheles_coluzzii_ngousso",
33
+ "anopheles_culicifacies",
34
+ "anopheles_darlingi",
35
+ "anopheles_dirus",
36
+ "anopheles_epiroticus",
37
+ "anopheles_farauti",
38
+ "anopheles_funestus",
39
+ "anopheles_gambiae",
40
+ "anopheles_maculatus",
41
+ "anopheles_melas",
42
+ "anopheles_merus",
43
+ "anopheles_minimus",
44
+ "anopheles_quadriannulatus",
45
+ "anopheles_sinensis_china",
46
+ "anopheles_stephensi",
47
+ "anoplophora_glabripennis",
48
+ "anser_brachyrhynchus",
49
+ "anser_cygnoides",
50
+ "anthonomus_grandis_gca022605725v3rs",
51
+ "aotus_nancymaae",
52
+ "aphidius_gifuensis_gca014905175v1",
53
+ "apis_dorsata_gca000469605v1rs",
54
+ "apis_florea_gca000184785v2rs",
55
+ "apis_mellifera",
56
+ "aplysia_californica_gca000002075v2",
57
+ "apteryx_haastii",
58
+ "apteryx_owenii",
59
+ "apteryx_rowi",
60
+ "aquila_chrysaetos_chrysaetos",
61
+ "ascaris_suum",
62
+ "astatotilapia_calliptera",
63
+ "asterias_rubens_gca902459465v3",
64
+ "astyanax_mexicanus",
65
+ "athalia_rosae_gca917208135v1",
66
+ "athene_cunicularia",
67
+ "atta_cephalotes",
68
+ "bactrocera_dorsalis_gca000789215v2",
69
+ "bactrocera_latifrons_gca001853355v1",
70
+ "bactrocera_tryoni_gca016617805v2",
71
+ "balaenoptera_musculus",
72
+ "belgica_antarctica",
73
+ "bemisia_tabaci_ssa3nig",
74
+ "betta_splendens",
75
+ "bicyclus_anynana_gca900239965v1rs",
76
+ "biomphalaria_glabrata",
77
+ "bison_bison_bison",
78
+ "bombus_impatiens",
79
+ "bombus_terrestris_gca910591885v2",
80
+ "bombyx_mandarina_gca003987935v1rs",
81
+ "bombyx_mori",
82
+ "bos_grunniens",
83
+ "bos_indicus_hybrid",
84
+ "bos_mutus",
85
+ "bos_taurus_hybrid",
86
+ "branchiostoma_lanceolatum",
87
+ "brugia_malayi",
88
+ "bubo_bubo",
89
+ "buteo_japonicus",
90
+ "caenorhabditis_brenneri",
91
+ "caenorhabditis_briggsae",
92
+ "caenorhabditis_elegans",
93
+ "caenorhabditis_japonica",
94
+ "caenorhabditis_remanei",
95
+ "cairina_moschata_domestica",
96
+ "calidris_pugnax",
97
+ "calidris_pygmaea",
98
+ "callithrix_jacchus",
99
+ "callorhinchus_milii",
100
+ "camarhynchus_parvulus",
101
+ "camelus_dromedarius",
102
+ "camponotus_floridanus_gca003227725v1rs",
103
+ "canis_lupus_familiarisgsd",
104
+ "capitella_teleta",
105
+ "capra_hircus",
106
+ "carassius_auratus",
107
+ "carlito_syrichta",
108
+ "castor_canadensis",
109
+ "catagonus_wagneri",
110
+ "catharus_ustulatus",
111
+ "cavia_aperea",
112
+ "cavia_porcellus",
113
+ "cebus_imitator",
114
+ "centruroides_sculpturatus_gca000671375v2",
115
+ "ceratitis_capitata_gca000347755v4",
116
+ "cercocebus_atys",
117
+ "cervus_hanglu_yarkandensis",
118
+ "chelonoidis_abingdonii",
119
+ "chelonus_insularis_gca013357705v1rs",
120
+ "chelydra_serpentina",
121
+ "chinchilla_lanigera",
122
+ "chlorocebus_sabaeus",
123
+ "choloepus_hoffmanni",
124
+ "chrysemys_picta_bellii",
125
+ "chrysolophus_pictus",
126
+ "cimex_lectularius",
127
+ "ciona_intestinalis",
128
+ "ciona_savignyi",
129
+ "clupea_harengus",
130
+ "clytia_hemisphaerica_gca902728285",
131
+ "colobus_angolensis_palliatus",
132
+ "copidosoma_floridanum_gca000648655v2",
133
+ "corvus_moneduloides",
134
+ "cotesia_glomerata_gca020080835v1",
135
+ "cottoperca_gobio",
136
+ "coturnix_japonica",
137
+ "crassostrea_gigas",
138
+ "crassostrea_virginica_gca002022765v4",
139
+ "cricetulus_griseus_picr",
140
+ "crocodylus_porosus",
141
+ "culex_quinquefasciatus_gca015732765v1",
142
+ "culicoides_sonorensis",
143
+ "cyanistes_caeruleus",
144
+ "cyclopterus_lumpus",
145
+ "cynoglossus_semilaevis",
146
+ "cyprinodon_variegatus",
147
+ "cyprinus_carpio_carpio",
148
+ "danaus_plexippus",
149
+ "danio_rerio",
150
+ "daphnia_magna_gca020631705v2",
151
+ "daphnia_pulex",
152
+ "daphnia_pulicaria_gca021234035v2rs",
153
+ "dasypus_novemcinctus",
154
+ "delphinapterus_leucas",
155
+ "dendroctonus_ponderosae_gca000355655v1",
156
+ "dendronephthya_gigantea_gca004324835v1",
157
+ "denticeps_clupeoides",
158
+ "dermacentor_andersoni_gca023375885v2rs",
159
+ "dermacentor_silvarum_gca013339745v1",
160
+ "dermatophagoides_pteronyssinus_gca001901225v2",
161
+ "diabrotica_virgifera_gca917563875v2rs",
162
+ "dicentrarchus_labrax",
163
+ "dimorphilus_gyrociliatus_gca904063045v1",
164
+ "dinothrombium_tinctorium",
165
+ "dipodomys_ordii",
166
+ "diuraphis_noxia_gca001186385v1",
167
+ "dromaius_novaehollandiae",
168
+ "drosophila_ananassae",
169
+ "drosophila_erecta",
170
+ "drosophila_grimshawi",
171
+ "drosophila_melanogaster",
172
+ "drosophila_mojavensis",
173
+ "drosophila_persimilis",
174
+ "drosophila_pseudoobscura",
175
+ "drosophila_sechellia",
176
+ "drosophila_simulans",
177
+ "drosophila_virilis",
178
+ "drosophila_willistoni",
179
+ "drosophila_yakuba",
180
+ "dufourea_novaeangliae_gca001272555v1rs",
181
+ "echeneis_naucrates",
182
+ "echinococcus_granulosus_gca000524195v1rs",
183
+ "echinops_telfairi",
184
+ "electrophorus_electricus",
185
+ "eptatretus_burgeri",
186
+ "equus_asinus",
187
+ "equus_caballus",
188
+ "erinaceus_europaeus",
189
+ "erpetoichthys_calabaricus",
190
+ "erythrura_gouldiae",
191
+ "esox_lucius",
192
+ "eufriesea_mexicana_gca001483705v1rs",
193
+ "eurytemora_affinis_gca000591075v2",
194
+ "exaiptasia_diaphana_gca001417965v1",
195
+ "falco_tinnunculus",
196
+ "felis_catus",
197
+ "ficedula_albicollis",
198
+ "folsomia_candida",
199
+ "fukomys_damarensis",
200
+ "fundulus_heteroclitus",
201
+ "gadus_morhua",
202
+ "galendromus_occidentalis_gca000255335v2rs",
203
+ "galleria_mellonella_gca003640425v2rs",
204
+ "gallus_gallus_gca000002315v5",
205
+ "gambusia_affinis",
206
+ "gasterosteus_aculeatus",
207
+ "geospiza_fortis",
208
+ "gigantopelta_aegis_gca016097555v1",
209
+ "glossina_austeni",
210
+ "glossina_brevipalpis",
211
+ "glossina_fuscipes",
212
+ "glossina_morsitans",
213
+ "glossina_pallidipes",
214
+ "glossina_palpalis",
215
+ "gopherus_agassizii",
216
+ "gopherus_evgoodei",
217
+ "gorilla_gorilla",
218
+ "gouania_willdenowi",
219
+ "habropoda_laboriosa_gca001263275v1rs",
220
+ "haemaphysalis_longicornis_gca013339765v1",
221
+ "haliotis_rubra_gca003918875v1rs",
222
+ "haliotis_rufescens_gca023055435v1rs",
223
+ "haplochromis_burtoni",
224
+ "harpegnathos_saltator_gca003227715v2rs",
225
+ "heliconius_melpomene",
226
+ "helicoverpa_armigera_gca023701775v1rs",
227
+ "helicoverpa_zea_gca022581195v1rs",
228
+ "helobdella_robusta",
229
+ "hermetia_illucens_gca905115235v1",
230
+ "heterocephalus_glaber_male",
231
+ "hippocampus_comes",
232
+ "hofstenia_miamia",
233
+ "homalodisca_vitripennis_gca021130785v2rs",
234
+ "homarus_americanus_gca018991925v1",
235
+ "homo_sapiens",
236
+ "hucho_hucho",
237
+ "hyalella_azteca_gca000764305v2",
238
+ "hyalomma_asiaticum_gca013339685v1",
239
+ "hydra_vulgaris_gca022113875v1rs",
240
+ "hymenolepis_microstoma",
241
+ "hypsibius_exemplaris_gca002082055v1",
242
+ "ictalurus_punctatus",
243
+ "ictidomys_tridecemlineatus",
244
+ "ixodes_persulcatus_gca013358835v1",
245
+ "ixodes_scapularis_gca016920785v2",
246
+ "jaculus_jaculus",
247
+ "junco_hyemalis",
248
+ "kryptolebias_marmoratus",
249
+ "labrus_bergylta",
250
+ "larimichthys_crocea",
251
+ "lates_calcarifer",
252
+ "laticauda_laticaudata",
253
+ "latimeria_chalumnae",
254
+ "leguminivora_glycinivorella_gca023078275v1rs",
255
+ "lepeophtheirus_salmonis_gca016086655v3rs",
256
+ "lepidothrix_coronata",
257
+ "lepisosteus_oculatus",
258
+ "leptinotarsa_decemlineata_gca000500325v2",
259
+ "leptobrachium_leishanense",
260
+ "leptotrombidium_deliense",
261
+ "limulus_polyphemus_gca000517525v1",
262
+ "linepithema_humile_gca000217595v1rs",
263
+ "lingula_anatina",
264
+ "loa_loa",
265
+ "lonchura_striata_domestica",
266
+ "lottia_gigantea",
267
+ "loxodonta_africana",
268
+ "lucilia_cuprina_gca022045245v1rs",
269
+ "lutzomyia_longipalpis",
270
+ "lynx_canadensis",
271
+ "lytechinus_variegatus_gca018143015v1",
272
+ "macaca_fascicularis",
273
+ "macaca_mulatta",
274
+ "macaca_nemestrina",
275
+ "malurus_cyaneus_samueli",
276
+ "manacus_vitellinus",
277
+ "mandrillus_leucophaeus",
278
+ "manduca_sexta_gca014839805v1rs",
279
+ "marmota_marmota_marmota",
280
+ "mastacembelus_armatus",
281
+ "mayetiola_destructor",
282
+ "maylandia_zebra",
283
+ "megachile_rotundata_gca000220905v1rs",
284
+ "megaselia_scalaris",
285
+ "meleagris_gallopavo",
286
+ "melitaea_cinxia_gca905220565v1",
287
+ "melopsittacus_undulatus",
288
+ "mercenaria_mercenaria_gca014805675v2",
289
+ "meriones_unguiculatus",
290
+ "mesocricetus_auratus",
291
+ "microcebus_murinus",
292
+ "microtus_ochrogaster",
293
+ "mizuhopecten_yessoensis_gca002113885v2",
294
+ "mnemiopsis_leidyi",
295
+ "mola_mola",
296
+ "monodelphis_domestica",
297
+ "monodon_monoceros",
298
+ "monomorium_pharaonis_gca013373865v2",
299
+ "monopterus_albus",
300
+ "moschus_moschiferus",
301
+ "mus_caroli",
302
+ "mus_musculus",
303
+ "mus_pahari",
304
+ "mus_spicilegus",
305
+ "mus_spretus",
306
+ "musca_domestica",
307
+ "mustela_putorius_furo",
308
+ "myotis_lucifugus",
309
+ "myripristis_murdjan",
310
+ "naja_naja",
311
+ "nannospalax_galili",
312
+ "nasonia_vitripennis",
313
+ "necator_americanus",
314
+ "nematostella_vectensis",
315
+ "neodiprion_lecontei_gca021901455v1rs",
316
+ "neodiprion_pinetum_gca021155775v1rs",
317
+ "neogobius_melanostomus",
318
+ "neolamprologus_brichardi",
319
+ "neovison_vison",
320
+ "nilaparvata_lugens_gca014356525v1rs",
321
+ "nomascus_leucogenys",
322
+ "notamacropus_eugenii",
323
+ "notechis_scutatus",
324
+ "nothobranchius_furzeri",
325
+ "nothoprocta_perdicaria",
326
+ "numida_meleagris",
327
+ "ochotona_princeps",
328
+ "octodon_degus",
329
+ "octopus_bimaculoides",
330
+ "octopus_sinensis_gca006345805v1",
331
+ "onchocerca_volvulus",
332
+ "oncorhynchus_kisutch",
333
+ "oncorhynchus_mykiss",
334
+ "oncorhynchus_tshawytscha",
335
+ "onthophagus_taurus_gca000648695v2",
336
+ "ooceraea_biroi_gca003672135v1",
337
+ "orbicella_faveolata_gca002042975v1",
338
+ "orchesella_cincta",
339
+ "oreochromis_aureus",
340
+ "oreochromis_niloticus",
341
+ "ornithorhynchus_anatinus",
342
+ "orussus_abietinus_gca000612105v2",
343
+ "oryctolagus_cuniculus",
344
+ "oryzias_javanicus",
345
+ "oryzias_latipes",
346
+ "oryzias_melastigma",
347
+ "oryzias_sinensis",
348
+ "otolemur_garnettii",
349
+ "otus_sunia",
350
+ "ovis_aries_rambouillet",
351
+ "owenia_fusiformis_gca903813345v1",
352
+ "pan_paniscus",
353
+ "pan_troglodytes",
354
+ "panthera_leo",
355
+ "panthera_pardus",
356
+ "panthera_tigris_altaica",
357
+ "papio_anubis",
358
+ "parambassis_ranga",
359
+ "paramormyrops_kingsleyae",
360
+ "parasteatoda_tepidariorum_gca000365465v3",
361
+ "parus_major",
362
+ "patiria_miniata_gca015706575v1",
363
+ "pavo_cristatus",
364
+ "pectinophora_gossypiella_gca024362695v1rs",
365
+ "pediculus_humanus",
366
+ "pelodiscus_sinensis",
367
+ "pelusios_castaneus",
368
+ "penaeus_chinensis_gca019202785v2rs",
369
+ "penaeus_japonicus_gca017312705v1",
370
+ "penaeus_monodon_gca015228065v1",
371
+ "penaeus_vannamei_gca003789085v1",
372
+ "periophthalmus_magnuspinnatus",
373
+ "peromyscus_maniculatus_bairdii",
374
+ "petromyzon_marinus",
375
+ "phascolarctos_cinereus",
376
+ "phasianus_colchicus",
377
+ "phlebotomus_papatasi",
378
+ "phlebotomus_perniciosus_gca918844115v2",
379
+ "phocoena_sinus",
380
+ "physeter_catodon",
381
+ "piliocolobus_tephrosceles",
382
+ "pocillopora_damicornis_gca003704095v1",
383
+ "podarcis_muralis",
384
+ "poecilia_formosa",
385
+ "poecilia_latipinna",
386
+ "poecilia_mexicana",
387
+ "poecilia_reticulata",
388
+ "pogona_vitticeps",
389
+ "pogonomyrmex_barbatus_gca000187915v1rs",
390
+ "polistes_canadensis_gca001313835v1rs",
391
+ "polistes_dominula_gca001465965v1rs",
392
+ "polistes_fuscatus_gca010416935v1rs",
393
+ "pollicipes_pollicipes_gca011947565v2",
394
+ "pomacea_canaliculata_gca003073045v1",
395
+ "pomphorhynchus_laevis_gca012934845v2gb",
396
+ "pongo_abelii",
397
+ "portunus_trituberculatus_gca017591435v1",
398
+ "priapulus_caudatus_gca000485595v2",
399
+ "pristionchus_pacificus",
400
+ "procambarus_clarkii_gca020424385v2",
401
+ "procavia_capensis",
402
+ "prolemur_simus",
403
+ "propithecus_coquereli",
404
+ "pseudonaja_textilis",
405
+ "pteropus_vampyrus",
406
+ "pundamilia_nyererei",
407
+ "pygocentrus_nattereri",
408
+ "rattus_norvegicus_wkybbb",
409
+ "rhagoletis_pomonella_gca013731165v1",
410
+ "rhinolophus_ferrumequinum",
411
+ "rhinopithecus_bieti",
412
+ "rhinopithecus_roxellana",
413
+ "rhipicephalus_microplus_gca013339725v1",
414
+ "rhipicephalus_sanguineus_gca013339695v1",
415
+ "rhodnius_prolixus",
416
+ "rhopalosiphum_maidis_gca003676215v3",
417
+ "saccharomyces_cerevisiae",
418
+ "saccoglossus_kowalevskii_gca000003605v1",
419
+ "saimiri_boliviensis_boliviensis",
420
+ "salarias_fasciatus",
421
+ "salmo_salar",
422
+ "salmo_trutta",
423
+ "salvator_merianae",
424
+ "sander_lucioperca",
425
+ "sarcophilus_harrisii",
426
+ "sarcoptes_scabiei",
427
+ "schistocerca_americana_gca021461395v2rs",
428
+ "schistosoma_haematobium_gca000699445v2rs",
429
+ "schistosoma_mansoni",
430
+ "sciurus_vulgaris",
431
+ "scleropages_formosus",
432
+ "scophthalmus_maximus",
433
+ "serinus_canaria",
434
+ "seriola_dumerili",
435
+ "seriola_lalandi_dorsalis",
436
+ "sinocyclocheilus_anshuiensis",
437
+ "sinocyclocheilus_grahami",
438
+ "sinocyclocheilus_rhinocerous",
439
+ "sipha_flava_gca003268045v1",
440
+ "sitophilus_oryzae_gca002938485v2rs",
441
+ "solenopsis_invicta",
442
+ "sorex_araneus",
443
+ "sparus_aurata",
444
+ "spermophilus_dauricus",
445
+ "sphaeramia_orbicularis",
446
+ "sphenodon_punctatus",
447
+ "stachyris_ruficeps",
448
+ "stegastes_partitus",
449
+ "stegodyphus_dumicola_gca010614865v2rs",
450
+ "stegodyphus_mimosarum",
451
+ "stomoxys_calcitrans",
452
+ "strigamia_maritima",
453
+ "strigops_habroptila",
454
+ "strix_occidentalis_caurina",
455
+ "strongylocentrotus_purpuratus",
456
+ "strongyloides_ratti",
457
+ "struthio_camelus_australis",
458
+ "stylophora_pistillata_gca002571385v1",
459
+ "suricata_suricatta",
460
+ "sus_scrofa_usmarc",
461
+ "taeniopygia_guttata",
462
+ "takifugu_rubripes",
463
+ "teleopsis_dalmanni",
464
+ "terrapene_carolina_triunguis",
465
+ "tetranychus_urticae",
466
+ "tetraodon_nigroviridis",
467
+ "thelohanellus_kitauei",
468
+ "theropithecus_gelada",
469
+ "thrips_palmi_gca012932325v1rs",
470
+ "tigriopus_californicus_gca007210705",
471
+ "trialeurodes_vaporariorum_gca011764245",
472
+ "tribolium_castaneum",
473
+ "trichinella_spiralis",
474
+ "trichogramma_pretiosum_gca000599845v3",
475
+ "trichoplax_adhaerens",
476
+ "trichuris_muris",
477
+ "tupaia_belangeri",
478
+ "tursiops_truncatus",
479
+ "urocitellus_parryii",
480
+ "ursus_americanus",
481
+ "ursus_maritimus",
482
+ "ursus_thibetanus_thibetanus",
483
+ "varanus_komodoensis",
484
+ "varroa_destructor_gca002443255",
485
+ "venturia_canescens_gca019457755v1rs",
486
+ "vicugna_pacos",
487
+ "vombatus_ursinus",
488
+ "vulpes_vulpes",
489
+ "xenopus_tropicalis",
490
+ "xiphophorus_couchianus",
491
+ "xiphophorus_maculatus",
492
+ "zalophus_californianus",
493
+ "zerene_cesonia_gca012273895v2rs",
494
+ "zonotrichia_albicollis",
495
+ "zootermopsis_nevadensis",
496
+ "zosterops_lateralis_melanops"
497
+ ],
498
+ "cls_token": "[CLS]",
499
+ "mask_token": "[MASK]",
500
+ "pad_token": "[PAD]",
501
+ "sep_token": "[SEP]",
502
+ "unk_token": "[UNK]"
503
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": false,
5
+ "mask_token": "[MASK]",
6
+ "max_len": 512,
7
+ "model_max_length": 2048,
8
+ "name_or_path": "/data/ouga/home/ag_gagneur/hingerl/BERTADN/preprocessing/tokenizer_upstream_metazoa",
9
+ "never_split": null,
10
+ "pad_token": "[PAD]",
11
+ "sep_token": "[SEP]",
12
+ "special_tokens_map_file": "/data/ouga/home/ag_gagneur/hingerl/.cache/huggingface/hub/models--zhihan1996--DNA_bert_6/snapshots/a79a8fd96ad172f964a4dbef3f4d7545a5034baa/special_tokens_map.json",
13
+ "strip_accents": null,
14
+ "tokenize_chinese_chars": true,
15
+ "tokenizer_class": "BertTokenizer",
16
+ "unk_token": "[UNK]"
17
+ }