sonalkum commited on
Commit
1e6d67a
·
1 Parent(s): a15624d
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Llama-2-7b-chat-hf-qformer/Qformer.json +339 -339
  2. Llama-2-7b-chat-hf-qformer/pytorch_model.bin.index.json +323 -323
  3. hf-dev-train/transformers-main/.circleci/TROUBLESHOOT.md +7 -0
  4. hf-dev-train/transformers-main/.circleci/config.yml +200 -0
  5. hf-dev-train/transformers-main/.circleci/create_circleci_config.py +478 -0
  6. hf-dev-train/transformers-main/.coveragerc +12 -0
  7. hf-dev-train/transformers-main/.gitattributes +4 -0
  8. hf-dev-train/transformers-main/.github/ISSUE_TEMPLATE/bug-report.yml +115 -0
  9. hf-dev-train/transformers-main/.github/ISSUE_TEMPLATE/config.yml +12 -0
  10. hf-dev-train/transformers-main/.github/ISSUE_TEMPLATE/feature-request.yml +31 -0
  11. hf-dev-train/transformers-main/.github/ISSUE_TEMPLATE/i18n.md +46 -0
  12. hf-dev-train/transformers-main/.github/ISSUE_TEMPLATE/migration.yml +72 -0
  13. hf-dev-train/transformers-main/.github/ISSUE_TEMPLATE/new-model-addition.yml +31 -0
  14. hf-dev-train/transformers-main/.github/PULL_REQUEST_TEMPLATE.md +76 -0
  15. hf-dev-train/transformers-main/.github/conda/build.sh +1 -0
  16. hf-dev-train/transformers-main/.github/conda/meta.yaml +54 -0
  17. hf-dev-train/transformers-main/.github/workflows/TROUBLESHOOT.md +9 -0
  18. hf-dev-train/transformers-main/.github/workflows/add-model-like.yml +80 -0
  19. hf-dev-train/transformers-main/.github/workflows/build-docker-images.yml +264 -0
  20. hf-dev-train/transformers-main/.github/workflows/build-past-ci-docker-images.yml +108 -0
  21. hf-dev-train/transformers-main/.github/workflows/build_documentation.yml +20 -0
  22. hf-dev-train/transformers-main/.github/workflows/build_pr_documentation.yml +17 -0
  23. hf-dev-train/transformers-main/.github/workflows/check_runner_status.yml +68 -0
  24. hf-dev-train/transformers-main/.github/workflows/delete_doc_comment.yml +13 -0
  25. hf-dev-train/transformers-main/.github/workflows/doctests.yml +81 -0
  26. hf-dev-train/transformers-main/.github/workflows/model-templates.yml +81 -0
  27. hf-dev-train/transformers-main/.github/workflows/release-conda.yml +47 -0
  28. hf-dev-train/transformers-main/.github/workflows/self-nightly-scheduled.yml +304 -0
  29. hf-dev-train/transformers-main/.github/workflows/self-past-caller.yml +136 -0
  30. hf-dev-train/transformers-main/.github/workflows/self-past.yml +275 -0
  31. hf-dev-train/transformers-main/.github/workflows/self-push-caller.yml +54 -0
  32. hf-dev-train/transformers-main/.github/workflows/self-push.yml +585 -0
  33. hf-dev-train/transformers-main/.github/workflows/self-scheduled.yml +495 -0
  34. hf-dev-train/transformers-main/.github/workflows/stale.yml +27 -0
  35. hf-dev-train/transformers-main/.github/workflows/update_metdata.yml +40 -0
  36. hf-dev-train/transformers-main/.github/workflows/update_tiny_models.yml +47 -0
  37. hf-dev-train/transformers-main/.gitignore +169 -0
  38. hf-dev-train/transformers-main/CITATION.cff +82 -0
  39. hf-dev-train/transformers-main/CODE_OF_CONDUCT.md +133 -0
  40. hf-dev-train/transformers-main/CONTRIBUTING.md +393 -0
  41. hf-dev-train/transformers-main/ISSUES.md +277 -0
  42. hf-dev-train/transformers-main/LICENSE +203 -0
  43. hf-dev-train/transformers-main/MANIFEST.in +1 -0
  44. hf-dev-train/transformers-main/Makefile +114 -0
  45. hf-dev-train/transformers-main/README.md +507 -0
  46. hf-dev-train/transformers-main/README_es.md +494 -0
  47. hf-dev-train/transformers-main/README_hd.md +0 -0
  48. hf-dev-train/transformers-main/README_ja.md +528 -0
  49. hf-dev-train/transformers-main/README_ko.md +442 -0
  50. hf-dev-train/transformers-main/README_zh-hans.md +467 -0
Llama-2-7b-chat-hf-qformer/Qformer.json CHANGED
@@ -1,341 +1,341 @@
1
  {
2
- "model.query_tokens": "pytorch_model-00004-of-00002.bin",
3
- "model.qformer_proj_norm.weight": "pytorch_model-00004-of-00002.bin",
4
- "model.qformer_proj_norm.bias": "pytorch_model-00004-of-00002.bin",
5
- "model.Qformer.bert.embeddings.position_ids": "pytorch_model-00004-of-00002.bin",
6
- "model.Qformer.bert.embeddings.word_embeddings.weight": "pytorch_model-00004-of-00002.bin",
7
- "model.Qformer.bert.embeddings.position_embeddings.weight": "pytorch_model-00004-of-00002.bin",
8
- "model.Qformer.bert.embeddings.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
9
- "model.Qformer.bert.embeddings.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
10
- "model.Qformer.bert.encoder.layer.0.attention.self.query.weight": "pytorch_model-00004-of-00002.bin",
11
- "model.Qformer.bert.encoder.layer.0.attention.self.query.bias": "pytorch_model-00004-of-00002.bin",
12
- "model.Qformer.bert.encoder.layer.0.attention.self.key.weight": "pytorch_model-00004-of-00002.bin",
13
- "model.Qformer.bert.encoder.layer.0.attention.self.key.bias": "pytorch_model-00004-of-00002.bin",
14
- "model.Qformer.bert.encoder.layer.0.attention.self.value.weight": "pytorch_model-00004-of-00002.bin",
15
- "model.Qformer.bert.encoder.layer.0.attention.self.value.bias": "pytorch_model-00004-of-00002.bin",
16
- "model.Qformer.bert.encoder.layer.0.attention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
17
- "model.Qformer.bert.encoder.layer.0.attention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
18
- "model.Qformer.bert.encoder.layer.0.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
19
- "model.Qformer.bert.encoder.layer.0.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
20
- "model.Qformer.bert.encoder.layer.0.crossattention.self.query.weight": "pytorch_model-00004-of-00002.bin",
21
- "model.Qformer.bert.encoder.layer.0.crossattention.self.query.bias": "pytorch_model-00004-of-00002.bin",
22
- "model.Qformer.bert.encoder.layer.0.crossattention.self.key.weight": "pytorch_model-00004-of-00002.bin",
23
- "model.Qformer.bert.encoder.layer.0.crossattention.self.key.bias": "pytorch_model-00004-of-00002.bin",
24
- "model.Qformer.bert.encoder.layer.0.crossattention.self.value.weight": "pytorch_model-00004-of-00002.bin",
25
- "model.Qformer.bert.encoder.layer.0.crossattention.self.value.bias": "pytorch_model-00004-of-00002.bin",
26
- "model.Qformer.bert.encoder.layer.0.crossattention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
27
- "model.Qformer.bert.encoder.layer.0.crossattention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
28
- "model.Qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
29
- "model.Qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
30
- "model.Qformer.bert.encoder.layer.0.intermediate.dense.weight": "pytorch_model-00004-of-00002.bin",
31
- "model.Qformer.bert.encoder.layer.0.intermediate.dense.bias": "pytorch_model-00004-of-00002.bin",
32
- "model.Qformer.bert.encoder.layer.0.output.dense.weight": "pytorch_model-00004-of-00002.bin",
33
- "model.Qformer.bert.encoder.layer.0.output.dense.bias": "pytorch_model-00004-of-00002.bin",
34
- "model.Qformer.bert.encoder.layer.0.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
35
- "model.Qformer.bert.encoder.layer.0.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
36
- "model.Qformer.bert.encoder.layer.0.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.bin",
37
- "model.Qformer.bert.encoder.layer.0.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.bin",
38
- "model.Qformer.bert.encoder.layer.0.output_query.dense.weight": "pytorch_model-00004-of-00002.bin",
39
- "model.Qformer.bert.encoder.layer.0.output_query.dense.bias": "pytorch_model-00004-of-00002.bin",
40
- "model.Qformer.bert.encoder.layer.0.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
41
- "model.Qformer.bert.encoder.layer.0.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
42
- "model.Qformer.bert.encoder.layer.1.attention.self.query.weight": "pytorch_model-00004-of-00002.bin",
43
- "model.Qformer.bert.encoder.layer.1.attention.self.query.bias": "pytorch_model-00004-of-00002.bin",
44
- "model.Qformer.bert.encoder.layer.1.attention.self.key.weight": "pytorch_model-00004-of-00002.bin",
45
- "model.Qformer.bert.encoder.layer.1.attention.self.key.bias": "pytorch_model-00004-of-00002.bin",
46
- "model.Qformer.bert.encoder.layer.1.attention.self.value.weight": "pytorch_model-00004-of-00002.bin",
47
- "model.Qformer.bert.encoder.layer.1.attention.self.value.bias": "pytorch_model-00004-of-00002.bin",
48
- "model.Qformer.bert.encoder.layer.1.attention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
49
- "model.Qformer.bert.encoder.layer.1.attention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
50
- "model.Qformer.bert.encoder.layer.1.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
51
- "model.Qformer.bert.encoder.layer.1.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
52
- "model.Qformer.bert.encoder.layer.1.intermediate.dense.weight": "pytorch_model-00004-of-00002.bin",
53
- "model.Qformer.bert.encoder.layer.1.intermediate.dense.bias": "pytorch_model-00004-of-00002.bin",
54
- "model.Qformer.bert.encoder.layer.1.output.dense.weight": "pytorch_model-00004-of-00002.bin",
55
- "model.Qformer.bert.encoder.layer.1.output.dense.bias": "pytorch_model-00004-of-00002.bin",
56
- "model.Qformer.bert.encoder.layer.1.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
57
- "model.Qformer.bert.encoder.layer.1.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
58
- "model.Qformer.bert.encoder.layer.1.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.bin",
59
- "model.Qformer.bert.encoder.layer.1.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.bin",
60
- "model.Qformer.bert.encoder.layer.1.output_query.dense.weight": "pytorch_model-00004-of-00002.bin",
61
- "model.Qformer.bert.encoder.layer.1.output_query.dense.bias": "pytorch_model-00004-of-00002.bin",
62
- "model.Qformer.bert.encoder.layer.1.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
63
- "model.Qformer.bert.encoder.layer.1.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
64
- "model.Qformer.bert.encoder.layer.2.attention.self.query.weight": "pytorch_model-00004-of-00002.bin",
65
- "model.Qformer.bert.encoder.layer.2.attention.self.query.bias": "pytorch_model-00004-of-00002.bin",
66
- "model.Qformer.bert.encoder.layer.2.attention.self.key.weight": "pytorch_model-00004-of-00002.bin",
67
- "model.Qformer.bert.encoder.layer.2.attention.self.key.bias": "pytorch_model-00004-of-00002.bin",
68
- "model.Qformer.bert.encoder.layer.2.attention.self.value.weight": "pytorch_model-00004-of-00002.bin",
69
- "model.Qformer.bert.encoder.layer.2.attention.self.value.bias": "pytorch_model-00004-of-00002.bin",
70
- "model.Qformer.bert.encoder.layer.2.attention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
71
- "model.Qformer.bert.encoder.layer.2.attention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
72
- "model.Qformer.bert.encoder.layer.2.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
73
- "model.Qformer.bert.encoder.layer.2.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
74
- "model.Qformer.bert.encoder.layer.2.crossattention.self.query.weight": "pytorch_model-00004-of-00002.bin",
75
- "model.Qformer.bert.encoder.layer.2.crossattention.self.query.bias": "pytorch_model-00004-of-00002.bin",
76
- "model.Qformer.bert.encoder.layer.2.crossattention.self.key.weight": "pytorch_model-00004-of-00002.bin",
77
- "model.Qformer.bert.encoder.layer.2.crossattention.self.key.bias": "pytorch_model-00004-of-00002.bin",
78
- "model.Qformer.bert.encoder.layer.2.crossattention.self.value.weight": "pytorch_model-00004-of-00002.bin",
79
- "model.Qformer.bert.encoder.layer.2.crossattention.self.value.bias": "pytorch_model-00004-of-00002.bin",
80
- "model.Qformer.bert.encoder.layer.2.crossattention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
81
- "model.Qformer.bert.encoder.layer.2.crossattention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
82
- "model.Qformer.bert.encoder.layer.2.crossattention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
83
- "model.Qformer.bert.encoder.layer.2.crossattention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
84
- "model.Qformer.bert.encoder.layer.2.intermediate.dense.weight": "pytorch_model-00004-of-00002.bin",
85
- "model.Qformer.bert.encoder.layer.2.intermediate.dense.bias": "pytorch_model-00004-of-00002.bin",
86
- "model.Qformer.bert.encoder.layer.2.output.dense.weight": "pytorch_model-00004-of-00002.bin",
87
- "model.Qformer.bert.encoder.layer.2.output.dense.bias": "pytorch_model-00004-of-00002.bin",
88
- "model.Qformer.bert.encoder.layer.2.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
89
- "model.Qformer.bert.encoder.layer.2.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
90
- "model.Qformer.bert.encoder.layer.2.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.bin",
91
- "model.Qformer.bert.encoder.layer.2.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.bin",
92
- "model.Qformer.bert.encoder.layer.2.output_query.dense.weight": "pytorch_model-00004-of-00002.bin",
93
- "model.Qformer.bert.encoder.layer.2.output_query.dense.bias": "pytorch_model-00004-of-00002.bin",
94
- "model.Qformer.bert.encoder.layer.2.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
95
- "model.Qformer.bert.encoder.layer.2.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
96
- "model.Qformer.bert.encoder.layer.3.attention.self.query.weight": "pytorch_model-00004-of-00002.bin",
97
- "model.Qformer.bert.encoder.layer.3.attention.self.query.bias": "pytorch_model-00004-of-00002.bin",
98
- "model.Qformer.bert.encoder.layer.3.attention.self.key.weight": "pytorch_model-00004-of-00002.bin",
99
- "model.Qformer.bert.encoder.layer.3.attention.self.key.bias": "pytorch_model-00004-of-00002.bin",
100
- "model.Qformer.bert.encoder.layer.3.attention.self.value.weight": "pytorch_model-00004-of-00002.bin",
101
- "model.Qformer.bert.encoder.layer.3.attention.self.value.bias": "pytorch_model-00004-of-00002.bin",
102
- "model.Qformer.bert.encoder.layer.3.attention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
103
- "model.Qformer.bert.encoder.layer.3.attention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
104
- "model.Qformer.bert.encoder.layer.3.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
105
- "model.Qformer.bert.encoder.layer.3.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
106
- "model.Qformer.bert.encoder.layer.3.intermediate.dense.weight": "pytorch_model-00004-of-00002.bin",
107
- "model.Qformer.bert.encoder.layer.3.intermediate.dense.bias": "pytorch_model-00004-of-00002.bin",
108
- "model.Qformer.bert.encoder.layer.3.output.dense.weight": "pytorch_model-00004-of-00002.bin",
109
- "model.Qformer.bert.encoder.layer.3.output.dense.bias": "pytorch_model-00004-of-00002.bin",
110
- "model.Qformer.bert.encoder.layer.3.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
111
- "model.Qformer.bert.encoder.layer.3.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
112
- "model.Qformer.bert.encoder.layer.3.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.bin",
113
- "model.Qformer.bert.encoder.layer.3.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.bin",
114
- "model.Qformer.bert.encoder.layer.3.output_query.dense.weight": "pytorch_model-00004-of-00002.bin",
115
- "model.Qformer.bert.encoder.layer.3.output_query.dense.bias": "pytorch_model-00004-of-00002.bin",
116
- "model.Qformer.bert.encoder.layer.3.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
117
- "model.Qformer.bert.encoder.layer.3.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
118
- "model.Qformer.bert.encoder.layer.4.attention.self.query.weight": "pytorch_model-00004-of-00002.bin",
119
- "model.Qformer.bert.encoder.layer.4.attention.self.query.bias": "pytorch_model-00004-of-00002.bin",
120
- "model.Qformer.bert.encoder.layer.4.attention.self.key.weight": "pytorch_model-00004-of-00002.bin",
121
- "model.Qformer.bert.encoder.layer.4.attention.self.key.bias": "pytorch_model-00004-of-00002.bin",
122
- "model.Qformer.bert.encoder.layer.4.attention.self.value.weight": "pytorch_model-00004-of-00002.bin",
123
- "model.Qformer.bert.encoder.layer.4.attention.self.value.bias": "pytorch_model-00004-of-00002.bin",
124
- "model.Qformer.bert.encoder.layer.4.attention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
125
- "model.Qformer.bert.encoder.layer.4.attention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
126
- "model.Qformer.bert.encoder.layer.4.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
127
- "model.Qformer.bert.encoder.layer.4.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
128
- "model.Qformer.bert.encoder.layer.4.crossattention.self.query.weight": "pytorch_model-00004-of-00002.bin",
129
- "model.Qformer.bert.encoder.layer.4.crossattention.self.query.bias": "pytorch_model-00004-of-00002.bin",
130
- "model.Qformer.bert.encoder.layer.4.crossattention.self.key.weight": "pytorch_model-00004-of-00002.bin",
131
- "model.Qformer.bert.encoder.layer.4.crossattention.self.key.bias": "pytorch_model-00004-of-00002.bin",
132
- "model.Qformer.bert.encoder.layer.4.crossattention.self.value.weight": "pytorch_model-00004-of-00002.bin",
133
- "model.Qformer.bert.encoder.layer.4.crossattention.self.value.bias": "pytorch_model-00004-of-00002.bin",
134
- "model.Qformer.bert.encoder.layer.4.crossattention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
135
- "model.Qformer.bert.encoder.layer.4.crossattention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
136
- "model.Qformer.bert.encoder.layer.4.crossattention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
137
- "model.Qformer.bert.encoder.layer.4.crossattention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
138
- "model.Qformer.bert.encoder.layer.4.intermediate.dense.weight": "pytorch_model-00004-of-00002.bin",
139
- "model.Qformer.bert.encoder.layer.4.intermediate.dense.bias": "pytorch_model-00004-of-00002.bin",
140
- "model.Qformer.bert.encoder.layer.4.output.dense.weight": "pytorch_model-00004-of-00002.bin",
141
- "model.Qformer.bert.encoder.layer.4.output.dense.bias": "pytorch_model-00004-of-00002.bin",
142
- "model.Qformer.bert.encoder.layer.4.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
143
- "model.Qformer.bert.encoder.layer.4.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
144
- "model.Qformer.bert.encoder.layer.4.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.bin",
145
- "model.Qformer.bert.encoder.layer.4.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.bin",
146
- "model.Qformer.bert.encoder.layer.4.output_query.dense.weight": "pytorch_model-00004-of-00002.bin",
147
- "model.Qformer.bert.encoder.layer.4.output_query.dense.bias": "pytorch_model-00004-of-00002.bin",
148
- "model.Qformer.bert.encoder.layer.4.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
149
- "model.Qformer.bert.encoder.layer.4.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
150
- "model.Qformer.bert.encoder.layer.5.attention.self.query.weight": "pytorch_model-00004-of-00002.bin",
151
- "model.Qformer.bert.encoder.layer.5.attention.self.query.bias": "pytorch_model-00004-of-00002.bin",
152
- "model.Qformer.bert.encoder.layer.5.attention.self.key.weight": "pytorch_model-00004-of-00002.bin",
153
- "model.Qformer.bert.encoder.layer.5.attention.self.key.bias": "pytorch_model-00004-of-00002.bin",
154
- "model.Qformer.bert.encoder.layer.5.attention.self.value.weight": "pytorch_model-00004-of-00002.bin",
155
- "model.Qformer.bert.encoder.layer.5.attention.self.value.bias": "pytorch_model-00004-of-00002.bin",
156
- "model.Qformer.bert.encoder.layer.5.attention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
157
- "model.Qformer.bert.encoder.layer.5.attention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
158
- "model.Qformer.bert.encoder.layer.5.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
159
- "model.Qformer.bert.encoder.layer.5.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
160
- "model.Qformer.bert.encoder.layer.5.intermediate.dense.weight": "pytorch_model-00004-of-00002.bin",
161
- "model.Qformer.bert.encoder.layer.5.intermediate.dense.bias": "pytorch_model-00004-of-00002.bin",
162
- "model.Qformer.bert.encoder.layer.5.output.dense.weight": "pytorch_model-00004-of-00002.bin",
163
- "model.Qformer.bert.encoder.layer.5.output.dense.bias": "pytorch_model-00004-of-00002.bin",
164
- "model.Qformer.bert.encoder.layer.5.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
165
- "model.Qformer.bert.encoder.layer.5.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
166
- "model.Qformer.bert.encoder.layer.5.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.bin",
167
- "model.Qformer.bert.encoder.layer.5.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.bin",
168
- "model.Qformer.bert.encoder.layer.5.output_query.dense.weight": "pytorch_model-00004-of-00002.bin",
169
- "model.Qformer.bert.encoder.layer.5.output_query.dense.bias": "pytorch_model-00004-of-00002.bin",
170
- "model.Qformer.bert.encoder.layer.5.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
171
- "model.Qformer.bert.encoder.layer.5.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
172
- "model.Qformer.bert.encoder.layer.6.attention.self.query.weight": "pytorch_model-00004-of-00002.bin",
173
- "model.Qformer.bert.encoder.layer.6.attention.self.query.bias": "pytorch_model-00004-of-00002.bin",
174
- "model.Qformer.bert.encoder.layer.6.attention.self.key.weight": "pytorch_model-00004-of-00002.bin",
175
- "model.Qformer.bert.encoder.layer.6.attention.self.key.bias": "pytorch_model-00004-of-00002.bin",
176
- "model.Qformer.bert.encoder.layer.6.attention.self.value.weight": "pytorch_model-00004-of-00002.bin",
177
- "model.Qformer.bert.encoder.layer.6.attention.self.value.bias": "pytorch_model-00004-of-00002.bin",
178
- "model.Qformer.bert.encoder.layer.6.attention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
179
- "model.Qformer.bert.encoder.layer.6.attention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
180
- "model.Qformer.bert.encoder.layer.6.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
181
- "model.Qformer.bert.encoder.layer.6.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
182
- "model.Qformer.bert.encoder.layer.6.crossattention.self.query.weight": "pytorch_model-00004-of-00002.bin",
183
- "model.Qformer.bert.encoder.layer.6.crossattention.self.query.bias": "pytorch_model-00004-of-00002.bin",
184
- "model.Qformer.bert.encoder.layer.6.crossattention.self.key.weight": "pytorch_model-00004-of-00002.bin",
185
- "model.Qformer.bert.encoder.layer.6.crossattention.self.key.bias": "pytorch_model-00004-of-00002.bin",
186
- "model.Qformer.bert.encoder.layer.6.crossattention.self.value.weight": "pytorch_model-00004-of-00002.bin",
187
- "model.Qformer.bert.encoder.layer.6.crossattention.self.value.bias": "pytorch_model-00004-of-00002.bin",
188
- "model.Qformer.bert.encoder.layer.6.crossattention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
189
- "model.Qformer.bert.encoder.layer.6.crossattention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
190
- "model.Qformer.bert.encoder.layer.6.crossattention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
191
- "model.Qformer.bert.encoder.layer.6.crossattention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
192
- "model.Qformer.bert.encoder.layer.6.intermediate.dense.weight": "pytorch_model-00004-of-00002.bin",
193
- "model.Qformer.bert.encoder.layer.6.intermediate.dense.bias": "pytorch_model-00004-of-00002.bin",
194
- "model.Qformer.bert.encoder.layer.6.output.dense.weight": "pytorch_model-00004-of-00002.bin",
195
- "model.Qformer.bert.encoder.layer.6.output.dense.bias": "pytorch_model-00004-of-00002.bin",
196
- "model.Qformer.bert.encoder.layer.6.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
197
- "model.Qformer.bert.encoder.layer.6.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
198
- "model.Qformer.bert.encoder.layer.6.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.bin",
199
- "model.Qformer.bert.encoder.layer.6.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.bin",
200
- "model.Qformer.bert.encoder.layer.6.output_query.dense.weight": "pytorch_model-00004-of-00002.bin",
201
- "model.Qformer.bert.encoder.layer.6.output_query.dense.bias": "pytorch_model-00004-of-00002.bin",
202
- "model.Qformer.bert.encoder.layer.6.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
203
- "model.Qformer.bert.encoder.layer.6.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
204
- "model.Qformer.bert.encoder.layer.7.attention.self.query.weight": "pytorch_model-00004-of-00002.bin",
205
- "model.Qformer.bert.encoder.layer.7.attention.self.query.bias": "pytorch_model-00004-of-00002.bin",
206
- "model.Qformer.bert.encoder.layer.7.attention.self.key.weight": "pytorch_model-00004-of-00002.bin",
207
- "model.Qformer.bert.encoder.layer.7.attention.self.key.bias": "pytorch_model-00004-of-00002.bin",
208
- "model.Qformer.bert.encoder.layer.7.attention.self.value.weight": "pytorch_model-00004-of-00002.bin",
209
- "model.Qformer.bert.encoder.layer.7.attention.self.value.bias": "pytorch_model-00004-of-00002.bin",
210
- "model.Qformer.bert.encoder.layer.7.attention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
211
- "model.Qformer.bert.encoder.layer.7.attention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
212
- "model.Qformer.bert.encoder.layer.7.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
213
- "model.Qformer.bert.encoder.layer.7.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
214
- "model.Qformer.bert.encoder.layer.7.intermediate.dense.weight": "pytorch_model-00004-of-00002.bin",
215
- "model.Qformer.bert.encoder.layer.7.intermediate.dense.bias": "pytorch_model-00004-of-00002.bin",
216
- "model.Qformer.bert.encoder.layer.7.output.dense.weight": "pytorch_model-00004-of-00002.bin",
217
- "model.Qformer.bert.encoder.layer.7.output.dense.bias": "pytorch_model-00004-of-00002.bin",
218
- "model.Qformer.bert.encoder.layer.7.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
219
- "model.Qformer.bert.encoder.layer.7.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
220
- "model.Qformer.bert.encoder.layer.7.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.bin",
221
- "model.Qformer.bert.encoder.layer.7.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.bin",
222
- "model.Qformer.bert.encoder.layer.7.output_query.dense.weight": "pytorch_model-00004-of-00002.bin",
223
- "model.Qformer.bert.encoder.layer.7.output_query.dense.bias": "pytorch_model-00004-of-00002.bin",
224
- "model.Qformer.bert.encoder.layer.7.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
225
- "model.Qformer.bert.encoder.layer.7.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
226
- "model.Qformer.bert.encoder.layer.8.attention.self.query.weight": "pytorch_model-00004-of-00002.bin",
227
- "model.Qformer.bert.encoder.layer.8.attention.self.query.bias": "pytorch_model-00004-of-00002.bin",
228
- "model.Qformer.bert.encoder.layer.8.attention.self.key.weight": "pytorch_model-00004-of-00002.bin",
229
- "model.Qformer.bert.encoder.layer.8.attention.self.key.bias": "pytorch_model-00004-of-00002.bin",
230
- "model.Qformer.bert.encoder.layer.8.attention.self.value.weight": "pytorch_model-00004-of-00002.bin",
231
- "model.Qformer.bert.encoder.layer.8.attention.self.value.bias": "pytorch_model-00004-of-00002.bin",
232
- "model.Qformer.bert.encoder.layer.8.attention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
233
- "model.Qformer.bert.encoder.layer.8.attention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
234
- "model.Qformer.bert.encoder.layer.8.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
235
- "model.Qformer.bert.encoder.layer.8.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
236
- "model.Qformer.bert.encoder.layer.8.crossattention.self.query.weight": "pytorch_model-00004-of-00002.bin",
237
- "model.Qformer.bert.encoder.layer.8.crossattention.self.query.bias": "pytorch_model-00004-of-00002.bin",
238
- "model.Qformer.bert.encoder.layer.8.crossattention.self.key.weight": "pytorch_model-00004-of-00002.bin",
239
- "model.Qformer.bert.encoder.layer.8.crossattention.self.key.bias": "pytorch_model-00004-of-00002.bin",
240
- "model.Qformer.bert.encoder.layer.8.crossattention.self.value.weight": "pytorch_model-00004-of-00002.bin",
241
- "model.Qformer.bert.encoder.layer.8.crossattention.self.value.bias": "pytorch_model-00004-of-00002.bin",
242
- "model.Qformer.bert.encoder.layer.8.crossattention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
243
- "model.Qformer.bert.encoder.layer.8.crossattention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
244
- "model.Qformer.bert.encoder.layer.8.crossattention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
245
- "model.Qformer.bert.encoder.layer.8.crossattention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
246
- "model.Qformer.bert.encoder.layer.8.intermediate.dense.weight": "pytorch_model-00004-of-00002.bin",
247
- "model.Qformer.bert.encoder.layer.8.intermediate.dense.bias": "pytorch_model-00004-of-00002.bin",
248
- "model.Qformer.bert.encoder.layer.8.output.dense.weight": "pytorch_model-00004-of-00002.bin",
249
- "model.Qformer.bert.encoder.layer.8.output.dense.bias": "pytorch_model-00004-of-00002.bin",
250
- "model.Qformer.bert.encoder.layer.8.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
251
- "model.Qformer.bert.encoder.layer.8.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
252
- "model.Qformer.bert.encoder.layer.8.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.bin",
253
- "model.Qformer.bert.encoder.layer.8.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.bin",
254
- "model.Qformer.bert.encoder.layer.8.output_query.dense.weight": "pytorch_model-00004-of-00002.bin",
255
- "model.Qformer.bert.encoder.layer.8.output_query.dense.bias": "pytorch_model-00004-of-00002.bin",
256
- "model.Qformer.bert.encoder.layer.8.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
257
- "model.Qformer.bert.encoder.layer.8.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
258
- "model.Qformer.bert.encoder.layer.9.attention.self.query.weight": "pytorch_model-00004-of-00002.bin",
259
- "model.Qformer.bert.encoder.layer.9.attention.self.query.bias": "pytorch_model-00004-of-00002.bin",
260
- "model.Qformer.bert.encoder.layer.9.attention.self.key.weight": "pytorch_model-00004-of-00002.bin",
261
- "model.Qformer.bert.encoder.layer.9.attention.self.key.bias": "pytorch_model-00004-of-00002.bin",
262
- "model.Qformer.bert.encoder.layer.9.attention.self.value.weight": "pytorch_model-00004-of-00002.bin",
263
- "model.Qformer.bert.encoder.layer.9.attention.self.value.bias": "pytorch_model-00004-of-00002.bin",
264
- "model.Qformer.bert.encoder.layer.9.attention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
265
- "model.Qformer.bert.encoder.layer.9.attention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
266
- "model.Qformer.bert.encoder.layer.9.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
267
- "model.Qformer.bert.encoder.layer.9.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
268
- "model.Qformer.bert.encoder.layer.9.intermediate.dense.weight": "pytorch_model-00004-of-00002.bin",
269
- "model.Qformer.bert.encoder.layer.9.intermediate.dense.bias": "pytorch_model-00004-of-00002.bin",
270
- "model.Qformer.bert.encoder.layer.9.output.dense.weight": "pytorch_model-00004-of-00002.bin",
271
- "model.Qformer.bert.encoder.layer.9.output.dense.bias": "pytorch_model-00004-of-00002.bin",
272
- "model.Qformer.bert.encoder.layer.9.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
273
- "model.Qformer.bert.encoder.layer.9.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
274
- "model.Qformer.bert.encoder.layer.9.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.bin",
275
- "model.Qformer.bert.encoder.layer.9.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.bin",
276
- "model.Qformer.bert.encoder.layer.9.output_query.dense.weight": "pytorch_model-00004-of-00002.bin",
277
- "model.Qformer.bert.encoder.layer.9.output_query.dense.bias": "pytorch_model-00004-of-00002.bin",
278
- "model.Qformer.bert.encoder.layer.9.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
279
- "model.Qformer.bert.encoder.layer.9.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
280
- "model.Qformer.bert.encoder.layer.10.attention.self.query.weight": "pytorch_model-00004-of-00002.bin",
281
- "model.Qformer.bert.encoder.layer.10.attention.self.query.bias": "pytorch_model-00004-of-00002.bin",
282
- "model.Qformer.bert.encoder.layer.10.attention.self.key.weight": "pytorch_model-00004-of-00002.bin",
283
- "model.Qformer.bert.encoder.layer.10.attention.self.key.bias": "pytorch_model-00004-of-00002.bin",
284
- "model.Qformer.bert.encoder.layer.10.attention.self.value.weight": "pytorch_model-00004-of-00002.bin",
285
- "model.Qformer.bert.encoder.layer.10.attention.self.value.bias": "pytorch_model-00004-of-00002.bin",
286
- "model.Qformer.bert.encoder.layer.10.attention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
287
- "model.Qformer.bert.encoder.layer.10.attention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
288
- "model.Qformer.bert.encoder.layer.10.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
289
- "model.Qformer.bert.encoder.layer.10.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
290
- "model.Qformer.bert.encoder.layer.10.crossattention.self.query.weight": "pytorch_model-00004-of-00002.bin",
291
- "model.Qformer.bert.encoder.layer.10.crossattention.self.query.bias": "pytorch_model-00004-of-00002.bin",
292
- "model.Qformer.bert.encoder.layer.10.crossattention.self.key.weight": "pytorch_model-00004-of-00002.bin",
293
- "model.Qformer.bert.encoder.layer.10.crossattention.self.key.bias": "pytorch_model-00004-of-00002.bin",
294
- "model.Qformer.bert.encoder.layer.10.crossattention.self.value.weight": "pytorch_model-00004-of-00002.bin",
295
- "model.Qformer.bert.encoder.layer.10.crossattention.self.value.bias": "pytorch_model-00004-of-00002.bin",
296
- "model.Qformer.bert.encoder.layer.10.crossattention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
297
- "model.Qformer.bert.encoder.layer.10.crossattention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
298
- "model.Qformer.bert.encoder.layer.10.crossattention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
299
- "model.Qformer.bert.encoder.layer.10.crossattention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
300
- "model.Qformer.bert.encoder.layer.10.intermediate.dense.weight": "pytorch_model-00004-of-00002.bin",
301
- "model.Qformer.bert.encoder.layer.10.intermediate.dense.bias": "pytorch_model-00004-of-00002.bin",
302
- "model.Qformer.bert.encoder.layer.10.output.dense.weight": "pytorch_model-00004-of-00002.bin",
303
- "model.Qformer.bert.encoder.layer.10.output.dense.bias": "pytorch_model-00004-of-00002.bin",
304
- "model.Qformer.bert.encoder.layer.10.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
305
- "model.Qformer.bert.encoder.layer.10.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
306
- "model.Qformer.bert.encoder.layer.10.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.bin",
307
- "model.Qformer.bert.encoder.layer.10.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.bin",
308
- "model.Qformer.bert.encoder.layer.10.output_query.dense.weight": "pytorch_model-00004-of-00002.bin",
309
- "model.Qformer.bert.encoder.layer.10.output_query.dense.bias": "pytorch_model-00004-of-00002.bin",
310
- "model.Qformer.bert.encoder.layer.10.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
311
- "model.Qformer.bert.encoder.layer.10.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
312
- "model.Qformer.bert.encoder.layer.11.attention.self.query.weight": "pytorch_model-00004-of-00002.bin",
313
- "model.Qformer.bert.encoder.layer.11.attention.self.query.bias": "pytorch_model-00004-of-00002.bin",
314
- "model.Qformer.bert.encoder.layer.11.attention.self.key.weight": "pytorch_model-00004-of-00002.bin",
315
- "model.Qformer.bert.encoder.layer.11.attention.self.key.bias": "pytorch_model-00004-of-00002.bin",
316
- "model.Qformer.bert.encoder.layer.11.attention.self.value.weight": "pytorch_model-00004-of-00002.bin",
317
- "model.Qformer.bert.encoder.layer.11.attention.self.value.bias": "pytorch_model-00004-of-00002.bin",
318
- "model.Qformer.bert.encoder.layer.11.attention.output.dense.weight": "pytorch_model-00004-of-00002.bin",
319
- "model.Qformer.bert.encoder.layer.11.attention.output.dense.bias": "pytorch_model-00004-of-00002.bin",
320
- "model.Qformer.bert.encoder.layer.11.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
321
- "model.Qformer.bert.encoder.layer.11.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
322
- "model.Qformer.bert.encoder.layer.11.intermediate.dense.weight": "pytorch_model-00004-of-00002.bin",
323
- "model.Qformer.bert.encoder.layer.11.intermediate.dense.bias": "pytorch_model-00004-of-00002.bin",
324
- "model.Qformer.bert.encoder.layer.11.output.dense.weight": "pytorch_model-00004-of-00002.bin",
325
- "model.Qformer.bert.encoder.layer.11.output.dense.bias": "pytorch_model-00004-of-00002.bin",
326
- "model.Qformer.bert.encoder.layer.11.output.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
327
- "model.Qformer.bert.encoder.layer.11.output.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
328
- "model.Qformer.bert.encoder.layer.11.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.bin",
329
- "model.Qformer.bert.encoder.layer.11.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.bin",
330
- "model.Qformer.bert.encoder.layer.11.output_query.dense.weight": "pytorch_model-00004-of-00002.bin",
331
- "model.Qformer.bert.encoder.layer.11.output_query.dense.bias": "pytorch_model-00004-of-00002.bin",
332
- "model.Qformer.bert.encoder.layer.11.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
333
- "model.Qformer.bert.encoder.layer.11.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
334
- "model.Qformer.cls.predictions.bias": "pytorch_model-00004-of-00002.bin",
335
- "model.Qformer.cls.predictions.transform.dense.weight": "pytorch_model-00004-of-00002.bin",
336
- "model.Qformer.cls.predictions.transform.dense.bias": "pytorch_model-00004-of-00002.bin",
337
- "model.Qformer.cls.predictions.transform.LayerNorm.weight": "pytorch_model-00004-of-00002.bin",
338
- "model.Qformer.cls.predictions.transform.LayerNorm.bias": "pytorch_model-00004-of-00002.bin",
339
- "model.Qformer.cls.predictions.decoder.weight": "pytorch_model-00004-of-00002.bin",
340
- "model.Qformer.cls.predictions.decoder.bias": "pytorch_model-00004-of-00002.bin"
341
  }
 
1
  {
2
+ "model.query_tokens": "pytorch_model-00004-of-00002.safetensors",
3
+ "model.qformer_proj_norm.weight": "pytorch_model-00004-of-00002.safetensors",
4
+ "model.qformer_proj_norm.bias": "pytorch_model-00004-of-00002.safetensors",
5
+ "model.Qformer.bert.embeddings.position_ids": "pytorch_model-00004-of-00002.safetensors",
6
+ "model.Qformer.bert.embeddings.word_embeddings.weight": "pytorch_model-00004-of-00002.safetensors",
7
+ "model.Qformer.bert.embeddings.position_embeddings.weight": "pytorch_model-00004-of-00002.safetensors",
8
+ "model.Qformer.bert.embeddings.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
9
+ "model.Qformer.bert.embeddings.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
10
+ "model.Qformer.bert.encoder.layer.0.attention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
11
+ "model.Qformer.bert.encoder.layer.0.attention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
12
+ "model.Qformer.bert.encoder.layer.0.attention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
13
+ "model.Qformer.bert.encoder.layer.0.attention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
14
+ "model.Qformer.bert.encoder.layer.0.attention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
15
+ "model.Qformer.bert.encoder.layer.0.attention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
16
+ "model.Qformer.bert.encoder.layer.0.attention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
17
+ "model.Qformer.bert.encoder.layer.0.attention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
18
+ "model.Qformer.bert.encoder.layer.0.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
19
+ "model.Qformer.bert.encoder.layer.0.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
20
+ "model.Qformer.bert.encoder.layer.0.crossattention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
21
+ "model.Qformer.bert.encoder.layer.0.crossattention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
22
+ "model.Qformer.bert.encoder.layer.0.crossattention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
23
+ "model.Qformer.bert.encoder.layer.0.crossattention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
24
+ "model.Qformer.bert.encoder.layer.0.crossattention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
25
+ "model.Qformer.bert.encoder.layer.0.crossattention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
26
+ "model.Qformer.bert.encoder.layer.0.crossattention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
27
+ "model.Qformer.bert.encoder.layer.0.crossattention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
28
+ "model.Qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
29
+ "model.Qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
30
+ "model.Qformer.bert.encoder.layer.0.intermediate.dense.weight": "pytorch_model-00004-of-00002.safetensors",
31
+ "model.Qformer.bert.encoder.layer.0.intermediate.dense.bias": "pytorch_model-00004-of-00002.safetensors",
32
+ "model.Qformer.bert.encoder.layer.0.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
33
+ "model.Qformer.bert.encoder.layer.0.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
34
+ "model.Qformer.bert.encoder.layer.0.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
35
+ "model.Qformer.bert.encoder.layer.0.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
36
+ "model.Qformer.bert.encoder.layer.0.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
37
+ "model.Qformer.bert.encoder.layer.0.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
38
+ "model.Qformer.bert.encoder.layer.0.output_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
39
+ "model.Qformer.bert.encoder.layer.0.output_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
40
+ "model.Qformer.bert.encoder.layer.0.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
41
+ "model.Qformer.bert.encoder.layer.0.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
42
+ "model.Qformer.bert.encoder.layer.1.attention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
43
+ "model.Qformer.bert.encoder.layer.1.attention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
44
+ "model.Qformer.bert.encoder.layer.1.attention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
45
+ "model.Qformer.bert.encoder.layer.1.attention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
46
+ "model.Qformer.bert.encoder.layer.1.attention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
47
+ "model.Qformer.bert.encoder.layer.1.attention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
48
+ "model.Qformer.bert.encoder.layer.1.attention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
49
+ "model.Qformer.bert.encoder.layer.1.attention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
50
+ "model.Qformer.bert.encoder.layer.1.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
51
+ "model.Qformer.bert.encoder.layer.1.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
52
+ "model.Qformer.bert.encoder.layer.1.intermediate.dense.weight": "pytorch_model-00004-of-00002.safetensors",
53
+ "model.Qformer.bert.encoder.layer.1.intermediate.dense.bias": "pytorch_model-00004-of-00002.safetensors",
54
+ "model.Qformer.bert.encoder.layer.1.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
55
+ "model.Qformer.bert.encoder.layer.1.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
56
+ "model.Qformer.bert.encoder.layer.1.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
57
+ "model.Qformer.bert.encoder.layer.1.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
58
+ "model.Qformer.bert.encoder.layer.1.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
59
+ "model.Qformer.bert.encoder.layer.1.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
60
+ "model.Qformer.bert.encoder.layer.1.output_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
61
+ "model.Qformer.bert.encoder.layer.1.output_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
62
+ "model.Qformer.bert.encoder.layer.1.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
63
+ "model.Qformer.bert.encoder.layer.1.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
64
+ "model.Qformer.bert.encoder.layer.2.attention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
65
+ "model.Qformer.bert.encoder.layer.2.attention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
66
+ "model.Qformer.bert.encoder.layer.2.attention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
67
+ "model.Qformer.bert.encoder.layer.2.attention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
68
+ "model.Qformer.bert.encoder.layer.2.attention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
69
+ "model.Qformer.bert.encoder.layer.2.attention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
70
+ "model.Qformer.bert.encoder.layer.2.attention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
71
+ "model.Qformer.bert.encoder.layer.2.attention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
72
+ "model.Qformer.bert.encoder.layer.2.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
73
+ "model.Qformer.bert.encoder.layer.2.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
74
+ "model.Qformer.bert.encoder.layer.2.crossattention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
75
+ "model.Qformer.bert.encoder.layer.2.crossattention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
76
+ "model.Qformer.bert.encoder.layer.2.crossattention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
77
+ "model.Qformer.bert.encoder.layer.2.crossattention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
78
+ "model.Qformer.bert.encoder.layer.2.crossattention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
79
+ "model.Qformer.bert.encoder.layer.2.crossattention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
80
+ "model.Qformer.bert.encoder.layer.2.crossattention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
81
+ "model.Qformer.bert.encoder.layer.2.crossattention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
82
+ "model.Qformer.bert.encoder.layer.2.crossattention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
83
+ "model.Qformer.bert.encoder.layer.2.crossattention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
84
+ "model.Qformer.bert.encoder.layer.2.intermediate.dense.weight": "pytorch_model-00004-of-00002.safetensors",
85
+ "model.Qformer.bert.encoder.layer.2.intermediate.dense.bias": "pytorch_model-00004-of-00002.safetensors",
86
+ "model.Qformer.bert.encoder.layer.2.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
87
+ "model.Qformer.bert.encoder.layer.2.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
88
+ "model.Qformer.bert.encoder.layer.2.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
89
+ "model.Qformer.bert.encoder.layer.2.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
90
+ "model.Qformer.bert.encoder.layer.2.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
91
+ "model.Qformer.bert.encoder.layer.2.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
92
+ "model.Qformer.bert.encoder.layer.2.output_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
93
+ "model.Qformer.bert.encoder.layer.2.output_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
94
+ "model.Qformer.bert.encoder.layer.2.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
95
+ "model.Qformer.bert.encoder.layer.2.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
96
+ "model.Qformer.bert.encoder.layer.3.attention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
97
+ "model.Qformer.bert.encoder.layer.3.attention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
98
+ "model.Qformer.bert.encoder.layer.3.attention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
99
+ "model.Qformer.bert.encoder.layer.3.attention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
100
+ "model.Qformer.bert.encoder.layer.3.attention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
101
+ "model.Qformer.bert.encoder.layer.3.attention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
102
+ "model.Qformer.bert.encoder.layer.3.attention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
103
+ "model.Qformer.bert.encoder.layer.3.attention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
104
+ "model.Qformer.bert.encoder.layer.3.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
105
+ "model.Qformer.bert.encoder.layer.3.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
106
+ "model.Qformer.bert.encoder.layer.3.intermediate.dense.weight": "pytorch_model-00004-of-00002.safetensors",
107
+ "model.Qformer.bert.encoder.layer.3.intermediate.dense.bias": "pytorch_model-00004-of-00002.safetensors",
108
+ "model.Qformer.bert.encoder.layer.3.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
109
+ "model.Qformer.bert.encoder.layer.3.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
110
+ "model.Qformer.bert.encoder.layer.3.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
111
+ "model.Qformer.bert.encoder.layer.3.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
112
+ "model.Qformer.bert.encoder.layer.3.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
113
+ "model.Qformer.bert.encoder.layer.3.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
114
+ "model.Qformer.bert.encoder.layer.3.output_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
115
+ "model.Qformer.bert.encoder.layer.3.output_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
116
+ "model.Qformer.bert.encoder.layer.3.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
117
+ "model.Qformer.bert.encoder.layer.3.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
118
+ "model.Qformer.bert.encoder.layer.4.attention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
119
+ "model.Qformer.bert.encoder.layer.4.attention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
120
+ "model.Qformer.bert.encoder.layer.4.attention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
121
+ "model.Qformer.bert.encoder.layer.4.attention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
122
+ "model.Qformer.bert.encoder.layer.4.attention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
123
+ "model.Qformer.bert.encoder.layer.4.attention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
124
+ "model.Qformer.bert.encoder.layer.4.attention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
125
+ "model.Qformer.bert.encoder.layer.4.attention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
126
+ "model.Qformer.bert.encoder.layer.4.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
127
+ "model.Qformer.bert.encoder.layer.4.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
128
+ "model.Qformer.bert.encoder.layer.4.crossattention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
129
+ "model.Qformer.bert.encoder.layer.4.crossattention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
130
+ "model.Qformer.bert.encoder.layer.4.crossattention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
131
+ "model.Qformer.bert.encoder.layer.4.crossattention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
132
+ "model.Qformer.bert.encoder.layer.4.crossattention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
133
+ "model.Qformer.bert.encoder.layer.4.crossattention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
134
+ "model.Qformer.bert.encoder.layer.4.crossattention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
135
+ "model.Qformer.bert.encoder.layer.4.crossattention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
136
+ "model.Qformer.bert.encoder.layer.4.crossattention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
137
+ "model.Qformer.bert.encoder.layer.4.crossattention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
138
+ "model.Qformer.bert.encoder.layer.4.intermediate.dense.weight": "pytorch_model-00004-of-00002.safetensors",
139
+ "model.Qformer.bert.encoder.layer.4.intermediate.dense.bias": "pytorch_model-00004-of-00002.safetensors",
140
+ "model.Qformer.bert.encoder.layer.4.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
141
+ "model.Qformer.bert.encoder.layer.4.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
142
+ "model.Qformer.bert.encoder.layer.4.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
143
+ "model.Qformer.bert.encoder.layer.4.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
144
+ "model.Qformer.bert.encoder.layer.4.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
145
+ "model.Qformer.bert.encoder.layer.4.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
146
+ "model.Qformer.bert.encoder.layer.4.output_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
147
+ "model.Qformer.bert.encoder.layer.4.output_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
148
+ "model.Qformer.bert.encoder.layer.4.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
149
+ "model.Qformer.bert.encoder.layer.4.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
150
+ "model.Qformer.bert.encoder.layer.5.attention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
151
+ "model.Qformer.bert.encoder.layer.5.attention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
152
+ "model.Qformer.bert.encoder.layer.5.attention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
153
+ "model.Qformer.bert.encoder.layer.5.attention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
154
+ "model.Qformer.bert.encoder.layer.5.attention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
155
+ "model.Qformer.bert.encoder.layer.5.attention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
156
+ "model.Qformer.bert.encoder.layer.5.attention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
157
+ "model.Qformer.bert.encoder.layer.5.attention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
158
+ "model.Qformer.bert.encoder.layer.5.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
159
+ "model.Qformer.bert.encoder.layer.5.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
160
+ "model.Qformer.bert.encoder.layer.5.intermediate.dense.weight": "pytorch_model-00004-of-00002.safetensors",
161
+ "model.Qformer.bert.encoder.layer.5.intermediate.dense.bias": "pytorch_model-00004-of-00002.safetensors",
162
+ "model.Qformer.bert.encoder.layer.5.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
163
+ "model.Qformer.bert.encoder.layer.5.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
164
+ "model.Qformer.bert.encoder.layer.5.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
165
+ "model.Qformer.bert.encoder.layer.5.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
166
+ "model.Qformer.bert.encoder.layer.5.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
167
+ "model.Qformer.bert.encoder.layer.5.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
168
+ "model.Qformer.bert.encoder.layer.5.output_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
169
+ "model.Qformer.bert.encoder.layer.5.output_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
170
+ "model.Qformer.bert.encoder.layer.5.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
171
+ "model.Qformer.bert.encoder.layer.5.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
172
+ "model.Qformer.bert.encoder.layer.6.attention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
173
+ "model.Qformer.bert.encoder.layer.6.attention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
174
+ "model.Qformer.bert.encoder.layer.6.attention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
175
+ "model.Qformer.bert.encoder.layer.6.attention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
176
+ "model.Qformer.bert.encoder.layer.6.attention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
177
+ "model.Qformer.bert.encoder.layer.6.attention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
178
+ "model.Qformer.bert.encoder.layer.6.attention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
179
+ "model.Qformer.bert.encoder.layer.6.attention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
180
+ "model.Qformer.bert.encoder.layer.6.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
181
+ "model.Qformer.bert.encoder.layer.6.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
182
+ "model.Qformer.bert.encoder.layer.6.crossattention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
183
+ "model.Qformer.bert.encoder.layer.6.crossattention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
184
+ "model.Qformer.bert.encoder.layer.6.crossattention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
185
+ "model.Qformer.bert.encoder.layer.6.crossattention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
186
+ "model.Qformer.bert.encoder.layer.6.crossattention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
187
+ "model.Qformer.bert.encoder.layer.6.crossattention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
188
+ "model.Qformer.bert.encoder.layer.6.crossattention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
189
+ "model.Qformer.bert.encoder.layer.6.crossattention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
190
+ "model.Qformer.bert.encoder.layer.6.crossattention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
191
+ "model.Qformer.bert.encoder.layer.6.crossattention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
192
+ "model.Qformer.bert.encoder.layer.6.intermediate.dense.weight": "pytorch_model-00004-of-00002.safetensors",
193
+ "model.Qformer.bert.encoder.layer.6.intermediate.dense.bias": "pytorch_model-00004-of-00002.safetensors",
194
+ "model.Qformer.bert.encoder.layer.6.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
195
+ "model.Qformer.bert.encoder.layer.6.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
196
+ "model.Qformer.bert.encoder.layer.6.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
197
+ "model.Qformer.bert.encoder.layer.6.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
198
+ "model.Qformer.bert.encoder.layer.6.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
199
+ "model.Qformer.bert.encoder.layer.6.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
200
+ "model.Qformer.bert.encoder.layer.6.output_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
201
+ "model.Qformer.bert.encoder.layer.6.output_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
202
+ "model.Qformer.bert.encoder.layer.6.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
203
+ "model.Qformer.bert.encoder.layer.6.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
204
+ "model.Qformer.bert.encoder.layer.7.attention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
205
+ "model.Qformer.bert.encoder.layer.7.attention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
206
+ "model.Qformer.bert.encoder.layer.7.attention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
207
+ "model.Qformer.bert.encoder.layer.7.attention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
208
+ "model.Qformer.bert.encoder.layer.7.attention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
209
+ "model.Qformer.bert.encoder.layer.7.attention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
210
+ "model.Qformer.bert.encoder.layer.7.attention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
211
+ "model.Qformer.bert.encoder.layer.7.attention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
212
+ "model.Qformer.bert.encoder.layer.7.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
213
+ "model.Qformer.bert.encoder.layer.7.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
214
+ "model.Qformer.bert.encoder.layer.7.intermediate.dense.weight": "pytorch_model-00004-of-00002.safetensors",
215
+ "model.Qformer.bert.encoder.layer.7.intermediate.dense.bias": "pytorch_model-00004-of-00002.safetensors",
216
+ "model.Qformer.bert.encoder.layer.7.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
217
+ "model.Qformer.bert.encoder.layer.7.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
218
+ "model.Qformer.bert.encoder.layer.7.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
219
+ "model.Qformer.bert.encoder.layer.7.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
220
+ "model.Qformer.bert.encoder.layer.7.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
221
+ "model.Qformer.bert.encoder.layer.7.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
222
+ "model.Qformer.bert.encoder.layer.7.output_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
223
+ "model.Qformer.bert.encoder.layer.7.output_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
224
+ "model.Qformer.bert.encoder.layer.7.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
225
+ "model.Qformer.bert.encoder.layer.7.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
226
+ "model.Qformer.bert.encoder.layer.8.attention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
227
+ "model.Qformer.bert.encoder.layer.8.attention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
228
+ "model.Qformer.bert.encoder.layer.8.attention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
229
+ "model.Qformer.bert.encoder.layer.8.attention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
230
+ "model.Qformer.bert.encoder.layer.8.attention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
231
+ "model.Qformer.bert.encoder.layer.8.attention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
232
+ "model.Qformer.bert.encoder.layer.8.attention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
233
+ "model.Qformer.bert.encoder.layer.8.attention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
234
+ "model.Qformer.bert.encoder.layer.8.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
235
+ "model.Qformer.bert.encoder.layer.8.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
236
+ "model.Qformer.bert.encoder.layer.8.crossattention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
237
+ "model.Qformer.bert.encoder.layer.8.crossattention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
238
+ "model.Qformer.bert.encoder.layer.8.crossattention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
239
+ "model.Qformer.bert.encoder.layer.8.crossattention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
240
+ "model.Qformer.bert.encoder.layer.8.crossattention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
241
+ "model.Qformer.bert.encoder.layer.8.crossattention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
242
+ "model.Qformer.bert.encoder.layer.8.crossattention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
243
+ "model.Qformer.bert.encoder.layer.8.crossattention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
244
+ "model.Qformer.bert.encoder.layer.8.crossattention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
245
+ "model.Qformer.bert.encoder.layer.8.crossattention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
246
+ "model.Qformer.bert.encoder.layer.8.intermediate.dense.weight": "pytorch_model-00004-of-00002.safetensors",
247
+ "model.Qformer.bert.encoder.layer.8.intermediate.dense.bias": "pytorch_model-00004-of-00002.safetensors",
248
+ "model.Qformer.bert.encoder.layer.8.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
249
+ "model.Qformer.bert.encoder.layer.8.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
250
+ "model.Qformer.bert.encoder.layer.8.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
251
+ "model.Qformer.bert.encoder.layer.8.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
252
+ "model.Qformer.bert.encoder.layer.8.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
253
+ "model.Qformer.bert.encoder.layer.8.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
254
+ "model.Qformer.bert.encoder.layer.8.output_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
255
+ "model.Qformer.bert.encoder.layer.8.output_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
256
+ "model.Qformer.bert.encoder.layer.8.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
257
+ "model.Qformer.bert.encoder.layer.8.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
258
+ "model.Qformer.bert.encoder.layer.9.attention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
259
+ "model.Qformer.bert.encoder.layer.9.attention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
260
+ "model.Qformer.bert.encoder.layer.9.attention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
261
+ "model.Qformer.bert.encoder.layer.9.attention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
262
+ "model.Qformer.bert.encoder.layer.9.attention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
263
+ "model.Qformer.bert.encoder.layer.9.attention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
264
+ "model.Qformer.bert.encoder.layer.9.attention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
265
+ "model.Qformer.bert.encoder.layer.9.attention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
266
+ "model.Qformer.bert.encoder.layer.9.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
267
+ "model.Qformer.bert.encoder.layer.9.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
268
+ "model.Qformer.bert.encoder.layer.9.intermediate.dense.weight": "pytorch_model-00004-of-00002.safetensors",
269
+ "model.Qformer.bert.encoder.layer.9.intermediate.dense.bias": "pytorch_model-00004-of-00002.safetensors",
270
+ "model.Qformer.bert.encoder.layer.9.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
271
+ "model.Qformer.bert.encoder.layer.9.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
272
+ "model.Qformer.bert.encoder.layer.9.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
273
+ "model.Qformer.bert.encoder.layer.9.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
274
+ "model.Qformer.bert.encoder.layer.9.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
275
+ "model.Qformer.bert.encoder.layer.9.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
276
+ "model.Qformer.bert.encoder.layer.9.output_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
277
+ "model.Qformer.bert.encoder.layer.9.output_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
278
+ "model.Qformer.bert.encoder.layer.9.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
279
+ "model.Qformer.bert.encoder.layer.9.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
280
+ "model.Qformer.bert.encoder.layer.10.attention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
281
+ "model.Qformer.bert.encoder.layer.10.attention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
282
+ "model.Qformer.bert.encoder.layer.10.attention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
283
+ "model.Qformer.bert.encoder.layer.10.attention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
284
+ "model.Qformer.bert.encoder.layer.10.attention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
285
+ "model.Qformer.bert.encoder.layer.10.attention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
286
+ "model.Qformer.bert.encoder.layer.10.attention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
287
+ "model.Qformer.bert.encoder.layer.10.attention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
288
+ "model.Qformer.bert.encoder.layer.10.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
289
+ "model.Qformer.bert.encoder.layer.10.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
290
+ "model.Qformer.bert.encoder.layer.10.crossattention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
291
+ "model.Qformer.bert.encoder.layer.10.crossattention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
292
+ "model.Qformer.bert.encoder.layer.10.crossattention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
293
+ "model.Qformer.bert.encoder.layer.10.crossattention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
294
+ "model.Qformer.bert.encoder.layer.10.crossattention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
295
+ "model.Qformer.bert.encoder.layer.10.crossattention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
296
+ "model.Qformer.bert.encoder.layer.10.crossattention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
297
+ "model.Qformer.bert.encoder.layer.10.crossattention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
298
+ "model.Qformer.bert.encoder.layer.10.crossattention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
299
+ "model.Qformer.bert.encoder.layer.10.crossattention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
300
+ "model.Qformer.bert.encoder.layer.10.intermediate.dense.weight": "pytorch_model-00004-of-00002.safetensors",
301
+ "model.Qformer.bert.encoder.layer.10.intermediate.dense.bias": "pytorch_model-00004-of-00002.safetensors",
302
+ "model.Qformer.bert.encoder.layer.10.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
303
+ "model.Qformer.bert.encoder.layer.10.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
304
+ "model.Qformer.bert.encoder.layer.10.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
305
+ "model.Qformer.bert.encoder.layer.10.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
306
+ "model.Qformer.bert.encoder.layer.10.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
307
+ "model.Qformer.bert.encoder.layer.10.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
308
+ "model.Qformer.bert.encoder.layer.10.output_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
309
+ "model.Qformer.bert.encoder.layer.10.output_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
310
+ "model.Qformer.bert.encoder.layer.10.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
311
+ "model.Qformer.bert.encoder.layer.10.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
312
+ "model.Qformer.bert.encoder.layer.11.attention.self.query.weight": "pytorch_model-00004-of-00002.safetensors",
313
+ "model.Qformer.bert.encoder.layer.11.attention.self.query.bias": "pytorch_model-00004-of-00002.safetensors",
314
+ "model.Qformer.bert.encoder.layer.11.attention.self.key.weight": "pytorch_model-00004-of-00002.safetensors",
315
+ "model.Qformer.bert.encoder.layer.11.attention.self.key.bias": "pytorch_model-00004-of-00002.safetensors",
316
+ "model.Qformer.bert.encoder.layer.11.attention.self.value.weight": "pytorch_model-00004-of-00002.safetensors",
317
+ "model.Qformer.bert.encoder.layer.11.attention.self.value.bias": "pytorch_model-00004-of-00002.safetensors",
318
+ "model.Qformer.bert.encoder.layer.11.attention.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
319
+ "model.Qformer.bert.encoder.layer.11.attention.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
320
+ "model.Qformer.bert.encoder.layer.11.attention.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
321
+ "model.Qformer.bert.encoder.layer.11.attention.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
322
+ "model.Qformer.bert.encoder.layer.11.intermediate.dense.weight": "pytorch_model-00004-of-00002.safetensors",
323
+ "model.Qformer.bert.encoder.layer.11.intermediate.dense.bias": "pytorch_model-00004-of-00002.safetensors",
324
+ "model.Qformer.bert.encoder.layer.11.output.dense.weight": "pytorch_model-00004-of-00002.safetensors",
325
+ "model.Qformer.bert.encoder.layer.11.output.dense.bias": "pytorch_model-00004-of-00002.safetensors",
326
+ "model.Qformer.bert.encoder.layer.11.output.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
327
+ "model.Qformer.bert.encoder.layer.11.output.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
328
+ "model.Qformer.bert.encoder.layer.11.intermediate_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
329
+ "model.Qformer.bert.encoder.layer.11.intermediate_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
330
+ "model.Qformer.bert.encoder.layer.11.output_query.dense.weight": "pytorch_model-00004-of-00002.safetensors",
331
+ "model.Qformer.bert.encoder.layer.11.output_query.dense.bias": "pytorch_model-00004-of-00002.safetensors",
332
+ "model.Qformer.bert.encoder.layer.11.output_query.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
333
+ "model.Qformer.bert.encoder.layer.11.output_query.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
334
+ "model.Qformer.cls.predictions.bias": "pytorch_model-00004-of-00002.safetensors",
335
+ "model.Qformer.cls.predictions.transform.dense.weight": "pytorch_model-00004-of-00002.safetensors",
336
+ "model.Qformer.cls.predictions.transform.dense.bias": "pytorch_model-00004-of-00002.safetensors",
337
+ "model.Qformer.cls.predictions.transform.LayerNorm.weight": "pytorch_model-00004-of-00002.safetensors",
338
+ "model.Qformer.cls.predictions.transform.LayerNorm.bias": "pytorch_model-00004-of-00002.safetensors",
339
+ "model.Qformer.cls.predictions.decoder.weight": "pytorch_model-00004-of-00002.safetensors",
340
+ "model.Qformer.cls.predictions.decoder.bias": "pytorch_model-00004-of-00002.safetensors"
341
  }
Llama-2-7b-chat-hf-qformer/pytorch_model.bin.index.json CHANGED
@@ -3,328 +3,328 @@
3
  "total_size": 13476839424
4
  },
5
  "weight_map": {
6
- "lm_head.weight": "pytorch_model-00002-of-00002.bin",
7
- "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
8
- "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
9
- "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
10
- "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
11
- "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
12
- "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
13
- "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
14
- "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
15
- "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
16
- "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
17
- "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
18
- "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
19
- "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
20
- "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
21
- "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
22
- "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
23
- "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
24
- "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
25
- "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
26
- "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
27
- "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
28
- "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
29
- "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
30
- "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
31
- "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
32
- "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
33
- "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
34
- "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
35
- "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
36
- "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
37
- "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
38
- "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
39
- "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
40
- "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
41
- "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
42
- "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
43
- "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
44
- "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
45
- "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
46
- "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
47
- "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
48
- "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
49
- "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
50
- "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
51
- "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
52
- "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
53
- "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
54
- "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
55
- "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
56
- "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
57
- "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
58
- "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
59
- "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
60
- "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
61
- "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
62
- "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
63
- "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
64
- "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
65
- "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
66
- "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
67
- "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
68
- "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
69
- "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
70
- "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
71
- "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
72
- "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
73
- "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
74
- "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
75
- "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
76
- "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
77
- "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
78
- "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
79
- "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
80
- "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
81
- "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
82
- "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
83
- "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
84
- "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
85
- "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
86
- "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
87
- "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
88
- "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
89
- "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
90
- "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
91
- "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
92
- "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
93
- "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
94
- "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
95
- "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
96
- "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
97
- "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
98
- "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
99
- "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
100
- "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
101
- "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
102
- "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
103
- "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
104
- "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
105
- "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
106
- "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
107
- "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
108
- "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
109
- "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
110
- "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
111
- "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
112
- "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
113
- "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
114
- "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
115
- "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
116
- "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
117
- "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
118
- "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
119
- "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
120
- "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
121
- "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
122
- "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
123
- "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
124
- "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
125
- "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
126
- "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
127
- "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
128
- "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
129
- "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
130
- "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
131
- "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
132
- "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
133
- "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
134
- "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
135
- "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
136
- "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
137
- "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
138
- "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
139
- "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
140
- "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
141
- "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
142
- "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
143
- "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
144
- "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
145
- "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
146
- "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
147
- "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
148
- "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
149
- "model.layers.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
150
- "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
151
- "model.layers.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
152
- "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
153
- "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
154
- "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
155
- "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
156
- "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
157
- "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
158
- "model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
159
- "model.layers.22.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
160
- "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
161
- "model.layers.22.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
162
- "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
163
- "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
164
- "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
165
- "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
166
- "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
167
- "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
168
- "model.layers.23.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
169
- "model.layers.23.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
170
- "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
171
- "model.layers.23.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
172
- "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
173
- "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
174
- "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
175
- "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
176
- "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
177
- "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
178
- "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
179
- "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
180
- "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
181
- "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
182
- "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
183
- "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
184
- "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
185
- "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
186
- "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
187
- "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
188
- "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
189
- "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
190
- "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
191
- "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
192
- "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
193
- "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
194
- "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
195
- "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
196
- "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
197
- "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
198
- "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
199
- "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
200
- "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
201
- "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
202
- "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
203
- "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
204
- "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
205
- "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
206
- "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
207
- "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
208
- "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
209
- "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
210
- "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
211
- "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
212
- "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
213
- "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
214
- "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
215
- "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
216
- "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
217
- "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
218
- "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
219
- "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
220
- "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
221
- "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
222
- "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
223
- "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
224
- "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
225
- "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
226
- "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
227
- "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
228
- "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
229
- "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
230
- "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
231
- "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
232
- "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
233
- "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
234
- "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
235
- "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
236
- "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
237
- "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
238
- "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
239
- "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
240
- "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
241
- "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
242
- "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
243
- "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
244
- "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
245
- "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
246
- "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
247
- "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
248
- "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
249
- "model.layers.30.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
250
- "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
251
- "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
252
- "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
253
- "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
254
- "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
255
- "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
256
- "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
257
- "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
258
- "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
259
- "model.layers.31.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
260
- "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
261
- "model.layers.31.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
262
- "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
263
- "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
264
- "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
265
- "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
266
- "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
267
- "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
268
- "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
269
- "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
270
- "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
271
- "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
272
- "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
273
- "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
274
- "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
275
- "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
276
- "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
277
- "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
278
- "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
279
- "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
280
- "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
281
- "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
282
- "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
283
- "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
284
- "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
285
- "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
286
- "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
287
- "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
288
- "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
289
- "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
290
- "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
291
- "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
292
- "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
293
- "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
294
- "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
295
- "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
296
- "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
297
- "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
298
- "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
299
- "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
300
- "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
301
- "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
302
- "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
303
- "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
304
- "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
305
- "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
306
- "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
307
- "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
308
- "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
309
- "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
310
- "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
311
- "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
312
- "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
313
- "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
314
- "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
315
- "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
316
- "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
317
- "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
318
- "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
319
- "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
320
- "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
321
- "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
322
- "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
323
- "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
324
- "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
325
- "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
326
- "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
327
- "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
328
- "model.norm.weight": "pytorch_model-00002-of-00002.bin"
329
  }
330
  }
 
3
  "total_size": 13476839424
4
  },
5
  "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00002-of-00002.safetensors",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
18
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
19
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
20
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
21
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
22
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
23
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
26
+ "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
27
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
28
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
29
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
30
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
31
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
32
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
33
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
34
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
35
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
36
+ "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
37
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
38
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
39
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
40
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
41
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
42
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
43
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
44
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
45
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
46
+ "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
47
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
48
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
49
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
50
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
51
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
52
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
53
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
54
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
55
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
56
+ "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
57
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
58
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
59
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
60
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
61
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
62
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
63
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
64
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
65
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
66
+ "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
67
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
68
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
69
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
70
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
71
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
72
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
73
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
74
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
75
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
76
+ "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
77
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
78
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
79
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
80
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
81
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
82
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
83
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
84
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
85
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
86
+ "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
87
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
88
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
89
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
90
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
91
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
92
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
93
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
94
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
95
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
96
+ "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
97
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
98
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
99
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
100
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
101
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
102
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
103
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
104
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
105
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
106
+ "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
107
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
108
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
109
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
110
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
111
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
112
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
113
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
114
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
115
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
116
+ "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
117
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
118
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
119
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
120
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
121
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
122
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
123
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
124
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
125
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
126
+ "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
127
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
128
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
129
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
130
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
131
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
132
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
133
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
134
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
135
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
136
+ "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
137
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
138
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
139
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
140
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
141
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
142
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
143
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
144
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
145
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
146
+ "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
147
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
148
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
149
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
150
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
151
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
152
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
153
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
154
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
155
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
156
+ "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
157
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
158
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
159
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
160
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
161
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
162
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
163
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
164
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
165
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
166
+ "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
167
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
168
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
169
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
170
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
171
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
172
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
173
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
174
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
175
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
176
+ "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
177
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
178
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
179
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.safetensors",
180
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.safetensors",
181
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.safetensors",
182
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
183
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.safetensors",
184
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.safetensors",
185
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.safetensors",
186
+ "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.safetensors",
187
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.safetensors",
188
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
189
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.safetensors",
190
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.safetensors",
191
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.safetensors",
192
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
193
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.safetensors",
194
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.safetensors",
195
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.safetensors",
196
+ "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.safetensors",
197
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.safetensors",
198
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
199
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.safetensors",
200
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.safetensors",
201
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.safetensors",
202
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
203
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.safetensors",
204
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.safetensors",
205
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.safetensors",
206
+ "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.safetensors",
207
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.safetensors",
208
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
209
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.safetensors",
210
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.safetensors",
211
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.safetensors",
212
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
213
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.safetensors",
214
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.safetensors",
215
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.safetensors",
216
+ "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.safetensors",
217
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.safetensors",
218
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
219
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.safetensors",
220
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.safetensors",
221
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.safetensors",
222
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
223
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.safetensors",
224
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.safetensors",
225
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.safetensors",
226
+ "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.safetensors",
227
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.safetensors",
228
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
229
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.safetensors",
230
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.safetensors",
231
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.safetensors",
232
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
233
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.safetensors",
234
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.safetensors",
235
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.safetensors",
236
+ "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.safetensors",
237
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.safetensors",
238
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
239
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
240
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
241
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
242
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
243
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
244
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
245
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
246
+ "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
247
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
248
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
249
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00002-of-00002.safetensors",
250
+ "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.safetensors",
251
+ "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00002.safetensors",
252
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
253
+ "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.safetensors",
254
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.safetensors",
255
+ "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.safetensors",
256
+ "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.safetensors",
257
+ "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.safetensors",
258
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
259
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00002-of-00002.safetensors",
260
+ "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.safetensors",
261
+ "model.layers.31.mlp.up_proj.weight": "pytorch_model-00002-of-00002.safetensors",
262
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.safetensors",
263
+ "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.safetensors",
264
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.safetensors",
265
+ "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.safetensors",
266
+ "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.safetensors",
267
+ "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.safetensors",
268
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
269
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
270
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
271
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
272
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
273
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
274
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
275
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
276
+ "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
277
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
278
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
279
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
280
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
281
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
282
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
283
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
284
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
285
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
286
+ "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
287
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
288
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
289
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
290
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
291
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
292
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
293
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
294
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
295
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
296
+ "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
297
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
298
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
299
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
300
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
301
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
302
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
303
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
304
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
305
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
306
+ "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
307
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
308
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
309
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
310
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
311
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
312
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
313
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
314
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
315
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
316
+ "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
317
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
318
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
319
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.safetensors",
320
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.safetensors",
321
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.safetensors",
322
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.safetensors",
323
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.safetensors",
324
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.safetensors",
325
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.safetensors",
326
+ "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.safetensors",
327
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.safetensors",
328
+ "model.norm.weight": "pytorch_model-00002-of-00002.safetensors"
329
  }
330
  }
hf-dev-train/transformers-main/.circleci/TROUBLESHOOT.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Troubleshooting
2
+
3
+ This is a document explaining how to deal with various issues on Circle-CI. The entries may include actually solutions or pointers to Issues that cover those.
4
+
5
+ ## Circle CI
6
+
7
+ * pytest worker runs out of resident RAM and gets killed by `cgroups`: https://github.com/huggingface/transformers/issues/11408
hf-dev-train/transformers-main/.circleci/config.yml ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 2.1
2
+ setup: true
3
+ orbs:
4
+ continuation: circleci/[email protected]
5
+
6
+ parameters:
7
+ nightly:
8
+ type: boolean
9
+ default: false
10
+
11
+ jobs:
12
+ # Ensure running with CircleCI/huggingface
13
+ check_circleci_user:
14
+ docker:
15
+ - image: cimg/python:3.8.12
16
+ parallelism: 1
17
+ steps:
18
+ - run: echo $CIRCLE_PROJECT_USERNAME
19
+ - run: |
20
+ if [ "$CIRCLE_PROJECT_USERNAME" = "huggingface" ]; then
21
+ exit 0
22
+ else
23
+ echo "The CI is running under $CIRCLE_PROJECT_USERNAME personal account. Please follow https://support.circleci.com/hc/en-us/articles/360008097173-Troubleshooting-why-pull-requests-are-not-triggering-jobs-on-my-organization- to fix it."; exit -1
24
+ fi
25
+ # Fetch the tests to run
26
+ fetch_tests:
27
+ working_directory: ~/transformers
28
+ docker:
29
+ - image: cimg/python:3.8.12
30
+ parallelism: 1
31
+ steps:
32
+ - checkout
33
+ - run: pip install --upgrade pip
34
+ - run: pip install GitPython
35
+ - run: pip install .
36
+ - run: mkdir -p test_preparation
37
+ - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt
38
+ - store_artifacts:
39
+ path: ~/transformers/tests_fetched_summary.txt
40
+ - run: |
41
+ if [ -f test_list.txt ]; then
42
+ cp test_list.txt test_preparation/test_list.txt
43
+ else
44
+ touch test_preparation/test_list.txt
45
+ fi
46
+ - run: |
47
+ if [ -f test_repo_utils.txt ]; then
48
+ mv test_repo_utils.txt test_preparation/test_repo_utils.txt
49
+ else
50
+ touch test_preparation/test_repo_utils.txt
51
+ fi
52
+ - run: python utils/tests_fetcher.py --filter_tests
53
+ - run: |
54
+ if [ -f test_list.txt ]; then
55
+ mv test_list.txt test_preparation/filtered_test_list.txt
56
+ else
57
+ touch test_preparation/filtered_test_list.txt
58
+ fi
59
+ - run: python utils/tests_fetcher.py --filters tests examples | tee examples_tests_fetched_summary.txt
60
+ - run: |
61
+ if [ -f test_list.txt ]; then
62
+ mv test_list.txt test_preparation/examples_test_list.txt
63
+ else
64
+ touch test_preparation/examples_test_list.txt
65
+ fi
66
+ - store_artifacts:
67
+ path: test_preparation/test_list.txt
68
+ - store_artifacts:
69
+ path: ~/transformers/test_preparation/filtered_test_list.txt
70
+ - store_artifacts:
71
+ path: test_preparation/examples_test_list.txt
72
+ - run: python .circleci/create_circleci_config.py --fetcher_folder test_preparation
73
+ - run: |
74
+ if [ ! -s test_preparation/generated_config.yml ]; then
75
+ echo "No tests to run, exiting early!"
76
+ circleci-agent step halt
77
+ fi
78
+ - run: cp test_preparation/generated_config.yml test_preparation/generated_config.txt
79
+ - store_artifacts:
80
+ path: test_preparation/generated_config.txt
81
+ - continuation/continue:
82
+ configuration_path: test_preparation/generated_config.yml
83
+
84
+ # To run all tests for the nightly build
85
+ fetch_all_tests:
86
+ working_directory: ~/transformers
87
+ docker:
88
+ - image: cimg/python:3.8.12
89
+ parallelism: 1
90
+ steps:
91
+ - checkout
92
+ - run: pip install --upgrade pip
93
+ - run: pip install GitPython
94
+ - run: pip install .
95
+ - run: |
96
+ mkdir test_preparation
97
+ echo -n "tests" > test_preparation/test_list.txt
98
+ echo -n "tests" > test_preparation/examples_test_list.txt
99
+ echo -n "tests/repo_utils" > test_preparation/test_repo_utils.txt
100
+ - run: |
101
+ echo -n "tests" > test_list.txt
102
+ python utils/tests_fetcher.py --filter_tests
103
+ mv test_list.txt test_preparation/filtered_test_list.txt
104
+ - run: python .circleci/create_circleci_config.py --fetcher_folder test_preparation
105
+ - run: cp test_preparation/generated_config.yml test_preparation/generated_config.txt
106
+ - store_artifacts:
107
+ path: test_preparation/generated_config.txt
108
+ - continuation/continue:
109
+ configuration_path: test_preparation/generated_config.yml
110
+
111
+ check_code_quality:
112
+ working_directory: ~/transformers
113
+ docker:
114
+ - image: cimg/python:3.8.12
115
+ resource_class: large
116
+ environment:
117
+ TRANSFORMERS_IS_CI: yes
118
+ PYTEST_TIMEOUT: 120
119
+ parallelism: 1
120
+ steps:
121
+ - checkout
122
+ - restore_cache:
123
+ keys:
124
+ - v0.6-code_quality-{{ checksum "setup.py" }}
125
+ - v0.6-code-quality
126
+ - run: pip install --upgrade pip
127
+ - run: pip install .[all,quality]
128
+ - save_cache:
129
+ key: v0.5-code_quality-{{ checksum "setup.py" }}
130
+ paths:
131
+ - '~/.cache/pip'
132
+ - run:
133
+ name: Show installed libraries and their versions
134
+ command: pip freeze | tee installed.txt
135
+ - store_artifacts:
136
+ path: ~/transformers/installed.txt
137
+ - run: black --check examples tests src utils
138
+ - run: ruff examples tests src utils
139
+ - run: python utils/custom_init_isort.py --check_only
140
+ - run: python utils/sort_auto_mappings.py --check_only
141
+ - run: doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
142
+ - run: python utils/check_doc_toc.py
143
+
144
+ check_repository_consistency:
145
+ working_directory: ~/transformers
146
+ docker:
147
+ - image: cimg/python:3.8.12
148
+ resource_class: large
149
+ environment:
150
+ TRANSFORMERS_IS_CI: yes
151
+ PYTEST_TIMEOUT: 120
152
+ parallelism: 1
153
+ steps:
154
+ - checkout
155
+ - restore_cache:
156
+ keys:
157
+ - v0.6-repository_consistency-{{ checksum "setup.py" }}
158
+ - v0.6-repository_consistency
159
+ - run: pip install --upgrade pip
160
+ - run: pip install .[all,quality]
161
+ - save_cache:
162
+ key: v0.5-repository_consistency-{{ checksum "setup.py" }}
163
+ paths:
164
+ - '~/.cache/pip'
165
+ - run:
166
+ name: Show installed libraries and their versions
167
+ command: pip freeze | tee installed.txt
168
+ - store_artifacts:
169
+ path: ~/transformers/installed.txt
170
+ - run: python utils/check_copies.py
171
+ - run: python utils/check_table.py
172
+ - run: python utils/check_dummies.py
173
+ - run: python utils/check_repo.py
174
+ - run: python utils/check_inits.py
175
+ - run: python utils/check_config_docstrings.py
176
+ - run: python utils/check_config_attributes.py
177
+ - run: python utils/check_doctest_list.py
178
+ - run: make deps_table_check_updated
179
+ - run: python utils/tests_fetcher.py --sanity_check
180
+ - run: python utils/update_metadata.py --check-only
181
+ - run: python utils/check_task_guides.py
182
+
183
+ workflows:
184
+ version: 2
185
+ setup_and_quality:
186
+ when:
187
+ not: <<pipeline.parameters.nightly>>
188
+ jobs:
189
+ - check_circleci_user
190
+ - check_code_quality
191
+ - check_repository_consistency
192
+ - fetch_tests
193
+
194
+ nightly:
195
+ when: <<pipeline.parameters.nightly>>
196
+ jobs:
197
+ - check_circleci_user
198
+ - check_code_quality
199
+ - check_repository_consistency
200
+ - fetch_all_tests
hf-dev-train/transformers-main/.circleci/create_circleci_config.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import argparse
17
+ import copy
18
+ import glob
19
+ import os
20
+ import random
21
+ from dataclasses import dataclass
22
+ from typing import Any, Dict, List, Optional
23
+
24
+ import yaml
25
+
26
+
27
+ COMMON_ENV_VARIABLES = {
28
+ "OMP_NUM_THREADS": 1,
29
+ "TRANSFORMERS_IS_CI": True,
30
+ "PYTEST_TIMEOUT": 120,
31
+ "RUN_PIPELINE_TESTS": False,
32
+ "RUN_PT_TF_CROSS_TESTS": False,
33
+ "RUN_PT_FLAX_CROSS_TESTS": False,
34
+ }
35
+ COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "s": None}
36
+ DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}]
37
+
38
+
39
+ @dataclass
40
+ class CircleCIJob:
41
+ name: str
42
+ additional_env: Dict[str, Any] = None
43
+ cache_name: str = None
44
+ cache_version: str = "0.6"
45
+ docker_image: List[Dict[str, str]] = None
46
+ install_steps: List[str] = None
47
+ marker: Optional[str] = None
48
+ parallelism: Optional[int] = 1
49
+ pytest_num_workers: int = 8
50
+ pytest_options: Dict[str, Any] = None
51
+ resource_class: Optional[str] = "xlarge"
52
+ tests_to_run: Optional[List[str]] = None
53
+ working_directory: str = "~/transformers"
54
+
55
+ def __post_init__(self):
56
+ # Deal with defaults for mutable attributes.
57
+ if self.additional_env is None:
58
+ self.additional_env = {}
59
+ if self.cache_name is None:
60
+ self.cache_name = self.name
61
+ if self.docker_image is None:
62
+ # Let's avoid changing the default list and make a copy.
63
+ self.docker_image = copy.deepcopy(DEFAULT_DOCKER_IMAGE)
64
+ if self.install_steps is None:
65
+ self.install_steps = []
66
+ if self.pytest_options is None:
67
+ self.pytest_options = {}
68
+ if isinstance(self.tests_to_run, str):
69
+ self.tests_to_run = [self.tests_to_run]
70
+ if self.parallelism is None:
71
+ self.parallelism = 1
72
+
73
+ def to_dict(self):
74
+ env = COMMON_ENV_VARIABLES.copy()
75
+ env.update(self.additional_env)
76
+ job = {
77
+ "working_directory": self.working_directory,
78
+ "docker": self.docker_image,
79
+ "environment": env,
80
+ }
81
+ if self.resource_class is not None:
82
+ job["resource_class"] = self.resource_class
83
+ if self.parallelism is not None:
84
+ job["parallelism"] = self.parallelism
85
+ steps = [
86
+ "checkout",
87
+ {"attach_workspace": {"at": "~/transformers/test_preparation"}},
88
+ {
89
+ "restore_cache": {
90
+ "keys": [
91
+ f"v{self.cache_version}-{self.cache_name}-" + '{{ checksum "setup.py" }}',
92
+ f"v{self.cache_version}-{self.cache_name}-",
93
+ ]
94
+ }
95
+ },
96
+ ]
97
+ steps.extend([{"run": l} for l in self.install_steps])
98
+ steps.append(
99
+ {
100
+ "save_cache": {
101
+ "key": f"v{self.cache_version}-{self.cache_name}-" + '{{ checksum "setup.py" }}',
102
+ "paths": ["~/.cache/pip"],
103
+ }
104
+ }
105
+ )
106
+ steps.append({"run": {"name": "Show installed libraries and their versions", "command": "pip freeze | tee installed.txt"}})
107
+ steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}})
108
+
109
+ all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options}
110
+ pytest_flags = [f"--{key}={value}" if value is not None else f"-{key}" for key, value in all_options.items()]
111
+ pytest_flags.append(
112
+ f"--make-reports={self.name}" if "examples" in self.name else f"--make-reports=tests_{self.name}"
113
+ )
114
+ test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
115
+ if self.parallelism == 1:
116
+ if self.tests_to_run is None:
117
+ test_command += " << pipeline.parameters.tests_to_run >>"
118
+ else:
119
+ test_command += " " + " ".join(self.tests_to_run)
120
+ else:
121
+ # We need explicit list instead of `pipeline.parameters.tests_to_run` (only available at job runtime)
122
+ tests = self.tests_to_run
123
+ if tests is None:
124
+ folder = os.environ["test_preparation_dir"]
125
+ test_file = os.path.join(folder, "filtered_test_list.txt")
126
+ if os.path.exists(test_file):
127
+ with open(test_file) as f:
128
+ tests = f.read().split(" ")
129
+
130
+ # expand the test list
131
+ if tests == ["tests"]:
132
+ tests = [os.path.join("tests", x) for x in os.listdir("tests")]
133
+ expanded_tests = []
134
+ for test in tests:
135
+ if test.endswith(".py"):
136
+ expanded_tests.append(test)
137
+ elif test == "tests/models":
138
+ expanded_tests.extend([os.path.join(test, x) for x in os.listdir(test)])
139
+ elif test == "tests/pipelines":
140
+ expanded_tests.extend([os.path.join(test, x) for x in os.listdir(test)])
141
+ else:
142
+ expanded_tests.append(test)
143
+ # Avoid long tests always being collected together
144
+ random.shuffle(expanded_tests)
145
+ tests = " ".join(expanded_tests)
146
+
147
+ # Each executor to run ~10 tests
148
+ n_executors = max(len(tests) // 10, 1)
149
+ # Avoid empty test list on some executor(s) or launching too many executors
150
+ if n_executors > self.parallelism:
151
+ n_executors = self.parallelism
152
+ job["parallelism"] = n_executors
153
+
154
+ # Need to be newline separated for the command `circleci tests split` below
155
+ command = f'echo {tests} | tr " " "\\n" >> tests.txt'
156
+ steps.append({"run": {"name": "Get tests", "command": command}})
157
+
158
+ command = 'TESTS=$(circleci tests split tests.txt) && echo $TESTS > splitted_tests.txt'
159
+ steps.append({"run": {"name": "Split tests", "command": command}})
160
+
161
+ steps.append({"store_artifacts": {"path": "~/transformers/tests.txt"}})
162
+ steps.append({"store_artifacts": {"path": "~/transformers/splitted_tests.txt"}})
163
+
164
+ test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
165
+ test_command += " $(cat splitted_tests.txt)"
166
+ if self.marker is not None:
167
+ test_command += f" -m {self.marker}"
168
+ test_command += " | tee tests_output.txt"
169
+ steps.append({"run": {"name": "Run tests", "command": test_command}})
170
+ steps.append({"store_artifacts": {"path": "~/transformers/tests_output.txt"}})
171
+ steps.append({"store_artifacts": {"path": "~/transformers/reports"}})
172
+ job["steps"] = steps
173
+ return job
174
+
175
+ @property
176
+ def job_name(self):
177
+ return self.name if "examples" in self.name else f"tests_{self.name}"
178
+
179
+
180
+ # JOBS
181
+ torch_and_tf_job = CircleCIJob(
182
+ "torch_and_tf",
183
+ additional_env={"RUN_PT_TF_CROSS_TESTS": True},
184
+ install_steps=[
185
+ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs cmake",
186
+ "git lfs install",
187
+ "pip install --upgrade pip",
188
+ "pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
189
+ "pip install tensorflow_probability",
190
+ "pip install git+https://github.com/huggingface/accelerate",
191
+ ],
192
+ marker="is_pt_tf_cross_test",
193
+ pytest_options={"rA": None, "durations": 0},
194
+ )
195
+
196
+
197
+ torch_and_flax_job = CircleCIJob(
198
+ "torch_and_flax",
199
+ additional_env={"RUN_PT_FLAX_CROSS_TESTS": True},
200
+ install_steps=[
201
+ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
202
+ "pip install --upgrade pip",
203
+ "pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]",
204
+ "pip install git+https://github.com/huggingface/accelerate",
205
+ ],
206
+ marker="is_pt_flax_cross_test",
207
+ pytest_options={"rA": None, "durations": 0},
208
+ )
209
+
210
+
211
+ torch_job = CircleCIJob(
212
+ "torch",
213
+ install_steps=[
214
+ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time",
215
+ "pip install --upgrade pip",
216
+ "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
217
+ "pip install git+https://github.com/huggingface/accelerate",
218
+ ],
219
+ parallelism=1,
220
+ pytest_num_workers=3,
221
+ )
222
+
223
+
224
+ tf_job = CircleCIJob(
225
+ "tf",
226
+ install_steps=[
227
+ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake",
228
+ "pip install --upgrade pip",
229
+ "pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
230
+ "pip install tensorflow_probability",
231
+ ],
232
+ parallelism=1,
233
+ pytest_options={"rA": None},
234
+ )
235
+
236
+
237
+ flax_job = CircleCIJob(
238
+ "flax",
239
+ install_steps=[
240
+ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
241
+ "pip install --upgrade pip",
242
+ "pip install .[flax,testing,sentencepiece,flax-speech,vision]",
243
+ ],
244
+ parallelism=1,
245
+ pytest_options={"rA": None},
246
+ )
247
+
248
+
249
+ pipelines_torch_job = CircleCIJob(
250
+ "pipelines_torch",
251
+ additional_env={"RUN_PIPELINE_TESTS": True},
252
+ install_steps=[
253
+ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
254
+ "pip install --upgrade pip",
255
+ "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]",
256
+ ],
257
+ pytest_options={"rA": None},
258
+ marker="is_pipeline_test",
259
+ )
260
+
261
+
262
+ pipelines_tf_job = CircleCIJob(
263
+ "pipelines_tf",
264
+ additional_env={"RUN_PIPELINE_TESTS": True},
265
+ install_steps=[
266
+ "sudo apt-get -y update && sudo apt-get install -y cmake",
267
+ "pip install --upgrade pip",
268
+ "pip install .[sklearn,tf-cpu,testing,sentencepiece,vision]",
269
+ "pip install tensorflow_probability",
270
+ ],
271
+ pytest_options={"rA": None},
272
+ marker="is_pipeline_test",
273
+ )
274
+
275
+
276
+ custom_tokenizers_job = CircleCIJob(
277
+ "custom_tokenizers",
278
+ additional_env={"RUN_CUSTOM_TOKENIZERS": True},
279
+ install_steps=[
280
+ "sudo apt-get -y update && sudo apt-get install -y cmake",
281
+ {
282
+ "name": "install jumanpp",
283
+ "command":
284
+ "wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz\n"
285
+ "tar xvf jumanpp-2.0.0-rc3.tar.xz\n"
286
+ "mkdir jumanpp-2.0.0-rc3/bld\n"
287
+ "cd jumanpp-2.0.0-rc3/bld\n"
288
+ "sudo cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local\n"
289
+ "sudo make install\n",
290
+ },
291
+ "pip install --upgrade pip",
292
+ "pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]",
293
+ "python -m unidic download",
294
+ ],
295
+ parallelism=None,
296
+ resource_class=None,
297
+ tests_to_run=[
298
+ "./tests/models/bert_japanese/test_tokenization_bert_japanese.py",
299
+ "./tests/models/openai/test_tokenization_openai.py",
300
+ "./tests/models/clip/test_tokenization_clip.py",
301
+ ],
302
+ )
303
+
304
+
305
+ examples_torch_job = CircleCIJob(
306
+ "examples_torch",
307
+ cache_name="torch_examples",
308
+ install_steps=[
309
+ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
310
+ "pip install --upgrade pip",
311
+ "pip install .[sklearn,torch,sentencepiece,testing,torch-speech]",
312
+ "pip install -r examples/pytorch/_tests_requirements.txt",
313
+ ],
314
+ tests_to_run="./examples/pytorch/",
315
+ )
316
+
317
+
318
+ examples_tensorflow_job = CircleCIJob(
319
+ "examples_tensorflow",
320
+ cache_name="tensorflow_examples",
321
+ install_steps=[
322
+ "sudo apt-get -y update && sudo apt-get install -y cmake",
323
+ "pip install --upgrade pip",
324
+ "pip install .[sklearn,tensorflow,sentencepiece,testing]",
325
+ "pip install -r examples/tensorflow/_tests_requirements.txt",
326
+ ],
327
+ tests_to_run="./examples/tensorflow/",
328
+ )
329
+
330
+
331
+ examples_flax_job = CircleCIJob(
332
+ "examples_flax",
333
+ cache_name="flax_examples",
334
+ install_steps=[
335
+ "pip install --upgrade pip",
336
+ "pip install .[flax,testing,sentencepiece]",
337
+ "pip install -r examples/flax/_tests_requirements.txt",
338
+ ],
339
+ tests_to_run="./examples/flax/",
340
+ )
341
+
342
+
343
+ hub_job = CircleCIJob(
344
+ "hub",
345
+ install_steps=[
346
+ "sudo apt-get -y update && sudo apt-get install git-lfs",
347
+ 'git config --global user.email "[email protected]"',
348
+ 'git config --global user.name "ci"',
349
+ "pip install --upgrade pip",
350
+ "pip install .[torch,sentencepiece,testing]",
351
+ ],
352
+ marker="is_staging_test",
353
+ pytest_num_workers=1,
354
+ )
355
+
356
+
357
+ onnx_job = CircleCIJob(
358
+ "onnx",
359
+ install_steps=[
360
+ "sudo apt-get -y update && sudo apt-get install -y cmake",
361
+ "pip install --upgrade pip",
362
+ "pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
363
+ ],
364
+ pytest_options={"k onnx": None},
365
+ pytest_num_workers=1,
366
+ )
367
+
368
+
369
+ exotic_models_job = CircleCIJob(
370
+ "exotic_models",
371
+ install_steps=[
372
+ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev",
373
+ "pip install --upgrade pip",
374
+ "pip install .[torch,testing,vision]",
375
+ "pip install torchvision",
376
+ "pip install scipy",
377
+ "pip install 'git+https://github.com/facebookresearch/detectron2.git'",
378
+ "sudo apt install tesseract-ocr",
379
+ "pip install pytesseract",
380
+ "pip install natten",
381
+ ],
382
+ tests_to_run=[
383
+ "tests/models/*layoutlmv*",
384
+ "tests/models/*nat",
385
+ "tests/models/deta",
386
+ ],
387
+ pytest_num_workers=1,
388
+ pytest_options={"durations": 100},
389
+ )
390
+
391
+
392
+ repo_utils_job = CircleCIJob(
393
+ "repo_utils",
394
+ install_steps=[
395
+ "pip install --upgrade pip",
396
+ "pip install .[quality,testing,torch]",
397
+ ],
398
+ parallelism=None,
399
+ pytest_num_workers=1,
400
+ resource_class="large",
401
+ tests_to_run="tests/repo_utils",
402
+ )
403
+
404
+ REGULAR_TESTS = [
405
+ torch_and_tf_job,
406
+ torch_and_flax_job,
407
+ torch_job,
408
+ tf_job,
409
+ flax_job,
410
+ custom_tokenizers_job,
411
+ hub_job,
412
+ onnx_job,
413
+ exotic_models_job,
414
+ ]
415
+ EXAMPLES_TESTS = [
416
+ examples_torch_job,
417
+ examples_tensorflow_job,
418
+ examples_flax_job,
419
+ ]
420
+ PIPELINE_TESTS = [
421
+ pipelines_torch_job,
422
+ pipelines_tf_job,
423
+ ]
424
+ REPO_UTIL_TESTS = [repo_utils_job]
425
+
426
+ def create_circleci_config(folder=None):
427
+ if folder is None:
428
+ folder = os.getcwd()
429
+ # Used in CircleCIJob.to_dict() to expand the test list (for using parallelism)
430
+ os.environ["test_preparation_dir"] = folder
431
+ jobs = []
432
+ all_test_file = os.path.join(folder, "test_list.txt")
433
+ if os.path.exists(all_test_file):
434
+ with open(all_test_file) as f:
435
+ all_test_list = f.read()
436
+ else:
437
+ all_test_list = []
438
+ if len(all_test_list) > 0:
439
+ jobs.extend(PIPELINE_TESTS)
440
+
441
+ test_file = os.path.join(folder, "filtered_test_list.txt")
442
+ if os.path.exists(test_file):
443
+ with open(test_file) as f:
444
+ test_list = f.read()
445
+ else:
446
+ test_list = []
447
+ if len(test_list) > 0:
448
+ jobs.extend(REGULAR_TESTS)
449
+
450
+ example_file = os.path.join(folder, "examples_test_list.txt")
451
+ if os.path.exists(example_file) and os.path.getsize(example_file) > 0:
452
+ jobs.extend(EXAMPLES_TESTS)
453
+
454
+ repo_util_file = os.path.join(folder, "test_repo_utils.txt")
455
+ if os.path.exists(repo_util_file) and os.path.getsize(repo_util_file) > 0:
456
+ jobs.extend(REPO_UTIL_TESTS)
457
+
458
+ if len(jobs) > 0:
459
+ config = {"version": "2.1"}
460
+ config["parameters"] = {
461
+ # Only used to accept the parameters from the trigger
462
+ "nightly": {"type": "boolean", "default": False},
463
+ "tests_to_run": {"type": "string", "default": test_list},
464
+ }
465
+ config["jobs"] = {j.job_name: j.to_dict() for j in jobs}
466
+ config["workflows"] = {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}}
467
+ with open(os.path.join(folder, "generated_config.yml"), "w") as f:
468
+ f.write(yaml.dump(config, indent=2, width=1000000, sort_keys=False))
469
+
470
+
471
+ if __name__ == "__main__":
472
+ parser = argparse.ArgumentParser()
473
+ parser.add_argument(
474
+ "--fetcher_folder", type=str, default=None, help="Only test that all tests and modules are accounted for."
475
+ )
476
+ args = parser.parse_args()
477
+
478
+ create_circleci_config(args.fetcher_folder)
hf-dev-train/transformers-main/.coveragerc ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [run]
2
+ source=transformers
3
+ omit =
4
+ # skip convertion scripts from testing for now
5
+ */convert_*
6
+ */__main__.py
7
+ [report]
8
+ exclude_lines =
9
+ pragma: no cover
10
+ raise
11
+ except
12
+ register_parameter
hf-dev-train/transformers-main/.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.py eol=lf
2
+ *.rst eol=lf
3
+ *.md eol=lf
4
+ *.mdx eol=lf
hf-dev-train/transformers-main/.github/ISSUE_TEMPLATE/bug-report.yml ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "\U0001F41B Bug Report"
2
+ description: Submit a bug report to help us improve transformers
3
+ body:
4
+ - type: textarea
5
+ id: system-info
6
+ attributes:
7
+ label: System Info
8
+ description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
9
+ placeholder: transformers version, platform, python version, ...
10
+ validations:
11
+ required: true
12
+
13
+ - type: textarea
14
+ id: who-can-help
15
+ attributes:
16
+ label: Who can help?
17
+ description: |
18
+ Your issue will be replied to more quickly if you can figure out the right person to tag with @
19
+ If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
20
+
21
+ All issues are read by one of the core maintainers, so if you don't know who to tag, just leave this blank and
22
+ a core maintainer will ping the right person.
23
+
24
+ Please tag fewer than 3 people.
25
+
26
+ Models:
27
+
28
+ - text models: @ArthurZucker and @younesbelkada
29
+ - vision models: @amyeroberts
30
+ - speech models: @sanchit-gandhi
31
+ - graph models: @clefourrier
32
+
33
+ Library:
34
+
35
+ - flax: @sanchit-gandhi
36
+ - generate: @gante
37
+ - pipelines: @Narsil
38
+ - tensorflow: @gante and @Rocketknight1
39
+ - tokenizers: @ArthurZucker
40
+ - trainer: @sgugger
41
+
42
+ Integrations:
43
+
44
+ - deepspeed: HF Trainer: @stas00, Accelerate: @pacman100
45
+ - ray/raytune: @richardliaw, @amogkam
46
+ - Big Model Inference: @sgugger @muellerzr
47
+
48
+ Documentation: @sgugger, @stevhliu and @MKhalusova
49
+
50
+ Model hub:
51
+
52
+ - for issues with a model, report at https://discuss.huggingface.co/ and tag the model's creator.
53
+
54
+ HF projects:
55
+
56
+ - accelerate: [different repo](https://github.com/huggingface/accelerate)
57
+ - datasets: [different repo](https://github.com/huggingface/datasets)
58
+ - diffusers: [different repo](https://github.com/huggingface/diffusers)
59
+ - rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
60
+
61
+ Maintained examples (not research project or legacy):
62
+
63
+ - Flax: @sanchit-gandhi
64
+ - PyTorch: @sgugger
65
+ - TensorFlow: @Rocketknight1
66
+
67
+ Research projects are not maintained and should be taken as is.
68
+
69
+ placeholder: "@Username ..."
70
+
71
+ - type: checkboxes
72
+ id: information-scripts-examples
73
+ attributes:
74
+ label: Information
75
+ description: 'The problem arises when using:'
76
+ options:
77
+ - label: "The official example scripts"
78
+ - label: "My own modified scripts"
79
+
80
+ - type: checkboxes
81
+ id: information-tasks
82
+ attributes:
83
+ label: Tasks
84
+ description: "The tasks I am working on are:"
85
+ options:
86
+ - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
87
+ - label: "My own task or dataset (give details below)"
88
+
89
+ - type: textarea
90
+ id: reproduction
91
+ validations:
92
+ required: true
93
+ attributes:
94
+ label: Reproduction
95
+ description: |
96
+ Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
97
+ If you have code snippets, error messages, stack traces please provide them here as well.
98
+ Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
99
+ Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
100
+
101
+ placeholder: |
102
+ Steps to reproduce the behavior:
103
+
104
+ 1.
105
+ 2.
106
+ 3.
107
+
108
+
109
+ - type: textarea
110
+ id: expected-behavior
111
+ validations:
112
+ required: true
113
+ attributes:
114
+ label: Expected behavior
115
+ description: "A clear and concise description of what you would expect to happen."
hf-dev-train/transformers-main/.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ blank_issues_enabled: true
2
+ version: 2.1
3
+ contact_links:
4
+ - name: Model checkpoints on the Hugging Face Hub
5
+ url: https://huggingface.co/models
6
+ about: Open a Pull request / Discussion related to a specific model checkpoint directly on the Hugging Face Hub
7
+ - name: Website Related
8
+ url: https://github.com/huggingface/hub-docs/issues
9
+ about: Feature requests and bug reports related to the website
10
+ - name: Forum
11
+ url: https://discuss.huggingface.co/
12
+ about: General usage questions and community discussions
hf-dev-train/transformers-main/.github/ISSUE_TEMPLATE/feature-request.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "\U0001F680 Feature request"
2
+ description: Submit a proposal/request for a new transformers feature
3
+ labels: [ "feature" ]
4
+ body:
5
+ - type: textarea
6
+ id: feature-request
7
+ validations:
8
+ required: true
9
+ attributes:
10
+ label: Feature request
11
+ description: |
12
+ A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist.
13
+
14
+ - type: textarea
15
+ id: motivation
16
+ validations:
17
+ required: true
18
+ attributes:
19
+ label: Motivation
20
+ description: |
21
+ Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
22
+
23
+
24
+ - type: textarea
25
+ id: contribution
26
+ validations:
27
+ required: true
28
+ attributes:
29
+ label: Your contribution
30
+ description: |
31
+ Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md)
hf-dev-train/transformers-main/.github/ISSUE_TEMPLATE/i18n.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: 🌐 Translating a new language?
3
+ about: Start a new translation effort in your language
4
+ title: '[i18n-<languageCode>] Translating docs to <languageName>'
5
+ labels: WIP
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ <!--
11
+ Note: Please search to see if an issue already exists for the language you are trying to translate.
12
+ -->
13
+
14
+ Hi!
15
+
16
+ Let's bring the documentation to all the <languageName>-speaking community 🌐 (currently 0 out of 267 complete)
17
+
18
+ Who would want to translate? Please follow the 🤗 [TRANSLATING guide](https://github.com/huggingface/transformers/blob/main/docs/TRANSLATING.md). Here is a list of the files ready for translation. Let us know in this issue if you'd like to translate any, and we'll add your name to the list.
19
+
20
+ Some notes:
21
+
22
+ * Please translate using an informal tone (imagine you are talking with a friend about transformers 🤗).
23
+ * Please translate in a gender-neutral way.
24
+ * Add your translations to the folder called `<languageCode>` inside the [source folder](https://github.com/huggingface/transformers/tree/main/docs/source).
25
+ * Register your translation in `<languageCode>/_toctree.yml`; please follow the order of the [English version](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml).
26
+ * Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @ArthurZucker, @sgugger for review.
27
+ * 🙋 If you'd like others to help you with the translation, you can also post in the 🤗 [forums](https://discuss.huggingface.co/).
28
+
29
+ ## Get Started section
30
+
31
+ - [ ] [index.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/index.mdx) https://github.com/huggingface/transformers/pull/20180
32
+ - [ ] [quicktour.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/quicktour.mdx) (waiting for initial PR to go through)
33
+ - [ ] [installation.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/installation.mdx).
34
+
35
+ ## Tutorial section
36
+ - [ ] [pipeline_tutorial.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/pipeline_tutorial.mdx)
37
+ - [ ] [autoclass_tutorial.mdx](https://github.com/huggingface/transformers/blob/master/docs/source/autoclass_tutorial.mdx)
38
+ - [ ] [preprocessing.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/preprocessing.mdx)
39
+ - [ ] [training.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/training.mdx)
40
+ - [ ] [accelerate.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/accelerate.mdx)
41
+ - [ ] [model_sharing.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/model_sharing.mdx)
42
+ - [ ] [multilingual.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/multilingual.mdx)
43
+
44
+ <!--
45
+ Keep on adding more as you go 🔥
46
+ -->
hf-dev-train/transformers-main/.github/ISSUE_TEMPLATE/migration.yml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers"
2
+ description: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers to transformers
3
+ labels: [ "migration" ]
4
+ body:
5
+ - type: textarea
6
+ id: system-info
7
+ attributes:
8
+ label: System Info
9
+ description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
10
+ render: shell
11
+ placeholder: transformers version, platform, python version, ...
12
+ validations:
13
+ required: true
14
+
15
+ - type: checkboxes
16
+ id: information-scripts-examples
17
+ attributes:
18
+ label: Information
19
+ description: 'The problem arises when using:'
20
+ options:
21
+ - label: "The official example scripts"
22
+ - label: "My own modified scripts"
23
+
24
+ - type: checkboxes
25
+ id: information-tasks
26
+ attributes:
27
+ label: Tasks
28
+ description: "The tasks I am working on are:"
29
+ options:
30
+ - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
31
+ - label: "My own task or dataset (give details below)"
32
+
33
+ - type: textarea
34
+ id: reproduction
35
+ validations:
36
+ required: true
37
+ attributes:
38
+ label: Reproduction
39
+ description: |
40
+ Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
41
+ If you have code snippets, error messages, stack traces please provide them here as well.
42
+ Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
43
+ Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
44
+
45
+ placeholder: |
46
+ Steps to reproduce the behavior:
47
+
48
+ 1.
49
+ 2.
50
+ 3.
51
+
52
+
53
+ - type: textarea
54
+ id: expected-behavior
55
+ validations:
56
+ required: true
57
+ attributes:
58
+ label: Expected behavior
59
+ description: "A clear and concise description of what you would expect to happen."
60
+ render: shell
61
+
62
+ - type: checkboxes
63
+ id: checklist
64
+ attributes:
65
+ label: Checklist
66
+ options:
67
+ - label: "I have read the migration guide in the readme.
68
+ ([pytorch-transformers](https://github.com/huggingface/transformers#migrating-from-pytorch-transformers-to-transformers);
69
+ [pytorch-pretrained-bert](https://github.com/huggingface/transformers#migrating-from-pytorch-pretrained-bert-to-transformers))"
70
+ required: true
71
+ - label: "I checked if a related official extension example runs on my machine."
72
+ required: true
hf-dev-train/transformers-main/.github/ISSUE_TEMPLATE/new-model-addition.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "\U0001F31F New model addition"
2
+ description: Submit a proposal/request to implement a new model
3
+ labels: [ "New model" ]
4
+
5
+ body:
6
+ - type: textarea
7
+ id: description-request
8
+ validations:
9
+ required: true
10
+ attributes:
11
+ label: Model description
12
+ description: |
13
+ Put any and all important information relative to the model
14
+
15
+ - type: checkboxes
16
+ id: information-tasks
17
+ attributes:
18
+ label: Open source status
19
+ description: |
20
+ Please note that if the model implementation isn't available or if the weights aren't open-source, we are less likely to implement it in `transformers`.
21
+ options:
22
+ - label: "The model implementation is available"
23
+ - label: "The model weights are available"
24
+
25
+ - type: textarea
26
+ id: additional-info
27
+ attributes:
28
+ label: Provide useful links for the implementation
29
+ description: |
30
+ Please provide information regarding the implementation, the weights, and the authors.
31
+ Please mention the authors by @gh-username if you're aware of their usernames.
hf-dev-train/transformers-main/.github/PULL_REQUEST_TEMPLATE.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # What does this PR do?
2
+
3
+ <!--
4
+ Congratulations! You've made it this far! You're not quite done yet though.
5
+
6
+ Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.
7
+
8
+ Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
9
+
10
+ Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
11
+ -->
12
+
13
+ <!-- Remove if not applicable -->
14
+
15
+ Fixes # (issue)
16
+
17
+
18
+ ## Before submitting
19
+ - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
20
+ - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
21
+ Pull Request section?
22
+ - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
23
+ to it if that's the case.
24
+ - [ ] Did you make sure to update the documentation with your changes? Here are the
25
+ [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and
26
+ [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
27
+ - [ ] Did you write any new necessary tests?
28
+
29
+
30
+ ## Who can review?
31
+
32
+ Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
33
+ members/contributors who may be interested in your PR.
34
+
35
+ <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @
36
+
37
+ If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
38
+ Please tag fewer than 3 people.
39
+
40
+ Models:
41
+
42
+ - text models: @ArthurZucker and @younesbelkada
43
+ - vision models: @amyeroberts
44
+ - speech models: @sanchit-gandhi
45
+ - graph models: @clefourrier
46
+
47
+ Library:
48
+
49
+ - flax: @sanchit-gandhi
50
+ - generate: @gante
51
+ - pipelines: @Narsil
52
+ - tensorflow: @gante and @Rocketknight1
53
+ - tokenizers: @ArthurZucker
54
+ - trainer: @sgugger
55
+
56
+ Integrations:
57
+
58
+ - deepspeed: HF Trainer: @stas00, Accelerate: @pacman100
59
+ - ray/raytune: @richardliaw, @amogkam
60
+
61
+ Documentation: @sgugger, @stevhliu and @MKhalusova
62
+
63
+ HF projects:
64
+
65
+ - accelerate: [different repo](https://github.com/huggingface/accelerate)
66
+ - datasets: [different repo](https://github.com/huggingface/datasets)
67
+ - diffusers: [different repo](https://github.com/huggingface/diffusers)
68
+ - rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
69
+
70
+ Maintained examples (not research project or legacy):
71
+
72
+ - Flax: @sanchit-gandhi
73
+ - PyTorch: @sgugger
74
+ - TensorFlow: @Rocketknight1
75
+
76
+ -->
hf-dev-train/transformers-main/.github/conda/build.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ $PYTHON setup.py install # Python command to install the script.
hf-dev-train/transformers-main/.github/conda/meta.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% set name = "transformers" %}
2
+
3
+ package:
4
+ name: "{{ name|lower }}"
5
+ version: "{{ TRANSFORMERS_VERSION }}"
6
+
7
+ source:
8
+ path: ../../
9
+
10
+ build:
11
+ noarch: python
12
+
13
+ requirements:
14
+ host:
15
+ - python
16
+ - pip
17
+ - numpy >=1.17
18
+ - dataclasses
19
+ - importlib_metadata
20
+ - huggingface_hub
21
+ - packaging
22
+ - filelock
23
+ - requests
24
+ - tqdm >=4.27
25
+ - sacremoses
26
+ - regex !=2019.12.17
27
+ - protobuf
28
+ - tokenizers >=0.11.1,!=0.11.3,<0.13
29
+ - pyyaml >=5.1
30
+ run:
31
+ - python
32
+ - numpy >=1.17
33
+ - dataclasses
34
+ - importlib_metadata
35
+ - huggingface_hub
36
+ - packaging
37
+ - filelock
38
+ - requests
39
+ - tqdm >=4.27
40
+ - sacremoses
41
+ - regex !=2019.12.17
42
+ - protobuf
43
+ - tokenizers >=0.11.1,!=0.11.3,<0.13
44
+ - pyyaml >=5.1
45
+
46
+ test:
47
+ imports:
48
+ - transformers
49
+
50
+ about:
51
+ home: https://huggingface.co
52
+ license: Apache License 2.0
53
+ license_file: LICENSE
54
+ summary: "🤗Transformers: State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0."
hf-dev-train/transformers-main/.github/workflows/TROUBLESHOOT.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Troubleshooting
2
+
3
+ This is a document explaining how to deal with various issues on github-actions self-hosted CI. The entries may include actually solutions or pointers to Issues that cover those.
4
+
5
+ ## GitHub Actions (self-hosted CI)
6
+
7
+ * Deepspeed
8
+
9
+ - if jit build hangs, clear out `rm -rf ~/.cache/torch_extensions/` reference: https://github.com/huggingface/transformers/pull/12723
hf-dev-train/transformers-main/.github/workflows/add-model-like.yml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Add model like runner
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ pull_request:
8
+ paths:
9
+ - "src/**"
10
+ - "tests/**"
11
+ - ".github/**"
12
+ types: [opened, synchronize, reopened]
13
+
14
+ jobs:
15
+ run_tests_templates_like:
16
+ name: "Add new model like template tests"
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - uses: actions/checkout@v3
20
+
21
+ - name: Install dependencies
22
+ run: |
23
+ sudo apt -y update && sudo apt install -y libsndfile1-dev
24
+
25
+ - name: Load cached virtual environment
26
+ uses: actions/cache@v2
27
+ id: cache
28
+ with:
29
+ path: ~/venv/
30
+ key: v4-tests_model_like-${{ hashFiles('setup.py') }}
31
+
32
+ - name: Create virtual environment on cache miss
33
+ if: steps.cache.outputs.cache-hit != 'true'
34
+ run: |
35
+ python -m venv ~/venv && . ~/venv/bin/activate
36
+ pip install --upgrade pip!=21.3
37
+ pip install -e .[dev]
38
+
39
+ - name: Check transformers location
40
+ # make `transformers` available as package (required since we use `-e` flag) and check it's indeed from the repo.
41
+ run: |
42
+ . ~/venv/bin/activate
43
+ python setup.py develop
44
+ transformers_install=$(pip list -e | grep transformers)
45
+ transformers_install_array=($transformers_install)
46
+ transformers_loc=${transformers_install_array[-1]}
47
+ transformers_repo_loc=$(pwd .)
48
+ if [ "$transformers_loc" != "$transformers_repo_loc" ]; then
49
+ echo "transformers is from $transformers_loc but it shoud be from $transformers_repo_loc/src."
50
+ echo "A fix is required. Stop testing."
51
+ exit 1
52
+ fi
53
+
54
+ - name: Create model files
55
+ run: |
56
+ . ~/venv/bin/activate
57
+ transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
58
+ make style
59
+ make fix-copies
60
+
61
+ - name: Run all PyTorch modeling test
62
+ run: |
63
+ . ~/venv/bin/activate
64
+ python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_new_models tests/bert_new/test_modeling_bert_new.py
65
+
66
+ - name: Run style changes
67
+ run: |
68
+ . ~/venv/bin/activate
69
+ make style && make quality && make repo-consistency
70
+
71
+ - name: Failure short reports
72
+ if: ${{ always() }}
73
+ run: cat reports/tests_new_models/failures_short.txt
74
+
75
+ - name: Test suite reports artifacts
76
+ if: ${{ always() }}
77
+ uses: actions/upload-artifact@v3
78
+ with:
79
+ name: run_all_tests_new_models_test_reports
80
+ path: reports/tests_new_models
hf-dev-train/transformers-main/.github/workflows/build-docker-images.yml ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build docker images (scheduled)
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - docker-image*
7
+ repository_dispatch:
8
+ workflow_call:
9
+ inputs:
10
+ image_postfix:
11
+ required: true
12
+ type: string
13
+ schedule:
14
+ - cron: "0 1 * * *"
15
+
16
+ concurrency:
17
+ group: docker-images-builds
18
+ cancel-in-progress: false
19
+
20
+ jobs:
21
+ latest-docker:
22
+ name: "Latest PyTorch + TensorFlow [dev]"
23
+ runs-on: ubuntu-latest
24
+ steps:
25
+ - name: Cleanup disk
26
+ run: |
27
+ sudo ls -l /usr/local/lib/
28
+ sudo ls -l /usr/share/
29
+ sudo du -sh /usr/local/lib/
30
+ sudo du -sh /usr/share/
31
+ sudo rm -rf /usr/local/lib/android
32
+ sudo rm -rf /usr/share/dotnet
33
+ sudo du -sh /usr/local/lib/
34
+ sudo du -sh /usr/share/
35
+ -
36
+ name: Set up Docker Buildx
37
+ uses: docker/setup-buildx-action@v2
38
+ -
39
+ name: Check out code
40
+ uses: actions/checkout@v3
41
+ -
42
+ name: Login to DockerHub
43
+ uses: docker/login-action@v2
44
+ with:
45
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
46
+ password: ${{ secrets.DOCKERHUB_PASSWORD }}
47
+ -
48
+ name: Build and push
49
+ uses: docker/build-push-action@v3
50
+ with:
51
+ context: ./docker/transformers-all-latest-gpu
52
+ build-args: |
53
+ REF=main
54
+ push: true
55
+ tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
56
+ # Push CI images still need to be re-built daily
57
+ -
58
+ name: Build and push (for Push CI) in a daily basis
59
+ # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
60
+ # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
61
+ if: inputs.image_postfix != '-push-ci'
62
+ uses: docker/build-push-action@v3
63
+ with:
64
+ context: ./docker/transformers-all-latest-gpu
65
+ build-args: |
66
+ REF=main
67
+ push: true
68
+ tags: huggingface/transformers-all-latest-gpu-push-ci
69
+
70
+ latest-with-torch-nightly-docker:
71
+ name: "Nightly PyTorch + Stable TensorFlow"
72
+ # Push CI doesn't need this image
73
+ if: inputs.image_postfix != '-push-ci'
74
+ runs-on: ubuntu-latest
75
+ steps:
76
+ -
77
+ name: Set up Docker Buildx
78
+ uses: docker/setup-buildx-action@v2
79
+ -
80
+ name: Check out code
81
+ uses: actions/checkout@v3
82
+ -
83
+ name: Login to DockerHub
84
+ uses: docker/login-action@v2
85
+ with:
86
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
87
+ password: ${{ secrets.DOCKERHUB_PASSWORD }}
88
+ -
89
+ name: Build and push
90
+ uses: docker/build-push-action@v3
91
+ with:
92
+ context: ./docker/transformers-all-latest-gpu
93
+ build-args: |
94
+ REF=main
95
+ PYTORCH=pre
96
+ push: true
97
+ tags: huggingface/transformers-all-latest-torch-nightly-gpu
98
+
99
+ latest-torch-deepspeed-docker:
100
+ name: "Latest PyTorch + DeepSpeed"
101
+ runs-on: ubuntu-latest
102
+ steps:
103
+ -
104
+ name: Set up Docker Buildx
105
+ uses: docker/setup-buildx-action@v2
106
+ -
107
+ name: Check out code
108
+ uses: actions/checkout@v3
109
+ -
110
+ name: Login to DockerHub
111
+ uses: docker/login-action@v2
112
+ with:
113
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
114
+ password: ${{ secrets.DOCKERHUB_PASSWORD }}
115
+ -
116
+ name: Build and push
117
+ uses: docker/build-push-action@v3
118
+ with:
119
+ context: ./docker/transformers-pytorch-deepspeed-latest-gpu
120
+ build-args: |
121
+ REF=main
122
+ push: true
123
+ tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
124
+
125
+ # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
126
+ latest-torch-deepspeed-docker-for-push-ci-daily-build:
127
+ name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
128
+ runs-on: ubuntu-latest
129
+ steps:
130
+ -
131
+ name: Set up Docker Buildx
132
+ uses: docker/setup-buildx-action@v2
133
+ -
134
+ name: Check out code
135
+ uses: actions/checkout@v3
136
+ -
137
+ name: Login to DockerHub
138
+ uses: docker/login-action@v2
139
+ with:
140
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
141
+ password: ${{ secrets.DOCKERHUB_PASSWORD }}
142
+ # Push CI images still need to be re-built daily
143
+ -
144
+ name: Build and push (for Push CI) in a daily basis
145
+ # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
146
+ # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
147
+ if: inputs.image_postfix != '-push-ci'
148
+ uses: docker/build-push-action@v3
149
+ with:
150
+ context: ./docker/transformers-pytorch-deepspeed-latest-gpu
151
+ build-args: |
152
+ REF=main
153
+ push: true
154
+ tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
155
+
156
+ nightly-torch-deepspeed-docker:
157
+ name: "Nightly PyTorch + DeepSpeed"
158
+ # Push CI doesn't need this image
159
+ if: inputs.image_postfix != '-push-ci'
160
+ runs-on: ubuntu-latest
161
+ steps:
162
+ -
163
+ name: Set up Docker Buildx
164
+ uses: docker/setup-buildx-action@v2
165
+ -
166
+ name: Check out code
167
+ uses: actions/checkout@v3
168
+ -
169
+ name: Login to DockerHub
170
+ uses: docker/login-action@v2
171
+ with:
172
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
173
+ password: ${{ secrets.DOCKERHUB_PASSWORD }}
174
+ -
175
+ name: Build and push
176
+ uses: docker/build-push-action@v3
177
+ with:
178
+ context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
179
+ build-args: |
180
+ REF=main
181
+ push: true
182
+ tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
183
+
184
+ doc-builder:
185
+ name: "Doc builder"
186
+ # Push CI doesn't need this image
187
+ if: inputs.image_postfix != '-push-ci'
188
+ runs-on: ubuntu-latest
189
+ steps:
190
+ -
191
+ name: Set up Docker Buildx
192
+ uses: docker/setup-buildx-action@v2
193
+ -
194
+ name: Check out code
195
+ uses: actions/checkout@v3
196
+ -
197
+ name: Login to DockerHub
198
+ uses: docker/login-action@v2
199
+ with:
200
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
201
+ password: ${{ secrets.DOCKERHUB_PASSWORD }}
202
+ -
203
+ name: Build and push
204
+ uses: docker/build-push-action@v3
205
+ with:
206
+ context: ./docker/transformers-doc-builder
207
+ push: true
208
+ tags: huggingface/transformers-doc-builder
209
+
210
+ latest-pytorch:
211
+ name: "Latest PyTorch [dev]"
212
+ # Push CI doesn't need this image
213
+ if: inputs.image_postfix != '-push-ci'
214
+ runs-on: ubuntu-latest
215
+ steps:
216
+ -
217
+ name: Set up Docker Buildx
218
+ uses: docker/setup-buildx-action@v2
219
+ -
220
+ name: Check out code
221
+ uses: actions/checkout@v3
222
+ -
223
+ name: Login to DockerHub
224
+ uses: docker/login-action@v2
225
+ with:
226
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
227
+ password: ${{ secrets.DOCKERHUB_PASSWORD }}
228
+ -
229
+ name: Build and push
230
+ uses: docker/build-push-action@v3
231
+ with:
232
+ context: ./docker/transformers-pytorch-gpu
233
+ build-args: |
234
+ REF=main
235
+ push: true
236
+ tags: huggingface/transformers-pytorch-gpu
237
+
238
+ latest-tensorflow:
239
+ name: "Latest TensorFlow [dev]"
240
+ # Push CI doesn't need this image
241
+ if: inputs.image_postfix != '-push-ci'
242
+ runs-on: ubuntu-latest
243
+ steps:
244
+ -
245
+ name: Set up Docker Buildx
246
+ uses: docker/setup-buildx-action@v2
247
+ -
248
+ name: Check out code
249
+ uses: actions/checkout@v3
250
+ -
251
+ name: Login to DockerHub
252
+ uses: docker/login-action@v2
253
+ with:
254
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
255
+ password: ${{ secrets.DOCKERHUB_PASSWORD }}
256
+ -
257
+ name: Build and push
258
+ uses: docker/build-push-action@v3
259
+ with:
260
+ context: ./docker/transformers-tensorflow-gpu
261
+ build-args: |
262
+ REF=main
263
+ push: true
264
+ tags: huggingface/transformers-tensorflow-gpu
hf-dev-train/transformers-main/.github/workflows/build-past-ci-docker-images.yml ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build docker images (Past CI)
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - past-ci-docker-image*
7
+
8
+ concurrency:
9
+ group: docker-images-builds
10
+ cancel-in-progress: false
11
+
12
+ jobs:
13
+ past-pytorch-docker:
14
+ name: "Past PyTorch Docker"
15
+ strategy:
16
+ fail-fast: false
17
+ matrix:
18
+ version: ["1.11", "1.10", "1.9", "1.8", "1.7", "1.6", "1.5", "1.4"]
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ -
22
+ name: Set up Docker Buildx
23
+ uses: docker/setup-buildx-action@v2
24
+ -
25
+ name: Check out code
26
+ uses: actions/checkout@v3
27
+ -
28
+ name: Login to DockerHub
29
+ uses: docker/login-action@v2
30
+ with:
31
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
32
+ password: ${{ secrets.DOCKERHUB_PASSWORD }}
33
+ -
34
+ name: Build and push
35
+ uses: docker/build-push-action@v3
36
+ with:
37
+ context: ./docker/transformers-past-gpu
38
+ build-args: |
39
+ REF=main
40
+ FRAMEWORK=pytorch
41
+ VERSION=${{ matrix.version }}
42
+ push: true
43
+ tags: huggingface/transformers-pytorch-past-${{ matrix.version }}-gpu
44
+
45
+ past-tensorflow-docker:
46
+ name: "Past TensorFlow Docker"
47
+ strategy:
48
+ fail-fast: false
49
+ matrix:
50
+ version: ["2.8", "2.7", "2.6", "2.5"]
51
+ runs-on: ubuntu-latest
52
+ steps:
53
+ -
54
+ name: Set up Docker Buildx
55
+ uses: docker/setup-buildx-action@v2
56
+ -
57
+ name: Check out code
58
+ uses: actions/checkout@v3
59
+ -
60
+ name: Login to DockerHub
61
+ uses: docker/login-action@v2
62
+ with:
63
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
64
+ password: ${{ secrets.DOCKERHUB_PASSWORD }}
65
+ -
66
+ name: Build and push
67
+ uses: docker/build-push-action@v3
68
+ with:
69
+ context: ./docker/transformers-past-gpu
70
+ build-args: |
71
+ REF=main
72
+ FRAMEWORK=tensorflow
73
+ VERSION=${{ matrix.version }}
74
+ push: true
75
+ tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu
76
+
77
+ past-tensorflow-docker-2-4:
78
+ name: "Past TensorFlow Docker"
79
+ strategy:
80
+ fail-fast: false
81
+ matrix:
82
+ version: ["2.4"]
83
+ runs-on: ubuntu-latest
84
+ steps:
85
+ -
86
+ name: Set up Docker Buildx
87
+ uses: docker/setup-buildx-action@v2
88
+ -
89
+ name: Check out code
90
+ uses: actions/checkout@v3
91
+ -
92
+ name: Login to DockerHub
93
+ uses: docker/login-action@v2
94
+ with:
95
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
96
+ password: ${{ secrets.DOCKERHUB_PASSWORD }}
97
+ -
98
+ name: Build and push
99
+ uses: docker/build-push-action@v3
100
+ with:
101
+ context: ./docker/transformers-past-gpu
102
+ build-args: |
103
+ REF=main
104
+ BASE_DOCKER_IMAGE=nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04
105
+ FRAMEWORK=tensorflow
106
+ VERSION=${{ matrix.version }}
107
+ push: true
108
+ tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu
hf-dev-train/transformers-main/.github/workflows/build_documentation.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build documentation
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ - doc-builder*
8
+ - v*-release
9
+ - use_templates
10
+
11
+ jobs:
12
+ build:
13
+ uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
14
+ with:
15
+ commit_sha: ${{ github.sha }}
16
+ package: transformers
17
+ notebook_folder: transformers_doc
18
+ languages: de en es fr it ko pt zh
19
+ secrets:
20
+ token: ${{ secrets.HUGGINGFACE_PUSH }}
hf-dev-train/transformers-main/.github/workflows/build_pr_documentation.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build PR Documentation
2
+
3
+ on:
4
+ pull_request:
5
+
6
+ concurrency:
7
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
8
+ cancel-in-progress: true
9
+
10
+ jobs:
11
+ build:
12
+ uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
13
+ with:
14
+ commit_sha: ${{ github.event.pull_request.head.sha }}
15
+ pr_number: ${{ github.event.number }}
16
+ package: transformers
17
+ languages: de en es fr it ko pt zh
hf-dev-train/transformers-main/.github/workflows/check_runner_status.yml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Self-hosted runner (check runner status)
2
+
3
+ # Note that each job's dependencies go into a corresponding docker file.
4
+ #
5
+ # For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
6
+ # `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
7
+ # `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
8
+
9
+ on:
10
+ repository_dispatch:
11
+ schedule:
12
+ # run per hour
13
+ - cron: "0 */1 * * *"
14
+
15
+ env:
16
+ TRANSFORMERS_IS_CI: yes
17
+
18
+ jobs:
19
+ check_runner_status:
20
+ name: Check Runner Status
21
+ runs-on: ubuntu-latest
22
+ outputs:
23
+ offline_runners: ${{ steps.set-offline_runners.outputs.offline_runners }}
24
+ steps:
25
+ - name: Checkout transformers
26
+ uses: actions/checkout@v3
27
+ with:
28
+ fetch-depth: 2
29
+
30
+ - name: Check Runner Status
31
+ run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker,single-gpu-doctest-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
32
+
33
+ - id: set-offline_runners
34
+ name: Set output for offline runners
35
+ if: ${{ always() }}
36
+ run: |
37
+ offline_runners=$(python3 -c 'fp = open("offline_runners.txt"); failed = fp.read(); fp.close(); print(failed)')
38
+ echo "offline_runners=$offline_runners" >> $GITHUB_OUTPUT
39
+
40
+ send_results:
41
+ name: Send results to webhook
42
+ runs-on: ubuntu-latest
43
+ needs: check_runner_status
44
+ if: ${{ failure() }}
45
+ steps:
46
+ - name: Preliminary job status
47
+ shell: bash
48
+ run: |
49
+ echo "Runner availability: ${{ needs.check_runner_status.result }}"
50
+
51
+ - uses: actions/checkout@v3
52
+ - uses: actions/download-artifact@v3
53
+ - name: Send message to Slack
54
+ env:
55
+ CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
56
+ CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
57
+ CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
58
+ CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
59
+ CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
60
+ ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
61
+ CI_EVENT: runner status check
62
+ RUNNER_STATUS: ${{ needs.check_runner_status.result }}
63
+ OFFLINE_RUNNERS: ${{ needs.check_runner_status.outputs.offline_runners }}
64
+ # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
65
+ # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
66
+ run: |
67
+ pip install slack_sdk
68
+ python utils/notification_service.py
hf-dev-train/transformers-main/.github/workflows/delete_doc_comment.yml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Delete dev documentation
2
+
3
+ on:
4
+ pull_request:
5
+ types: [ closed ]
6
+
7
+
8
+ jobs:
9
+ delete:
10
+ uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
11
+ with:
12
+ pr_number: ${{ github.event.number }}
13
+ package: transformers
hf-dev-train/transformers-main/.github/workflows/doctests.yml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Doctests
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - doctest*
7
+ repository_dispatch:
8
+ schedule:
9
+ - cron: "0 2 * * *"
10
+
11
+
12
+ env:
13
+ HF_HOME: /mnt/cache
14
+ TRANSFORMERS_IS_CI: yes
15
+ RUN_SLOW: yes
16
+ OMP_NUM_THREADS: 16
17
+ MKL_NUM_THREADS: 16
18
+ SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
19
+ TF_FORCE_GPU_ALLOW_GROWTH: true
20
+
21
+ jobs:
22
+ run_doctests:
23
+ runs-on: [self-hosted, doc-tests-gpu]
24
+ container:
25
+ image: huggingface/transformers-all-latest-gpu
26
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
27
+ steps:
28
+ - uses: actions/checkout@v3
29
+ - name: NVIDIA-SMI
30
+ run: |
31
+ nvidia-smi
32
+
33
+ - name: GPU visibility
34
+ run: |
35
+ python3 utils/print_env.py
36
+
37
+ - name: Show installed libraries and their versions
38
+ run: pip freeze
39
+
40
+ - name: Prepare files for doctests
41
+ run: |
42
+ python3 utils/prepare_for_doc_test.py src docs
43
+
44
+ - name: Run doctests
45
+ run: |
46
+ python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
47
+
48
+ - name: Clean files after doctests
49
+ run: |
50
+ python3 utils/prepare_for_doc_test.py src docs --remove_new_line
51
+
52
+ - name: Failure short reports
53
+ if: ${{ failure() }}
54
+ continue-on-error: true
55
+ run: cat reports/doc_tests_gpu/failures_short.txt
56
+
57
+ - name: Test suite reports artifacts
58
+ if: ${{ always() }}
59
+ uses: actions/upload-artifact@v3
60
+ with:
61
+ name: doc_tests_gpu_test_reports
62
+ path: reports/doc_tests_gpu
63
+
64
+
65
+ send_results:
66
+ name: Send results to webhook
67
+ runs-on: ubuntu-latest
68
+ if: always()
69
+ needs: [run_doctests]
70
+ steps:
71
+ - uses: actions/checkout@v3
72
+ - uses: actions/download-artifact@v3
73
+ - name: Send message to Slack
74
+ env:
75
+ CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
76
+ CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY_DOCS }}
77
+ CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY_DOCS }}
78
+ CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
79
+ run: |
80
+ pip install slack_sdk
81
+ python utils/notification_service_doc_tests.py
hf-dev-train/transformers-main/.github/workflows/model-templates.yml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Model templates runner
2
+
3
+ on:
4
+ repository_dispatch:
5
+ schedule:
6
+ - cron: "0 2 * * *"
7
+
8
+ jobs:
9
+ run_tests_templates:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - name: Checkout repository
13
+ uses: actions/checkout@v3
14
+
15
+ - name: Install dependencies
16
+ run: |
17
+ sudo apt -y update && sudo apt install -y libsndfile1-dev
18
+
19
+ - name: Load cached virtual environment
20
+ uses: actions/cache@v2
21
+ id: cache
22
+ with:
23
+ path: ~/venv/
24
+ key: v4-tests_templates-${{ hashFiles('setup.py') }}
25
+
26
+ - name: Create virtual environment on cache miss
27
+ if: steps.cache.outputs.cache-hit != 'true'
28
+ run: |
29
+ python -m venv ~/venv && . ~/venv/bin/activate
30
+ pip install --upgrade pip!=21.3
31
+ pip install -e .[dev]
32
+
33
+ - name: Check transformers location
34
+ # make `transformers` available as package (required since we use `-e` flag) and check it's indeed from the repo.
35
+ run: |
36
+ . ~/venv/bin/activate
37
+ python setup.py develop
38
+ transformer_loc=$(pip show transformers | grep "Location: " | cut -c11-)
39
+ transformer_repo_loc=$(pwd .)
40
+ if [ "$transformer_loc" != "$transformer_repo_loc/src" ]; then
41
+ echo "transformers is from $transformer_loc but it shoud be from $transformer_repo_loc/src."
42
+ echo "A fix is required. Stop testing."
43
+ exit 1
44
+ fi
45
+
46
+ - name: Create model files
47
+ run: |
48
+ . ~/venv/bin/activate
49
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
50
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
51
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
52
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
53
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
54
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
55
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/flax-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
56
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/flax-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
57
+ make style
58
+ python utils/check_table.py --fix_and_overwrite
59
+ python utils/check_dummies.py --fix_and_overwrite
60
+ python utils/check_copies.py --fix_and_overwrite
61
+
62
+ - name: Run all non-slow tests
63
+ run: |
64
+ . ~/venv/bin/activate
65
+ python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_templates tests/*template*
66
+
67
+ - name: Run style changes
68
+ run: |
69
+ . ~/venv/bin/activate
70
+ make style && make quality && make repo-consistency
71
+
72
+ - name: Failure short reports
73
+ if: ${{ always() }}
74
+ run: cat reports/tests_templates/failures_short.txt
75
+
76
+ - name: Test suite reports artifacts
77
+ if: ${{ always() }}
78
+ uses: actions/upload-artifact@v3
79
+ with:
80
+ name: run_all_tests_templates_test_reports
81
+ path: reports/tests_templates
hf-dev-train/transformers-main/.github/workflows/release-conda.yml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Release - Conda
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - v*
7
+ branches:
8
+ - conda_*
9
+
10
+ env:
11
+ ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
12
+
13
+ jobs:
14
+ build_and_package:
15
+ runs-on: ubuntu-latest
16
+ defaults:
17
+ run:
18
+ shell: bash -l {0}
19
+
20
+ steps:
21
+ - name: Checkout repository
22
+ uses: actions/checkout@v1
23
+
24
+ - name: Install miniconda
25
+ uses: conda-incubator/setup-miniconda@v2
26
+ with:
27
+ auto-update-conda: true
28
+ auto-activate-base: false
29
+ python-version: 3.8
30
+ activate-environment: "build-transformers"
31
+ channels: huggingface
32
+
33
+ - name: Setup conda env
34
+ run: |
35
+ conda install -c defaults anaconda-client conda-build
36
+
37
+ - name: Extract version
38
+ run: echo "TRANSFORMERS_VERSION=`python setup.py --version`" >> $GITHUB_ENV
39
+
40
+ - name: Build conda packages
41
+ run: |
42
+ conda info
43
+ conda list
44
+ conda-build .github/conda
45
+
46
+ - name: Upload to Anaconda
47
+ run: anaconda upload `conda-build .github/conda --output` --force
hf-dev-train/transformers-main/.github/workflows/self-nightly-scheduled.yml ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Self-hosted runner (nightly)
2
+
3
+ # Note that each job's dependencies go into a corresponding docker file.
4
+ #
5
+ # For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
6
+ # `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
7
+ # `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
8
+
9
+ on:
10
+ repository_dispatch:
11
+ # Disable temporarily until the test suite can be run under 12 hours.
12
+ # schedule:
13
+ # - cron: "0 16 * * *"
14
+
15
+ env:
16
+ HF_HOME: /mnt/cache
17
+ TRANSFORMERS_IS_CI: yes
18
+ OMP_NUM_THREADS: 8
19
+ MKL_NUM_THREADS: 8
20
+ RUN_SLOW: yes
21
+ SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
22
+ TF_FORCE_GPU_ALLOW_GROWTH: true
23
+ RUN_PT_TF_CROSS_TESTS: 1
24
+
25
+ jobs:
26
+ check_runner_status:
27
+ name: Check Runner Status
28
+ runs-on: ubuntu-latest
29
+ steps:
30
+ - name: Checkout transformers
31
+ uses: actions/checkout@v3
32
+ with:
33
+ fetch-depth: 2
34
+
35
+ - name: Check Runner Status
36
+ run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
37
+
38
+ check_runners:
39
+ name: Check Runners
40
+ needs: check_runner_status
41
+ strategy:
42
+ matrix:
43
+ machine_type: [single-gpu, multi-gpu]
44
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
45
+ container:
46
+ image: huggingface/transformers-all-latest-torch-nightly-gpu
47
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
48
+ steps:
49
+ - name: NVIDIA-SMI
50
+ run: |
51
+ nvidia-smi
52
+
53
+ setup:
54
+ name: Setup
55
+ needs: check_runners
56
+ strategy:
57
+ matrix:
58
+ machine_type: [single-gpu, multi-gpu]
59
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
60
+ container:
61
+ image: huggingface/transformers-all-latest-torch-nightly-gpu
62
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
63
+ outputs:
64
+ matrix: ${{ steps.set-matrix.outputs.matrix }}
65
+ steps:
66
+ - name: Update clone
67
+ working-directory: /transformers
68
+ run: |
69
+ git fetch && git checkout ${{ github.sha }}
70
+
71
+ - name: Cleanup
72
+ working-directory: /transformers
73
+ run: |
74
+ rm -rf tests/__pycache__
75
+ rm -rf tests/models/__pycache__
76
+ rm -rf reports
77
+
78
+ - name: Show installed libraries and their versions
79
+ working-directory: /transformers
80
+ run: pip freeze
81
+
82
+ - id: set-matrix
83
+ name: Identify models to test
84
+ working-directory: /transformers/tests
85
+ run: |
86
+ echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT
87
+
88
+ - name: NVIDIA-SMI
89
+ run: |
90
+ nvidia-smi
91
+
92
+ run_tests_single_gpu:
93
+ name: Model tests
94
+ strategy:
95
+ fail-fast: false
96
+ matrix:
97
+ folders: ${{ fromJson(needs.setup.outputs.matrix) }}
98
+ machine_type: [single-gpu]
99
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
100
+ container:
101
+ image: huggingface/transformers-all-latest-torch-nightly-gpu
102
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
103
+ needs: setup
104
+ steps:
105
+ - name: Echo folder ${{ matrix.folders }}
106
+ shell: bash
107
+ # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
108
+ # set the artifact folder names (because the character `/` is not allowed).
109
+ run: |
110
+ echo "${{ matrix.folders }}"
111
+ matrix_folders=${{ matrix.folders }}
112
+ matrix_folders=${matrix_folders/'models/'/'models_'}
113
+ echo "$matrix_folders"
114
+ echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
115
+
116
+ - name: Update clone
117
+ working-directory: /transformers
118
+ run: git fetch && git checkout ${{ github.sha }}
119
+
120
+ - name: NVIDIA-SMI
121
+ run: |
122
+ nvidia-smi
123
+
124
+ - name: Environment
125
+ working-directory: /transformers
126
+ run: |
127
+ python3 utils/print_env.py
128
+
129
+ - name: Show installed libraries and their versions
130
+ working-directory: /transformers
131
+ run: pip freeze
132
+
133
+ - name: Run all tests on GPU
134
+ working-directory: /transformers
135
+ run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
136
+
137
+ - name: Failure short reports
138
+ if: ${{ failure() }}
139
+ continue-on-error: true
140
+ run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
141
+
142
+ - name: Test suite reports artifacts
143
+ if: ${{ always() }}
144
+ uses: actions/upload-artifact@v3
145
+ with:
146
+ name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
147
+ path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
148
+
149
+ run_tests_multi_gpu:
150
+ name: Model tests
151
+ strategy:
152
+ fail-fast: false
153
+ matrix:
154
+ folders: ${{ fromJson(needs.setup.outputs.matrix) }}
155
+ machine_type: [multi-gpu]
156
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
157
+ container:
158
+ image: huggingface/transformers-all-latest-torch-nightly-gpu
159
+ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
160
+ needs: setup
161
+ steps:
162
+ - name: Echo folder ${{ matrix.folders }}
163
+ shell: bash
164
+ # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
165
+ # set the artifact folder names (because the character `/` is not allowed).
166
+ run: |
167
+ echo "${{ matrix.folders }}"
168
+ matrix_folders=${{ matrix.folders }}
169
+ matrix_folders=${matrix_folders/'models/'/'models_'}
170
+ echo "$matrix_folders"
171
+ echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
172
+
173
+ - name: Update clone
174
+ working-directory: /transformers
175
+ run: git fetch && git checkout ${{ github.sha }}
176
+
177
+ - name: NVIDIA-SMI
178
+ run: |
179
+ nvidia-smi
180
+
181
+ - name: Environment
182
+ working-directory: /transformers
183
+ run: |
184
+ python3 utils/print_env.py
185
+
186
+ - name: Show installed libraries and their versions
187
+ working-directory: /transformers
188
+ run: pip freeze
189
+
190
+ - name: Run all tests on GPU
191
+ working-directory: /transformers
192
+ run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
193
+
194
+ - name: Failure short reports
195
+ if: ${{ failure() }}
196
+ continue-on-error: true
197
+ run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
198
+
199
+ - name: Test suite reports artifacts
200
+ if: ${{ always() }}
201
+ uses: actions/upload-artifact@v3
202
+ with:
203
+ name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
204
+ path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
205
+
206
+ run_all_tests_torch_cuda_extensions_gpu:
207
+ name: Torch CUDA extension tests
208
+ strategy:
209
+ fail-fast: false
210
+ matrix:
211
+ machine_type: [single-gpu, multi-gpu]
212
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
213
+ needs: setup
214
+ container:
215
+ image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
216
+ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
217
+ steps:
218
+ - name: Update clone
219
+ working-directory: /workspace/transformers
220
+ run: git fetch && git checkout ${{ github.sha }}
221
+
222
+ - name: Remove cached torch extensions
223
+ run: rm -rf /github/home/.cache/torch_extensions/
224
+
225
+ # To avoid unknown test failures
226
+ - name: Pre build DeepSpeed *again*
227
+ working-directory: /workspace
228
+ run: |
229
+ python3 -m pip uninstall -y deepspeed
230
+ rm -rf DeepSpeed
231
+ git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
232
+ DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
233
+
234
+ - name: NVIDIA-SMI
235
+ run: |
236
+ nvidia-smi
237
+
238
+ - name: Environment
239
+ working-directory: /workspace/transformers
240
+ run: |
241
+ python utils/print_env.py
242
+
243
+ - name: Show installed libraries and their versions
244
+ working-directory: /workspace/transformers
245
+ run: pip freeze
246
+
247
+ - name: Run all tests on GPU
248
+ working-directory: /workspace/transformers
249
+ run: |
250
+ python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
251
+
252
+ - name: Failure short reports
253
+ if: ${{ failure() }}
254
+ continue-on-error: true
255
+ run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
256
+
257
+ - name: Test suite reports artifacts
258
+ if: ${{ always() }}
259
+ uses: actions/upload-artifact@v3
260
+ with:
261
+ name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
262
+ path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
263
+
264
+ send_results:
265
+ name: Send results to webhook
266
+ runs-on: ubuntu-latest
267
+ if: always()
268
+ needs: [
269
+ check_runner_status,
270
+ check_runners,
271
+ setup,
272
+ run_tests_single_gpu,
273
+ run_tests_multi_gpu,
274
+ run_all_tests_torch_cuda_extensions_gpu
275
+ ]
276
+ steps:
277
+ - name: Preliminary job status
278
+ shell: bash
279
+ # For the meaning of these environment variables, see the job `Setup`
280
+ run: |
281
+ echo "Runner availability: ${{ needs.check_runner_status.result }}"
282
+ echo "Runner status: ${{ needs.check_runners.result }}"
283
+ echo "Setup status: ${{ needs.setup.result }}"
284
+
285
+ - uses: actions/checkout@v3
286
+ - uses: actions/download-artifact@v3
287
+ - name: Send message to Slack
288
+ env:
289
+ CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
290
+ CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
291
+ CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
292
+ CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
293
+ CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
294
+ ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
295
+ CI_EVENT: nightly-build
296
+ RUNNER_STATUS: ${{ needs.check_runner_status.result }}
297
+ RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
298
+ SETUP_STATUS: ${{ needs.setup.result }}
299
+ # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
300
+ # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
301
+ run: |
302
+ pip install slack_sdk
303
+ pip show slack_sdk
304
+ python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
hf-dev-train/transformers-main/.github/workflows/self-past-caller.yml ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Self-hosted runner (past-ci-caller)
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - run-past-ci*
7
+
8
+ jobs:
9
+ run_past_ci_pytorch_1-11:
10
+ name: PyTorch 1.11
11
+ if: always()
12
+ uses: ./.github/workflows/self-past.yml
13
+ with:
14
+ framework: pytorch
15
+ version: "1.11"
16
+ secrets: inherit
17
+
18
+ run_past_ci_pytorch_1-10:
19
+ name: PyTorch 1.10
20
+ if: always()
21
+ needs: [run_past_ci_pytorch_1-11]
22
+ uses: ./.github/workflows/self-past.yml
23
+ with:
24
+ framework: pytorch
25
+ version: "1.10"
26
+ secrets: inherit
27
+
28
+ run_past_ci_pytorch_1-9:
29
+ name: PyTorch 1.9
30
+ if: always()
31
+ needs: [run_past_ci_pytorch_1-10]
32
+ uses: ./.github/workflows/self-past.yml
33
+ with:
34
+ framework: pytorch
35
+ version: "1.9"
36
+ secrets: inherit
37
+
38
+ run_past_ci_pytorch_1-8:
39
+ name: PyTorch 1.8
40
+ if: always()
41
+ needs: [run_past_ci_pytorch_1-9]
42
+ uses: ./.github/workflows/self-past.yml
43
+ with:
44
+ framework: pytorch
45
+ version: "1.8"
46
+ secrets: inherit
47
+
48
+ run_past_ci_pytorch_1-7:
49
+ name: PyTorch 1.7
50
+ if: always()
51
+ needs: [run_past_ci_pytorch_1-8]
52
+ uses: ./.github/workflows/self-past.yml
53
+ with:
54
+ framework: pytorch
55
+ version: "1.7"
56
+ secrets: inherit
57
+
58
+ run_past_ci_pytorch_1-6:
59
+ name: PyTorch 1.6
60
+ if: always()
61
+ needs: [run_past_ci_pytorch_1-7]
62
+ uses: ./.github/workflows/self-past.yml
63
+ with:
64
+ framework: pytorch
65
+ version: "1.6"
66
+ secrets: inherit
67
+
68
+ run_past_ci_pytorch_1-5:
69
+ name: PyTorch 1.5
70
+ if: always()
71
+ needs: [run_past_ci_pytorch_1-6]
72
+ uses: ./.github/workflows/self-past.yml
73
+ with:
74
+ framework: pytorch
75
+ version: "1.5"
76
+ secrets: inherit
77
+
78
+ run_past_ci_pytorch_1-4:
79
+ name: PyTorch 1.4
80
+ if: always()
81
+ needs: [run_past_ci_pytorch_1-5]
82
+ uses: ./.github/workflows/self-past.yml
83
+ with:
84
+ framework: pytorch
85
+ version: "1.4"
86
+ secrets: inherit
87
+
88
+ run_past_ci_tensorflow_2-8:
89
+ name: TensorFlow 2.8
90
+ if: always()
91
+ needs: [run_past_ci_pytorch_1-4]
92
+ uses: ./.github/workflows/self-past.yml
93
+ with:
94
+ framework: tensorflow
95
+ version: "2.8"
96
+ secrets: inherit
97
+
98
+ run_past_ci_tensorflow_2-7:
99
+ name: TensorFlow 2.7
100
+ if: always()
101
+ needs: [run_past_ci_tensorflow_2-8]
102
+ uses: ./.github/workflows/self-past.yml
103
+ with:
104
+ framework: tensorflow
105
+ version: "2.7"
106
+ secrets: inherit
107
+
108
+ run_past_ci_tensorflow_2-6:
109
+ name: TensorFlow 2.6
110
+ if: always()
111
+ needs: [run_past_ci_tensorflow_2-7]
112
+ uses: ./.github/workflows/self-past.yml
113
+ with:
114
+ framework: tensorflow
115
+ version: "2.6"
116
+ secrets: inherit
117
+
118
+ run_past_ci_tensorflow_2-5:
119
+ name: TensorFlow 2.5
120
+ if: always()
121
+ needs: [run_past_ci_tensorflow_2-6]
122
+ uses: ./.github/workflows/self-past.yml
123
+ with:
124
+ framework: tensorflow
125
+ version: "2.5"
126
+ secrets: inherit
127
+
128
+ run_past_ci_tensorflow_2-4:
129
+ name: TensorFlow 2.4
130
+ if: always()
131
+ needs: [run_past_ci_tensorflow_2-5]
132
+ uses: ./.github/workflows/self-past.yml
133
+ with:
134
+ framework: tensorflow
135
+ version: "2.4"
136
+ secrets: inherit
hf-dev-train/transformers-main/.github/workflows/self-past.yml ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Self-hosted runner (past)
2
+
3
+ # Note that each job's dependencies go into a corresponding docker file.
4
+ #
5
+ # For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
6
+ # `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
7
+ # `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
8
+
9
+ on:
10
+ workflow_call:
11
+ inputs:
12
+ framework:
13
+ required: true
14
+ type: string
15
+ version:
16
+ required: true
17
+ type: string
18
+ # Use this to control the commit to test against
19
+ sha:
20
+ default: 'main'
21
+ required: false
22
+ type: string
23
+
24
+ env:
25
+ HF_HOME: /mnt/cache
26
+ TRANSFORMERS_IS_CI: yes
27
+ OMP_NUM_THREADS: 8
28
+ MKL_NUM_THREADS: 8
29
+ RUN_SLOW: yes
30
+ SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
31
+ TF_FORCE_GPU_ALLOW_GROWTH: true
32
+ RUN_PT_TF_CROSS_TESTS: 1
33
+
34
+ jobs:
35
+ check_runner_status:
36
+ name: Check Runner Status
37
+ runs-on: ubuntu-latest
38
+ steps:
39
+ - name: Checkout transformers
40
+ uses: actions/checkout@v3
41
+ with:
42
+ fetch-depth: 2
43
+
44
+ - name: Check Runner Status
45
+ run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
46
+
47
+ check_runners:
48
+ name: Check Runners
49
+ needs: check_runner_status
50
+ strategy:
51
+ matrix:
52
+ machine_type: [single-gpu, multi-gpu]
53
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
54
+ container:
55
+ image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
56
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
57
+ steps:
58
+ - name: NVIDIA-SMI
59
+ run: |
60
+ nvidia-smi
61
+
62
+ setup:
63
+ name: Setup
64
+ needs: check_runners
65
+ strategy:
66
+ matrix:
67
+ machine_type: [single-gpu, multi-gpu]
68
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
69
+ container:
70
+ image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
71
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
72
+ outputs:
73
+ matrix: ${{ steps.set-matrix.outputs.matrix }}
74
+ steps:
75
+ - name: Update clone
76
+ working-directory: /transformers
77
+ run: git fetch && git checkout ${{ inputs.sha }}
78
+
79
+ - name: Cleanup
80
+ working-directory: /transformers
81
+ run: |
82
+ rm -rf tests/__pycache__
83
+ rm -rf tests/models/__pycache__
84
+ rm -rf reports
85
+
86
+ - name: Show installed libraries and their versions
87
+ working-directory: /transformers
88
+ run: pip freeze
89
+
90
+ - id: set-matrix
91
+ working-directory: /transformers
92
+ name: Identify models to test
93
+ run: |
94
+ cd tests
95
+ echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT
96
+
97
+ run_tests_single_gpu:
98
+ name: Model tests
99
+ strategy:
100
+ fail-fast: false
101
+ matrix:
102
+ folders: ${{ fromJson(needs.setup.outputs.matrix) }}
103
+ machine_type: [single-gpu]
104
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
105
+ container:
106
+ image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
107
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
108
+ needs: setup
109
+ steps:
110
+ - name: Update clone
111
+ working-directory: /transformers
112
+ run: git fetch && git checkout ${{ inputs.sha }}
113
+
114
+ - name: Echo folder ${{ matrix.folders }}
115
+ shell: bash
116
+ # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
117
+ # set the artifact folder names (because the character `/` is not allowed).
118
+ run: |
119
+ echo "${{ matrix.folders }}"
120
+ matrix_folders=${{ matrix.folders }}
121
+ matrix_folders=${matrix_folders/'models/'/'models_'}
122
+ echo "$matrix_folders"
123
+ echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
124
+
125
+ - name: NVIDIA-SMI
126
+ run: |
127
+ nvidia-smi
128
+
129
+ - name: Environment
130
+ working-directory: /transformers
131
+ run: |
132
+ python3 utils/print_env.py
133
+
134
+ - name: Show installed libraries and their versions
135
+ working-directory: /transformers
136
+ run: pip freeze
137
+
138
+ - name: Run all tests on GPU
139
+ working-directory: /transformers
140
+ run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
141
+
142
+ - name: Failure short reports
143
+ if: ${{ failure() }}
144
+ continue-on-error: true
145
+ run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
146
+
147
+ - name: Save job name
148
+ if: ${{ always() }}
149
+ shell: bash
150
+ run: |
151
+ matrix_folders=${matrix_folders/'models_'/'models/'}
152
+ job_name="Model tests ($matrix_folders, ${{ matrix.machine_type }})"
153
+ echo "$job_name"
154
+ echo "$job_name" > /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/job_name.txt
155
+
156
+ - name: Test suite reports artifacts
157
+ if: ${{ always() }}
158
+ uses: actions/upload-artifact@v3
159
+ with:
160
+ name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
161
+ path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
162
+
163
+ run_tests_multi_gpu:
164
+ name: Model tests
165
+ strategy:
166
+ fail-fast: false
167
+ matrix:
168
+ folders: ${{ fromJson(needs.setup.outputs.matrix) }}
169
+ machine_type: [multi-gpu]
170
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
171
+ container:
172
+ image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
173
+ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
174
+ needs: setup
175
+ steps:
176
+ - name: Update clone
177
+ working-directory: /transformers
178
+ run: git fetch && git checkout ${{ inputs.sha }}
179
+
180
+ - name: Echo folder ${{ matrix.folders }}
181
+ shell: bash
182
+ # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
183
+ # set the artifact folder names (because the character `/` is not allowed).
184
+ run: |
185
+ echo "${{ matrix.folders }}"
186
+ matrix_folders=${{ matrix.folders }}
187
+ matrix_folders=${matrix_folders/'models/'/'models_'}
188
+ echo "$matrix_folders"
189
+ echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
190
+
191
+ - name: NVIDIA-SMI
192
+ run: |
193
+ nvidia-smi
194
+
195
+ - name: Environment
196
+ working-directory: /transformers
197
+ run: |
198
+ python3 utils/print_env.py
199
+
200
+ - name: Show installed libraries and their versions
201
+ working-directory: /transformers
202
+ run: pip freeze
203
+
204
+ - name: Run all tests on GPU
205
+ working-directory: /transformers
206
+ run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
207
+
208
+ - name: Failure short reports
209
+ if: ${{ failure() }}
210
+ continue-on-error: true
211
+ run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
212
+
213
+ - name: Save job name
214
+ if: ${{ always() }}
215
+ shell: bash
216
+ run: |
217
+ matrix_folders=${matrix_folders/'models_'/'models/'}
218
+ job_name="Model tests ($matrix_folders, ${{ matrix.machine_type }})"
219
+ echo "$job_name"
220
+ echo "$job_name" > /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/job_name.txt
221
+
222
+ - name: Test suite reports artifacts
223
+ if: ${{ always() }}
224
+ uses: actions/upload-artifact@v3
225
+ with:
226
+ name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
227
+ path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
228
+
229
+ send_results:
230
+ name: Send results to webhook
231
+ runs-on: ubuntu-latest
232
+ if: always()
233
+ needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu]
234
+ steps:
235
+ - name: Preliminary job status
236
+ shell: bash
237
+ # For the meaning of these environment variables, see the job `Setup`
238
+ run: |
239
+ echo "Runner availability: ${{ needs.check_runner_status.result }}"
240
+ echo "Runner status: ${{ needs.check_runners.result }}"
241
+ echo "Setup status: ${{ needs.setup.result }}"
242
+
243
+ - uses: actions/checkout@v3
244
+ - uses: actions/download-artifact@v3
245
+
246
+ # Create a directory to store test failure tables in the next step
247
+ - name: Create directory
248
+ run: mkdir test_failure_tables
249
+
250
+ - name: Send message to Slack
251
+ env:
252
+ CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
253
+ CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
254
+ CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
255
+ CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
256
+ CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
257
+ ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
258
+ CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
259
+ RUNNER_STATUS: ${{ needs.check_runner_status.result }}
260
+ RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
261
+ SETUP_STATUS: ${{ needs.setup.result }}
262
+ # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
263
+ # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
264
+ run: |
265
+ pip install slack_sdk
266
+ pip show slack_sdk
267
+ python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
268
+
269
+ # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
270
+ - name: Failure table artifacts
271
+ if: ${{ always() }}
272
+ uses: actions/upload-artifact@v3
273
+ with:
274
+ name: test_failure_tables_${{ inputs.framework }}-${{ inputs.version }}
275
+ path: test_failure_tables
hf-dev-train/transformers-main/.github/workflows/self-push-caller.yml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Used to trigger self-push CI
2
+ name: Self-hosted runner (push-caller)
3
+
4
+ on:
5
+ push:
6
+ branches:
7
+ - main
8
+ paths:
9
+ - "src/**"
10
+ - "tests/**"
11
+ - ".github/**"
12
+ - "templates/**"
13
+ - "utils/**"
14
+
15
+ jobs:
16
+ check-for-setup:
17
+ runs-on: ubuntu-latest
18
+ name: Check if setup was changed
19
+ outputs:
20
+ changed: ${{ steps.was_changed.outputs.changed }}
21
+ steps:
22
+ - uses: actions/checkout@v3
23
+ with:
24
+ fetch-depth: "2"
25
+
26
+ - name: Get changed files
27
+ id: changed-files
28
+ uses: tj-actions/[email protected]
29
+
30
+ - name: Was setup changed
31
+ id: was_changed
32
+ run: |
33
+ for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
34
+ if [ `basename "${file}"` = "setup.py" ]; then
35
+ echo "changed=1" >> $GITHUB_OUTPUT
36
+ fi
37
+ done
38
+
39
+ build-docker-containers:
40
+ needs: check-for-setup
41
+ if: (github.event_name == 'push') && (needs.check-for-setup.outputs.changed == '1')
42
+ uses: ./.github/workflows/build-docker-images.yml
43
+ with:
44
+ image_postfix: "-push-ci"
45
+ secrets: inherit
46
+
47
+ run_push_ci:
48
+ name: Trigger Push CI
49
+ runs-on: ubuntu-latest
50
+ if: ${{ always() }}
51
+ needs: build-docker-containers
52
+ steps:
53
+ - name: Trigger push CI via workflow_run
54
+ run: echo "Trigger push CI via workflow_run"
hf-dev-train/transformers-main/.github/workflows/self-push.yml ADDED
@@ -0,0 +1,585 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Self-hosted runner (push)
2
+
3
+ on:
4
+ workflow_run:
5
+ workflows: ["Self-hosted runner (push-caller)"]
6
+ branches: ["main"]
7
+ types: [completed]
8
+ push:
9
+ branches:
10
+ - ci_*
11
+ - ci-*
12
+ paths:
13
+ - "src/**"
14
+ - "tests/**"
15
+ - ".github/**"
16
+ - "templates/**"
17
+ - "utils/**"
18
+ repository_dispatch:
19
+
20
+ env:
21
+ HF_HOME: /mnt/cache
22
+ TRANSFORMERS_IS_CI: yes
23
+ OMP_NUM_THREADS: 8
24
+ MKL_NUM_THREADS: 8
25
+ PYTEST_TIMEOUT: 60
26
+ TF_FORCE_GPU_ALLOW_GROWTH: true
27
+ RUN_PT_TF_CROSS_TESTS: 1
28
+
29
+ jobs:
30
+ check_runner_status:
31
+ name: Check Runner Status
32
+ runs-on: ubuntu-latest
33
+ steps:
34
+ - name: Checkout transformers
35
+ uses: actions/checkout@v3
36
+ with:
37
+ fetch-depth: 2
38
+
39
+ - name: Check Runner Status
40
+ run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
41
+
42
+ check_runners:
43
+ name: Check Runners
44
+ needs: check_runner_status
45
+ strategy:
46
+ matrix:
47
+ machine_type: [single-gpu, multi-gpu]
48
+ runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
49
+ container:
50
+ image: huggingface/transformers-all-latest-gpu-push-ci
51
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
52
+ steps:
53
+ - name: NVIDIA-SMI
54
+ run: |
55
+ nvidia-smi
56
+
57
+ setup:
58
+ name: Setup
59
+ needs: check_runners
60
+ strategy:
61
+ matrix:
62
+ machine_type: [single-gpu, multi-gpu]
63
+ runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
64
+ container:
65
+ image: huggingface/transformers-all-latest-gpu-push-ci
66
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
67
+ outputs:
68
+ matrix: ${{ steps.set-matrix.outputs.matrix }}
69
+ test_map: ${{ steps.set-matrix.outputs.test_map }}
70
+ steps:
71
+ # Necessary to get the correct branch name and commit SHA for `workflow_run` event
72
+ # We also take into account the `push` event (we might want to test some changes in a branch)
73
+ - name: Prepare custom environment variables
74
+ shell: bash
75
+ # `CI_BRANCH_PUSH`: The branch name from the push event
76
+ # `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event
77
+ # `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty)
78
+ # `CI_SHA_PUSH`: The commit SHA from the push event
79
+ # `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event
80
+ # `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty)
81
+ run: |
82
+ CI_BRANCH_PUSH=${{ github.event.ref }}
83
+ CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
84
+ CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
85
+ CI_SHA_PUSH=${{ github.event.head_commit.id }}
86
+ CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
87
+ echo $CI_BRANCH_PUSH
88
+ echo $CI_BRANCH_WORKFLOW_RUN
89
+ echo $CI_SHA_PUSH
90
+ echo $CI_SHA_WORKFLOW_RUN
91
+ [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
92
+ [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
93
+
94
+ - name: print environment variables
95
+ run: |
96
+ echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
97
+ echo "env.CI_SHA = ${{ env.CI_SHA }}"
98
+
99
+ - name: Update clone using environment variables
100
+ working-directory: /transformers
101
+ run: |
102
+ echo "original branch = $(git branch --show-current)"
103
+ git fetch && git checkout ${{ env.CI_BRANCH }}
104
+ echo "updated branch = $(git branch --show-current)"
105
+ git checkout ${{ env.CI_SHA }}
106
+ echo "log = $(git log -n 1)"
107
+
108
+ - name: Cleanup
109
+ working-directory: /transformers
110
+ run: |
111
+ rm -rf tests/__pycache__
112
+ rm -rf tests/models/__pycache__
113
+ rm -rf reports
114
+
115
+ - name: Show installed libraries and their versions
116
+ working-directory: /transformers
117
+ run: pip freeze
118
+
119
+ - name: Fetch the tests to run
120
+ working-directory: /transformers
121
+ # TODO: add `git-python` in the docker images
122
+ run: |
123
+ pip install --upgrade git-python
124
+ python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
125
+
126
+ - name: Report fetched tests
127
+ uses: actions/upload-artifact@v3
128
+ with:
129
+ name: test_fetched
130
+ path: /transformers/test_preparation.txt
131
+
132
+ - id: set-matrix
133
+ name: Organize tests into models
134
+ working-directory: /transformers
135
+ # The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc.
136
+ # The `test_map` is used to get the actual identified test files under each key.
137
+ # If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
138
+ run: |
139
+ if [ -f test_map.json ]; then
140
+ keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)')
141
+ test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)')
142
+ else
143
+ keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
144
+ test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
145
+ fi
146
+ echo $keys
147
+ echo $test_map
148
+ echo "matrix=$keys" >> $GITHUB_OUTPUT
149
+ echo "test_map=$test_map" >> $GITHUB_OUTPUT
150
+
151
+ run_tests_single_gpu:
152
+ name: Model tests
153
+ needs: setup
154
+ # `dummy` means there is no test to run
155
+ if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
156
+ strategy:
157
+ fail-fast: false
158
+ matrix:
159
+ folders: ${{ fromJson(needs.setup.outputs.matrix) }}
160
+ machine_type: [single-gpu]
161
+ runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
162
+ container:
163
+ image: huggingface/transformers-all-latest-gpu-push-ci
164
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
165
+ steps:
166
+ # Necessary to get the correct branch name and commit SHA for `workflow_run` event
167
+ # We also take into account the `push` event (we might want to test some changes in a branch)
168
+ - name: Prepare custom environment variables
169
+ shell: bash
170
+ # For the meaning of these environment variables, see the job `Setup`
171
+ run: |
172
+ CI_BRANCH_PUSH=${{ github.event.ref }}
173
+ CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
174
+ CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
175
+ CI_SHA_PUSH=${{ github.event.head_commit.id }}
176
+ CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
177
+ echo $CI_BRANCH_PUSH
178
+ echo $CI_BRANCH_WORKFLOW_RUN
179
+ echo $CI_SHA_PUSH
180
+ echo $CI_SHA_WORKFLOW_RUN
181
+ [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
182
+ [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
183
+
184
+ - name: print environment variables
185
+ run: |
186
+ echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
187
+ echo "env.CI_SHA = ${{ env.CI_SHA }}"
188
+
189
+ - name: Update clone using environment variables
190
+ working-directory: /transformers
191
+ run: |
192
+ echo "original branch = $(git branch --show-current)"
193
+ git fetch && git checkout ${{ env.CI_BRANCH }}
194
+ echo "updated branch = $(git branch --show-current)"
195
+ git checkout ${{ env.CI_SHA }}
196
+ echo "log = $(git log -n 1)"
197
+
198
+ - name: Echo folder ${{ matrix.folders }}
199
+ shell: bash
200
+ # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
201
+ # set the artifact folder names (because the character `/` is not allowed).
202
+ run: |
203
+ echo "${{ matrix.folders }}"
204
+ echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}"
205
+ matrix_folders=${{ matrix.folders }}
206
+ matrix_folders=${matrix_folders/'models/'/'models_'}
207
+ echo "$matrix_folders"
208
+ echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
209
+
210
+ - name: NVIDIA-SMI
211
+ run: |
212
+ nvidia-smi
213
+
214
+ - name: Environment
215
+ working-directory: /transformers
216
+ run: |
217
+ python3 utils/print_env.py
218
+
219
+ - name: Show installed libraries and their versions
220
+ working-directory: /transformers
221
+ run: pip freeze
222
+
223
+ - name: Run all non-slow selected tests on GPU
224
+ working-directory: /transformers
225
+ run: |
226
+ python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
227
+
228
+ - name: Failure short reports
229
+ if: ${{ failure() }}
230
+ continue-on-error: true
231
+ run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
232
+
233
+ - name: Test suite reports artifacts
234
+ if: ${{ always() }}
235
+ uses: actions/upload-artifact@v3
236
+ with:
237
+ name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
238
+ path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
239
+
240
+ run_tests_multi_gpu:
241
+ name: Model tests
242
+ needs: setup
243
+ # `dummy` means there is no test to run
244
+ if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
245
+ strategy:
246
+ fail-fast: false
247
+ matrix:
248
+ folders: ${{ fromJson(needs.setup.outputs.matrix) }}
249
+ machine_type: [multi-gpu]
250
+ runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
251
+ container:
252
+ image: huggingface/transformers-all-latest-gpu-push-ci
253
+ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
254
+ steps:
255
+ # Necessary to get the correct branch name and commit SHA for `workflow_run` event
256
+ # We also take into account the `push` event (we might want to test some changes in a branch)
257
+ - name: Prepare custom environment variables
258
+ shell: bash
259
+ # For the meaning of these environment variables, see the job `Setup`
260
+ run: |
261
+ CI_BRANCH_PUSH=${{ github.event.ref }}
262
+ CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
263
+ CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
264
+ CI_SHA_PUSH=${{ github.event.head_commit.id }}
265
+ CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
266
+ echo $CI_BRANCH_PUSH
267
+ echo $CI_BRANCH_WORKFLOW_RUN
268
+ echo $CI_SHA_PUSH
269
+ echo $CI_SHA_WORKFLOW_RUN
270
+ [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
271
+ [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
272
+
273
+ - name: print environment variables
274
+ run: |
275
+ echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
276
+ echo "env.CI_SHA = ${{ env.CI_SHA }}"
277
+
278
+ - name: Update clone using environment variables
279
+ working-directory: /transformers
280
+ run: |
281
+ echo "original branch = $(git branch --show-current)"
282
+ git fetch && git checkout ${{ env.CI_BRANCH }}
283
+ echo "updated branch = $(git branch --show-current)"
284
+ git checkout ${{ env.CI_SHA }}
285
+ echo "log = $(git log -n 1)"
286
+
287
+ - name: Echo folder ${{ matrix.folders }}
288
+ shell: bash
289
+ # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
290
+ # set the artifact folder names (because the character `/` is not allowed).
291
+ run: |
292
+ echo "${{ matrix.folders }}"
293
+ echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}"
294
+ matrix_folders=${{ matrix.folders }}
295
+ matrix_folders=${matrix_folders/'models/'/'models_'}
296
+ echo "$matrix_folders"
297
+ echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
298
+
299
+ - name: NVIDIA-SMI
300
+ run: |
301
+ nvidia-smi
302
+
303
+ - name: Environment
304
+ working-directory: /transformers
305
+ run: |
306
+ python3 utils/print_env.py
307
+
308
+ - name: Show installed libraries and their versions
309
+ working-directory: /transformers
310
+ run: pip freeze
311
+
312
+ - name: Run all non-slow selected tests on GPU
313
+ env:
314
+ MKL_SERVICE_FORCE_INTEL: 1
315
+ working-directory: /transformers
316
+ run: |
317
+ python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
318
+
319
+ - name: Failure short reports
320
+ if: ${{ failure() }}
321
+ continue-on-error: true
322
+ run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
323
+
324
+ - name: Test suite reports artifacts
325
+ if: ${{ always() }}
326
+ uses: actions/upload-artifact@v3
327
+ with:
328
+ name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
329
+ path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
330
+
331
+ run_tests_torch_cuda_extensions_single_gpu:
332
+ name: Torch CUDA extension tests
333
+ needs: setup
334
+ if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
335
+ strategy:
336
+ fail-fast: false
337
+ matrix:
338
+ machine_type: [single-gpu]
339
+ runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
340
+ container:
341
+ image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
342
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
343
+ steps:
344
+ # Necessary to get the correct branch name and commit SHA for `workflow_run` event
345
+ # We also take into account the `push` event (we might want to test some changes in a branch)
346
+ - name: Prepare custom environment variables
347
+ shell: bash
348
+ # For the meaning of these environment variables, see the job `Setup`
349
+ run: |
350
+ CI_BRANCH_PUSH=${{ github.event.ref }}
351
+ CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
352
+ CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
353
+ CI_SHA_PUSH=${{ github.event.head_commit.id }}
354
+ CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
355
+ echo $CI_BRANCH_PUSH
356
+ echo $CI_BRANCH_WORKFLOW_RUN
357
+ echo $CI_SHA_PUSH
358
+ echo $CI_SHA_WORKFLOW_RUN
359
+ [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
360
+ [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
361
+
362
+ - name: print environment variables
363
+ run: |
364
+ echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
365
+ echo "env.CI_SHA = ${{ env.CI_SHA }}"
366
+
367
+ - name: Update clone using environment variables
368
+ working-directory: /workspace/transformers
369
+ run: |
370
+ echo "original branch = $(git branch --show-current)"
371
+ git fetch && git checkout ${{ env.CI_BRANCH }}
372
+ echo "updated branch = $(git branch --show-current)"
373
+ git checkout ${{ env.CI_SHA }}
374
+ echo "log = $(git log -n 1)"
375
+
376
+ - name: Remove cached torch extensions
377
+ run: rm -rf /github/home/.cache/torch_extensions/
378
+
379
+ # To avoid unknown test failures
380
+ - name: Pre build DeepSpeed *again*
381
+ working-directory: /workspace
382
+ run: |
383
+ python3 -m pip uninstall -y deepspeed
384
+ DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
385
+
386
+ - name: NVIDIA-SMI
387
+ run: |
388
+ nvidia-smi
389
+
390
+ - name: Environment
391
+ working-directory: /workspace/transformers
392
+ run: |
393
+ python utils/print_env.py
394
+
395
+ - name: Show installed libraries and their versions
396
+ working-directory: /workspace/transformers
397
+ run: pip freeze
398
+
399
+ - name: Run all non-slow selected tests on GPU
400
+ working-directory: /workspace/transformers
401
+ # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
402
+ run: |
403
+ python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
404
+
405
+ - name: Failure short reports
406
+ if: ${{ failure() }}
407
+ continue-on-error: true
408
+ run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
409
+
410
+ - name: Test suite reports artifacts
411
+ if: ${{ always() }}
412
+ uses: actions/upload-artifact@v3
413
+ with:
414
+ name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
415
+ path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
416
+
417
+ run_tests_torch_cuda_extensions_multi_gpu:
418
+ name: Torch CUDA extension tests
419
+ needs: setup
420
+ if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
421
+ strategy:
422
+ fail-fast: false
423
+ matrix:
424
+ machine_type: [multi-gpu]
425
+ runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
426
+ container:
427
+ image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
428
+ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
429
+ steps:
430
+ # Necessary to get the correct branch name and commit SHA for `workflow_run` event
431
+ # We also take into account the `push` event (we might want to test some changes in a branch)
432
+ - name: Prepare custom environment variables
433
+ shell: bash
434
+ # For the meaning of these environment variables, see the job `Setup`
435
+ run: |
436
+ CI_BRANCH_PUSH=${{ github.event.ref }}
437
+ CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
438
+ CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
439
+ CI_SHA_PUSH=${{ github.event.head_commit.id }}
440
+ CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
441
+ echo $CI_BRANCH_PUSH
442
+ echo $CI_BRANCH_WORKFLOW_RUN
443
+ echo $CI_SHA_PUSH
444
+ echo $CI_SHA_WORKFLOW_RUN
445
+ [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
446
+ [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
447
+
448
+ - name: print environment variables
449
+ run: |
450
+ echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
451
+ echo "env.CI_SHA = ${{ env.CI_SHA }}"
452
+
453
+ - name: Update clone using environment variables
454
+ working-directory: /workspace/transformers
455
+ run: |
456
+ echo "original branch = $(git branch --show-current)"
457
+ git fetch && git checkout ${{ env.CI_BRANCH }}
458
+ echo "updated branch = $(git branch --show-current)"
459
+ git checkout ${{ env.CI_SHA }}
460
+ echo "log = $(git log -n 1)"
461
+
462
+ - name: Remove cached torch extensions
463
+ run: rm -rf /github/home/.cache/torch_extensions/
464
+
465
+ # To avoid unknown test failures
466
+ - name: Pre build DeepSpeed *again*
467
+ working-directory: /workspace
468
+ run: |
469
+ python3 -m pip uninstall -y deepspeed
470
+ DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
471
+
472
+ - name: NVIDIA-SMI
473
+ run: |
474
+ nvidia-smi
475
+
476
+ - name: Environment
477
+ working-directory: /workspace/transformers
478
+ run: |
479
+ python utils/print_env.py
480
+
481
+ - name: Show installed libraries and their versions
482
+ working-directory: /workspace/transformers
483
+ run: pip freeze
484
+
485
+ - name: Run all non-slow selected tests on GPU
486
+ working-directory: /workspace/transformers
487
+ # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
488
+ run: |
489
+ python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
490
+
491
+ - name: Failure short reports
492
+ if: ${{ failure() }}
493
+ continue-on-error: true
494
+ run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
495
+
496
+ - name: Test suite reports artifacts
497
+ if: ${{ always() }}
498
+ uses: actions/upload-artifact@v3
499
+ with:
500
+ name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
501
+ path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
502
+
503
+ send_results:
504
+ name: Send results to webhook
505
+ runs-on: ubuntu-latest
506
+ if: always()
507
+ needs: [
508
+ check_runner_status,
509
+ check_runners,
510
+ setup,
511
+ run_tests_single_gpu,
512
+ run_tests_multi_gpu,
513
+ run_tests_torch_cuda_extensions_single_gpu,
514
+ run_tests_torch_cuda_extensions_multi_gpu
515
+ ]
516
+ steps:
517
+ - name: Preliminary job status
518
+ shell: bash
519
+ # For the meaning of these environment variables, see the job `Setup`
520
+ run: |
521
+ echo "Runner availability: ${{ needs.check_runner_status.result }}"
522
+ echo "Setup status: ${{ needs.setup.result }}"
523
+ echo "Runner status: ${{ needs.check_runners.result }}"
524
+
525
+ # Necessary to get the correct branch name and commit SHA for `workflow_run` event
526
+ # We also take into account the `push` event (we might want to test some changes in a branch)
527
+ - name: Prepare custom environment variables
528
+ shell: bash
529
+ # For the meaning of these environment variables, see the job `Setup`
530
+ run: |
531
+ CI_BRANCH_PUSH=${{ github.event.ref }}
532
+ CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
533
+ CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
534
+ CI_SHA_PUSH=${{ github.event.head_commit.id }}
535
+ CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
536
+ echo $CI_BRANCH_PUSH
537
+ echo $CI_BRANCH_WORKFLOW_RUN
538
+ echo $CI_SHA_PUSH
539
+ echo $CI_SHA_WORKFLOW_RUN
540
+ [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
541
+ [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
542
+
543
+ - name: print environment variables
544
+ run: |
545
+ echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
546
+ echo "env.CI_SHA = ${{ env.CI_SHA }}"
547
+
548
+ - uses: actions/checkout@v3
549
+ # To avoid failure when multiple commits are merged into `main` in a short period of time.
550
+ # Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ...
551
+ # (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit)
552
+ with:
553
+ fetch-depth: 20
554
+
555
+ - name: Update clone using environment variables
556
+ run: |
557
+ echo "original branch = $(git branch --show-current)"
558
+ git fetch && git checkout ${{ env.CI_BRANCH }}
559
+ echo "updated branch = $(git branch --show-current)"
560
+ git checkout ${{ env.CI_SHA }}
561
+ echo "log = $(git log -n 1)"
562
+
563
+ - uses: actions/download-artifact@v3
564
+ - name: Send message to Slack
565
+ env:
566
+ CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
567
+ CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
568
+ CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
569
+ CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
570
+ CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
571
+ ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
572
+ CI_EVENT: push
573
+ CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
574
+ CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
575
+ CI_SHA: ${{ env.CI_SHA }}
576
+ RUNNER_STATUS: ${{ needs.check_runner_status.result }}
577
+ RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
578
+ SETUP_STATUS: ${{ needs.setup.result }}
579
+
580
+ # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
581
+ # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
582
+ run: |
583
+ pip install slack_sdk
584
+ pip show slack_sdk
585
+ python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
hf-dev-train/transformers-main/.github/workflows/self-scheduled.yml ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Self-hosted runner (scheduled)
2
+
3
+ # Note that each job's dependencies go into a corresponding docker file.
4
+ #
5
+ # For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
6
+ # `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
7
+ # `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
8
+
9
+ on:
10
+ repository_dispatch:
11
+ schedule:
12
+ - cron: "0 2 * * *"
13
+
14
+ env:
15
+ HF_HOME: /mnt/cache
16
+ TRANSFORMERS_IS_CI: yes
17
+ OMP_NUM_THREADS: 8
18
+ MKL_NUM_THREADS: 8
19
+ RUN_SLOW: yes
20
+ SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
21
+ TF_FORCE_GPU_ALLOW_GROWTH: true
22
+ RUN_PT_TF_CROSS_TESTS: 1
23
+
24
+ jobs:
25
+ check_runner_status:
26
+ name: Check Runner Status
27
+ runs-on: ubuntu-latest
28
+ steps:
29
+ - name: Checkout transformers
30
+ uses: actions/checkout@v3
31
+ with:
32
+ fetch-depth: 2
33
+
34
+ - name: Check Runner Status
35
+ run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
36
+
37
+ check_runners:
38
+ name: Check Runners
39
+ needs: check_runner_status
40
+ strategy:
41
+ matrix:
42
+ machine_type: [single-gpu, multi-gpu]
43
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
44
+ container:
45
+ image: huggingface/transformers-all-latest-gpu
46
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
47
+ steps:
48
+ - name: NVIDIA-SMI
49
+ run: |
50
+ nvidia-smi
51
+
52
+ setup:
53
+ name: Setup
54
+ needs: check_runners
55
+ strategy:
56
+ matrix:
57
+ machine_type: [single-gpu, multi-gpu]
58
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
59
+ container:
60
+ image: huggingface/transformers-all-latest-gpu
61
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
62
+ outputs:
63
+ matrix: ${{ steps.set-matrix.outputs.matrix }}
64
+ steps:
65
+ - name: Update clone
66
+ working-directory: /transformers
67
+ run: |
68
+ git fetch && git checkout ${{ github.sha }}
69
+
70
+ - name: Cleanup
71
+ working-directory: /transformers
72
+ run: |
73
+ rm -rf tests/__pycache__
74
+ rm -rf tests/models/__pycache__
75
+ rm -rf reports
76
+
77
+ - name: Show installed libraries and their versions
78
+ working-directory: /transformers
79
+ run: pip freeze
80
+
81
+ - id: set-matrix
82
+ name: Identify models to test
83
+ working-directory: /transformers/tests
84
+ run: |
85
+ echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT
86
+
87
+ - name: NVIDIA-SMI
88
+ run: |
89
+ nvidia-smi
90
+
91
+ run_tests_single_gpu:
92
+ name: Model tests
93
+ strategy:
94
+ fail-fast: false
95
+ matrix:
96
+ folders: ${{ fromJson(needs.setup.outputs.matrix) }}
97
+ machine_type: [single-gpu]
98
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
99
+ container:
100
+ image: huggingface/transformers-all-latest-gpu
101
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
102
+ needs: setup
103
+ steps:
104
+ - name: Echo folder ${{ matrix.folders }}
105
+ shell: bash
106
+ # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
107
+ # set the artifact folder names (because the character `/` is not allowed).
108
+ run: |
109
+ echo "${{ matrix.folders }}"
110
+ matrix_folders=${{ matrix.folders }}
111
+ matrix_folders=${matrix_folders/'models/'/'models_'}
112
+ echo "$matrix_folders"
113
+ echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
114
+
115
+ - name: Update clone
116
+ working-directory: /transformers
117
+ run: git fetch && git checkout ${{ github.sha }}
118
+
119
+ - name: NVIDIA-SMI
120
+ run: |
121
+ nvidia-smi
122
+
123
+ - name: Environment
124
+ working-directory: /transformers
125
+ run: |
126
+ python3 utils/print_env.py
127
+
128
+ - name: Show installed libraries and their versions
129
+ working-directory: /transformers
130
+ run: pip freeze
131
+
132
+ - name: Run all tests on GPU
133
+ working-directory: /transformers
134
+ run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
135
+
136
+ - name: Failure short reports
137
+ if: ${{ failure() }}
138
+ continue-on-error: true
139
+ run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
140
+
141
+ - name: Test suite reports artifacts
142
+ if: ${{ always() }}
143
+ uses: actions/upload-artifact@v3
144
+ with:
145
+ name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
146
+ path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
147
+
148
+ run_tests_multi_gpu:
149
+ name: Model tests
150
+ strategy:
151
+ fail-fast: false
152
+ matrix:
153
+ folders: ${{ fromJson(needs.setup.outputs.matrix) }}
154
+ machine_type: [multi-gpu]
155
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
156
+ container:
157
+ image: huggingface/transformers-all-latest-gpu
158
+ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
159
+ needs: setup
160
+ steps:
161
+ - name: Echo folder ${{ matrix.folders }}
162
+ shell: bash
163
+ # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
164
+ # set the artifact folder names (because the character `/` is not allowed).
165
+ run: |
166
+ echo "${{ matrix.folders }}"
167
+ matrix_folders=${{ matrix.folders }}
168
+ matrix_folders=${matrix_folders/'models/'/'models_'}
169
+ echo "$matrix_folders"
170
+ echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
171
+
172
+ - name: Update clone
173
+ working-directory: /transformers
174
+ run: git fetch && git checkout ${{ github.sha }}
175
+
176
+ - name: NVIDIA-SMI
177
+ run: |
178
+ nvidia-smi
179
+
180
+ - name: Environment
181
+ working-directory: /transformers
182
+ run: |
183
+ python3 utils/print_env.py
184
+
185
+ - name: Show installed libraries and their versions
186
+ working-directory: /transformers
187
+ run: pip freeze
188
+
189
+ - name: Run all tests on GPU
190
+ working-directory: /transformers
191
+ run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
192
+
193
+ - name: Failure short reports
194
+ if: ${{ failure() }}
195
+ continue-on-error: true
196
+ run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
197
+
198
+ - name: Test suite reports artifacts
199
+ if: ${{ always() }}
200
+ uses: actions/upload-artifact@v3
201
+ with:
202
+ name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
203
+ path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
204
+
205
+ run_examples_gpu:
206
+ name: Examples directory
207
+ strategy:
208
+ fail-fast: false
209
+ matrix:
210
+ machine_type: [single-gpu]
211
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
212
+ container:
213
+ image: huggingface/transformers-all-latest-gpu
214
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
215
+ needs: setup
216
+ steps:
217
+ - name: Update clone
218
+ working-directory: /transformers
219
+ run: git fetch && git checkout ${{ github.sha }}
220
+
221
+ - name: NVIDIA-SMI
222
+ run: |
223
+ nvidia-smi
224
+
225
+ - name: Environment
226
+ working-directory: /transformers
227
+ run: |
228
+ python3 utils/print_env.py
229
+
230
+ - name: Show installed libraries and their versions
231
+ working-directory: /transformers
232
+ run: pip freeze
233
+
234
+ - name: Run examples tests on GPU
235
+ working-directory: /transformers
236
+ run: |
237
+ pip install -r examples/pytorch/_tests_requirements.txt
238
+ python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
239
+
240
+ - name: Failure short reports
241
+ if: ${{ failure() }}
242
+ continue-on-error: true
243
+ run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
244
+
245
+ - name: Test suite reports artifacts
246
+ if: ${{ always() }}
247
+ uses: actions/upload-artifact@v3
248
+ with:
249
+ name: ${{ matrix.machine_type }}_run_examples_gpu
250
+ path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
251
+
252
+ run_pipelines_torch_gpu:
253
+ name: PyTorch pipelines
254
+ strategy:
255
+ fail-fast: false
256
+ matrix:
257
+ machine_type: [single-gpu, multi-gpu]
258
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
259
+ container:
260
+ image: huggingface/transformers-pytorch-gpu
261
+ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
262
+ needs: setup
263
+ steps:
264
+ - name: Update clone
265
+ working-directory: /transformers
266
+ run: git fetch && git checkout ${{ github.sha }}
267
+
268
+ - name: NVIDIA-SMI
269
+ run: |
270
+ nvidia-smi
271
+
272
+ - name: Environment
273
+ working-directory: /transformers
274
+ run: |
275
+ python3 utils/print_env.py
276
+
277
+ - name: Show installed libraries and their versions
278
+ working-directory: /transformers
279
+ run: pip freeze
280
+
281
+ - name: Run all pipeline tests on GPU
282
+ working-directory: /transformers
283
+ run: |
284
+ python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
285
+
286
+ - name: Failure short reports
287
+ if: ${{ failure() }}
288
+ continue-on-error: true
289
+ run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
290
+
291
+ - name: Test suite reports artifacts
292
+ if: ${{ always() }}
293
+ uses: actions/upload-artifact@v3
294
+ with:
295
+ name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
296
+ path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
297
+
298
+ run_pipelines_tf_gpu:
299
+ name: TensorFlow pipelines
300
+ strategy:
301
+ fail-fast: false
302
+ matrix:
303
+ machine_type: [single-gpu, multi-gpu]
304
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
305
+ container:
306
+ image: huggingface/transformers-tensorflow-gpu
307
+ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
308
+ needs: setup
309
+ steps:
310
+ - name: Update clone
311
+ working-directory: /transformers
312
+ run: |
313
+ git fetch && git checkout ${{ github.sha }}
314
+
315
+ - name: NVIDIA-SMI
316
+ run: |
317
+ nvidia-smi
318
+
319
+ - name: Environment
320
+ working-directory: /transformers
321
+ run: |
322
+ python3 utils/print_env.py
323
+
324
+ - name: Show installed libraries and their versions
325
+ working-directory: /transformers
326
+ run: pip freeze
327
+
328
+ - name: Run all pipeline tests on GPU
329
+ working-directory: /transformers
330
+ run: |
331
+ python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines
332
+
333
+ - name: Failure short reports
334
+ if: ${{ always() }}
335
+ run: |
336
+ cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt
337
+
338
+ - name: Test suite reports artifacts
339
+ if: ${{ always() }}
340
+ uses: actions/upload-artifact@v3
341
+ with:
342
+ name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu
343
+ path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu
344
+
345
+ run_all_tests_torch_cuda_extensions_gpu:
346
+ name: Torch CUDA extension tests
347
+ strategy:
348
+ fail-fast: false
349
+ matrix:
350
+ machine_type: [single-gpu, multi-gpu]
351
+ runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
352
+ needs: setup
353
+ container:
354
+ image: huggingface/transformers-pytorch-deepspeed-latest-gpu
355
+ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
356
+ steps:
357
+ - name: Update clone
358
+ working-directory: /workspace/transformers
359
+ run: git fetch && git checkout ${{ github.sha }}
360
+
361
+ - name: Remove cached torch extensions
362
+ run: rm -rf /github/home/.cache/torch_extensions/
363
+
364
+ # To avoid unknown test failures
365
+ - name: Pre build DeepSpeed *again*
366
+ working-directory: /workspace
367
+ run: |
368
+ python3 -m pip uninstall -y deepspeed
369
+ DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
370
+
371
+ - name: NVIDIA-SMI
372
+ run: |
373
+ nvidia-smi
374
+
375
+ - name: Environment
376
+ working-directory: /workspace/transformers
377
+ run: |
378
+ python utils/print_env.py
379
+
380
+ - name: Show installed libraries and their versions
381
+ working-directory: /workspace/transformers
382
+ run: pip freeze
383
+
384
+ - name: Run all tests on GPU
385
+ working-directory: /workspace/transformers
386
+ run: |
387
+ python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
388
+
389
+ - name: Failure short reports
390
+ if: ${{ failure() }}
391
+ continue-on-error: true
392
+ run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
393
+
394
+ - name: Test suite reports artifacts
395
+ if: ${{ always() }}
396
+ uses: actions/upload-artifact@v3
397
+ with:
398
+ name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
399
+ path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
400
+
401
+ run_extract_warnings:
402
+ name: Extract warnings in CI artifacts
403
+ runs-on: ubuntu-latest
404
+ if: always()
405
+ needs: [
406
+ check_runner_status,
407
+ check_runners,
408
+ setup,
409
+ run_tests_single_gpu,
410
+ run_tests_multi_gpu,
411
+ run_examples_gpu,
412
+ run_pipelines_tf_gpu,
413
+ run_pipelines_torch_gpu,
414
+ run_all_tests_torch_cuda_extensions_gpu
415
+ ]
416
+ steps:
417
+ - name: Checkout transformers
418
+ uses: actions/checkout@v3
419
+ with:
420
+ fetch-depth: 2
421
+
422
+ - name: Install transformers
423
+ run: pip install transformers
424
+
425
+ - name: Show installed libraries and their versions
426
+ run: pip freeze
427
+
428
+ - name: Create output directory
429
+ run: mkdir warnings_in_ci
430
+
431
+ - uses: actions/download-artifact@v3
432
+ with:
433
+ path: warnings_in_ci
434
+
435
+ - name: Show artifacts
436
+ run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')"
437
+ working-directory: warnings_in_ci
438
+
439
+ - name: Extract warnings in CI artifacts
440
+ run: |
441
+ python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
442
+ echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"
443
+
444
+ - name: Upload artifact
445
+ if: ${{ always() }}
446
+ uses: actions/upload-artifact@v3
447
+ with:
448
+ name: warnings_in_ci
449
+ path: warnings_in_ci/selected_warnings.json
450
+
451
+ send_results:
452
+ name: Send results to webhook
453
+ runs-on: ubuntu-latest
454
+ if: always()
455
+ needs: [
456
+ check_runner_status,
457
+ check_runners,
458
+ setup,
459
+ run_tests_single_gpu,
460
+ run_tests_multi_gpu,
461
+ run_examples_gpu,
462
+ run_pipelines_tf_gpu,
463
+ run_pipelines_torch_gpu,
464
+ run_all_tests_torch_cuda_extensions_gpu,
465
+ run_extract_warnings
466
+ ]
467
+ steps:
468
+ - name: Preliminary job status
469
+ shell: bash
470
+ # For the meaning of these environment variables, see the job `Setup`
471
+ run: |
472
+ echo "Runner availability: ${{ needs.check_runner_status.result }}"
473
+ echo "Runner status: ${{ needs.check_runners.result }}"
474
+ echo "Setup status: ${{ needs.setup.result }}"
475
+
476
+ - uses: actions/checkout@v3
477
+ - uses: actions/download-artifact@v3
478
+ - name: Send message to Slack
479
+ env:
480
+ CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
481
+ CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
482
+ CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
483
+ CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
484
+ CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
485
+ ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
486
+ CI_EVENT: scheduled
487
+ RUNNER_STATUS: ${{ needs.check_runner_status.result }}
488
+ RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
489
+ SETUP_STATUS: ${{ needs.setup.result }}
490
+ # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
491
+ # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
492
+ run: |
493
+ pip install slack_sdk
494
+ pip show slack_sdk
495
+ python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
hf-dev-train/transformers-main/.github/workflows/stale.yml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Stale Bot
2
+
3
+ on:
4
+ schedule:
5
+ - cron: "0 15 * * *"
6
+
7
+ jobs:
8
+ close_stale_issues:
9
+ name: Close Stale Issues
10
+ if: github.repository == 'huggingface/transformers'
11
+ runs-on: ubuntu-latest
12
+ env:
13
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
14
+ steps:
15
+ - uses: actions/checkout@v3
16
+
17
+ - name: Setup Python
18
+ uses: actions/setup-python@v4
19
+ with:
20
+ python-version: 3.7
21
+
22
+ - name: Install requirements
23
+ run: |
24
+ pip install PyGithub
25
+ - name: Close stale issues
26
+ run: |
27
+ python scripts/stale.py
hf-dev-train/transformers-main/.github/workflows/update_metdata.yml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Update Transformers metadata
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ - update_transformers_metadata
8
+
9
+ jobs:
10
+ build_and_package:
11
+ runs-on: ubuntu-latest
12
+ defaults:
13
+ run:
14
+ shell: bash -l {0}
15
+
16
+ steps:
17
+ - uses: actions/checkout@v3
18
+
19
+ - name: Load cached virtual environment
20
+ uses: actions/cache@v2
21
+ id: cache
22
+ with:
23
+ path: ~/venv/
24
+ key: v3-metadata-${{ hashFiles('setup.py') }}
25
+
26
+ - name: Create virtual environment on cache miss
27
+ if: steps.cache.outputs.cache-hit != 'true'
28
+ run: |
29
+ python -m venv ~/venv && . ~/venv/bin/activate
30
+ pip install --upgrade pip
31
+
32
+ - name: Setup environment
33
+ run: |
34
+ . ~/venv/bin/activate
35
+ pip install git+https://github.com/huggingface/transformers#egg=transformers[dev]
36
+
37
+ - name: Update metadata
38
+ run: |
39
+ . ~/venv/bin/activate
40
+ python utils/update_metadata.py --token ${{ secrets.SYLVAIN_HF_TOKEN }} --commit_sha ${{ github.sha }}
hf-dev-train/transformers-main/.github/workflows/update_tiny_models.yml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Update Tiny Models
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - update_tiny_models*
7
+ repository_dispatch:
8
+ schedule:
9
+ - cron: "0 2 * * *"
10
+
11
+ env:
12
+ TOKEN: ${{ secrets.TRANSFORMERS_HUB_BOT_HF_TOKEN }}
13
+
14
+ jobs:
15
+ update_tiny_models:
16
+ name: Update tiny models
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - name: Checkout transformers
20
+ uses: actions/checkout@v3
21
+ with:
22
+ fetch-depth: 2
23
+
24
+ - name: Install
25
+ run: |
26
+ python -m pip install -U .[dev]
27
+ python -m pip install -U natten
28
+
29
+ - name: Update tiny models
30
+ run: |
31
+ python utils/update_tiny_models.py
32
+
33
+ - name: Full report
34
+ run: cat tiny_models/reports/tiny_model_creation_report.json
35
+
36
+ - name: Failure report
37
+ run: cat tiny_models/reports/simple_failed_report.txt
38
+
39
+ - name: Summary report
40
+ run: cat tiny_models/reports/tiny_model_summary.json
41
+
42
+ - name: Test suite reports artifacts
43
+ if: ${{ always() }}
44
+ uses: actions/upload-artifact@v3
45
+ with:
46
+ name: tiny_model_creation_reports
47
+ path: tiny_models/reports
hf-dev-train/transformers-main/.gitignore ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Initially taken from Github's Python gitignore file
2
+
3
+ # Byte-compiled / optimized / DLL files
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # tests and logs
12
+ tests/fixtures/cached_*_text.txt
13
+ logs/
14
+ lightning_logs/
15
+ lang_code_data/
16
+
17
+ # Distribution / packaging
18
+ .Python
19
+ build/
20
+ develop-eggs/
21
+ dist/
22
+ downloads/
23
+ eggs/
24
+ .eggs/
25
+ lib/
26
+ lib64/
27
+ parts/
28
+ sdist/
29
+ var/
30
+ wheels/
31
+ *.egg-info/
32
+ .installed.cfg
33
+ *.egg
34
+ MANIFEST
35
+
36
+ # PyInstaller
37
+ # Usually these files are written by a python script from a template
38
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
39
+ *.manifest
40
+ *.spec
41
+
42
+ # Installer logs
43
+ pip-log.txt
44
+ pip-delete-this-directory.txt
45
+
46
+ # Unit test / coverage reports
47
+ htmlcov/
48
+ .tox/
49
+ .nox/
50
+ .coverage
51
+ .coverage.*
52
+ .cache
53
+ nosetests.xml
54
+ coverage.xml
55
+ *.cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ .python-version
90
+
91
+ # celery beat schedule file
92
+ celerybeat-schedule
93
+
94
+ # SageMath parsed files
95
+ *.sage.py
96
+
97
+ # Environments
98
+ .env
99
+ .venv
100
+ env/
101
+ venv/
102
+ ENV/
103
+ env.bak/
104
+ venv.bak/
105
+
106
+ # Spyder project settings
107
+ .spyderproject
108
+ .spyproject
109
+
110
+ # Rope project settings
111
+ .ropeproject
112
+
113
+ # mkdocs documentation
114
+ /site
115
+
116
+ # mypy
117
+ .mypy_cache/
118
+ .dmypy.json
119
+ dmypy.json
120
+
121
+ # Pyre type checker
122
+ .pyre/
123
+
124
+ # vscode
125
+ .vs
126
+ .vscode
127
+
128
+ # Pycharm
129
+ .idea
130
+
131
+ # TF code
132
+ tensorflow_code
133
+
134
+ # Models
135
+ proc_data
136
+
137
+ # examples
138
+ runs
139
+ /runs_old
140
+ /wandb
141
+ /examples/runs
142
+ /examples/**/*.args
143
+ /examples/rag/sweep
144
+
145
+ # data
146
+ /data
147
+ serialization_dir
148
+
149
+ # emacs
150
+ *.*~
151
+ debug.env
152
+
153
+ # vim
154
+ .*.swp
155
+
156
+ #ctags
157
+ tags
158
+
159
+ # pre-commit
160
+ .pre-commit*
161
+
162
+ # .lock
163
+ *.lock
164
+
165
+ # DS_Store (MacOS)
166
+ .DS_Store
167
+
168
+ # ruff
169
+ .ruff_cache
hf-dev-train/transformers-main/CITATION.cff ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: "1.2.0"
2
+ date-released: 2020-10
3
+ message: "If you use this software, please cite it using these metadata."
4
+ title: "Transformers: State-of-the-Art Natural Language Processing"
5
+ url: "https://github.com/huggingface/transformers"
6
+ authors:
7
+ - family-names: Wolf
8
+ given-names: Thomas
9
+ - family-names: Debut
10
+ given-names: Lysandre
11
+ - family-names: Sanh
12
+ given-names: Victor
13
+ - family-names: Chaumond
14
+ given-names: Julien
15
+ - family-names: Delangue
16
+ given-names: Clement
17
+ - family-names: Moi
18
+ given-names: Anthony
19
+ - family-names: Cistac
20
+ given-names: Perric
21
+ - family-names: Ma
22
+ given-names: Clara
23
+ - family-names: Jernite
24
+ given-names: Yacine
25
+ - family-names: Plu
26
+ given-names: Julien
27
+ - family-names: Xu
28
+ given-names: Canwen
29
+ - family-names: "Le Scao"
30
+ given-names: Teven
31
+ - family-names: Gugger
32
+ given-names: Sylvain
33
+ - family-names: Drame
34
+ given-names: Mariama
35
+ - family-names: Lhoest
36
+ given-names: Quentin
37
+ - family-names: Rush
38
+ given-names: "Alexander M."
39
+ preferred-citation:
40
+ type: conference-paper
41
+ authors:
42
+ - family-names: Wolf
43
+ given-names: Thomas
44
+ - family-names: Debut
45
+ given-names: Lysandre
46
+ - family-names: Sanh
47
+ given-names: Victor
48
+ - family-names: Chaumond
49
+ given-names: Julien
50
+ - family-names: Delangue
51
+ given-names: Clement
52
+ - family-names: Moi
53
+ given-names: Anthony
54
+ - family-names: Cistac
55
+ given-names: Perric
56
+ - family-names: Ma
57
+ given-names: Clara
58
+ - family-names: Jernite
59
+ given-names: Yacine
60
+ - family-names: Plu
61
+ given-names: Julien
62
+ - family-names: Xu
63
+ given-names: Canwen
64
+ - family-names: "Le Scao"
65
+ given-names: Teven
66
+ - family-names: Gugger
67
+ given-names: Sylvain
68
+ - family-names: Drame
69
+ given-names: Mariama
70
+ - family-names: Lhoest
71
+ given-names: Quentin
72
+ - family-names: Rush
73
+ given-names: "Alexander M."
74
+ booktitle: "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations"
75
+ month: 10
76
+ start: 38
77
+ end: 45
78
+ title: "Transformers: State-of-the-Art Natural Language Processing"
79
+ year: 2020
80
+ publisher: "Association for Computational Linguistics"
81
+ url: "https://www.aclweb.org/anthology/2020.emnlp-demos.6"
82
+ address: "Online"
hf-dev-train/transformers-main/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Contributor Covenant Code of Conduct
3
+
4
+ ## Our Pledge
5
+
6
+ We as members, contributors, and leaders pledge to make participation in our
7
+ community a harassment-free experience for everyone, regardless of age, body
8
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
9
+ identity and expression, level of experience, education, socio-economic status,
10
+ nationality, personal appearance, race, caste, color, religion, or sexual
11
+ identity and orientation.
12
+
13
+ We pledge to act and interact in ways that contribute to an open, welcoming,
14
+ diverse, inclusive, and healthy community.
15
+
16
+ ## Our Standards
17
+
18
+ Examples of behavior that contributes to a positive environment for our
19
+ community include:
20
+
21
+ * Demonstrating empathy and kindness toward other people
22
+ * Being respectful of differing opinions, viewpoints, and experiences
23
+ * Giving and gracefully accepting constructive feedback
24
+ * Accepting responsibility and apologizing to those affected by our mistakes,
25
+ and learning from the experience
26
+ * Focusing on what is best not just for us as individuals, but for the overall
27
+ community
28
+
29
+ Examples of unacceptable behavior include:
30
+
31
+ * The use of sexualized language or imagery, and sexual attention or advances of
32
+ any kind
33
+ * Trolling, insulting or derogatory comments, and personal or political attacks
34
+ * Public or private harassment
35
+ * Publishing others' private information, such as a physical or email address,
36
+ without their explicit permission
37
+ * Other conduct which could reasonably be considered inappropriate in a
38
+ professional setting
39
+
40
+ ## Enforcement Responsibilities
41
+
42
+ Community leaders are responsible for clarifying and enforcing our standards of
43
+ acceptable behavior and will take appropriate and fair corrective action in
44
+ response to any behavior that they deem inappropriate, threatening, offensive,
45
+ or harmful.
46
+
47
+ Community leaders have the right and responsibility to remove, edit, or reject
48
+ comments, commits, code, wiki edits, issues, and other contributions that are
49
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
50
+ decisions when appropriate.
51
+
52
+ ## Scope
53
+
54
+ This Code of Conduct applies within all community spaces, and also applies when
55
+ an individual is officially representing the community in public spaces.
56
+ Examples of representing our community include using an official e-mail address,
57
+ posting via an official social media account, or acting as an appointed
58
+ representative at an online or offline event.
59
+
60
+ ## Enforcement
61
+
62
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
63
+ reported to the community leaders responsible for enforcement at
64
65
+ All complaints will be reviewed and investigated promptly and fairly.
66
+
67
+ All community leaders are obligated to respect the privacy and security of the
68
+ reporter of any incident.
69
+
70
+ ## Enforcement Guidelines
71
+
72
+ Community leaders will follow these Community Impact Guidelines in determining
73
+ the consequences for any action they deem in violation of this Code of Conduct:
74
+
75
+ ### 1. Correction
76
+
77
+ **Community Impact**: Use of inappropriate language or other behavior deemed
78
+ unprofessional or unwelcome in the community.
79
+
80
+ **Consequence**: A private, written warning from community leaders, providing
81
+ clarity around the nature of the violation and an explanation of why the
82
+ behavior was inappropriate. A public apology may be requested.
83
+
84
+ ### 2. Warning
85
+
86
+ **Community Impact**: A violation through a single incident or series of
87
+ actions.
88
+
89
+ **Consequence**: A warning with consequences for continued behavior. No
90
+ interaction with the people involved, including unsolicited interaction with
91
+ those enforcing the Code of Conduct, for a specified period of time. This
92
+ includes avoiding interactions in community spaces as well as external channels
93
+ like social media. Violating these terms may lead to a temporary or permanent
94
+ ban.
95
+
96
+ ### 3. Temporary Ban
97
+
98
+ **Community Impact**: A serious violation of community standards, including
99
+ sustained inappropriate behavior.
100
+
101
+ **Consequence**: A temporary ban from any sort of interaction or public
102
+ communication with the community for a specified period of time. No public or
103
+ private interaction with the people involved, including unsolicited interaction
104
+ with those enforcing the Code of Conduct, is allowed during this period.
105
+ Violating these terms may lead to a permanent ban.
106
+
107
+ ### 4. Permanent Ban
108
+
109
+ **Community Impact**: Demonstrating a pattern of violation of community
110
+ standards, including sustained inappropriate behavior, harassment of an
111
+ individual, or aggression toward or disparagement of classes of individuals.
112
+
113
+ **Consequence**: A permanent ban from any sort of public interaction within the
114
+ community.
115
+
116
+ ## Attribution
117
+
118
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
119
+ version 2.1, available at
120
+ [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
121
+
122
+ Community Impact Guidelines were inspired by
123
+ [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
124
+
125
+ For answers to common questions about this code of conduct, see the FAQ at
126
+ [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
127
+ [https://www.contributor-covenant.org/translations][translations].
128
+
129
+ [homepage]: https://www.contributor-covenant.org
130
+ [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
131
+ [Mozilla CoC]: https://github.com/mozilla/diversity
132
+ [FAQ]: https://www.contributor-covenant.org/faq
133
+ [translations]: https://www.contributor-covenant.org/translations
hf-dev-train/transformers-main/CONTRIBUTING.md ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ # Contribute to 🤗 Transformers
18
+
19
+ Everyone is welcome to contribute, and we value everybody's contribution. Code
20
+ contributions are not the only way to help the community. Answering questions, helping
21
+ others, and improving the documentation are also immensely valuable.
22
+
23
+ It also helps us if you spread the word! Reference the library in blog posts
24
+ about the awesome projects it made possible, shout out on Twitter every time it has
25
+ helped you, or simply ⭐️ the repository to say thank you.
26
+
27
+ However you choose to contribute, please be mindful and respect our
28
+ [code of conduct](https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md).
29
+
30
+ **This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
31
+
32
+ ## Ways to contribute
33
+
34
+ There are several ways you can contribute to 🤗 Transformers:
35
+
36
+ * Fix outstanding issues with the existing code.
37
+ * Submit issues related to bugs or desired new features.
38
+ * Implement new models.
39
+ * Contribute to the examples or to the documentation.
40
+
41
+ If you don't know where to start, there is a special [Good First
42
+ Issue](https://github.com/huggingface/transformers/contribute) listing. It will give you a list of
43
+ open issues that are beginner-friendly and help you start contributing to open-source. Just comment in the issue that you'd like to work
44
+ on it.
45
+
46
+ For something slightly more challenging, you can also take a look at the [Good Second Issue](https://github.com/huggingface/transformers/labels/Good%20Second%20Issue) list. In general though, if you feel like you know what you're doing, go for it and we'll help you get there! 🚀
47
+
48
+ > All contributions are equally valuable to the community. 🥰
49
+
50
+ ## Fixing outstanding issues
51
+
52
+ If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md/#create-a-pull-request) and open a Pull Request!
53
+
54
+ ## Submitting a bug-related issue or feature request
55
+
56
+ Do your best to follow these guidelines when submitting a bug-related issue or a feature
57
+ request. It will make it easier for us to come back to you quickly and with good
58
+ feedback.
59
+
60
+ ### Did you find a bug?
61
+
62
+ The 🤗 Transformers library is robust and reliable thanks to users who report the problems they encounter.
63
+
64
+ Before you report an issue, we would really appreciate it if you could **make sure the bug was not
65
+ already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code. If you're unsure whether the bug is in your code or the library, please ask on the [forum](https://discuss.huggingface.co/) first. This helps us respond quicker to fixing issues related to the library versus general questions.
66
+
67
+ Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:
68
+
69
+ * Your **OS type and version** and **Python**, **PyTorch** and
70
+ **TensorFlow** versions when applicable.
71
+ * A short, self-contained, code snippet that allows us to reproduce the bug in
72
+ less than 30s.
73
+ * The *full* traceback if an exception is raised.
74
+ * Attach any other additional information, like screenshots, you think may help.
75
+
76
+ To get the OS and software versions automatically, run the following command:
77
+
78
+ ```bash
79
+ transformers-cli env
80
+ ```
81
+
82
+ You can also run the same command from the root of the repository:
83
+
84
+ ```bash
85
+ python src/transformers/commands/transformers_cli.py env
86
+ ```
87
+
88
+ ### Do you want a new feature?
89
+
90
+ If there is a new feature you'd like to see in 🤗 Transformers, please open an issue and describe:
91
+
92
+ 1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it a feature related to something you need for a project? Is it something you worked on and think it could benefit the community?
93
+
94
+ Whatever it is, we'd love to hear about it!
95
+
96
+ 2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better we'll be able to help you.
97
+ 3. Provide a *code snippet* that demonstrates the features usage.
98
+ 4. If the feature is related to a paper, please include a link.
99
+
100
+ If your issue is well written we're already 80% of the way there by the time you create it.
101
+
102
+ We have added [templates](https://github.com/huggingface/transformers/tree/main/templates) to help you get started with your issue.
103
+
104
+ ## Do you want to implement a new model?
105
+
106
+ New models are constantly released and if you want to implement a new model, please provide the following information
107
+
108
+ * A short description of the model and link to the paper.
109
+ * Link to the implementation if it is open-sourced.
110
+ * Link to the model weights if they are available.
111
+
112
+ If you are willing to contribute the model yourself, let us know so we can help you add it to 🤗 Transformers!
113
+
114
+ We have added a [detailed guide and templates](https://github.com/huggingface/transformers/tree/main/templates) to help you get started with adding a new model, and we also have a more technical guide for [how to add a model to 🤗 Transformers](https://huggingface.co/docs/transformers/add_new_model).
115
+
116
+ ## Do you want to add documentation?
117
+
118
+ We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be happy to make the changes or help you make a contribution if you're interested!
119
+
120
+ For more details about how to generate, build, and write the documentation, take a look at the documentation [README](https://github.com/huggingface/transformers/tree/main/docs).
121
+
122
+ ## Create a Pull Request
123
+
124
+ Before writing any code, we strongly advise you to search through the existing PRs or
125
+ issues to make sure nobody is already working on the same thing. If you are
126
+ unsure, it is always a good idea to open an issue to get some feedback.
127
+
128
+ You will need basic `git` proficiency to contribute to
129
+ 🤗 Transformers. While `git` is not the easiest tool to use, it has the greatest
130
+ manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro
131
+ Git](https://git-scm.com/book/en/v2) is a very good reference.
132
+
133
+ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/main/setup.py#L426))** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
134
+
135
+ 1. Fork the [repository](https://github.com/huggingface/transformers) by
136
+ clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code
137
+ under your GitHub user account.
138
+
139
+ 2. Clone your fork to your local disk, and add the base repository as a remote:
140
+
141
+ ```bash
142
+ git clone [email protected]:<your Github handle>/transformers.git
143
+ cd transformers
144
+ git remote add upstream https://github.com/huggingface/transformers.git
145
+ ```
146
+
147
+ 3. Create a new branch to hold your development changes:
148
+
149
+ ```bash
150
+ git checkout -b a-descriptive-name-for-my-changes
151
+ ```
152
+
153
+ 🚨 **Do not** work on the `main` branch!
154
+
155
+ 4. Set up a development environment by running the following command in a virtual environment:
156
+
157
+ ```bash
158
+ pip install -e ".[dev]"
159
+ ```
160
+
161
+ If 🤗 Transformers was already installed in the virtual environment, remove
162
+ it with `pip uninstall transformers` before reinstalling it in editable
163
+ mode with the `-e` flag.
164
+
165
+ Depending on your OS, you may need to install some external libraries as well if the `pip` installation fails.
166
+
167
+ For macOS, you will likely need [MeCab](https://taku910.github.io/mecab/) which can be installed from Homebrew:
168
+
169
+ ```bash
170
+ brew install mecab
171
+ ```
172
+
173
+ 5. Develop the features on your branch.
174
+
175
+ As you work on your code, you should make sure the test suite
176
+ passes. Run the tests impacted by your changes like this:
177
+
178
+ ```bash
179
+ pytest tests/<TEST_TO_RUN>.py
180
+ ```
181
+
182
+ For more information about tests, check out the
183
+ [Testing](https://huggingface.co/docs/transformers/testing) guide.
184
+
185
+ 🤗 Transformers relies on `black` and `ruff` to format its source code
186
+ consistently. After you make changes, apply automatic style corrections and code verifications
187
+ that can't be automated in one go with:
188
+
189
+ ```bash
190
+ make fixup
191
+ ```
192
+
193
+ This target is also optimized to only work with files modified by the PR you're working on.
194
+
195
+ If you prefer to run the checks one after the other, the following command applies the
196
+ style corrections:
197
+
198
+ ```bash
199
+ make style
200
+ ```
201
+
202
+ 🤗 Transformers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality
203
+ controls are run by the CI, but you can run the same checks with:
204
+
205
+ ```bash
206
+ make quality
207
+ ```
208
+
209
+ Finally, we have a lot of scripts to make sure we didn't forget to update
210
+ some files when adding a new model. You can run these scripts with:
211
+
212
+ ```bash
213
+ make repo-consistency
214
+ ```
215
+
216
+ To learn more about those checks and how to fix any issues with them, check out the
217
+ [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.
218
+
219
+ If you're modifying documents under `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check
220
+ make sure you install the documentation builder:
221
+
222
+ ```bash
223
+ pip install ".[docs]"
224
+ ```
225
+
226
+ Run the following command from the root of the repository:
227
+
228
+ ```bash
229
+ doc-builder build transformers docs/source/en --build_dir ~/tmp/test-build
230
+ ```
231
+
232
+ This will build the documentation in the `~/tmp/test-build` folder where you can inspect the generated
233
+ Markdown files with your favorite editor. You can also preview the docs on GitHub when you open a pull request.
234
+
235
+ Once you're happy with your changes, add changed files with `git add` and
236
+ record your changes locally with `git commit`:
237
+
238
+ ```bash
239
+ git add modified_file.py
240
+ git commit
241
+ ```
242
+
243
+ Please remember to write [good commit
244
+ messages](https://chris.beams.io/posts/git-commit/) to clearly communicate the changes you made!
245
+
246
+ To keep your copy of the code up to date with the original
247
+ repository, rebase your branch on `upstream/branch` *before* you open a pull request or if requested by a maintainer:
248
+
249
+ ```bash
250
+ git fetch upstream
251
+ git rebase upstream/main
252
+ ```
253
+
254
+ Push your changes to your branch:
255
+
256
+ ```bash
257
+ git push -u origin a-descriptive-name-for-my-changes
258
+ ```
259
+
260
+ If you've already opened a pull request, you'll need to force push with the `--force` flag. Otherwise, if the pull request hasn't been opened yet, you can just push your changes normally.
261
+
262
+ 6. Now you can go to your fork of the repository on GitHub and click on **Pull request** to open a pull request. Make sure you tick off all the boxes in our [checklist](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md/#pull-request-checklist) below. When you're ready, you can send your changes to the project maintainers for review.
263
+
264
+ 7. It's ok if maintainers request changes, it happens to our core contributors
265
+ too! So everyone can see the changes in the pull request, work in your local
266
+ branch and push the changes to your fork. They will automatically appear in
267
+ the pull request.
268
+
269
+ ### Pull request checklist
270
+
271
+ ☐ The pull request title should summarize your contribution.<br>
272
+ ☐ If your pull request addresses an issue, please mention the issue number in the pull
273
+ request description to make sure they are linked (and people viewing the issue know you
274
+ are working on it).<br>
275
+ ☐ To indicate a work in progress please prefix the title with `[WIP]`. These are
276
+ useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.
277
+ ☐ Make sure existing tests pass.<br>
278
+ ☐ If adding a new feature, also add tests for it.<br>
279
+ - If you are adding a new model, make sure you use
280
+ `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)` to trigger the common tests.
281
+ - If you are adding new `@slow` tests, make sure they pass using
282
+ `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
283
+ - If you are adding a new tokenizer, write tests and make sure
284
+ `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` passes.
285
+ CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
286
+
287
+ ☐ All public methods must have informative docstrings (see
288
+ [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py)
289
+ for an example).<br>
290
+ ☐ Due to the rapidly growing repository, don't add any images, videos and other
291
+ non-text files that'll significantly weigh down the repository. Instead, use a Hub
292
+ repository such as [`hf-internal-testing`](https://huggingface.co/hf-internal-testing)
293
+ to host these files and reference them by URL. We recommend placing documentation
294
+ related images in the following repository:
295
+ [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
296
+ You can open a PR on this dataset repostitory and ask a Hugging Face member to merge it.
297
+
298
+ For more information about the checks run on a pull request, take a look at our [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.
299
+
300
+ ### Tests
301
+
302
+ An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
303
+ the [tests](https://github.com/huggingface/transformers/tree/main/tests) folder and examples tests in the
304
+ [examples](https://github.com/huggingface/transformers/tree/main/examples) folder.
305
+
306
+ We like `pytest` and `pytest-xdist` because it's faster. From the root of the
307
+ repository, specify a *path to a subfolder or a test file* to run the test.
308
+
309
+ ```bash
310
+ python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
311
+ ```
312
+
313
+ Similarly, for the `examples` directory, specify a *path to a subfolder or test file* to run the test. For example, the following command tests the text classification subfolder in the PyTorch `examples` directory:
314
+
315
+ ```bash
316
+ pip install -r examples/xxx/requirements.txt # only needed the first time
317
+ python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
318
+ ```
319
+
320
+ In fact, this is actually how our `make test` and `make test-examples` commands are implemented (not including the `pip install`)!
321
+
322
+ You can also specify a smaller set of tests in order to test only the feature
323
+ you're working on.
324
+
325
+ By default, slow tests are skipped but you can set the `RUN_SLOW` environment variable to
326
+ `yes` to run them. This will download many gigabytes of models so make sure you
327
+ have enough disk space, a good internet connection or a lot of patience!
328
+
329
+ <Tip warning={true}>
330
+
331
+ Remember to specify a *path to a subfolder or a test file* to run the test. Otherwise, you'll run all the tests in the `tests` or `examples` folder, which will take a very long time!
332
+
333
+ </Tip>
334
+
335
+ ```bash
336
+ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
337
+ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
338
+ ```
339
+
340
+ Like the slow tests, there are other environment variables available which not enabled by default during testing:
341
+ - `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
342
+ - `RUN_PT_FLAX_CROSS_TESTS`: Enables tests for PyTorch + Flax integration.
343
+ - `RUN_PT_TF_CROSS_TESTS`: Enables tests for TensorFlow + PyTorch integration.
344
+
345
+ More environment variables and additional information can be found in the [testing_utils.py](src/transformers/testing_utils.py).
346
+
347
+ 🤗 Transformers uses `pytest` as a test runner only. It doesn't use any
348
+ `pytest`-specific features in the test suite itself.
349
+
350
+ This means `unittest` is fully supported. Here's how to run tests with
351
+ `unittest`:
352
+
353
+ ```bash
354
+ python -m unittest discover -s tests -t . -v
355
+ python -m unittest discover -s examples -t examples -v
356
+ ```
357
+
358
+ ### Style guide
359
+
360
+ For documentation strings, 🤗 Transformers follows the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html).
361
+ Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
362
+ for more information.
363
+
364
+ ### Develop on Windows
365
+
366
+ On Windows (unless you're working in [Windows Subsystem for Linux](https://learn.microsoft.com/en-us/windows/wsl/) or WSL), you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings:
367
+
368
+ ```bash
369
+ git config core.autocrlf input
370
+ ```
371
+
372
+ One way to run the `make` command on Windows is with MSYS2:
373
+
374
+ 1. [Download MSYS2](https://www.msys2.org/), and we assume it's installed in `C:\msys64`.
375
+ 2. Open the command line `C:\msys64\msys2.exe` (it should be available from the **Start** menu).
376
+ 3. Run in the shell: `pacman -Syu` and install `make` with `pacman -S make`.
377
+ 4. Add `C:\msys64\usr\bin` to your PATH environment variable.
378
+
379
+ You can now use `make` from any terminal (Powershell, cmd.exe, etc.)! 🎉
380
+
381
+ ### Sync a forked repository with upstream main (the Hugging Face repository)
382
+
383
+ When updating the main branch of a forked repository, please follow these steps to avoid pinging the upstream repository which adds reference notes to each upstream PR, and sends unnecessary notifications to the developers involved in these PRs.
384
+
385
+ 1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead, merge directly into the forked main.
386
+ 2. If a PR is absolutely necessary, use the following steps after checking out your branch:
387
+
388
+ ```bash
389
+ git checkout -b your-branch-for-syncing
390
+ git pull --squash --no-commit upstream main
391
+ git commit -m '<your message without GitHub references>'
392
+ git push --set-upstream origin your-branch-for-syncing
393
+ ```
hf-dev-train/transformers-main/ISSUES.md ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ # How To Request Support
18
+
19
+ This is an Open Source Project so please be mindful that like in any other project of this kind there is no obligation to answer all requests for help.
20
+
21
+ However, we want to encourage you to ask for help whenever you think it's needed! We are happy about every question we get because it allows us to better understand your needs, possible misunderstandings, and most importantly a way for you to help us make this library better. That being said, this document's main purpose is to provide guidelines at how you can formulate your requests to increase your chances to be understood and to get support.
22
+
23
+ There are two main venues to receive support: [the forums](https://discuss.huggingface.co/) and [the GitHub issues](https://github.com/huggingface/transformers/issues).
24
+
25
+ ## The Forums
26
+
27
+ [The user forums](https://discuss.huggingface.co/) are supported by the wide community of the library users and backed up by developers when needed.
28
+
29
+ If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystalized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).
30
+
31
+ In particular all "Please explain" questions or objectively very user-specific feature requests belong to the forums. Here are some example of such questions:
32
+
33
+ * "I would like to use a BertModel within a RL-Agent for a customer support service. How can I use a BertForMaskedLM in my ChatBotModel?"
34
+
35
+ * "Could you please explain why T5 has no positional embedding matrix under T5Model?"
36
+
37
+ * "How should I set my generation parameters for translation?"
38
+
39
+ * "How to train T5 on De->En translation?"
40
+
41
+
42
+ ## The GitHub Issues
43
+
44
+ Everything which hints at a bug should be opened as an [issue](https://github.com/huggingface/transformers/issues).
45
+
46
+ You are not required to read the following guidelines before opening an issue. However, if you notice that your issue doesn't get any replies, chances are that the developers have one or several difficulties with its quality. In this case, reading the following points and adjusting your issue accordingly could help.
47
+
48
+ 1. Before posting an issue, first search for already posted issues, since chances are someone has already asked a similar question before you.
49
+
50
+ If you use Google your search query should be:
51
+
52
+ ```
53
+ "huggingface" "transformers" your query
54
+ ```
55
+
56
+ The first two quoted words tell Google to limit the search to the context of the Huggingface Transformers. The remainder is your query - most commonly this would be the error message the software fails with. We will go deeper into details shortly.
57
+
58
+ The results of such a query will typically match GitHub issues, Hugging Face forums, StackExchange, and blogs.
59
+
60
+ If you find relevant hints, you may choose to continue the discussion there if you have follow up questions.
61
+
62
+ If what you found is similar but doesn't quite answer your problem, please, post a new issue and do include links to similar issues or forum discussions you may have found.
63
+
64
+ Let's look at some examples:
65
+
66
+ The error message, often referred to as an assertion, tells us what went wrong. Here is an example of an assertion:
67
+
68
+ ```python
69
+ Traceback (most recent call last):
70
+ File "<string>", line 1, in <module>
71
+ File "/transformers/src/transformers/__init__.py", line 34, in <module>
72
+ from . import dependency_versions_check
73
+ File "/transformers/src/transformers/dependency_versions_check.py", line 34, in <module>
74
+ from .utils import is_tokenizers_available
75
+ File "/transformers/src/transformers/utils/import_utils.py", line 40, in <module>
76
+ from tqdm.auto import tqdm
77
+ ModuleNotFoundError: No module named 'tqdm.auto'
78
+ ```
79
+
80
+ and it typically includes a traceback, so that we can see the full stack of calls the program made before it fails. This gives us the context to know why the program failed.
81
+
82
+ Going back to the above example. If you received this error search, look at the very last line of the error which is:
83
+
84
+ ```python
85
+ ModuleNotFoundError: No module named 'tqdm.auto'
86
+ ```
87
+
88
+ And now we can use it to do the searching on your favorite search engine:
89
+
90
+ 1. first for `"huggingface" "transformers" "ModuleNotFoundError: No module named 'tqdm.auto'"`
91
+ 2. if you don't find relevant results, then search for just `"ModuleNotFoundError: No module named 'tqdm.auto'"`
92
+ 3. and finally if nothing still comes up, then remove the outside quotes: `ModuleNotFoundError: No module named 'tqdm.auto'`
93
+
94
+ If the error includes any messages that include bits unique to your filesystem, always remove those in the search query since other users will not have the same filesystem as yours. For example:
95
+
96
+ ```bash
97
+ python -c 'open("/tmp/wrong_path.txt", "r")'
98
+ Traceback (most recent call last):
99
+ File "<string>", line 1, in <module>
100
+ FileNotFoundError: [Errno 2] No such file or directory: '/tmp/wrong_path.txt'
101
+ ```
102
+ Here you'd search for just: `"FileNotFoundError: [Errno 2] No such file or directory"`
103
+
104
+ If the local information that you removed were inside the error message and you removed them you may need to remove double quotes since your query is no longer exact. So if the error message was something like:
105
+
106
+ ```bash
107
+ ValueError: '/tmp/wrong_path.txt' cannot be found
108
+ ```
109
+
110
+ then you'd search for `"ValueError" "cannot be found"`
111
+
112
+ As you search you will notice that when you don't use quotes often the search engines will return a variety of unrelated hits, which may or may not be what you want.
113
+
114
+ Experiment with different ways and find which approach gives the most satisfactory results.
115
+
116
+ 2. Keep the issue short, providing the information that you think will aid the developers to understand your situation. Put yourself in the shoes of the person who has never seen your code or knows anything about your custom setup. This mental exercise will help to develop an intuition to what/what not to share"
117
+
118
+ 3. If there is a software failure, always provide the full traceback, for example:
119
+
120
+ ```python
121
+ $ python -c 'import transformers'
122
+ Traceback (most recent call last):
123
+ File "<string>", line 1, in <module>
124
+ File "/transformers/src/transformers/__init__.py", line 34, in <module>
125
+ from . import dependency_versions_check
126
+ File "/transformers/src/transformers/dependency_versions_check.py", line 34, in <module>
127
+ from .utils import is_tokenizers_available
128
+ File "/transformers/src/transformers/utils/import_utils.py", line 40, in <module>
129
+ from tqdm.auto import tqdm
130
+ ModuleNotFoundError: No module named 'tqdm.auto'
131
+ ```
132
+
133
+ As compared to providing just the last line of the error message, e.g.:
134
+ ```python
135
+ ModuleNotFoundError: No module named 'tqdm.auto'
136
+ ```
137
+ which is not sufficient.
138
+
139
+ If your application is running on more than one GPU (e.g. under `DistributedDataParallel`) and typically getting every log and traceback printed multiple times, please make sure that you paste only one copy of it. At times the traceback from parallel processes may get interleaved - so either disentangle these or change the loggers to log only for `local_rank==0` so that only one process logs things.
140
+
141
+ 4. When quoting a traceback, command line instructions and any type of code always enclose it in triple backticks inside the editor window, that is:
142
+
143
+ ````
144
+ ```
145
+ git clone https://github.com/huggingface/transformers
146
+ cd transformers
147
+ pip install .
148
+ ```
149
+ ````
150
+
151
+ If it's a command line with a long argument list, please consider breaking it down using backslashes and new lines. Here is an example of a good command line quote:
152
+
153
+ ```bash
154
+ cd examples/seq2seq
155
+ python -m torch.distributed.launch --nproc_per_node=2 ./finetune_trainer.py \
156
+ --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
157
+ --output_dir output_dir --overwrite_output_dir \
158
+ --do_train --n_train 500 --num_train_epochs 1 \
159
+ --per_device_train_batch_size 1 --freeze_embeds \
160
+ --src_lang en_XX --tgt_lang ro_RO --task translation \
161
+ --fp16 --sharded_ddp
162
+ ```
163
+
164
+ If you don't break it up, one has to scroll horizontally which often makes it quite difficult to quickly see what's happening.
165
+
166
+ The backslashes allow us to copy the command directly into the console to run it, without needing to edit it.
167
+
168
+ 5. Include only the important information that you think will help the developer to quickly identify the problem.
169
+
170
+ For example applications often create huge amounts of logs. Ask yourself whether providing all or parts of the log is useful.
171
+
172
+ Pasting a 100-1000 lines of log into the issue is an immediate turn off, since it will take a lot of time to figure out where the pertinent parts of the log are.
173
+
174
+ Attaching a full log can be helpful if it's done as an attachment, if it's enclosed in the following html code in the comment editor window:
175
+
176
+ ```
177
+ <details>
178
+ <summary>Full log</summary>
179
+ <pre>
180
+
181
+ many
182
+ lines
183
+ go
184
+ here
185
+
186
+ </pre>
187
+ </details>
188
+ ```
189
+
190
+ which would result in the following entry, which can be opened if desired, but otherwise takes little space.
191
+
192
+ <details>
193
+ <summary>Full log</summary>
194
+ <pre>
195
+ many
196
+ lines
197
+ go
198
+ here
199
+ </pre>
200
+ </details>
201
+
202
+ You could also provide a link to a pastebin service, but this is less beneficial since those links tend to expire quickly and future readers of your issue might not be able to access that log file anymore and may lack some context.
203
+
204
+ 6. If this is an issue in your code, do try to reduce that code to a minimal example that still demonstrates the problem. Please ask at the forums if you have a hard time figuring how to do that. Please realize that we don't have the luxury of having time to try and understand all of your custom code.
205
+
206
+ If you really tried to make a short reproducible code but couldn't figure it out, it might be that having a traceback will give the developer enough information to know what's going on. But if it is not enough and we can't reproduce the problem, we can't really solve it.
207
+
208
+ Do not despair if you can't figure it out from the beginning, just share what you can and perhaps someone else will be able to help you at the forums.
209
+
210
+ If your setup involves any custom datasets, the best way to help us reproduce the problem is to create a [Google Colab notebook](https://colab.research.google.com/) that demonstrates the issue and once you verify that the issue still exists, include a link to that notebook in the Issue. Just make sure that you don't copy and paste the location bar url of the open notebook - as this is private and we won't be able to open it. Instead, you need to click on `Share` in the right upper corner of the notebook, select `Get Link` and then copy and paste the public link it will give to you.
211
+
212
+ 7. If you forked off some of this project's code or example applications, please, do not ask us to go into your code repository and figure out what you may have done. The code is already very complex and unless there is an easy way to do a diff and it's a small diff, it won't be possible to find someone with time on their hands to make a lengthy investigation. Albeit, you might find someone at the forums who will be generous to do this for you.
213
+
214
+ 8. Before reporting an issue, first, always try to update your environment to the latest official version of this library. We have no resources to go and debug older revisions, which could easily have bugs that have been fixed in the latest released version.
215
+
216
+ We understand that this is not always possible, especially when APIs change, in which case file an issue against the highest library version your environment can support.
217
+
218
+ Of course, if you upgrade the library, always retest that the problem is still there.
219
+
220
+ 9. Please do not ask us to reproduce an issue with your custom data, since we don't have it. So, either you should use some existing dataset supported by HF datasets or you need to supply a code that generates a small sample on the fly, or some another quick and simple way to get it.
221
+
222
+ Please do not send us any non-public domain data that may require a license or a permission to be used.
223
+
224
+ 10. Do not tag multiple developers on the issue unless you know this is expected, either because you asked them and they gave you an explicit permission to tag them or the issue template instructs you to do so.
225
+
226
+ The "who to tag for what domain" part of the issue template is there to help users direct their questions to the right developers who are designated maintainers of project's specific domains. They can then decide at their own discretion to tag other developers if they feel it'd help move the issue forward.
227
+
228
+ We currently don't have a triage service and we trust your capacity to identify the right domain and thus the persons to tag in your issue. If you are not sure, please use the forums to ask for guidance.
229
+
230
+ When in doubt, err on the side of not tagging a given person. If you tag multiple people out of context or permission don't be surprised if you get no response at all. Please remember that every time you tag someone, they get a notification and you're taking their time without their permission. Please be sensitive to that.
231
+
232
+ If you got helped by one of the developers in the past please don't tag them in future issues, unless they are listed in the issue template for the domain you are asking about or that developer gave you an explicit permission to tag them in future issues.
233
+
234
+ If you see a certain developer doing multiple and/or recent commits into a specific area of the project that you feel is relevant to your issue, it is not a good reason to tag them. Various developers may be fixing things that prevent them from moving forward, but often their work is focused on a totally different domain. And while they may or may not know how to help you with the problem at hand, it would benefit the whole community much more if they focus on the domain of their unique expertise.
235
+
236
+ 11. Use the Edit button. Take your time, and re-read and improve the wording and formatting to make your posts and comments as easy to understand as possible.
237
+
238
+ Avoid posting multiple comments in a row, as each comment generates a notification for the developers tagged in that issue. If you happened to post multiple comments in a row, and nobody followed up yet - consider merging those into one or a few comments while editing the combined content to be coherent.
239
+
240
+ If you choose to edit your older comments after others posted follow up comments you need to be aware that your modifications might not be noticed, so if it's not a typo fixing, try to write a new comment flagging that something has been changed in the previous comments.
241
+
242
+ For example, the very first comment is the most important one. If while the thread unfolds you realize that things aren't as they seemed to you originally you may want to edit the first post to reflect the up-to-date understanding of the issue at hand so that it helps those who read your issue in the future quickly understand what's going on and not need to sift through dozens of comments. It also helps to indicate that the post was edited. So, those reading the thread later can understand why there might be certain discontinuity in the information flow.
243
+
244
+ Use bullets and items if you have lists of items and the outcome improves overall readability.
245
+
246
+ Use backticks to refer to class and function names, e.g. `BartModel` and `generate` as these stand out and improve the speed of a reader's comprehension.
247
+
248
+ Try not use italics and bold text too much as these often make the text more difficult to read.
249
+
250
+
251
+ 12. If you are cross-referencing a specific comment in a given thread or another issue, always link to that specific comment, rather than using the issue link. If you do the latter it could be quite impossible to find which specific comment you're referring to.
252
+
253
+ To get the link to the specific comment do not copy the url from the location bar of your browser, but instead, click the `...` icon in the upper right corner of the comment and then select "Copy Link".
254
+
255
+ For example the first link is a link to an issue, and the second to a specific comment in the same issue:
256
+
257
+ 1. https://github.com/huggingface/transformers/issues/9257
258
+ 2. https://github.com/huggingface/transformers/issues/9257#issuecomment-749945162
259
+
260
+
261
+ 13. If you are replying to a last comment, it's totally fine to make your reply with just your comment in it. The readers can follow the information flow here.
262
+
263
+ But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like:
264
+
265
+ ```
266
+ > How big is your gpu cluster?
267
+
268
+ Our cluster is made of 256 gpus.
269
+ ```
270
+
271
+ If you are addressing multiple comments, quote the relevant parts of each before your answer. Some people use the same comment to do multiple replies, others separate them into separate comments. Either way works. The latter approach helps for linking to a specific comment.
272
+
273
+ In general the best way to figure out what works the best is learn from issues posted by other people - see which issues get great responses and which get little to no response - observe what the posters who received great responses did differently from those who did not.
274
+
275
+ Thank you for reading this somewhat lengthy document. We would like to conclude that these are not absolute rules, but a friendly advice that will help maximize the chances for us to understand what you are trying to communicate, reproduce the problem then resolve it to your satisfaction and the benefit of the whole community.
276
+
277
+ If after reading this document there are remaining questions on how and why or there is a need for further elucidation, please, don't hesitate to ask your question in [this thread](https://discuss.huggingface.co/t/how-to-request-support/3128).
hf-dev-train/transformers-main/LICENSE ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2018- The Hugging Face team. All rights reserved.
2
+
3
+ Apache License
4
+ Version 2.0, January 2004
5
+ http://www.apache.org/licenses/
6
+
7
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
8
+
9
+ 1. Definitions.
10
+
11
+ "License" shall mean the terms and conditions for use, reproduction,
12
+ and distribution as defined by Sections 1 through 9 of this document.
13
+
14
+ "Licensor" shall mean the copyright owner or entity authorized by
15
+ the copyright owner that is granting the License.
16
+
17
+ "Legal Entity" shall mean the union of the acting entity and all
18
+ other entities that control, are controlled by, or are under common
19
+ control with that entity. For the purposes of this definition,
20
+ "control" means (i) the power, direct or indirect, to cause the
21
+ direction or management of such entity, whether by contract or
22
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
23
+ outstanding shares, or (iii) beneficial ownership of such entity.
24
+
25
+ "You" (or "Your") shall mean an individual or Legal Entity
26
+ exercising permissions granted by this License.
27
+
28
+ "Source" form shall mean the preferred form for making modifications,
29
+ including but not limited to software source code, documentation
30
+ source, and configuration files.
31
+
32
+ "Object" form shall mean any form resulting from mechanical
33
+ transformation or translation of a Source form, including but
34
+ not limited to compiled object code, generated documentation,
35
+ and conversions to other media types.
36
+
37
+ "Work" shall mean the work of authorship, whether in Source or
38
+ Object form, made available under the License, as indicated by a
39
+ copyright notice that is included in or attached to the work
40
+ (an example is provided in the Appendix below).
41
+
42
+ "Derivative Works" shall mean any work, whether in Source or Object
43
+ form, that is based on (or derived from) the Work and for which the
44
+ editorial revisions, annotations, elaborations, or other modifications
45
+ represent, as a whole, an original work of authorship. For the purposes
46
+ of this License, Derivative Works shall not include works that remain
47
+ separable from, or merely link (or bind by name) to the interfaces of,
48
+ the Work and Derivative Works thereof.
49
+
50
+ "Contribution" shall mean any work of authorship, including
51
+ the original version of the Work and any modifications or additions
52
+ to that Work or Derivative Works thereof, that is intentionally
53
+ submitted to Licensor for inclusion in the Work by the copyright owner
54
+ or by an individual or Legal Entity authorized to submit on behalf of
55
+ the copyright owner. For the purposes of this definition, "submitted"
56
+ means any form of electronic, verbal, or written communication sent
57
+ to the Licensor or its representatives, including but not limited to
58
+ communication on electronic mailing lists, source code control systems,
59
+ and issue tracking systems that are managed by, or on behalf of, the
60
+ Licensor for the purpose of discussing and improving the Work, but
61
+ excluding communication that is conspicuously marked or otherwise
62
+ designated in writing by the copyright owner as "Not a Contribution."
63
+
64
+ "Contributor" shall mean Licensor and any individual or Legal Entity
65
+ on behalf of whom a Contribution has been received by Licensor and
66
+ subsequently incorporated within the Work.
67
+
68
+ 2. Grant of Copyright License. Subject to the terms and conditions of
69
+ this License, each Contributor hereby grants to You a perpetual,
70
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
71
+ copyright license to reproduce, prepare Derivative Works of,
72
+ publicly display, publicly perform, sublicense, and distribute the
73
+ Work and such Derivative Works in Source or Object form.
74
+
75
+ 3. Grant of Patent License. Subject to the terms and conditions of
76
+ this License, each Contributor hereby grants to You a perpetual,
77
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
78
+ (except as stated in this section) patent license to make, have made,
79
+ use, offer to sell, sell, import, and otherwise transfer the Work,
80
+ where such license applies only to those patent claims licensable
81
+ by such Contributor that are necessarily infringed by their
82
+ Contribution(s) alone or by combination of their Contribution(s)
83
+ with the Work to which such Contribution(s) was submitted. If You
84
+ institute patent litigation against any entity (including a
85
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
86
+ or a Contribution incorporated within the Work constitutes direct
87
+ or contributory patent infringement, then any patent licenses
88
+ granted to You under this License for that Work shall terminate
89
+ as of the date such litigation is filed.
90
+
91
+ 4. Redistribution. You may reproduce and distribute copies of the
92
+ Work or Derivative Works thereof in any medium, with or without
93
+ modifications, and in Source or Object form, provided that You
94
+ meet the following conditions:
95
+
96
+ (a) You must give any other recipients of the Work or
97
+ Derivative Works a copy of this License; and
98
+
99
+ (b) You must cause any modified files to carry prominent notices
100
+ stating that You changed the files; and
101
+
102
+ (c) You must retain, in the Source form of any Derivative Works
103
+ that You distribute, all copyright, patent, trademark, and
104
+ attribution notices from the Source form of the Work,
105
+ excluding those notices that do not pertain to any part of
106
+ the Derivative Works; and
107
+
108
+ (d) If the Work includes a "NOTICE" text file as part of its
109
+ distribution, then any Derivative Works that You distribute must
110
+ include a readable copy of the attribution notices contained
111
+ within such NOTICE file, excluding those notices that do not
112
+ pertain to any part of the Derivative Works, in at least one
113
+ of the following places: within a NOTICE text file distributed
114
+ as part of the Derivative Works; within the Source form or
115
+ documentation, if provided along with the Derivative Works; or,
116
+ within a display generated by the Derivative Works, if and
117
+ wherever such third-party notices normally appear. The contents
118
+ of the NOTICE file are for informational purposes only and
119
+ do not modify the License. You may add Your own attribution
120
+ notices within Derivative Works that You distribute, alongside
121
+ or as an addendum to the NOTICE text from the Work, provided
122
+ that such additional attribution notices cannot be construed
123
+ as modifying the License.
124
+
125
+ You may add Your own copyright statement to Your modifications and
126
+ may provide additional or different license terms and conditions
127
+ for use, reproduction, or distribution of Your modifications, or
128
+ for any such Derivative Works as a whole, provided Your use,
129
+ reproduction, and distribution of the Work otherwise complies with
130
+ the conditions stated in this License.
131
+
132
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
133
+ any Contribution intentionally submitted for inclusion in the Work
134
+ by You to the Licensor shall be under the terms and conditions of
135
+ this License, without any additional terms or conditions.
136
+ Notwithstanding the above, nothing herein shall supersede or modify
137
+ the terms of any separate license agreement you may have executed
138
+ with Licensor regarding such Contributions.
139
+
140
+ 6. Trademarks. This License does not grant permission to use the trade
141
+ names, trademarks, service marks, or product names of the Licensor,
142
+ except as required for reasonable and customary use in describing the
143
+ origin of the Work and reproducing the content of the NOTICE file.
144
+
145
+ 7. Disclaimer of Warranty. Unless required by applicable law or
146
+ agreed to in writing, Licensor provides the Work (and each
147
+ Contributor provides its Contributions) on an "AS IS" BASIS,
148
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149
+ implied, including, without limitation, any warranties or conditions
150
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151
+ PARTICULAR PURPOSE. You are solely responsible for determining the
152
+ appropriateness of using or redistributing the Work and assume any
153
+ risks associated with Your exercise of permissions under this License.
154
+
155
+ 8. Limitation of Liability. In no event and under no legal theory,
156
+ whether in tort (including negligence), contract, or otherwise,
157
+ unless required by applicable law (such as deliberate and grossly
158
+ negligent acts) or agreed to in writing, shall any Contributor be
159
+ liable to You for damages, including any direct, indirect, special,
160
+ incidental, or consequential damages of any character arising as a
161
+ result of this License or out of the use or inability to use the
162
+ Work (including but not limited to damages for loss of goodwill,
163
+ work stoppage, computer failure or malfunction, or any and all
164
+ other commercial damages or losses), even if such Contributor
165
+ has been advised of the possibility of such damages.
166
+
167
+ 9. Accepting Warranty or Additional Liability. While redistributing
168
+ the Work or Derivative Works thereof, You may choose to offer,
169
+ and charge a fee for, acceptance of support, warranty, indemnity,
170
+ or other liability obligations and/or rights consistent with this
171
+ License. However, in accepting such obligations, You may act only
172
+ on Your own behalf and on Your sole responsibility, not on behalf
173
+ of any other Contributor, and only if You agree to indemnify,
174
+ defend, and hold each Contributor harmless for any liability
175
+ incurred by, or claims asserted against, such Contributor by reason
176
+ of your accepting any such warranty or additional liability.
177
+
178
+ END OF TERMS AND CONDITIONS
179
+
180
+ APPENDIX: How to apply the Apache License to your work.
181
+
182
+ To apply the Apache License to your work, attach the following
183
+ boilerplate notice, with the fields enclosed by brackets "[]"
184
+ replaced with your own identifying information. (Don't include
185
+ the brackets!) The text should be enclosed in the appropriate
186
+ comment syntax for the file format. We also recommend that a
187
+ file or class name and description of purpose be included on the
188
+ same "printed page" as the copyright notice for easier
189
+ identification within third-party archives.
190
+
191
+ Copyright [yyyy] [name of copyright owner]
192
+
193
+ Licensed under the Apache License, Version 2.0 (the "License");
194
+ you may not use this file except in compliance with the License.
195
+ You may obtain a copy of the License at
196
+
197
+ http://www.apache.org/licenses/LICENSE-2.0
198
+
199
+ Unless required by applicable law or agreed to in writing, software
200
+ distributed under the License is distributed on an "AS IS" BASIS,
201
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202
+ See the License for the specific language governing permissions and
203
+ limitations under the License.
hf-dev-train/transformers-main/MANIFEST.in ADDED
@@ -0,0 +1 @@
 
 
1
+ include LICENSE
hf-dev-train/transformers-main/Makefile ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: deps_table_update modified_only_fixup extra_style_checks quality style fixup fix-copies test test-examples
2
+
3
+ # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
4
+ export PYTHONPATH = src
5
+
6
+ check_dirs := examples tests src utils
7
+
8
+ modified_only_fixup:
9
+ $(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
10
+ @if test -n "$(modified_py_files)"; then \
11
+ echo "Checking/fixing $(modified_py_files)"; \
12
+ black $(modified_py_files); \
13
+ ruff $(modified_py_files) --fix; \
14
+ else \
15
+ echo "No library .py files were modified"; \
16
+ fi
17
+
18
+ # Update src/transformers/dependency_versions_table.py
19
+
20
+ deps_table_update:
21
+ @python setup.py deps_table_update
22
+
23
+ deps_table_check_updated:
24
+ @md5sum src/transformers/dependency_versions_table.py > md5sum.saved
25
+ @python setup.py deps_table_update
26
+ @md5sum -c --quiet md5sum.saved || (printf "\nError: the version dependency table is outdated.\nPlease run 'make fixup' or 'make style' and commit the changes.\n\n" && exit 1)
27
+ @rm md5sum.saved
28
+
29
+ # autogenerating code
30
+
31
+ autogenerate_code: deps_table_update
32
+
33
+ # Check that the repo is in a good state
34
+
35
+ repo-consistency:
36
+ python utils/check_copies.py
37
+ python utils/check_table.py
38
+ python utils/check_dummies.py
39
+ python utils/check_repo.py
40
+ python utils/check_inits.py
41
+ python utils/check_config_docstrings.py
42
+ python utils/check_config_attributes.py
43
+ python utils/check_doctest_list.py
44
+ python utils/tests_fetcher.py --sanity_check
45
+ python utils/update_metadata.py --check-only
46
+ python utils/check_task_guides.py
47
+
48
+ # this target runs checks on all files
49
+
50
+ quality:
51
+ black --check $(check_dirs)
52
+ python utils/custom_init_isort.py --check_only
53
+ python utils/sort_auto_mappings.py --check_only
54
+ ruff $(check_dirs)
55
+ doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
56
+ python utils/check_doc_toc.py
57
+
58
+ # Format source code automatically and check is there are any problems left that need manual fixing
59
+
60
+ extra_style_checks:
61
+ python utils/custom_init_isort.py
62
+ python utils/sort_auto_mappings.py
63
+ doc-builder style src/transformers docs/source --max_len 119 --path_to_docs docs/source
64
+ python utils/check_doc_toc.py --fix_and_overwrite
65
+
66
+ # this target runs checks on all files and potentially modifies some of them
67
+
68
+ style:
69
+ black $(check_dirs)
70
+ ruff $(check_dirs) --fix
71
+ ${MAKE} autogenerate_code
72
+ ${MAKE} extra_style_checks
73
+
74
+ # Super fast fix and check target that only works on relevant modified files since the branch was made
75
+
76
+ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency
77
+
78
+ # Make marked copies of snippets of codes conform to the original
79
+
80
+ fix-copies:
81
+ python utils/check_copies.py --fix_and_overwrite
82
+ python utils/check_table.py --fix_and_overwrite
83
+ python utils/check_dummies.py --fix_and_overwrite
84
+ python utils/check_task_guides.py --fix_and_overwrite
85
+
86
+ # Run tests for the library
87
+
88
+ test:
89
+ python -m pytest -n auto --dist=loadfile -s -v ./tests/
90
+
91
+ # Run tests for examples
92
+
93
+ test-examples:
94
+ python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/
95
+
96
+ # Run tests for SageMaker DLC release
97
+
98
+ test-sagemaker: # install sagemaker dependencies in advance with pip install .[sagemaker]
99
+ TEST_SAGEMAKER=True python -m pytest -n auto -s -v ./tests/sagemaker
100
+
101
+
102
+ # Release stuff
103
+
104
+ pre-release:
105
+ python utils/release.py
106
+
107
+ pre-patch:
108
+ python utils/release.py --patch
109
+
110
+ post-release:
111
+ python utils/release.py --post_release
112
+
113
+ post-patch:
114
+ python utils/release.py --post_release --patch
hf-dev-train/transformers-main/README.md ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ <p align="center">
18
+ <picture>
19
+ <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
20
+ <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
21
+ <img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
22
+ </picture>
23
+ <br/>
24
+ <br/>
25
+ </p>
26
+
27
+ <p align="center">
28
+ <a href="https://circleci.com/gh/huggingface/transformers">
29
+ <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
30
+ </a>
31
+ <a href="https://github.com/huggingface/transformers/blob/main/LICENSE">
32
+ <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
33
+ </a>
34
+ <a href="https://huggingface.co/docs/transformers/index">
35
+ <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online">
36
+ </a>
37
+ <a href="https://github.com/huggingface/transformers/releases">
38
+ <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
39
+ </a>
40
+ <a href="https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md">
41
+ <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
42
+ </a>
43
+ <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
44
+ </p>
45
+
46
+ <h4 align="center">
47
+ <p>
48
+ <b>English</b> |
49
+ <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hans.md">简体中文</a> |
50
+ <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hant.md">繁體中文</a> |
51
+ <a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">한국어</a> |
52
+ <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
53
+ <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
54
+ <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
55
+ <p>
56
+ </h4>
57
+
58
+ <h3 align="center">
59
+ <p>State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow</p>
60
+ </h3>
61
+
62
+ <h3 align="center">
63
+ <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
64
+ </h3>
65
+
66
+ 🤗 Transformers provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio.
67
+
68
+ These models can be applied on:
69
+
70
+ * 📝 Text, for tasks like text classification, information extraction, question answering, summarization, translation, text generation, in over 100 languages.
71
+ * 🖼️ Images, for tasks like image classification, object detection, and segmentation.
72
+ * 🗣️ Audio, for tasks like speech recognition and audio classification.
73
+
74
+ Transformer models can also perform tasks on **several modalities combined**, such as table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
75
+
76
+ 🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments.
77
+
78
+ 🤗 Transformers is backed by the three most popular deep learning libraries — [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.
79
+
80
+ ## Online demos
81
+
82
+ You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer [private model hosting, versioning, & an inference API](https://huggingface.co/pricing) for public and private models.
83
+
84
+ Here are a few examples:
85
+
86
+ In Natural Language Processing:
87
+ - [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
88
+ - [Name Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
89
+ - [Text generation with GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
90
+ - [Natural Language Inference with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
91
+ - [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
92
+ - [Question answering with DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
93
+ - [Translation with T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
94
+
95
+ In Computer Vision:
96
+ - [Image classification with ViT](https://huggingface.co/google/vit-base-patch16-224)
97
+ - [Object Detection with DETR](https://huggingface.co/facebook/detr-resnet-50)
98
+ - [Semantic Segmentation with SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
99
+ - [Panoptic Segmentation with MaskFormer](https://huggingface.co/facebook/maskformer-swin-small-coco)
100
+ - [Depth Estimation with DPT](https://huggingface.co/docs/transformers/model_doc/dpt)
101
+ - [Video Classification with VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)
102
+ - [Universal Segmentation with OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
103
+
104
+ In Audio:
105
+ - [Automatic Speech Recognition with Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h)
106
+ - [Keyword Spotting with Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
107
+ - [Audio Classification with Audio Spectrogram Transformer](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
108
+
109
+ In Multimodal tasks:
110
+ - [Table Question Answering with TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq)
111
+ - [Visual Question Answering with ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
112
+ - [Zero-shot Image Classification with CLIP](https://huggingface.co/openai/clip-vit-large-patch14)
113
+ - [Document Question Answering with LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
114
+ - [Zero-shot Video Classification with X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
115
+
116
+ **[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team, is the official demo of this repo’s text generation capabilities.
117
+
118
+ ## If you are looking for custom support from the Hugging Face team
119
+
120
+ <a target="_blank" href="https://huggingface.co/support">
121
+ <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
122
+ </a><br>
123
+
124
+ ## Quick tour
125
+
126
+ To immediately use a model on a given input (text, image, audio, ...), we provide the `pipeline` API. Pipelines group together a pretrained model with the preprocessing that was used during that model's training. Here is how to quickly use a pipeline to classify positive versus negative texts:
127
+
128
+ ```python
129
+ >>> from transformers import pipeline
130
+
131
+ # Allocate a pipeline for sentiment-analysis
132
+ >>> classifier = pipeline('sentiment-analysis')
133
+ >>> classifier('We are very happy to introduce pipeline to the transformers repository.')
134
+ [{'label': 'POSITIVE', 'score': 0.9996980428695679}]
135
+ ```
136
+
137
+ The second line of code downloads and caches the pretrained model used by the pipeline, while the third evaluates it on the given text. Here the answer is "positive" with a confidence of 99.97%.
138
+
139
+ Many tasks have a pre-trained `pipeline` ready to go, in NLP but also in computer vision and speech. For example, we can easily extract detected objects in an image:
140
+
141
+ ``` python
142
+ >>> import requests
143
+ >>> from PIL import Image
144
+ >>> from transformers import pipeline
145
+
146
+ # Download an image with cute cats
147
+ >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
148
+ >>> image_data = requests.get(url, stream=True).raw
149
+ >>> image = Image.open(image_data)
150
+
151
+ # Allocate a pipeline for object detection
152
+ >>> object_detector = pipeline('object-detection')
153
+ >>> object_detector(image)
154
+ [{'score': 0.9982201457023621,
155
+ 'label': 'remote',
156
+ 'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
157
+ {'score': 0.9960021376609802,
158
+ 'label': 'remote',
159
+ 'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
160
+ {'score': 0.9954745173454285,
161
+ 'label': 'couch',
162
+ 'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
163
+ {'score': 0.9988006353378296,
164
+ 'label': 'cat',
165
+ 'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
166
+ {'score': 0.9986783862113953,
167
+ 'label': 'cat',
168
+ 'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
169
+ ```
170
+
171
+ Here we get a list of objects detected in the image, with a box surrounding the object and a confidence score. Here is the original image on the left, with the predictions displayed on the right:
172
+
173
+ <h3 align="center">
174
+ <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
175
+ <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
176
+ </h3>
177
+
178
+ You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/docs/transformers/task_summary).
179
+
180
+ In addition to `pipeline`, to download and use any of the pretrained models on your given task, all it takes is three lines of code. Here is the PyTorch version:
181
+ ```python
182
+ >>> from transformers import AutoTokenizer, AutoModel
183
+
184
+ >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
185
+ >>> model = AutoModel.from_pretrained("bert-base-uncased")
186
+
187
+ >>> inputs = tokenizer("Hello world!", return_tensors="pt")
188
+ >>> outputs = model(**inputs)
189
+ ```
190
+
191
+ And here is the equivalent code for TensorFlow:
192
+ ```python
193
+ >>> from transformers import AutoTokenizer, TFAutoModel
194
+
195
+ >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
196
+ >>> model = TFAutoModel.from_pretrained("bert-base-uncased")
197
+
198
+ >>> inputs = tokenizer("Hello world!", return_tensors="tf")
199
+ >>> outputs = model(**inputs)
200
+ ```
201
+
202
+ The tokenizer is responsible for all the preprocessing the pretrained model expects, and can be called directly on a single string (as in the above examples) or a list. It will output a dictionary that you can use in downstream code or simply directly pass to your model using the ** argument unpacking operator.
203
+
204
+ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use as usual. [This tutorial](https://huggingface.co/docs/transformers/training) explains how to integrate such a model into a classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune on a new dataset.
205
+
206
+ ## Why should I use transformers?
207
+
208
+ 1. Easy-to-use state-of-the-art models:
209
+ - High performance on natural language understanding & generation, computer vision, and audio tasks.
210
+ - Low barrier to entry for educators and practitioners.
211
+ - Few user-facing abstractions with just three classes to learn.
212
+ - A unified API for using all our pretrained models.
213
+
214
+ 1. Lower compute costs, smaller carbon footprint:
215
+ - Researchers can share trained models instead of always retraining.
216
+ - Practitioners can reduce compute time and production costs.
217
+ - Dozens of architectures with over 60,000 pretrained models across all modalities.
218
+
219
+ 1. Choose the right framework for every part of a model's lifetime:
220
+ - Train state-of-the-art models in 3 lines of code.
221
+ - Move a single model between TF2.0/PyTorch/JAX frameworks at will.
222
+ - Seamlessly pick the right framework for training, evaluation and production.
223
+
224
+ 1. Easily customize a model or an example to your needs:
225
+ - We provide examples for each architecture to reproduce the results published by its original authors.
226
+ - Model internals are exposed as consistently as possible.
227
+ - Model files can be used independently of the library for quick experiments.
228
+
229
+ ## Why shouldn't I use transformers?
230
+
231
+ - This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
232
+ - The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library (possibly, [Accelerate](https://huggingface.co/docs/accelerate)).
233
+ - While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/main/examples) are just that: examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.
234
+
235
+ ## Installation
236
+
237
+ ### With pip
238
+
239
+ This repository is tested on Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+ and TensorFlow 2.3+.
240
+
241
+ You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
242
+
243
+ First, create a virtual environment with the version of Python you're going to use and activate it.
244
+
245
+ Then, you will need to install at least one of Flax, PyTorch or TensorFlow.
246
+ Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) installation pages regarding the specific installation command for your platform.
247
+
248
+ When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows:
249
+
250
+ ```bash
251
+ pip install transformers
252
+ ```
253
+
254
+ If you'd like to play with the examples or need the bleeding edge of the code and can't wait for a new release, you must [install the library from source](https://huggingface.co/docs/transformers/installation#installing-from-source).
255
+
256
+ ### With conda
257
+
258
+ Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
259
+
260
+ 🤗 Transformers can be installed using conda as follows:
261
+
262
+ ```shell script
263
+ conda install -c huggingface transformers
264
+ ```
265
+
266
+ Follow the installation pages of Flax, PyTorch or TensorFlow to see how to install them with conda.
267
+
268
+ > **_NOTE:_** On Windows, you may be prompted to activate Developer Mode in order to benefit from caching. If this is not an option for you, please let us know in [this issue](https://github.com/huggingface/huggingface_hub/issues/1062).
269
+
270
+ ## Model architectures
271
+
272
+ **[All the model checkpoints](https://huggingface.co/models)** provided by 🤗 Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co/models) where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).
273
+
274
+ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
275
+
276
+ 🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them):
277
+
278
+ 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
279
+ 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
280
+ 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
281
+ 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
282
+ 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
283
+ 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
284
+ 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
285
+ 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
286
+ 1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
287
+ 1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
288
+ 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
289
+ 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
290
+ 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
291
+ 1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
292
+ 1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
293
+ 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
294
+ 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
295
+ 1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
296
+ 1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
297
+ 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
298
+ 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
299
+ 1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
300
+ 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
301
+ 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
302
+ 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
303
+ 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
304
+ 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
305
+ 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
306
+ 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
307
+ 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
308
+ 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
309
+ 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
310
+ 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
311
+ 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
312
+ 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
313
+ 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
314
+ 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
315
+ 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
316
+ 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
317
+ 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
318
+ 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
319
+ 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
320
+ 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
321
+ 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
322
+ 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
323
+ 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
324
+ 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
325
+ 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
326
+ 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
327
+ 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
328
+ 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
329
+ 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
330
+ 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
331
+ 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
332
+ 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
333
+ 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
334
+ 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
335
+ 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
336
+ 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
337
+ 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
338
+ 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
339
+ 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
340
+ 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
341
+ 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
342
+ 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
343
+ 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
344
+ 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
345
+ 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
346
+ 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
347
+ 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
348
+ 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
349
+ 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
350
+ 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
351
+ 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
352
+ 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
353
+ 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
354
+ 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
355
+ 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
356
+ 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
357
+ 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
358
+ 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
359
+ 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
360
+ 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
361
+ 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
362
+ 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
363
+ 1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
364
+ 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
365
+ 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
366
+ 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
367
+ 1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
368
+ 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
369
+ 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
370
+ 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
371
+ 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
372
+ 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
373
+ 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
374
+ 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
375
+ 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
376
+ 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
377
+ 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
378
+ 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
379
+ 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
380
+ 1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
381
+ 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
382
+ 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
383
+ 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
384
+ 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
385
+ 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
386
+ 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
387
+ 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
388
+ 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
389
+ 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
390
+ 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
391
+ 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
392
+ 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
393
+ 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
394
+ 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
395
+ 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/main/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
396
+ 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
397
+ 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
398
+ 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
399
+ 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
400
+ 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
401
+ 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
402
+ 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
403
+ 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
404
+ 1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
405
+ 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
406
+ 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
407
+ 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
408
+ 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
409
+ 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
410
+ 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
411
+ 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
412
+ 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
413
+ 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
414
+ 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
415
+ 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
416
+ 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
417
+ 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
418
+ 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
419
+ 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
420
+ 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
421
+ 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
422
+ 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
423
+ 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
424
+ 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
425
+ 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
426
+ 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
427
+ 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
428
+ 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
429
+ 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
430
+ 1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
431
+ 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
432
+ 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
433
+ 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
434
+ 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
435
+ 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
436
+ 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
437
+ 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
438
+ 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
439
+ 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
440
+ 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
441
+ 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
442
+ 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
443
+ 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
444
+ 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
445
+ 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
446
+ 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
447
+ 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
448
+ 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
449
+ 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
450
+ 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
451
+ 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
452
+ 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
453
+ 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
454
+ 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
455
+ 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
456
+ 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
457
+ 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
458
+ 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
459
+ 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
460
+ 1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
461
+ 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
462
+ 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
463
+ 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
464
+ 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
465
+ 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
466
+ 1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
467
+ 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
468
+ 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
469
+ 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
470
+ 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
471
+ 1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
472
+ 1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
473
+
474
+ To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).
475
+
476
+ These implementations have been tested on several datasets (see the example scripts) and should match the performance of the original implementations. You can find more details on performance in the Examples section of the [documentation](https://github.com/huggingface/transformers/tree/main/examples).
477
+
478
+
479
+ ## Learn more
480
+
481
+ | Section | Description |
482
+ |-|-|
483
+ | [Documentation](https://huggingface.co/docs/transformers/) | Full API documentation and tutorials |
484
+ | [Task summary](https://huggingface.co/docs/transformers/task_summary) | Tasks supported by 🤗 Transformers |
485
+ | [Preprocessing tutorial](https://huggingface.co/docs/transformers/preprocessing) | Using the `Tokenizer` class to prepare data for the models |
486
+ | [Training and fine-tuning](https://huggingface.co/docs/transformers/training) | Using the models provided by 🤗 Transformers in a PyTorch/TensorFlow training loop and the `Trainer` API |
487
+ | [Quick tour: Fine-tuning/usage scripts](https://github.com/huggingface/transformers/tree/main/examples) | Example scripts for fine-tuning models on a wide range of tasks |
488
+ | [Model sharing and uploading](https://huggingface.co/docs/transformers/model_sharing) | Upload and share your fine-tuned models with the community |
489
+ | [Migration](https://huggingface.co/docs/transformers/migration) | Migrate to 🤗 Transformers from `pytorch-transformers` or `pytorch-pretrained-bert` |
490
+
491
+ ## Citation
492
+
493
+ We now have a [paper](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) you can cite for the 🤗 Transformers library:
494
+ ```bibtex
495
+ @inproceedings{wolf-etal-2020-transformers,
496
+ title = "Transformers: State-of-the-Art Natural Language Processing",
497
+ author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
498
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
499
+ month = oct,
500
+ year = "2020",
501
+ address = "Online",
502
+ publisher = "Association for Computational Linguistics",
503
+ url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
504
+ pages = "38--45"
505
+ }
506
+ ```
507
+
hf-dev-train/transformers-main/README_es.md ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ <p align="center">
18
+ <br>
19
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
20
+ <br>
21
+ <p>
22
+ <p align="center">
23
+ <a href="https://circleci.com/gh/huggingface/transformers">
24
+ <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
25
+ </a>
26
+ <a href="https://github.com/huggingface/transformers/blob/main/LICENSE">
27
+ <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
28
+ </a>
29
+ <a href="https://huggingface.co/docs/transformers/index">
30
+ <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online">
31
+ </a>
32
+ <a href="https://github.com/huggingface/transformers/releases">
33
+ <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
34
+ </a>
35
+ <a href="https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md">
36
+ <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
37
+ </a>
38
+ <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
39
+ </p>
40
+
41
+ <h4 align="center">
42
+ <p>
43
+ <a href="https://github.com/huggingface/transformers/">English</a> |
44
+ <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hans.md">简体中文</a> |
45
+ <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hant.md">繁體中文</a> |
46
+ <a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">한국어</a> |
47
+ <b>Español</b> |
48
+ <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
49
+ <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
50
+ <p>
51
+ </h4>
52
+
53
+ <h3 align="center">
54
+ <p>Lo último de Machine Learning para JAX, PyTorch y TensorFlow</p>
55
+ </h3>
56
+
57
+ <h3 align="center">
58
+ <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
59
+ </h3>
60
+
61
+ 🤗 Transformers aporta miles de modelos preentrenados Para realizar tareas en diferentes modalidades como texto, vision, y audio.
62
+
63
+ Estos modelos pueden ser aplicados en:
64
+
65
+ * 📝 Texto, Para tareas como clasificación de texto, extracción de información, responder preguntas, resumir, traducir, generación de texto, en más de 100 idiomas.
66
+ * 🖼️ Imágenes, para tareas como clasificación de imágenes, detección the objetos, y segmentación.
67
+ * 🗣️ Audio, para tareas como reconocimiento de voz y clasificación de audio.
68
+
69
+ Los modelos de Transformer también pueden realizar tareas en **muchas modalidades combinadas**, como responder pregunstas, reconocimiento de carácteres ópticos,extracción de información de documentos escaneados, clasificación de video, y respuesta de preguntas visuales.
70
+
71
+ 🤗 Transformers aporta APIs para descargar rápidamente y usar estos modelos preentrenados en un texto dado, afinarlos en tus propios sets de datos y compartirlos con la comunidad en nuestro [centro de modelos](https://huggingface.co/models). Al mismo tiempo, cada módulo de Python que define una arquitectura es completamente independiente y se puede modificar para permitir experimentos de investigación rápidos.
72
+
73
+ 🤗 Transformers está respaldado por las tres bibliotecas de deep learning más populares — [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) y [TensorFlow](https://www.tensorflow.org/) — con una perfecta integración entre ellos. Es sencillo entrenar sus modelos con uno antes de cargarlos para la inferencia con el otro.
74
+
75
+ ## Demostraciones en línea
76
+
77
+ Puedes probar la mayoría de nuestros modelos directamente en sus páginas desde el [centro de modelos](https://huggingface.co/models). También ofrecemos [alojamiento de modelos privados, control de versiones y una API de inferencia](https://huggingface.co/pricing) para modelos públicos y privados.
78
+
79
+ Aquí hay algunos ejemplos:
80
+
81
+ En procesamiento del lenguaje natural:
82
+ - [Terminación de palabras enmascaradas con BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
83
+ - [Reconocimiento del nombre de la entidad con Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
84
+ - [Generación de texto con GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
85
+ - [Inferencia del lenguaje natural con RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
86
+ - [Resumen con BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
87
+ - [Responder a preguntas con DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
88
+ - [Traducción con T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
89
+
90
+ En visión de ordenador:
91
+ - [Clasificación de imágenes con ViT](https://huggingface.co/google/vit-base-patch16-224)
92
+ - [Detección de objetos con DETR](https://huggingface.co/facebook/detr-resnet-50)
93
+ - [Segmentación semántica con SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
94
+ - [Segmentación panóptica con DETR](https://huggingface.co/facebook/detr-resnet-50-panoptic)
95
+ - [Segmentación Universal con OneFormer (Segmentación Semántica, de Instancia y Panóptica con un solo modelo)](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
96
+
97
+ En Audio:
98
+ - [Reconocimiento de voz automático con Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h)
99
+ - [Detección de palabras clave con Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
100
+
101
+ En tareas multimodales:
102
+ - [Respuesta visual a preguntas con ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
103
+
104
+ **[Escribe con Transformer](https://transformer.huggingface.co)**, construido por el equipo de Hugging Face, es la demostración oficial de las capacidades de generación de texto de este repositorio.
105
+
106
+ ## Si está buscando soporte personalizado del equipo de Hugging Face
107
+
108
+ <a target="_blank" href="https://huggingface.co/support">
109
+ <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
110
+ </a><br>
111
+
112
+ ## Tour rápido
113
+
114
+ Para usar inmediatamente un modelo en una entrada determinada (texto, imagen, audio, ...), proporcionamos la API de `pipeline`. Los pipelines agrupan un modelo previamente entrenado con el preprocesamiento que se usó durante el entrenamiento de ese modelo. Aquí se explica cómo usar rápidamente un pipeline para clasificar textos positivos frente a negativos:
115
+
116
+ ```python
117
+ >>> from transformers import pipeline
118
+
119
+ # Allocate a pipeline for sentiment-analysis
120
+ >>> classifier = pipeline('sentiment-analysis')
121
+ >>> classifier('We are very happy to introduce pipeline to the transformers repository.')
122
+ [{'label': 'POSITIVE', 'score': 0.9996980428695679}]
123
+ ```
124
+
125
+ La segunda línea de código descarga y almacena en caché el modelo previamente entrenado que usa la canalización, mientras que la tercera lo evalúa en el texto dado. Aquí la respuesta es "positiva" con una confianza del 99,97%.
126
+
127
+ Muchas tareas tienen un `pipeline` preentrenado listo para funcionar, en NLP pero también en visión por ordenador y habla. Por ejemplo, podemos extraer fácilmente los objetos detectados en una imagen:
128
+
129
+ ``` python
130
+ >>> import requests
131
+ >>> from PIL import Image
132
+ >>> from transformers import pipeline
133
+
134
+ # Download an image with cute cats
135
+ >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
136
+ >>> image_data = requests.get(url, stream=True).raw
137
+ >>> image = Image.open(image_data)
138
+
139
+ # Allocate a pipeline for object detection
140
+ >>> object_detector = pipeline('object_detection')
141
+ >>> object_detector(image)
142
+ [{'score': 0.9982201457023621,
143
+ 'label': 'remote',
144
+ 'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
145
+ {'score': 0.9960021376609802,
146
+ 'label': 'remote',
147
+ 'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
148
+ {'score': 0.9954745173454285,
149
+ 'label': 'couch',
150
+ 'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
151
+ {'score': 0.9988006353378296,
152
+ 'label': 'cat',
153
+ 'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
154
+ {'score': 0.9986783862113953,
155
+ 'label': 'cat',
156
+ 'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
157
+ ```
158
+
159
+ Aquí obtenemos una lista de objetos detectados en la imagen, con un cuadro que rodea el objeto y una puntuación de confianza. Aquí está la imagen original a la derecha, con las predicciones mostradas a la izquierda:
160
+
161
+ <h3 align="center">
162
+ <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
163
+ <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
164
+ </h3>
165
+
166
+ Puedes obtener más información sobre las tareas admitidas por la API de `pipeline` en [este tutorial](https://huggingface.co/docs/transformers/task_summary).
167
+
168
+ Además de `pipeline`, para descargar y usar cualquiera de los modelos previamente entrenados en su tarea dada, todo lo que necesita son tres líneas de código. Aquí está la versión de PyTorch:
169
+ ```python
170
+ >>> from transformers import AutoTokenizer, AutoModel
171
+
172
+ >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
173
+ >>> model = AutoModel.from_pretrained("bert-base-uncased")
174
+
175
+ >>> inputs = tokenizer("Hello world!", return_tensors="pt")
176
+ >>> outputs = model(**inputs)
177
+ ```
178
+
179
+ Y aquí está el código equivalente para TensorFlow:
180
+ ```python
181
+ >>> from transformers import AutoTokenizer, TFAutoModel
182
+
183
+ >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
184
+ >>> model = TFAutoModel.from_pretrained("bert-base-uncased")
185
+
186
+ >>> inputs = tokenizer("Hello world!", return_tensors="tf")
187
+ >>> outputs = model(**inputs)
188
+ ```
189
+
190
+ El tokenizador es responsable de todo el preprocesamiento que espera el modelo preentrenado y se puede llamar directamente en una sola cadena (como en los ejemplos anteriores) o en una lista. Dará como resultado un diccionario que puedes usar en el código descendente o simplemente pasarlo directamente a su modelo usando el operador de desempaquetado de argumento **.
191
+
192
+ El modelo en si es un [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) normal o un [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (dependiendo De tu backend) que puedes usar de forma habitual. [Este tutorial](https://huggingface.co/docs/transformers/training) explica cómo integrar un modelo de este tipo en un ciclo de entrenamiento PyTorch o TensorFlow clásico, o como usar nuestra API `Trainer` para ajustar rápidamente un nuevo conjunto de datos.
193
+
194
+ ## ¿Por qué debo usar transformers?
195
+
196
+ 1. Modelos de última generación fáciles de usar:
197
+ - Alto rendimiento en comprensión y generación de lenguaje natural, visión artificial y tareas de audio.
198
+ - Baja barrera de entrada para educadores y profesionales.
199
+ - Pocas abstracciones de cara al usuario con solo tres clases para aprender.
200
+ - Una API unificada para usar todos nuestros modelos preentrenados.
201
+
202
+ 1. Menores costes de cómputo, menor huella de carbono:
203
+ - Los investigadores pueden compartir modelos entrenados en lugar de siempre volver a entrenar.
204
+ - Los profesionales pueden reducir el tiempo de cómputo y los costos de producción.
205
+ - Docenas de arquitecturas con más de 60 000 modelos preentrenados en todas las modalidades.
206
+
207
+ 1. Elija el marco adecuado para cada parte de la vida útil de un modelo:
208
+ - Entrene modelos de última generación en 3 líneas de código.
209
+ - Mueva un solo modelo entre los marcos TF2.0/PyTorch/JAX a voluntad.
210
+ - Elija sin problemas el marco adecuado para la formación, la evaluación y la producción.
211
+
212
+ 1. Personalice fácilmente un modelo o un ejemplo según sus necesidades:
213
+ - Proporcionamos ejemplos de cada arquitectura para reproducir los resultados publicados por sus autores originales..
214
+ - Los internos del modelo están expuestos lo más consistentemente posible..
215
+ - Los archivos modelo se pueden usar independientemente de la biblioteca para experimentos rápidos.
216
+
217
+ ## ¿Por qué no debería usar transformers?
218
+
219
+ - Esta biblioteca no es una caja de herramientas modular de bloques de construcción para redes neuronales. El código en los archivos del modelo no se refactoriza con abstracciones adicionales a propósito, de modo que los investigadores puedan iterar rápidamente en cada uno de los modelos sin sumergirse en abstracciones/archivos adicionales.
220
+ - La API de entrenamiento no está diseñada para funcionar en ningún modelo, pero está optimizada para funcionar con los modelos proporcionados por la biblioteca. Para bucles genéricos de aprendizaje automático, debe usar otra biblioteca (posiblemente, [Accelerate](https://huggingface.co/docs/accelerate)).
221
+ - Si bien nos esforzamos por presentar tantos casos de uso como sea posible, los scripts en nuestra [carpeta de ejemplos](https://github.com/huggingface/transformers/tree/main/examples) son solo eso: ejemplos. Se espera que no funcionen de forma inmediata en su problema específico y que deba cambiar algunas líneas de código para adaptarlas a sus necesidades.
222
+
223
+ ## Instalación
224
+
225
+ ### Con pip
226
+
227
+ Este repositorio está probado en Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+ y TensorFlow 2.3+.
228
+
229
+ Deberías instalar 🤗 Transformers en un [ambiente virtual](https://docs.python.org/3/library/venv.html). Si no estas familiarizado con los entornos virtuales de Python, consulta la [guía de usuario](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
230
+
231
+ Primero, crea un entorno virtual con la versión de Python que vas a usar y actívalo.
232
+
233
+ Luego, deberás instalar al menos uno de Flax, PyTorch o TensorFlow.
234
+ Por favor, ve a la [página de instalación de TensorFlow](https://www.tensorflow.org/install/), [página de instalación de PyTorch](https://pytorch.org/get-started/locally/#start-locally) y/o las páginas de instalación de [Flax](https://github.com/google/flax#quick-install) y [Jax](https://github.com/google/jax#installation) con respecto al comando de instalación específico para tu plataforma.
235
+
236
+ Cuando se ha instalado uno de esos backends, los 🤗 Transformers se pueden instalar usando pip de la siguiente manera:
237
+
238
+ ```bash
239
+ pip install transformers
240
+ ```
241
+
242
+ Si deseas jugar con los ejemplos o necesitas la última versión del código y no puedes esperar a una nueva versión, tienes que [instalar la librería de la fuente](https://huggingface.co/docs/transformers/installation#installing-from-source).
243
+
244
+ ### Con conda
245
+
246
+ Desde la versión v4.0.0 de Transformers, ahora tenemos un canal conda: `huggingface`.
247
+
248
+ 🤗 Transformers se puede instalar usando conda de la siguiente manera:
249
+
250
+ ```shell script
251
+ conda install -c huggingface transformers
252
+ ```
253
+
254
+ Sigue las páginas de instalación de Flax, PyTorch o TensorFlow para ver cómo instalarlos con conda.
255
+
256
+ > **_NOTA:_** En Windows, es posible que se le pida que active el modo de desarrollador para beneficiarse del almacenamiento en caché. Si esta no es una opción para usted, háganoslo saber en [esta issue](https://github.com/huggingface/huggingface_hub/issues/1062).
257
+
258
+ ## Arquitecturas modelo
259
+
260
+ **[Todos los puntos de control del modelo](https://huggingface.co/models)** aportados por 🤗 Transformers están perfectamente integrados desde huggingface.co [Centro de modelos](https://huggingface.co) donde son subidos directamente por los [usuarios](https://huggingface.co/users) y [organizaciones](https://huggingface.co/organizations).
261
+
262
+ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
263
+
264
+ 🤗 Transformers actualmente proporciona las siguientes arquitecturas (ver [aquí](https://huggingface.co/docs/transformers/model_summary) para un resumen de alto nivel de cada uno de ellas.):
265
+
266
+ 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
267
+ 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
268
+ 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
269
+ 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
270
+ 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
271
+ 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
272
+ 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
273
+ 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
274
+ 1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
275
+ 1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
276
+ 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
277
+ 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
278
+ 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
279
+ 1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
280
+ 1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
281
+ 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
282
+ 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
283
+ 1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
284
+ 1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
285
+ 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
286
+ 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
287
+ 1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
288
+ 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
289
+ 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
290
+ 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
291
+ 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
292
+ 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
293
+ 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
294
+ 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
295
+ 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
296
+ 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
297
+ 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
298
+ 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
299
+ 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
300
+ 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
301
+ 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
302
+ 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
303
+ 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
304
+ 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
305
+ 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
306
+ 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
307
+ 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
308
+ 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
309
+ 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
310
+ 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
311
+ 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
312
+ 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
313
+ 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
314
+ 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
315
+ 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
316
+ 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
317
+ 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
318
+ 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
319
+ 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
320
+ 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
321
+ 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
322
+ 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
323
+ 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
324
+ 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
325
+ 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
326
+ 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
327
+ 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
328
+ 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
329
+ 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
330
+ 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
331
+ 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
332
+ 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
333
+ 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
334
+ 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
335
+ 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
336
+ 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
337
+ 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
338
+ 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
339
+ 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
340
+ 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
341
+ 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
342
+ 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
343
+ 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
344
+ 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
345
+ 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
346
+ 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
347
+ 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
348
+ 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
349
+ 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
350
+ 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
351
+ 1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
352
+ 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
353
+ 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
354
+ 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
355
+ 1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
356
+ 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
357
+ 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
358
+ 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
359
+ 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
360
+ 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
361
+ 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
362
+ 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
363
+ 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
364
+ 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
365
+ 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
366
+ 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
367
+ 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
368
+ 1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
369
+ 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
370
+ 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
371
+ 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
372
+ 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
373
+ 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
374
+ 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
375
+ 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
376
+ 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
377
+ 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
378
+ 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
379
+ 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
380
+ 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
381
+ 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
382
+ 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
383
+ 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/main/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
384
+ 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
385
+ 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
386
+ 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
387
+ 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
388
+ 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
389
+ 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
390
+ 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
391
+ 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
392
+ 1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
393
+ 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
394
+ 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
395
+ 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
396
+ 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
397
+ 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
398
+ 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
399
+ 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
400
+ 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
401
+ 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
402
+ 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
403
+ 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
404
+ 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
405
+ 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
406
+ 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
407
+ 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
408
+ 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
409
+ 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
410
+ 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
411
+ 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
412
+ 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
413
+ 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
414
+ 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
415
+ 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
416
+ 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
417
+ 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
418
+ 1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
419
+ 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
420
+ 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
421
+ 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
422
+ 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
423
+ 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
424
+ 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
425
+ 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
426
+ 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
427
+ 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
428
+ 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
429
+ 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
430
+ 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
431
+ 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
432
+ 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
433
+ 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
434
+ 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
435
+ 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
436
+ 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
437
+ 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
438
+ 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
439
+ 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
440
+ 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
441
+ 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
442
+ 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
443
+ 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
444
+ 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
445
+ 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
446
+ 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
447
+ 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
448
+ 1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
449
+ 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
450
+ 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
451
+ 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
452
+ 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
453
+ 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
454
+ 1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
455
+ 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
456
+ 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
457
+ 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
458
+ 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
459
+ 1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
460
+ 1. ¿Quieres aportar un nuevo modelo? Hemos agregado una **guía detallada y plantillas** para guiarte en el proceso de agregar un nuevo modelo. Puedes encontrarlos en la carpeta de [`templates`](./templates) del repositorio. Asegúrate de revisar las [pautas de contribución](./CONTRIBUTING.md) y comunícate con los mantenedores o abra un problema para recopilar comentarios antes de comenzar su PR.
461
+
462
+ Para comprobar si cada modelo tiene una implementación en Flax, PyTorch o TensorFlow, o tiene un tokenizador asociado respaldado por la librería 🤗 Tokenizers , ve a [esta tabla](https://huggingface.co/docs/transformers/index#supported-frameworks).
463
+
464
+ Estas implementaciones se han probado en varios conjuntos de datos (consulte los scripts de ejemplo) y deberían coincidir con el rendimiento de las implementaciones originales. Puede encontrar más detalles sobre el rendimiento en la sección Examples de la [documentación](https://github.com/huggingface/transformers/tree/main/examples).
465
+
466
+
467
+ ## Aprender más
468
+
469
+ | Sección | Descripción |
470
+ |-|-|
471
+ | [Documentación](https://huggingface.co/docs/transformers/) | Toda la documentación de la API y tutoriales |
472
+ | [Resumen de tareas](https://huggingface.co/docs/transformers/task_summary) | Tareas soportadas 🤗 Transformers |
473
+ | [Tutorial de preprocesAmiento](https://huggingface.co/docs/transformers/preprocessing) | Usando la clase `Tokenizer` para preparar datos para los modelos |
474
+ | [Entrenamiento y puesta a punto](https://huggingface.co/docs/transformers/training) | Usando los modelos aportados por 🤗 Transformers en un bucle de entreno de PyTorch/TensorFlow y la API de `Trainer` |
475
+ | [Recorrido rápido: secuencias de comandos de ajuste/uso](https://github.com/huggingface/transformers/tree/main/examples) | Scripts de ejemplo para ajustar modelos en una amplia gama de tareas |
476
+ | [Compartir y subir modelos](https://huggingface.co/docs/transformers/model_sharing) | Carga y comparte tus modelos perfeccionados con la comunidad |
477
+ | [Migración](https://huggingface.co/docs/transformers/migration) | Migra a 🤗 Transformers desde `pytorch-transformers` o `pytorch-pretrained-bert` |
478
+
479
+ ## Citación
480
+
481
+ Ahora nosotros tenemos un [papel](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) que puedes citar para la librería de 🤗 Transformers:
482
+ ```bibtex
483
+ @inproceedings{wolf-etal-2020-transformers,
484
+ title = "Transformers: State-of-the-Art Natural Language Processing",
485
+ author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
486
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
487
+ month = oct,
488
+ year = "2020",
489
+ address = "Online",
490
+ publisher = "Association for Computational Linguistics",
491
+ url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
492
+ pages = "38--45"
493
+ }
494
+ ```
hf-dev-train/transformers-main/README_hd.md ADDED
The diff for this file is too large to render. See raw diff
 
hf-dev-train/transformers-main/README_ja.md ADDED
@@ -0,0 +1,528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ <!---
18
+ A useful guide for English-Traditional Japanese translation of Hugging Face documentation
19
+ - Use square quotes, e.g.,「引用」
20
+
21
+ Dictionary
22
+
23
+ API: API(翻訳しない)
24
+ add: 追加
25
+ checkpoint: チェックポイント
26
+ code: コード
27
+ community: コミュニティ
28
+ confidence: 信頼度
29
+ dataset: データセット
30
+ documentation: ドキュメント
31
+ example: 例
32
+ finetune: 微調整
33
+ Hugging Face: Hugging Face(翻訳しない)
34
+ implementation: 実装
35
+ inference: 推論
36
+ library: ライブラリ
37
+ module: モジュール
38
+ NLP/Natural Language Processing: NLPと表示される場合は翻訳されず、Natural Language Processingと表示される場合は翻訳される
39
+ online demos: オンラインデモ
40
+ pipeline: pipeline(翻訳しない)
41
+ pretrained/pretrain: 学習済み
42
+ Python data structures (e.g., list, set, dict): リスト、セット、ディクショナリと訳され、括弧内は原文英語
43
+ repository: repository(翻訳しない)
44
+ summary: 概要
45
+ token-: token-(翻訳しない)
46
+ Trainer: Trainer(翻訳しない)
47
+ transformer: transformer(翻訳しない)
48
+ tutorial: チュートリアル
49
+ user: ユーザ
50
+ -->
51
+
52
+ <p align="center">
53
+ <br>
54
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
55
+ <br>
56
+ <p>
57
+ <p align="center">
58
+ <a href="https://circleci.com/gh/huggingface/transformers">
59
+ <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
60
+ </a>
61
+ <a href="https://github.com/huggingface/transformers/blob/main/LICENSE">
62
+ <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
63
+ </a>
64
+ <a href="https://huggingface.co/docs/transformers/index">
65
+ <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online">
66
+ </a>
67
+ <a href="https://github.com/huggingface/transformers/releases">
68
+ <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
69
+ </a>
70
+ <a href="https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md">
71
+ <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
72
+ </a>
73
+ <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
74
+ </p>
75
+
76
+ <h4 align="center">
77
+ <p>
78
+ <a href="https://github.com/huggingface/transformers/">English</a> |
79
+ <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hans.md">简体中文</a> |
80
+ <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hant.md">繁體中文</a> |
81
+ <a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">한국어</a> |
82
+ <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
83
+ <b>日本語</b> |
84
+ <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
85
+ <p>
86
+ </h4>
87
+
88
+ <h3 align="center">
89
+ <p>JAX、PyTorch、TensorFlowのための最先端機械学習</p>
90
+ </h3>
91
+
92
+ <h3 align="center">
93
+ <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
94
+ </h3>
95
+
96
+ 🤗Transformersは、テキスト、視覚、音声などの異なるモダリティに対してタスクを実行するために、事前に学習させた数千のモデルを提供します。
97
+
98
+ これらのモデルは次のような場合に適用できます:
99
+
100
+ * 📝 テキストは、テキストの分類、情報抽出、質問応答、要約、翻訳、テキスト生成などのタスクのために、100以上の言語に対応しています。
101
+ * 🖼️ 画像分類、物体検出、セグメンテーションなどのタスクのための画像。
102
+ * 🗣️ 音声は、音声認識や音声分類などのタスクに使用します。
103
+
104
+ トランスフォーマーモデルは、テーブル質問応答、光学文字認識、スキャン文書からの情報抽出、ビデオ分類、視覚的質問応答など、**複数のモダリティを組み合わせた**タスクも実行可能です。
105
+
106
+ 🤗Transformersは、与えられたテキストに対してそれらの事前学習されたモデルを素早くダウンロードして使用し、あなた自身のデータセットでそれらを微調整し、私たちの[model hub](https://huggingface.co/models)でコミュニティと共有するためのAPIを提供します。同時に、アーキテクチャを定義する各Pythonモジュールは完全にスタンドアロンであり、迅速な研究実験を可能にするために変更することができます。
107
+
108
+ 🤗Transformersは[Jax](https://jax.readthedocs.io/en/latest/)、[PyTorch](https://pytorch.org/)、[TensorFlow](https://www.tensorflow.org/)という3大ディープラーニングライブラリーに支えられ、それぞれのライブラリをシームレスに統合しています。片方でモデルを学習してから、もう片方で推論用にロードするのは簡単なことです。
109
+
110
+ ## オンラインデモ
111
+
112
+ [model hub](https://huggingface.co/models)から、ほとんどのモデルのページで直接テストすることができます。また、パブリックモデル、プライベートモデルに対して、[プライベートモデルのホスティング、バージョニング、推論API](https://huggingface.co/pricing)を提供しています。
113
+
114
+ 以下はその一例です:
115
+
116
+ 自然言語処理にて:
117
+ - [BERTによるマスクドワード補完](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
118
+ - [Electraによる名前実体認識](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
119
+ - [GPT-2によるテキスト生成](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
120
+ - [RoBERTaによる自然言語推論](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
121
+ - [BARTによる要約](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
122
+ - [DistilBERTによる質問応答](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
123
+ - [T5による翻訳](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
124
+
125
+ コンピュータビジョンにて:
126
+ - [ViTによる画像分類](https://huggingface.co/google/vit-base-patch16-224)
127
+ - [DETRによる物体検出](https://huggingface.co/facebook/detr-resnet-50)
128
+ - [SegFormerによるセマンティックセグメンテーション](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
129
+ - [DETRによるパノプティックセグメンテーション](https://huggingface.co/facebook/detr-resnet-50-panoptic)
130
+
131
+ オーディオにて:
132
+ - [Wav2Vec2による自動音声認識](https://huggingface.co/facebook/wav2vec2-base-960h)
133
+ - [Wav2Vec2によるキーワード検索](https://huggingface.co/superb/wav2vec2-base-superb-ks)
134
+
135
+ マルチモーダルなタスクにて:
136
+ - [ViLTによる視覚的質問応答](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
137
+
138
+ Hugging Faceチームによって作られた **[トランスフォーマーを使った書き込み](https://transformer.huggingface.co)** は、このリポジトリのテキスト生成機能の公式デモである。
139
+
140
+ ## Hugging Faceチームによるカスタム・サポートをご希望の場合
141
+
142
+ <a target="_blank" href="https://huggingface.co/support">
143
+ <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
144
+ </a><br>
145
+
146
+ ## クイックツアー
147
+
148
+ 与えられた入力(テキスト、画像、音声、...)に対してすぐにモデルを使うために、我々は`pipeline`というAPIを提供しております。pipelineは、学習済みのモデルと、そのモデルの学習時に使用された前処理をグループ化したものです。以下は、肯定的なテキストと否定的なテキストを分類するためにpipelineを使用する方法です:
149
+
150
+ ```python
151
+ >>> from transformers import pipeline
152
+
153
+ # Allocate a pipeline for sentiment-analysis
154
+ >>> classifier = pipeline('sentiment-analysis')
155
+ >>> classifier('We are very happy to introduce pipeline to the transformers repository.')
156
+ [{'label': 'POSITIVE', 'score': 0.9996980428695679}]
157
+ ```
158
+
159
+ 2行目のコードでは、pipelineで使用される事前学習済みモデルをダウンロードしてキャッシュし、3行目では与えられたテキストに対してそのモデルを評価します。ここでは、答えは99.97%の信頼度で「ポジティブ」です。
160
+
161
+ 自然言語処理だけでなく、コンピュータビジョンや音声処理においても、多くのタスクにはあらかじめ訓練された`pipeline`が用意されている。例えば、画像から検出された物体を簡単に抽出することができる:
162
+
163
+ ``` python
164
+ >>> import requests
165
+ >>> from PIL import Image
166
+ >>> from transformers import pipeline
167
+
168
+ # Download an image with cute cats
169
+ >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
170
+ >>> image_data = requests.get(url, stream=True).raw
171
+ >>> image = Image.open(image_data)
172
+
173
+ # Allocate a pipeline for object detection
174
+ >>> object_detector = pipeline('object-detection')
175
+ >>> object_detector(image)
176
+ [{'score': 0.9982201457023621,
177
+ 'label': 'remote',
178
+ 'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
179
+ {'score': 0.9960021376609802,
180
+ 'label': 'remote',
181
+ 'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
182
+ {'score': 0.9954745173454285,
183
+ 'label': 'couch',
184
+ 'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
185
+ {'score': 0.9988006353378296,
186
+ 'label': 'cat',
187
+ 'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
188
+ {'score': 0.9986783862113953,
189
+ 'label': 'cat',
190
+ 'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
191
+ ```
192
+
193
+ ここでは、画像から検出されたオブジェクトのリストが得られ、オブジェクトを囲むボックスと信頼度スコアが表示されます。左側が元画像、右側が予測結果を表示したものです:
194
+
195
+ <h3 align="center">
196
+ <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
197
+ <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
198
+ </h3>
199
+
200
+ [このチュートリアル](https://huggingface.co/docs/transformers/task_summary)では、`pipeline`APIでサポートされているタスクについて詳しく説明しています。
201
+
202
+ `pipeline`に加えて、与えられたタスクに学習済みのモデルをダウンロードして使用するために必要なのは、3行のコードだけです。以下はPyTorchのバージョンです:
203
+ ```python
204
+ >>> from transformers import AutoTokenizer, AutoModel
205
+
206
+ >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
207
+ >>> model = AutoModel.from_pretrained("bert-base-uncased")
208
+
209
+ >>> inputs = tokenizer("Hello world!", return_tensors="pt")
210
+ >>> outputs = model(**inputs)
211
+ ```
212
+
213
+ And here is the equivalent code for TensorFlow:
214
+ ```python
215
+ >>> from transformers import AutoTokenizer, TFAutoModel
216
+
217
+ >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
218
+ >>> model = TFAutoModel.from_pretrained("bert-base-uncased")
219
+
220
+ >>> inputs = tokenizer("Hello world!", return_tensors="tf")
221
+ >>> outputs = model(**inputs)
222
+ ```
223
+
224
+ トークナイザは学習済みモデルが期待するすべての前処理を担当し、単一の文字列 (上記の例のように) またはリストに対して直接呼び出すことができます。これは下流のコードで使用できる辞書を出力します。また、単純に ** 引数展開演算子を使用してモデルに直接渡すこともできます。
225
+
226
+ モデル自体は通常の[Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) または [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (バック���ンドによって異なる)で、通常通り使用することが可能です。[このチュートリアル](https://huggingface.co/docs/transformers/training)では、このようなモデルを従来のPyTorchやTensorFlowの学習ループに統合する方法や、私たちの`Trainer`APIを使って新しいデータセットで素早く微調整を行う方法について説明します。
227
+
228
+ ## なぜtransformersを使う必要があるのでしょうか?
229
+
230
+ 1. 使いやすい最新モデル:
231
+ - 自然言語理解・生成、コンピュータビジョン、オーディオの各タスクで高いパフォーマンスを発揮します。
232
+ - 教育者、実務者にとっての低い参入障壁。
233
+ - 学習するクラスは3つだけで、ユーザが直面する抽象化はほとんどありません。
234
+ - 学習済みモデルを利用するための統一されたAPI。
235
+
236
+ 1. 低い計算コスト、少ないカーボンフットプリント:
237
+ - 研究者は、常に再トレーニングを行うのではなく、トレーニングされたモデルを共有することができます。
238
+ - 実務家は、計算時間や生産コストを削減することができます。
239
+ - すべてのモダリティにおいて、60,000以上の事前学習済みモデルを持つ数多くのアーキテクチャを提供します。
240
+
241
+ 1. モデルのライフタイムのあらゆる部分で適切なフレームワークを選択可能:
242
+ - 3行のコードで最先端のモデルをトレーニング。
243
+ - TF2.0/PyTorch/JAXフレームワーク間で1つのモデルを自在に移動させる。
244
+ - 学習、評価、生産に適したフレームワークをシームレスに選択できます。
245
+
246
+ 1. モデルやサンプルをニーズに合わせて簡単にカスタマイズ可能:
247
+ - 原著者が発表した結果を再現するために、各アーキテクチャの例を提供しています。
248
+ - モデル内部は可能な限り一貫して公開されています。
249
+ - モデルファイルはライブラリとは独立して利用することができ、迅速な実験が可能です。
250
+
251
+ ## なぜtransformersを使ってはいけないのでしょうか?
252
+
253
+ - このライブラリは、ニューラルネットのためのビルディングブロックのモジュール式ツールボックスではありません。モデルファイルのコードは、研究者が追加の抽象化/ファイルに飛び込むことなく、各モデルを素早く反復できるように、意図的に追加の抽象化でリファクタリングされていません。
254
+ - 学習APIはどのようなモデルでも動作するわけではなく、ライブラリが提供するモデルで動作するように最適化されています。一般的な機械学習のループには、別のライブラリ(おそらく[Accelerate](https://huggingface.co/docs/accelerate))を使用する必要があります。
255
+ - 私たちはできるだけ多くの使用例を紹介するよう努力していますが、[examples フォルダ](https://github.com/huggingface/transformers/tree/main/examples) にあるスクリプトはあくまで例です。あなたの特定の問題に対してすぐに動作するわけではなく、あなたのニーズに合わせるために数行のコードを変更する必要があることが予想されます。
256
+
257
+ ## インストール
258
+
259
+ ### pipにて
260
+
261
+ このリポジトリは、Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+, TensorFlow 2.3+ でテストされています。
262
+
263
+ 🤗Transformersは[仮想環境](https://docs.python.org/3/library/venv.html)にインストールする必要があります。Pythonの仮想環境に慣れていない場合は、[ユーザーガイド](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)を確認してください。
264
+
265
+ まず、使用するバージョンのPythonで仮想環境を作成し、アクティベートします。
266
+
267
+ その後、Flax, PyTorch, TensorFlowのうち少なくとも1つをインストールする必要があります。
268
+ [TensorFlowインストールページ](https://www.tensorflow.org/install/)、[PyTorchインストールページ](https://pytorch.org/get-started/locally/#start-locally)、[Flax](https://github.com/google/flax#quick-install)、[Jax](https://github.com/google/jax#installation)インストールページで、お使いのプラットフォーム別のインストールコマンドを参照してください。
269
+
270
+ これらのバックエンドのいずれかがインストールされている場合、🤗Transformersは以下のようにpipを使用してインストールすることができます:
271
+
272
+ ```bash
273
+ pip install transformers
274
+ ```
275
+
276
+ もしサンプルを試したい、またはコードの最先端が必要で、新しいリリースを待てない場合は、[ライブラリをソースからインストール](https://huggingface.co/docs/transformers/installation#installing-from-source)する必要があります。
277
+
278
+ ### condaにて
279
+
280
+ Transformersバージョン4.0.0から、condaチャンネルを搭載しました: `huggingface`。
281
+
282
+ 🤗Transformersは以下のようにcondaを使って設置することができます:
283
+
284
+ ```shell script
285
+ conda install -c huggingface transformers
286
+ ```
287
+
288
+ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それぞれのインストールページに従ってください。
289
+
290
+ > **_注意:_** Windowsでは、キャッシュの恩恵を受けるために、デベロッパーモードを有効にするよう促されることがあります。このような場合は、[このissue](https://github.com/huggingface/huggingface_hub/issues/1062)でお知らせください。
291
+
292
+ ## モデルアーキテクチャ
293
+
294
+ 🤗Transformersが提供する **[全モデルチェックポイント](https://huggingface.co/models)** は、[ユーザー](https://huggingface.co/users)や[組織](https://huggingface.co/organizations)によって直接アップロードされるhuggingface.co [model hub](https://huggingface.co)からシームレスに統合されています。
295
+
296
+ 現在のチェックポイント数: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
297
+
298
+ 🤗Transformersは現在、以下のアーキテクチャを提供しています(それぞれのハイレベルな要約は[こちら](https://huggingface.co/docs/transformers/model_summary)を参照してください):
299
+
300
+ 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (Google Research and the Toyota Technological Institute at Chicago から) Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut から公開された研究論文: [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942)
301
+ 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research から) Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. から公開された研究論文 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918)
302
+ 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679)
303
+ 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (MIT から) Yuan Gong, Yu-An Chung, James Glass から公開された研究論文: [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778)
304
+ 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (Facebook から) Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer から公開された研究論文: [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461)
305
+ 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (École polytechnique から) Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis から公開された研究論文: [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321)
306
+ 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research から) Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen から公開された研究論文: [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)
307
+ 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (Microsoft から) Hangbo Bao, Li Dong, Furu Wei から公開された研究論文: [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254)
308
+ 1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (Google から) Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova から公開された研究論文: [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
309
+ 1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (Google から) Sascha Rothe, Shashi Narayan, Aliaksei Severyn から公開された研究論文: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461)
310
+ 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research から) Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen から公開された研究論文: [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/)
311
+ 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (Google Research から) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed から公開された研究論文: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062)
312
+ 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (Google Research から) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed から公開された研究論文: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062)
313
+ 1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (Microsoft Research AI4Science から) Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu から公開された研究論文: [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9)
314
+ 1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (Google AI から) Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil から公開された研究論文: [Big Transfer (BiT)](https://arxiv.org/abs/1912.11370)Houlsby.
315
+ 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (Facebook から) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston から公開された研究論文: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637)
316
+ 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (Facebook から) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston から公開された研究論文: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637)
317
+ 1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (Salesforce から) Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi から公開された研究論文: [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086)
318
+ 1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (Salesforce から) Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. から公開された研究論文 [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597)
319
+ 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (BigScience workshop から) [BigScience Workshop](https://bigscience.huggingface.co/) から公開されました.
320
+ 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa から) Adrian de Wynter and Daniel J. Perry から公開された研究論文: [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499)
321
+ 1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (Harbin Institute of Technology/Microsoft Research Asia/Intel Labs から) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
322
+ 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google Research から) Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel から公開された研究論文: [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626)
323
+ 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)
324
+ 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
325
+ 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
326
+ 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
327
+ 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
328
+ 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
329
+ 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)
330
+ 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (Microsoft Research Asia から) Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang から公開された研究論文: [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152)
331
+ 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech から) Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan から公開された研究論文: [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496)
332
+ 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI から) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie から公開された研究論文: [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545)
333
+ 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
334
+ 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University から) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun から公開された研究論文: [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413)
335
+ 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce から) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher から公開された研究論文: [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858)
336
+ 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft から) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang から公開された研究論文: [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808)
337
+ 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook から) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli から公開された研究論文: [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555)
338
+ 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
339
+ 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
340
+ 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google から) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch から公開された研究論文: [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)
341
+ 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (SenseTime Research から) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai から公開された研究論文: [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159)
342
+ 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (Facebook から) Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou から公開された研究論文: [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877)
343
+ 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (The University of Texas at Austin から) Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl. から公開された研究論文 [NMS Strikes Back](https://arxiv.org/abs/2212.06137)
344
+ 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook から) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko から公開された研究論文: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872)
345
+ 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research から) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan から公開された研究論文: [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536)
346
+ 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs から) Ali Hassani and Humphrey Shi から公開された研究論文: [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001)
347
+ 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace から), Victor Sanh, Lysandre Debut and Thomas Wolf. 同じ手法で GPT2, RoBERTa と Multilingual BERT の圧縮を行いました.圧縮されたモデルはそれぞれ [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) と名付けられました. 公開された研究論文: [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108)
348
+ 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research から) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei から公開された研究論文: [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378)
349
+ 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER から), Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park から公開された研究論文: [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664)
350
+ 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook から) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih から公開された研究論文: [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906)
351
+ 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs から) René Ranftl, Alexey Bochkovskiy, Vladlen Koltun から公開された研究論文: [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413)
352
+ 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (Snap Research から) Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. から公開された研究論文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191)
353
+ 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
354
+ 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University から) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning から公開された研究論文: [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555)
355
+ 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research から) Sascha Rothe, Shashi Narayan, Aliaksei Severyn から公開された研究論文: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461)
356
+ 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu から) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu から公開された研究論文: [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223)
357
+ 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu から) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. から公開された研究論文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)
358
+ 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI から) はトランスフォーマープロテイン言語モデルです. **ESM-1b** は Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus から公開された研究論文: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** は Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives から公開された研究論文: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** と **ESMFold** は Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives から公開された研究論文: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902)
359
+ 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (Google AI から) Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V から公開されたレポジトリー [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) Le, and Jason Wei
360
+ 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
361
+ 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS から) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab から公開された研究論文: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372)
362
+ 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (Facebook AI から) Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela から公開された研究論文: [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482)
363
+ 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research から) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon から公開された研究論文: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824)
364
+ 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain から) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le から公開された研究論文: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236)
365
+ 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100)
366
+ 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST から) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim から公開された研究論文: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436)
367
+ 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI から) Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever から公開された研究論文: [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/)
368
+ 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI から) Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy から公開されたレポジトリー : [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo)
369
+ 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (EleutherAI から) Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach から公開された研究論文: [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)
370
+ 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (ABEJA から) Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori からリリース.
371
+ 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI から) Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** から公開された研究論文: [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/)
372
+ 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI から) Ben Wang and Aran Komatsuzaki から公開されたレポジトリー [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/)
373
+ 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (AI-Sweden から) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren から公開された研究論文: [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf)
374
+ 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) 坂本俊之(tanreinama)からリリースされました.
375
+ 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234).
376
+ 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094)
377
+ 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447)
378
+ 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321)
379
+ 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI から) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever から公開された研究論文: [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/)
380
+ 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
381
+ 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI から) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever から公開された研究論文: [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf)
382
+ 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia から) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou から公開された研究論文: [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318)
383
+ 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia から) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou から公開された研究論文: [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740)
384
+ 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (Microsoft Research Asia から) Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei から公開された研究論文: [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387)
385
+ 1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (Microsoft Research Asia から) Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei から公開された研究論文: [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836)
386
+ 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
387
+ 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI から) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze から公開された研究論文: [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136)
388
+ 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology から) Jiapeng Wang, Lianwen Jin, Kai Ding から公開された研究論文: [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669)
389
+ 1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (The FAIR team of Meta AI から) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. から公開された研究論文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
390
+ 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
391
+ 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI から) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang から公開された研究論文: [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916)
392
+ 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia から) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto から公開された研究論文: [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057)
393
+ 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (UNC Chapel Hill から) Hao Tan and Mohit Bansal から公開された研究論文: [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490)
394
+ 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (Facebook から) Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert から公開された研究論文: [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161)
395
+ 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook から) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin から公開された研究論文: [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125)
396
+ 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg Tiedemann から. [OPUS](http://opus.nlpl.eu/) を使いながら学習された "Machine translation" (マシントランスレーション) モデル. [Marian Framework](https://marian-nmt.github.io/) はMicrosoft Translator Team が現在開発中です.
397
+ 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia から) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei から公開された研究論文: [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518)
398
+ 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC から) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. から公開された研究論文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
399
+ 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC から) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov から公開された研究論文: [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
400
+ 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer から公開された研究論文: [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210)
401
+ 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan から公開された研究論文: [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401)
402
+ 1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (Facebook から) Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. から公開された研究論文 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655)
403
+ 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
404
+ 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
405
+ 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research から) Peng Wang, Cheng Da, and Cong Yao. から公開された研究論文 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)
406
+ 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia から) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka から公開された研究論文: [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151)
407
+ 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain から) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou から公開された研究論文: [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984)
408
+ 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. から) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam から公開された研究論文: [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861)
409
+ 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. から) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen から公開された研究論文: [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)
410
+ 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple から) Sachin Mehta and Mohammad Rastegari から公開された研究論文: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178)
411
+ 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297)
412
+ 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934)
413
+ 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131)
414
+ 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs から) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi から公開された研究論文: [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143)
415
+ 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab から) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu から公開された研究論文: [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204)
416
+ 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta から) the NLLB team から公開された研究論文: [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
417
+ 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/main/model_doc/nllb-moe)** (Meta から) the NLLB team. から公開された研究論文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
418
+ 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison から) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh から公開された研究論文: [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902)
419
+ 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs から) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi から公開された研究論文: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
420
+ 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
421
+ 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
422
+ 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
423
+ 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)
424
+ 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795)
425
+ 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research から) Dat Quoc Nguyen and Anh Tuan Nguyen から公開された研究論文: [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/)
426
+ 1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (Google から) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. から公開された研究論文 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)
427
+ 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP から) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang から公開された研究論文: [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333)
428
+ 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs から) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng から公開された研究論文: [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)
429
+ 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063)
430
+ 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA から) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius から公開された研究論文: [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602)
431
+ 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook から) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela から公開された研究論文: [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401)
432
+ 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research から) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang から公開された研究論文: [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909)
433
+ 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (Google Research から) Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya から公開された研究論文: [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451)
434
+ 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (META Platforms から) Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár から公開された研究論文: [Designing Network Design Space](https://arxiv.org/abs/2003.13678)
435
+ 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research から) Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder から公開された研究論文: [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821)
436
+ 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (Microsoft Research から) Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun から公開された研究論文: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)
437
+ 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (Facebook から), Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov から公開された研究論文: [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692)
438
+ 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038)
439
+ 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
440
+ 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
441
+ 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
442
+ 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
443
+ 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
444
+ 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research から) Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. から公開された研究論文 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)
445
+ 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (Facebook から), Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino から公開された研究論文: [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171)
446
+ 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook から), Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau から公開された研究論文: [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678)
447
+ 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University から), Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy から公開された研究論文: [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438)
448
+ 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley から) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer から公開された研究論文: [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316)
449
+ 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft から) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo から公開された研究論文: [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
450
+ 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft から) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo から公開された研究論文: [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883)
451
+ 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345)
452
+ 1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (Google から) William Fedus, Barret Zoph, Noam Shazeer から公開された研究論文: [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
453
+ 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (Google AI から) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu から公開された研究論文: [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683)
454
+ 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (Google AI から) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu から公開されたレポジトリー [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511)
455
+ 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research から) Brandon Smock, Rohith Pesala, Robin Abraham から公開された研究論文: [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061)
456
+ 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI から) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos から公開された研究論文: [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349)
457
+ 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research から) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou から公開された研究論文: [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653)
458
+ 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (HuggingFace から).
459
+ 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook から) Gedas Bertasius, Heng Wang, Lorenzo Torresani から公開された研究論文: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095)
460
+ 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley から) Michael Janner, Qiyang Li, Sergey Levine から公開された研究論文: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)
461
+ 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU から) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov から公開された研究論文: [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
462
+ 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft から), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei から公開された研究論文: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282)
463
+ 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill から), Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal から公開された研究論文: [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
464
+ 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research から) Yi Tay, Mostafa Dehghani, Vinh Q から公開された研究論文: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
465
+ 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research から) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang から公開された研究論文: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597)
466
+ 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research から) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu から公開された研究論文: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752)
467
+ 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University から) Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. から公開された研究論文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)
468
+ 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University から) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu から公開された研究論文: [Visual Attention Network](https://arxiv.org/abs/2202.09741)
469
+ 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University から) Zhan Tong, Yibing Song, Jue Wang, Limin Wang から公開された研究論文: [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602)
470
+ 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain から) Wonjae Kim, Bokyung Son, Ildoo Kim から公開された研究論文: [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334)
471
+ 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
472
+ 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP から) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang から公開された研究論文: [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557)
473
+ 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
474
+ 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI から) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick から公開された研究論文: [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
475
+ 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI から) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas から公開された研究論文: [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141)
476
+ 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI から) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477)
477
+ 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI から) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino から公開された研究論文: [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171)
478
+ 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI から) Qiantong Xu, Alexei Baevski, Michael Auli から公開された研究論文: [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680)
479
+ 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (Microsoft Research から) Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei から公開された研究論文: [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900)
480
+ 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI から) Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever から公開された研究論文: [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf)
481
+ 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (Microsoft Research から) Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling から公開された研究論文: [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816)
482
+ 1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (Meta AI から) Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe. から公開された研究論文 [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255)
483
+ 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li から公開された研究論文: [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668)
484
+ 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (Facebook から) Guillaume Lample and Alexis Conneau から公開された研究論文: [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291)
485
+ 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063)
486
+ 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (Facebook AI から), Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov から公開された研究論文: [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116)
487
+ 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI から), Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau から公開された研究論文: [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572)
488
+ 1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (Meta AI から) Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa から公開された研究論文: [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472)
489
+ 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU から) Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le から公開された研究論文: [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237)
490
+ 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI から) Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli から公開された研究論文: [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296)
491
+ 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (Facebook AI から) Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979)
492
+ 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (Huazhong University of Science & Technology から) Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu から公開された研究論文: [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666)
493
+ 1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (the University of Wisconsin - Madison から) Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh から公開された研究論文: [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714)
494
+ 1. 新しいモデルを投稿したいですか?新しいモデルを追加するためのガイドとして、**詳細なガイドとテンプレート**が追加されました。これらはリポジトリの[`templates`](./templates)フォルダにあります。PRを始める前に、必ず[コントリビューションガイド](./CONTRIBUTING.md)を確認し、メンテナに連絡するか、フィードバックを収集するためにissueを開いてください。
495
+
496
+ 各モデルがFlax、PyTorch、TensorFlowで実装されているか、🤗Tokenizersライブラリに支えられた関連トークナイザを持っているかは、[この表](https://huggingface.co/docs/transformers/index#supported-frameworks)を参照してください。
497
+
498
+ これらの実装はいくつかのデータセットでテストされており(サンプルスクリプトを参照)、オリジナルの実装の性能と一致するはずである。性能の詳細は[documentation](https://github.com/huggingface/transformers/tree/main/examples)のExamplesセクションで見ることができます。
499
+
500
+
501
+ ## さらに詳しく
502
+
503
+ | セクション | 概要 |
504
+ |-|-|
505
+ | [ドキュメント](https://huggingface.co/docs/transformers/) | 完全なAPIドキュメントとチュートリアル |
506
+ | [タスク概要](https://huggingface.co/docs/transformers/task_summary) | 🤗Transformersがサポートするタスク |
507
+ | [前処理チュートリアル](https://huggingface.co/docs/transformers/preprocessing) | モデル用のデータを準備するために`Tokenizer`クラスを使用 |
508
+ | [トレーニングと微調整](https://huggingface.co/docs/transformers/training) | PyTorch/TensorFlowの学習ループと`Trainer`APIで🤗Transformersが提供するモデルを使用 |
509
+ | [クイックツアー: 微調整/使用方法スクリプト](https://github.com/huggingface/transformers/tree/main/examples) | 様々なタスクでモデルの微調整を行うためのスクリプト例 |
510
+ | [モデルの共有とアップロード](https://huggingface.co/docs/transformers/model_sharing) | 微調整したモデルをアップロードしてコミュニティで共有する |
511
+ | [マイグレーション](https://huggingface.co/docs/transformers/migration) | `pytorch-transformers`または`pytorch-pretrained-bert`から🤗Transformers に移行する |
512
+
513
+ ## 引用
514
+
515
+ 🤗 トランスフォーマーライブラリに引用できる[論文](https://www.aclweb.org/anthology/2020.emnlp-demos.6/)が出来ました:
516
+ ```bibtex
517
+ @inproceedings{wolf-etal-2020-transformers,
518
+ title = "Transformers: State-of-the-Art Natural Language Processing",
519
+ author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
520
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
521
+ month = oct,
522
+ year = "2020",
523
+ address = "Online",
524
+ publisher = "Association for Computational Linguistics",
525
+ url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
526
+ pages = "38--45"
527
+ }
528
+ ```
hf-dev-train/transformers-main/README_ko.md ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ <p align="center">
18
+ <br>
19
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
20
+ <br>
21
+ <p>
22
+ <p align="center">
23
+ <a href="https://circleci.com/gh/huggingface/transformers">
24
+ <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
25
+ </a>
26
+ <a href="https://github.com/huggingface/transformers/blob/main/LICENSE">
27
+ <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
28
+ </a>
29
+ <a href="https://huggingface.co/docs/transformers/index">
30
+ <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online">
31
+ </a>
32
+ <a href="https://github.com/huggingface/transformers/releases">
33
+ <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
34
+ </a>
35
+ <a href="https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md">
36
+ <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
37
+ </a>
38
+ <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
39
+ </p>
40
+
41
+ <h4 align="center">
42
+ <p>
43
+ <a href="https://github.com/huggingface/transformers/">English</a> |
44
+ <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hans.md">简体中文</a> |
45
+ <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hant.md">繁體中文</a> |
46
+ <b>한국어</b> |
47
+ <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
48
+ <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
49
+ <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
50
+ <p>
51
+ </h4>
52
+
53
+ <h3 align="center">
54
+ <p> Jax, Pytorch, TensorFlow를 위한 최첨단 자연어처리</p>
55
+ </h3>
56
+
57
+ <h3 align="center">
58
+ <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
59
+ </h3>
60
+
61
+ 🤗 Transformers는 분류, 정보 추출, 질문 답변, 요약, 번역, 문장 생성 등을 100개 이상의 언어로 수행할 수 있는 수천개의 사전학습된 모델을 제공합니다. 우리의 목표는 모두가 최첨단의 NLP 기술을 쉽게 사용하는 것입니다.
62
+
63
+ 🤗 Transformers는 이러한 사전학습 모델을 빠르게 다운로드해 특정 텍스트에 사용하고, 원하는 데이터로 fine-tuning해 커뮤니티나 우리의 [모델 허브](https://huggingface.co/models)에 공유할 수 있도록 API를 제공합니다. 또한, 모델 구조를 정의하는 각 파이썬 모듈은 완전히 독립적이여서 연구 실험을 위해 손쉽게 수정할 수 있습니다.
64
+
65
+ 🤗 Transformers는 가장 유명한 3개의 딥러닝 라이브러리를 지원합니다. 이들은 서로 완벽히 연동됩니다 — [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/). 간단하게 이 라이브러리 중 하나로 모델을 학습하고, 또 다른 라이브러리로 추론을 위해 모델을 불러올 수 있습니다.
66
+
67
+ ## 온라인 데모
68
+
69
+ 대부분의 모델을 [모델 허브](https://huggingface.co/models) 페이지에서 바로 테스트해볼 수 있습니다. 공개 및 비공개 모델을 위한 [비공개 모델 호스팅, 버전 관리, 추론 API](https://huggingface.co/pricing)도 제공합니다.
70
+
71
+ 예시:
72
+ - [BERT로 마스킹된 단어 완성하기](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
73
+ - [Electra를 이용한 개체명 인식](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
74
+ - [GPT-2로 텍스트 생성하기](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
75
+ - [RoBERTa로 자연어 추론하기](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
76
+ - [BART를 이용한 요약](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
77
+ - [DistilBERT를 이용한 질문 답변](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
78
+ - [T5로 번역하기](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
79
+
80
+ **[Transformer와 글쓰기](https://transformer.huggingface.co)** 는 이 저장소의 텍스트 생성 능력에 관한 Hugging Face 팀의 공식 데모입니다.
81
+
82
+ ## Hugging Face 팀의 커스텀 지원을 원한다면
83
+
84
+ <a target="_blank" href="https://huggingface.co/support">
85
+ <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
86
+ </a><br>
87
+
88
+ ## 퀵 투어
89
+
90
+ 원하는 텍스트에 바로 모델을 사용할 수 있도록, 우리는 `pipeline` API를 제공합니다. Pipeline은 사전학습 모델과 그 모델을 학습할 때 적용한 전처리 방식을 하나로 합칩니다. 다음은 긍정적인 텍스트와 부정적인 텍스트를 분류하기 위해 pipeline을 사용한 간단한 예시입니다:
91
+
92
+ ```python
93
+ >>> from transformers import pipeline
94
+
95
+ # Allocate a pipeline for sentiment-analysis
96
+ >>> classifier = pipeline('sentiment-analysis')
97
+ >>> classifier('We are very happy to introduce pipeline to the transformers repository.')
98
+ [{'label': 'POSITIVE', 'score': 0.9996980428695679}]
99
+ ```
100
+
101
+ 코드의 두번째 줄은 pipeline이 사용하는 사전학습 모델을 다운로드하고 캐시로 저장합니다. 세번째 줄에선 그 모델이 주어진 텍스트를 평가합니다. 여기서 모델은 99.97%의 확률로 텍스트가 긍정적이라고 평가했습니다.
102
+
103
+ 많은 NLP 과제들을 `pipeline`으로 바로 수행할 수 있습니다. 예를 들어, 질문과 문맥이 주어지면 손쉽게 답변을 추출할 수 있습니다:
104
+
105
+ ``` python
106
+ >>> from transformers import pipeline
107
+
108
+ # Allocate a pipeline for question-answering
109
+ >>> question_answerer = pipeline('question-answering')
110
+ >>> question_answerer({
111
+ ... 'question': 'What is the name of the repository ?',
112
+ ... 'context': 'Pipeline has been included in the huggingface/transformers repository'
113
+ ... })
114
+ {'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
115
+
116
+ ```
117
+
118
+ 답변뿐만 아니라, 여기에 사용된 사전학습 모델은 확신도와 토크나이즈된 문장 속 답변의 시작점, 끝점까지 반환합니다. [이 튜토리얼](https://huggingface.co/docs/transformers/task_summary)에서 `pipeline` API가 지원하는 다양한 과제를 확인할 수 있습니다.
119
+
120
+ 코드 3줄로 원하는 과제에 맞게 사전학습 모델을 다운로드 받고 사용할 수 있습니다. 다음은 PyTorch 버전입니다:
121
+ ```python
122
+ >>> from transformers import AutoTokenizer, AutoModel
123
+
124
+ >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
125
+ >>> model = AutoModel.from_pretrained("bert-base-uncased")
126
+
127
+ >>> inputs = tokenizer("Hello world!", return_tensors="pt")
128
+ >>> outputs = model(**inputs)
129
+ ```
130
+ 다음은 TensorFlow 버전입니다:
131
+ ```python
132
+ >>> from transformers import AutoTokenizer, TFAutoModel
133
+
134
+ >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
135
+ >>> model = TFAutoModel.from_pretrained("bert-base-uncased")
136
+
137
+ >>> inputs = tokenizer("Hello world!", return_tensors="tf")
138
+ >>> outputs = model(**inputs)
139
+ ```
140
+
141
+ 토크나이저는 사전학습 모델의 모든 전처리를 책임집니다. 그리고 (위의 예시처럼) 1개의 스트링이나 리스트도 처리할 수 있습니다. 토크나이저는 딕셔너리를 반환하는데, 이는 다운스트림 코드에 사용하거나 언패킹 연산자 ** 를 이용해 모델에 바로 전달할 수도 있습니다.
142
+
143
+ 모델 자체는 일반적으로 사용되는 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)나 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)입니다. [이 튜토리얼](https://huggingface.co/transformers/training.html)은 이러한 모델을 표준적인 PyTorch나 TensorFlow 학습 과정에서 사용하는 방법, 또는 새로운 데이터로 fine-tune하기 위해 `Trainer` API를 사용하는 방법을 설명해줍니다.
144
+
145
+ ## 왜 transformers를 사용해야 할까요?
146
+
147
+ 1. 손쉽게 사용할 수 있는 최첨단 모델:
148
+ - NLU와 NLG 과제에서 뛰어난 성능을 보입니다.
149
+ - 교육자 실무자에게 진입 장벽이 낮습니다.
150
+ - 3개의 클래스만 배우면 바로 사용할 수 있습니다.
151
+ - 하나의 API로 모든 사전학습 모델을 사용할 수 있습니다.
152
+
153
+ 1. 더 적은 계산 비용, 더 적은 탄소 발자국:
154
+ - 연구자들은 모델을 계속 다시 학습시키는 대신 학습된 모델을 공유할 수 있습니다.
155
+ - 실무자들은 학습에 필요한 시간과 비용을 절약할 수 있습니다.
156
+ - 수십개의 모델 구조, 2,000개 이상의 사전학습 모델, 100개 이상의 언어로 학습된 모델 등.
157
+
158
+ 1. 모델의 각 생애주기에 적합한 프레임워크:
159
+ - 코드 3줄로 최첨단 모델을 학습하세요.
160
+ - 자유롭게 모델을 TF2.0나 PyTorch 프레임워크로 변환하세요.
161
+ - 학습, 평가, 공개 등 각 단계에 맞는 프레임워크를 원하는대로 선택하세요.
162
+
163
+ 1. 필요한 대로 모델이나 예시를 커스터마이즈하세요:
164
+ - 우리는 저자가 공개한 결과를 재현하기 위해 각 모델 구조의 예시를 제공합니다.
165
+ - 모델 내부 구조는 가능한 일관적으로 공개되어 있습니다.
166
+ - 빠른 실험을 위해 모델 파일은 라이브러리와 독립적으로 사용될 수 있습니다.
167
+
168
+ ## 왜 transformers를 사용하지 말아야 할까요?
169
+
170
+ - 이 라이브러리는 신경망 블록을 만들기 위한 모듈이 아닙니다. 연구자들이 여러 파일을 살펴보지 않고 바로 각 모델을 사용할 수 있도록, 모델 파일 코드의 추상화 수준을 적정하게 유지했습니다.
171
+ - 학습 API는 모든 모델에 적용할 수 있도록 만들어지진 않았지만, 라이브러리가 제공하는 모델들에 적용할 수 있도록 최적화되었습니다. 일반적인 머신 러닝을 위해선, 다른 라이브러리를 사용하세요.
172
+ - 가능한 많은 사용 예시를 보여드리고 싶어서, [예시 폴더](https://github.com/huggingface/transformers/tree/main/examples)의 스크립트를 준비했습니다. 이 스크립트들을 수정 없이 특정한 문제에 바로 적용하지 못할 수 있습니다. 필요에 맞게 일부 코드를 수정해야 할 수 있습니다.
173
+
174
+ ## 설치
175
+
176
+ ### pip로 설치하기
177
+
178
+ 이 저장소는 Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+, TensorFlow 2.3+에서 테스트 되었습니다.
179
+
180
+ [가상 환경](https://docs.python.org/3/library/venv.html)에 🤗 Transformers를 설치하세요. Python 가상 환경에 익숙하지 않다면, [사용자 가이드](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)를 확인하세요.
181
+
182
+ 우선, 사용할 Python 버전으로 가상 환경을 만들고 실행하세요.
183
+
184
+ 그 다음, Flax, PyTorch, TensorFlow 중 적어도 하나는 설치해야 합니다.
185
+ 플랫폼에 맞는 설치 명령어를 확인하기 위해 [TensorFlow 설치 페이지](https://www.tensorflow.org/install/), [PyTorch 설치 페이지](https://pytorch.org/get-started/locally/#start-locally), [Flax 설치 페이지](https://github.com/google/flax#quick-install)를 확인하세요.
186
+
187
+ 이들 중 적어도 하나가 설치되었다면, 🤗 Transformers는 다음과 같이 pip을 이용해 설치할 수 있습니다:
188
+
189
+ ```bash
190
+ pip install transformers
191
+ ```
192
+
193
+ 예시들을 체험해보고 싶거나, 최최최첨단 코드를 원하거나, 새로운 버전이 나올 때까지 기다릴 수 없다면 [라이브러리를 소스에서 바로 설치](https://huggingface.co/docs/transformers/installation#installing-from-source)하셔야 합니다.
194
+
195
+ ### conda로 설치하기
196
+
197
+ Transformers 버전 v4.0.0부터, conda 채널이 생겼습니다: `huggingface`.
198
+
199
+ 🤗 Transformers는 다음과 같이 conda로 설치할 수 있습니다:
200
+
201
+ ```shell script
202
+ conda install -c huggingface transformers
203
+ ```
204
+
205
+ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 방법을 확인하세요.
206
+
207
+ ## 모델 구조
208
+
209
+ **🤗 Transformers가 제공하는 [모든 모델 체크포인트](https://huggingface.co/models)** 는 huggingface.co [모델 허브](https://huggingface.co)에 완벽히 연동되어 있습니다. [개인](https://huggingface.co/users)과 [기관](https://huggingface.co/organizations)이 모델 허브에 직접 업로드할 수 있습니다.
210
+
211
+ 현재 사용 가능한 모델 체크포인트의 개수: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
212
+
213
+ 🤗 Transformers는 다음 모델들을 제공합니다 (각 모델의 요약은 [여기](https://huggingface.co/docs/transformers/model_summary)서 확인하세요):
214
+
215
+ 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
216
+ 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research 에서 제공)은 Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.의 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918)논문과 함께 발표했습니다.
217
+ 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
218
+ 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
219
+ 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
220
+ 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
221
+ 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
222
+ 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
223
+ 1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
224
+ 1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
225
+ 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
226
+ 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
227
+ 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
228
+ 1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
229
+ 1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
230
+ 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
231
+ 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
232
+ 1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
233
+ 1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (Salesforce 에서 제공)은 Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.의 [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597)논문과 함께 발표했습니다.
234
+ 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
235
+ 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa 에서) Adrian de Wynter and Daniel J. Perry 의 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 논문과 함께 발표했습니다.
236
+ 1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
237
+ 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google Research 에서) Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 의 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 논문과 함께 발표했습니다.
238
+ 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne 에서) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 의 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 논문과 함께 발표했습니다.
239
+ 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
240
+ 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
241
+ 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
242
+ 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
243
+ 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
244
+ 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce 에서) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 의 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 논문과 함께 발표했습니다.
245
+ 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (Microsoft Research Asia 에서) Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 의 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 논문과 함께 발표했습니다.
246
+ 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech 에서) Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 의 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 논문과 함께 발표했습니다.
247
+ 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI 에서) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 의 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 논문과 함께 발표했습니다.
248
+ 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
249
+ 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University 에서) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 의 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 논문과 함께 발표했습니다.
250
+ 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce 에서) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 의 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 논문과 함께 발표했습니다.
251
+ 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft 에서) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 의 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 논문과 함께 발표했습니다.
252
+ 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook 에서) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 의 [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 논문과 함께 발표했습니다.
253
+ 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft 에서) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 의 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 논문과 함께 발표했습니다.
254
+ 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft 에서) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 의 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 논문과 함께 발표했습니다.
255
+ 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google 에서) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 의 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 논문과 함께 발표했습니다.
256
+ 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (SenseTime Research 에서) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai 의 [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) 논문과 함께 발표했습니다.
257
+ 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (Facebook 에서) Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 의 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 논문과 함께 발표했습니다.
258
+ 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (The University of Texas at Austin 에서 제공)은 Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.의 [NMS Strikes Back](https://arxiv.org/abs/2212.06137)논문과 함께 발표했습니다.
259
+ 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook 에서) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 의 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 논문과 함께 발표했습니다.
260
+ 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research 에서) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 의 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 논문과 함께 발표했습니다.
261
+ 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs 에서) Ali Hassani and Humphrey Shi 의 [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) 논문과 함께 발표했습니다.
262
+ 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace 에서) Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT 의 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 논문과 함께 발표했습니다.
263
+ 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research 에서) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei 의 [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 논문과 함께 발표했습니다.
264
+ 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER 에서) Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 의 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 논문과 함께 발표했습니다.
265
+ 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook 에서) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 의 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 논문과 함께 발표했습니다.
266
+ 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs 에서) René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 의 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 논문과 함께 발표했습니다.
267
+ 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
268
+ 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
269
+ 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University 에서) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 의 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 논문과 함께 발표했습니다.
270
+ 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research 에서) Sascha Rothe, Shashi Narayan, Aliaksei Severyn 의 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 논문과 함께 발표했습니다.
271
+ 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu 에서) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 의 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) 논문과 함께 발표했습니다.
272
+ 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu 에서 제공)은 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.의 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)논문과 함께 발표했습니다.
273
+ 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
274
+ 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
275
+ 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
276
+ 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
277
+ 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
278
+ 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
279
+ 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
280
+ 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
281
+ 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
282
+ 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
283
+ 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
284
+ 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (EleutherAI 에서) Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbac 의 [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) 논문과 함께 발표했습니다.
285
+ 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
286
+ 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI 에서) Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 의 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 논문과 함께 발표했습니다.
287
+ 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
288
+ 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (AI-Sweden 에서) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 의 [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) 논문과 함께 발표했습니다.
289
+ 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
290
+ 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu 의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) 논문과 함께 발표했습니다.
291
+ 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다.
292
+ 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다.
293
+ 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다.
294
+ 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI 에서) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 의 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 논문과 함께 발표했습니다.
295
+ 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
296
+ 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI 에서) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever 의 [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) 논문과 함께 발표했습니다.
297
+ 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia 에서) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 의 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 논문과 함께 발표했습니다.
298
+ 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia 에서) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 의 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 논문과 함께 발표했습니다.
299
+ 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (Microsoft Research Asia 에서) Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei 의 [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) 논문과 함께 발표했습니다.
300
+ 1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (Microsoft Research Asia 에서) Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei 의 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) 논문과 함께 발표했습니다.
301
+ 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (AllenAI 에서) Iz Beltagy, Matthew E. Peters, Arman Cohan 의 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 논문과 함께 발표했습니다.
302
+ 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI 에서) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 의 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 논문과 함께 발표했습니다.
303
+ 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology 에서) Jiapeng Wang, Lianwen Jin, Kai Ding 의 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 논문과 함께 발표했습니다.
304
+ 1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (The FAIR team of Meta AI 에서 제공)은 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.의 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)논문과 함께 발표했습니다.
305
+ 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI 에서) Iz Beltagy, Matthew E. Peters, Arman Cohan 의 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 논문과 함께 발표했습니다.
306
+ 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI 에서) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 의 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 논문과 함께 발표했습니다.
307
+ 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia 에서) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 의 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 논문과 함께 발표했습니다.
308
+ 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (UNC Chapel Hill 에서) Hao Tan and Mohit Bansal 의 [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) 논문과 함께 발표했습니다.
309
+ 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (Facebook 에서) Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert 의 [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) 논문과 함께 발표했습니다.
310
+ 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook 에서) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 의 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 논문과 함께 발표했습니다.
311
+ 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
312
+ 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia 에서) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 의 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 논문과 함께 발표했습니다.
313
+ 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC 에서 제공)은 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.의 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)논문과 함께 발표했습니다.
314
+ 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC 에서) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 의 [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) 논문과 함께 발표했습니다.
315
+ 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook 에서) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 의 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 논문과 함께 발표했습니다.
316
+ 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook 에서) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 의 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 논문과 함께 발표했습니다.
317
+ 1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (Facebook 에서 제공)은 Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.의 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655)논문과 함께 발표했습니다.
318
+ 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA 에서) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 의 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 논문과 함께 발표했습니다.
319
+ 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA 에서) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 의 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 논문과 함께 발표했습니다.
320
+ 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research 에서 제공)은 Peng Wang, Cheng Da, and Cong Yao.의 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)논문과 함께 발표했습니다.
321
+ 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia 에서) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 의 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 논문과 함께 발표했습니다.
322
+ 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain 에서) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou 의 [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 논문과 함께 발표했습니다.
323
+ 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. 에서) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 의 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 논문과 함께 발표했습니다.
324
+ 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. 에서) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 의 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 논문과 함께 발표했습니다.
325
+ 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple 에서) Sachin Mehta and Mohammad Rastegari 의 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 논문과 함께 발표했습니다.
326
+ 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research 에서) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 의 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 논문과 함께 발표했습니다.
327
+ 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI 에서) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 의 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 논문과 함께 발표했습니다.
328
+ 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box 에서) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 의 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 논문과 함께 발표했습니다.
329
+ 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs 에서) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 의 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 논문과 함께 발표했습니다.
330
+ 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab 에서) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 의 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 논문과 함께 발표했습니다.
331
+ 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta 에서) the NLLB team 의 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 논문과 함께 발표했습니다.
332
+ 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/main/model_doc/nllb-moe)** (Meta 에서 제공)은 the NLLB team.의 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)논문과 함께 발표했습니다.
333
+ 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison 에서) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 의 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 논문과 함께 발표했습니다.
334
+ 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs 에서) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 의 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 논문과 함께 발표했습니다.
335
+ 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다.
336
+ 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다.
337
+ 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다.
338
+ 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google 에서) Jason Phang, Yao Zhao, Peter J. Liu 의 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 논문과 함께 발표했습니다.
339
+ 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind 에서) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 의 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 논문과 함께 발표했습니다.
340
+ 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research 에서) Dat Quoc Nguyen and Anh Tuan Nguyen 의 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 논문과 함께 발표했습니다.
341
+ 1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (Google 에서 제공)은 Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.의 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)논문과 함께 발표했습니다.
342
+ 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP 에서) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 의 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 논문과 함께 발표했습니다.
343
+ 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs 에서) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 의 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 논문과 함께 발표했습니다.
344
+ 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research 에서) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 의 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 논문과 함께 발표했습니다.
345
+ 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA 에서) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 의 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 논문과 함께 발표했습니다.
346
+ 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook 에서) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela 의 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 논문과 함께 발표했습니다.
347
+ 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research 에서) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 의 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 논문과 함께 발표했습니다.
348
+ 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (Google Research 에서) Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 의 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 논문과 함께 발표했습니다.
349
+ 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (META Research 에서) Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár 의 [Designing Network Design Space](https://arxiv.org/abs/2003.13678) 논문과 함께 발표했습니다.
350
+ 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research 에서) Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 의 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 논문과 함께 발표했습니다.
351
+ 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (Microsoft Research 에서) Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 의 [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) 논문과 함께 발표했습니다.
352
+ 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (Facebook 에서) Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 의 a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 논문과 함께 발표했습니다.
353
+ 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook 에서) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 의 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 논문과 함께 발표했습니다.
354
+ 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다.
355
+ 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology 에서) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 의 a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 논문과 함께 발표했습니다.
356
+ 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA 에서) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 의 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 논문과 함께 발표했습니다.
357
+ 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
358
+ 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
359
+ 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research 에서 제공)은 Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.의 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)논문과 함께 발표했습니다.
360
+ 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (Facebook 에서) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 의 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 논문과 함께 발표했습니다.
361
+ 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook 에서) Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 의 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 논문과 함께 발표했습니다.
362
+ 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University 에서) Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 의 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 논문과 함께 발표했습니다.
363
+ 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley 에서) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 의 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 논문과 함께 발표했습니다.
364
+ 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft 에서) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 의 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 논문과 함께 발표했습니다.
365
+ 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft 에서) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 의 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 논문과 함께 발표했습니다.
366
+ 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg 에서) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 의 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 논문과 함께 발표했습니다.
367
+ 1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (Google 에서) William Fedus, Barret Zoph, Noam Shazeer. 의 [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) 논문과 함께 발표했습니다.
368
+ 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (Google AI 에서) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 의 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 논문과 함께 발표했습니다.
369
+ 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
370
+ 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research 에서) Brandon Smock, Rohith Pesala, Robin Abraham 의 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 논문과 함께 발표했습니다.
371
+ 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI 에서) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 의 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 논문과 함께 발표했습니다.
372
+ 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research 에서) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 의 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 논문과 함께 발표했습니다.
373
+ 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
374
+ 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook 에서) Gedas Bertasius, Heng Wang, Lorenzo Torresani 의 [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) 논문과 함께 발표했습니다.
375
+ 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley 에서) Michael Janner, Qiyang Li, Sergey Levin 의 [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) 논문과 함께 발표했습니다.
376
+ 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU 에서) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 의 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 논문과 함께 발표했습니다.
377
+ 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft 에서) Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 의 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 논문과 함께 발표했습니다.
378
+ 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill 에서) Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 의 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 논문과 함께 발표했습니다.
379
+ 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research 에서) Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzle 의 [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) 논문과 함께 발표했습니다.
380
+ 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research 에서) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 의 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 논문과 함께 발표했습니다.
381
+ 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research 에서) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 의 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 논문과 함께 발표했습니다.
382
+ 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University 에서 제공)은 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.의 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)논문과 함께 발표했습니다.
383
+ 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University 에서) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 의 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 논문과 함께 발표했습니다.
384
+ 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University 에서) Zhan Tong, Yibing Song, Jue Wang, Limin Wang 의 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 논문과 함께 발표했습니다.
385
+ 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain 에서) Wonjae Kim, Bokyung Son, Ildoo Kim 의 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 논문과 함께 발표했습니다.
386
+ 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
387
+ 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP 에서) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 의 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 논문과 함께 발표했습니다.
388
+ 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
389
+ 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI 에서) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 의 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 논문과 함께 발표했습니다.
390
+ 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI 에서) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 의 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) 논문과 함께 발표했습니다.
391
+ 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI 에서) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 의 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 논문과 함께 발표했습니다.
392
+ 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI 에서) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 의 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 논문과 함께 발표했습니다.
393
+ 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI 에서) Qiantong Xu, Alexei Baevski, Michael Auli 의 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 논문과 함께 발표했습니다.
394
+ 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (Microsoft Research 에서) Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei 의 [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) 논문과 함께 발표했습니다.
395
+ 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever 의 [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) 논문과 함께 발표했습니다.
396
+ 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (Microsoft Research 에서) Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling 의 [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) 논문과 함께 발표했습니다.
397
+ 1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (Meta AI 에서 제공)은 Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.의 [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255)논문과 함께 발표했습니다.
398
+ 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (Facebook AI 에서 제공) Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li 의 [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) 논문과 함께 발표했습니다.
399
+ 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (Facebook 에서) Guillaume Lample and Alexis Conneau 의 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 논문과 함께 발표했습니다.
400
+ 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (Microsoft Research 에서) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 의 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 논문과 함께 발표했습니다.
401
+ 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (Facebook AI 에서) Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 의 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 논문과 함께 발표했습니다.
402
+ 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI 에서) Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau 의 [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) 논문과 함께 발표했습니다.
403
+ 1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (Meta AI 에서) Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa 의 [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) 논문과 함께 발표했습니다.
404
+ 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU 에서) Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 의 [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 논문과 함께 발표했습니다.
405
+ 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI 에서) Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 의 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 논문과 함께 발표했습니다.
406
+ 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (Facebook AI 에서) Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 의 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 논문과 함께 발표했습니다.
407
+ 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (Huazhong University of Science & Technology 에서) Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu 의 [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) 논문과 함께 발표했습니다.
408
+ 1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (the University of Wisconsin - Madison 에서) Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 의 [You Only Sample (Almost) 논문과 함께 발표했습니다.
409
+ 1. 새로운 모델을 올리고 싶나요? 우리가 **상세한 가이드와 템플릿** 으로 새로운 모델을 올리도록 도와드릴게요. 가이드와 템플릿은 이 저장소의 [`templates`](./templates) 폴더에서 확인하실 수 있습니다. [컨트리뷰션 가이드라인](./CONTRIBUTING.md)을 꼭 확인해주시고, PR을 올리기 전에 메인테이너에게 연락하거나 이슈를 오픈해 피드백을 받으시길 바랍니다.
410
+
411
+ 각 모델이 Flax, PyTorch, TensorFlow으로 구현되었는지 또는 🤗 Tokenizers 라이브러리가 지원하는 토크나이저를 사용하는지 확인하려면, [이 표](https://huggingface.co/docs/transformers/index#supported-frameworks)를 확인하세요.
412
+
413
+ 이 구현은 여러 데이터로 검증되었고 (예시 스크립트를 참고하세요) 오리지널 구현의 성능과 같아야 합니다. [도큐먼트](https://huggingface.co/docs/transformers/examples)의 Examples 섹션에서 성능에 대한 자세한 설명을 확인할 수 있습니다.
414
+
415
+ ## 더 알아보기
416
+
417
+ | 섹션 | 설명 |
418
+ |-|-|
419
+ | [도큐먼트](https://huggingface.co/transformers/) | 전체 API 도큐먼트와 튜토리얼 |
420
+ | [과제 요약](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers가 지원하는 과제들 |
421
+ | [전처리 튜토리얼](https://huggingface.co/docs/transformers/preprocessing) | `Tokenizer` 클래스를 이용해 모델을 위한 데이터 준비하기 |
422
+ | [학습과 fine-tuning](https://huggingface.co/docs/transformers/training) | 🤗 Transformers가 제공하는 모델 PyTorch/TensorFlow 학습 과정과 `Trainer` API에서 사용하기 |
423
+ | [퀵 투어: Fine-tuning/사용 스크립트](https://github.com/huggingface/transformers/tree/main/examples) | 다양한 과제에서 모델 fine-tuning하는 예시 스크립트 |
424
+ | [모델 공유 및 업로드](https://huggingface.co/docs/transformers/model_sharing) | 커뮤니티에 fine-tune된 모델을 업로드 및 공유하기 |
425
+ | [마이그레이션](https://huggingface.co/docs/transformers/migration) | `pytorch-transformers`나 `pytorch-pretrained-bert`에서 🤗 Transformers로 이동하기|
426
+
427
+ ## 인용
428
+
429
+ 🤗 Transformers 라이브러리를 인용하고 싶다면, 이 [논문](https://www.aclweb.org/anthology/2020.emnlp-demos.6/)을 인용해 주세요:
430
+ ```bibtex
431
+ @inproceedings{wolf-etal-2020-transformers,
432
+ title = "Transformers: State-of-the-Art Natural Language Processing",
433
+ author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
434
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
435
+ month = oct,
436
+ year = "2020",
437
+ address = "Online",
438
+ publisher = "Association for Computational Linguistics",
439
+ url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
440
+ pages = "38--45"
441
+ }
442
+ ```
hf-dev-train/transformers-main/README_zh-hans.md ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ <!---
18
+ A useful guide for English-Chinese translation of Hugging Face documentation
19
+ - Add space around English words and numbers when they appear between Chinese characters. E.g., 共 100 多种语言; 使用 transformers 库。
20
+ - Use square quotes, e.g.,「引用」
21
+
22
+ Dictionary
23
+
24
+ Hugging Face: 抱抱脸
25
+ token: 词符(并用括号标注原英文)
26
+ tokenize: 词符化(并用括号标注原英文)
27
+ tokenizer: 词符化器(并用括号标注原英文)
28
+ transformer: transformer(不翻译)
29
+ pipeline: 流水线
30
+ API: API (不翻译)
31
+ inference: 推理
32
+ Trainer: 训练器。当作为类名出现时不翻译。
33
+ pretrained/pretrain: 预训练
34
+ finetune: 微调
35
+ community: 社区
36
+ example: 当特指仓库中 example 目录时翻译为「用例」
37
+ Python data structures (e.g., list, set, dict): 翻译为列表,集合,词典,并用括号标注原英文
38
+ NLP/Natural Language Processing: 以 NLP 出现时不翻译,以 Natural Language Processing 出现时翻译为自然语言处理
39
+ checkpoint: 检查点
40
+ -->
41
+
42
+ <p align="center">
43
+ <br>
44
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
45
+ <br>
46
+ <p>
47
+ <p align="center">
48
+ <a href="https://circleci.com/gh/huggingface/transformers">
49
+ <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
50
+ </a>
51
+ <a href="https://github.com/huggingface/transformers/blob/main/LICENSE">
52
+ <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
53
+ </a>
54
+ <a href="https://huggingface.co/docs/transformers/index">
55
+ <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online">
56
+ </a>
57
+ <a href="https://github.com/huggingface/transformers/releases">
58
+ <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
59
+ </a>
60
+ <a href="https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md">
61
+ <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
62
+ </a>
63
+ <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
64
+ </p>
65
+
66
+ <h4 align="center">
67
+ <p>
68
+ <a href="https://github.com/huggingface/transformers/">English</a> |
69
+ <b>简体中文</b> |
70
+ <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hant.md">繁體中文</a> |
71
+ <a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">한국어</a> |
72
+ <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
73
+ <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
74
+ <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
75
+ <p>
76
+ </h4>
77
+
78
+ <h3 align="center">
79
+ <p>为 Jax、PyTorch 和 TensorFlow 打造的先进的自然语言处理</p>
80
+ </h3>
81
+
82
+ <h3 align="center">
83
+ <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
84
+ </h3>
85
+
86
+ 🤗 Transformers 提供了数以千计的预训练模型,支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨是让最先进的 NLP 技术人人易用。
87
+
88
+ 🤗 Transformers 提供了便于快速下载和使用的API,让你可以把预训练模型用在给定文本、在你的数据集上微调然后通过 [model hub](https://huggingface.co/models) 与社区共享。同时,每个定义的 Python 模块均完全独立,方便修改和快速研究实验。
89
+
90
+ 🤗 Transformers 支持三个最热门的深度学习库: [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) 以及 [TensorFlow](https://www.tensorflow.org/) — 并与之无缝整合。你可以直接使用一个框架训练你的模型然后用另一个加载和推理。
91
+
92
+ ## 在线演示
93
+
94
+ 你可以直接在模型页面上测试大多数 [model hub](https://huggingface.co/models) 上的模型。 我们也提供了 [私有模型托管、模型版本管理以及推理API](https://huggingface.co/pricing)。
95
+
96
+ 这里是一些例���:
97
+ - [用 BERT 做掩码填词](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
98
+ - [用 Electra 做命名实体识别](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
99
+ - [用 GPT-2 做文本生成](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
100
+ - [用 RoBERTa 做自然语言推理](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
101
+ - [用 BART 做文本摘要](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
102
+ - [用 DistilBERT 做问答](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
103
+ - [用 T5 做翻译](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
104
+
105
+ **[Write With Transformer](https://transformer.huggingface.co)**,由抱抱脸团队打造,是一个文本生成的官方 demo。
106
+
107
+ ## 如果你在寻找由抱抱脸团队提供的定制化支持服务
108
+
109
+ <a target="_blank" href="https://huggingface.co/support">
110
+ <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
111
+ </a><br>
112
+
113
+ ## 快速上手
114
+
115
+ 我们为快速使用模型提供了 `pipeline` (流水线)API。流水线聚合了预训练模型和对应的文本预处理。下面是一个快速使用流水线去判断正负面情绪的例子:
116
+
117
+ ```python
118
+ >>> from transformers import pipeline
119
+
120
+ # 使用情绪分析流水线
121
+ >>> classifier = pipeline('sentiment-analysis')
122
+ >>> classifier('We are very happy to introduce pipeline to the transformers repository.')
123
+ [{'label': 'POSITIVE', 'score': 0.9996980428695679}]
124
+ ```
125
+
126
+ 第二行代码下载并缓存了流水线使用的预训练模型,而第三行代码则在给定的文本上进行了评估。这里的答案“正面” (positive) 具有 99 的置信度。
127
+
128
+ 许多的 NLP 任务都有开箱即用的预训练流水线。比如说,我们可以轻松的从给定文本中抽取问题答案:
129
+
130
+ ``` python
131
+ >>> from transformers import pipeline
132
+
133
+ # 使用问答流水线
134
+ >>> question_answerer = pipeline('question-answering')
135
+ >>> question_answerer({
136
+ ... 'question': 'What is the name of the repository ?',
137
+ ... 'context': 'Pipeline has been included in the huggingface/transformers repository'
138
+ ... })
139
+ {'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
140
+
141
+ ```
142
+
143
+ 除了给出答案,预训练模型还给出了对应的置信度分数、答案在词符化 (tokenized) 后的文本中开始和结束的位置。你可以从[这个教程](https://huggingface.co/docs/transformers/task_summary)了解更多流水线API支持的任务。
144
+
145
+ 要在你的任务上下载和使用任意预训练模型也很简单,只需三行代码。这里是 PyTorch 版的示例:
146
+ ```python
147
+ >>> from transformers import AutoTokenizer, AutoModel
148
+
149
+ >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
150
+ >>> model = AutoModel.from_pretrained("bert-base-uncased")
151
+
152
+ >>> inputs = tokenizer("Hello world!", return_tensors="pt")
153
+ >>> outputs = model(**inputs)
154
+ ```
155
+ 这里是等效的 TensorFlow 代码:
156
+ ```python
157
+ >>> from transformers import AutoTokenizer, TFAutoModel
158
+
159
+ >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
160
+ >>> model = TFAutoModel.from_pretrained("bert-base-uncased")
161
+
162
+ >>> inputs = tokenizer("Hello world!", return_tensors="tf")
163
+ >>> outputs = model(**inputs)
164
+ ```
165
+
166
+ 词符化器 (tokenizer) 为所有的预训练模型提供了预处理,并可以直接对单个字符串进行调用(比如上面的例子)或对列表 (list) 调用。它会输出一个你可以在下游代码里使用或直接通过 `**` 解包表达式传给模型的词典 (dict)。
167
+
168
+ 模型本身是一个常规的 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 或 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)(取决于你的后端),可以常规方式使用。 [这个教程](https://huggingface.co/transformers/training.html)解释了如何将这样的模型整合到经典的 PyTorch 或 TensorFlow 训练循环中,或是如何使用我们的 `Trainer` 训练器)API 来在一个新的数据集上快速微调。
169
+
170
+ ## 为什么要用 transformers?
171
+
172
+ 1. 便于使用的先进模型:
173
+ - NLU 和 NLG 上表现优越
174
+ - 对教学和实践友好且低门槛
175
+ - 高级抽象,只需了解三个类
176
+ - 对所有模型统一的API
177
+
178
+ 1. 更低计算开销,更少的碳排放:
179
+ - 研究人员可以分享已训练的模型而非每次从头开始训练
180
+ - 工程师可以减少计算用时和生产环境开销
181
+ - 数十种模型架构、两千多个预训练模型、100多种语言支持
182
+
183
+ 1. 对于模型生命周期的每一个部分都面面俱到:
184
+ - 训练先进的模型,只需 3 行代码
185
+ - 模型在不同深度学习框架间任意转移,随你心意
186
+ - 为训练、评估和生产选择最适合的框架,衔接无缝
187
+
188
+ 1. 为你的需求轻松定制专属模型和用例:
189
+ - 我们为每种模型架构提供了多个用例来复现原论文结果
190
+ - 模型内部结构保持透明一致
191
+ - 模型文件可单独使用,方便魔改和快速实验
192
+
193
+ ## 什么情况下我不该用 transformers?
194
+
195
+ - 本库并不是模块化的神经网络工具箱。模型文件中的代码特意呈若璞玉,未经额外抽象封装,以便研究人员快速迭代魔改而不致溺于抽象和文件跳转之中。
196
+ - `Trainer` API 并非兼容任何模型,只为本库之模型优化。若是在寻找适用于通用机器学习的训练循环实现,请另觅他库。
197
+ - 尽管我们已尽力而为,[examples 目录](https://github.com/huggingface/transformers/tree/main/examples)中的脚本也仅为用例而已。对于你的特定问题,它们并不一定开箱即用,可能需要改几行代码以适之。
198
+
199
+ ## 安装
200
+
201
+ ### 使用 pip
202
+
203
+ 这个仓库已在 Python 3.6+、Flax 0.3.2+、PyTorch 1.3.1+ 和 TensorFlow 2.3+ 下经过测试。
204
+
205
+ 你可以在[虚拟环境](https://docs.python.org/3/library/venv.html)中安装 🤗 Transformers。如果你还不熟悉 Python 的虚拟环境,请阅此[用户说明](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
206
+
207
+ 首先,用你打算使用的版本的 Python 创建一个虚拟环境并激活。
208
+
209
+ 然后,你需要安装 Flax、PyTorch 或 TensorFlow 其中之一。关于在你使用的平台上安装这些框架,请参阅 [TensorFlow 安装页](https://www.tensorflow.org/install/), [PyTorch 安装页](https://pytorch.org/get-started/locally/#start-locally) 或 [Flax 安装页](https://github.com/google/flax#quick-install)。
210
+
211
+ 当这些后端之一安装成功后, 🤗 Transformers 可依此安装:
212
+
213
+ ```bash
214
+ pip install transformers
215
+ ```
216
+
217
+ 如果你想要试试用例或者想在正式发布前使用最新的开发中代码,你得[从源代码安装](https://huggingface.co/docs/transformers/installation#installing-from-source)。
218
+
219
+ ### 使用 conda
220
+
221
+ 自 Transformers 4.0.0 版始,我们有了一个 conda 频道: `huggingface`。
222
+
223
+ 🤗 Transformers 可以通过 conda 依此安装:
224
+
225
+ ```shell script
226
+ conda install -c huggingface transformers
227
+ ```
228
+
229
+ 要通过 conda 安装 Flax、PyTorch 或 TensorFlow 其中之一,请参阅它们各自安装页的说明。
230
+
231
+ ## 模型架构
232
+
233
+ 🤗 Transformers 支持的[**所有的模型检查点**](https://huggingface.co/models)由[用户](https://huggingface.co/users)和[组织](https://huggingface.co/organizations)上传,均与 huggingface.co [model hub](https://huggingface.co) 无缝整合。
234
+
235
+ 目前的检查点数量: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
236
+
237
+ 🤗 Transformers 目前支持如下的架构(模型概述请阅[这里](https://huggingface.co/docs/transformers/model_summary)):
238
+
239
+ 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
240
+ 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (来自 Google Research) 伴随论文 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) 由 Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig 发布。
241
+ 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (来自 BAAI) 伴随论文 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) 由 Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell 发布。
242
+ 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (来自 MIT) 伴随论文 [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) 由 Yuan Gong, Yu-An Chung, James Glass 发布。
243
+ 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
244
+ 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
245
+ 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
246
+ 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (来自 Microsoft) 伴随论文 [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) 由 Hangbo Bao, Li Dong, Furu Wei 发布。
247
+ 1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (来自 Google) 伴随论文 [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) 由 Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova 发布。
248
+ 1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (来自 Google) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
249
+ 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (来自 VinAI Research) 伴随论文 [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 由 Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen 发布。
250
+ 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
251
+ 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
252
+ 1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (来自 Microsoft Research AI4Science) 伴随论文 [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) 由 Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu 发布。
253
+ 1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (来自 Google AI) 伴随论文 [Big Transfer (BiT) 由 Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby 发布。
254
+ 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
255
+ 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
256
+ 1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (来自 Salesforce) 伴随论文 [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) 由 Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi 发布。
257
+ 1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (来自 Salesforce) 伴随论文 [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) 由 Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi 发布。
258
+ 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
259
+ 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
260
+ 1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
261
+ 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (来自 Google Research) 伴随论文 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 由 Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 发布。
262
+ 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
263
+ 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
264
+ 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
265
+ 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
266
+ 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
267
+ 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
268
+ 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
269
+ 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
270
+ 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
271
+ 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
272
+ 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
273
+ 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
274
+ 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
275
+ 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (来自 Microsoft) 伴随论文 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 由 Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 发布。
276
+ 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
277
+ 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
278
+ 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
279
+ 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (来自 Berkeley/Facebook/Google) 伴随论文 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 由 Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 发布。
280
+ 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (来自 SenseTime Research) 伴随论文 [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) 由 Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai 发布。
281
+ 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。
282
+ 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (来自 The University of Texas at Austin) 伴随论文 [NMS Strikes Back](https://arxiv.org/abs/2212.06137) 由 Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl 发布。
283
+ 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
284
+ 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
285
+ 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (来自 SHI Labs) 伴随论文 [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) 由 Ali Hassani and Humphrey Shi 发布。
286
+ 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) 和德语版 DistilBERT。
287
+ 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (来自 Microsoft Research) 伴随论文 [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 由 Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei 发布。
288
+ 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (来自 NAVER) 伴随论文 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 由 Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 发布。
289
+ 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
290
+ 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (来自 Intel Labs) 伴随论文 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 由 René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 发布。
291
+ 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (来自 Snap Research) 伴随论文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) 由 Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren 发布。
292
+ 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
293
+ 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
294
+ 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
295
+ 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (来自 Baidu) 伴随论文 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 发布。
296
+ 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (来自 Baidu) 伴随论文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) 由 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang 发布。
297
+ 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
298
+ 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
299
+ 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
300
+ 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
301
+ 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。
302
+ 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
303
+ 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
304
+ 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (来自 Microsoft Research) 伴随论文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 由 Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang 发布。
305
+ 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (来自 KAIST) 伴随论文 [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 由 Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim 发布。
306
+ 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
307
+ 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
308
+ 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
309
+ 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (来自 ABEJA) 由 Shinya Otani, Takayoshi Makabe, Anuj Arora, Kyo Hattori。
310
+ 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
311
+ 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
312
+ 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
313
+ 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama).
314
+ 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
315
+ 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。
316
+ 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
317
+ 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
318
+ 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。
319
+ 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
320
+ 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
321
+ 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
322
+ 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
323
+ 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) 由 Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei 发布。
324
+ 1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) 由 Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei 发布。
325
+ 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
326
+ 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (来自 Meta AI) 伴随论文 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 由 Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 发布。
327
+ 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (来自 South China University of Technology) 伴随论文 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 由 Jiapeng Wang, Lianwen Jin, Kai Ding 发布。
328
+ 1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (来自 The FAIR team of Meta AI) 伴随论文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) 由 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample 发布。
329
+ 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
330
+ 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (来自 Google AI) released 伴随论文 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 由 Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 发布。
331
+ 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
332
+ 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (来自 UNC Chapel Hill) 伴随论文 [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) 由 Hao Tan and Mohit Bansal 发布。
333
+ 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (来自 Facebook) 伴随论文 [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) 由 Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert 发布。
334
+ 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。
335
+ 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
336
+ 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (来自 Microsoft Research Asia) 伴随论文 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 由 Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 发布。
337
+ 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (来自 FAIR and UIUC) 伴随论文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) 由 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar 发布。
338
+ 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
339
+ 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
340
+ 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
341
+ 1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (来自 Facebook) 伴随论文 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) 由 Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer 发布。
342
+ 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
343
+ 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
344
+ 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (来自 Alibaba Research) 伴随论文 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) 由 Peng Wang, Cheng Da, and Cong Yao 发布。
345
+ 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (来自 Studio Ousia) 伴随论文 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 由 Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 发布。
346
+ 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (来自 CMU/Google Brain) 伴随论文 [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 由 Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou 发布。
347
+ 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (来自 Google Inc.) 伴随论文 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 由 Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 发布。
348
+ 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (来自 Google Inc.) 伴随论文 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 由 Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 发布。
349
+ 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (来自 Apple) 伴随论文 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 由 Sachin Mehta and Mohammad Rastegari 发布。
350
+ 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
351
+ 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
352
+ 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。
353
+ 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (来自 SHI Labs) 伴随论文 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 由 Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 发布。
354
+ 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。
355
+ 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。
356
+ 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/main/model_doc/nllb-moe)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。
357
+ 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
358
+ 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (来自 SHI Labs) 伴随论文 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 由 Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 发布。
359
+ 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
360
+ 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
361
+ 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
362
+ 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (来自 Google) 伴随论文 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 由 Jason Phang, Yao Zhao, Peter J. Liu 发布。
363
+ 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
364
+ 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。
365
+ 1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (来自 Google) 伴随论文 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) 由 Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova 发布。
366
+ 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (来自 UCLA NLP) 伴随论文 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 由 Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 发布。
367
+ 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (来自 Sea AI Labs) 伴随论文 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 由 Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 发布。
368
+ 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
369
+ 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
370
+ 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (来自 Facebook) 伴随论文 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 由 Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela 发布。
371
+ 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
372
+ 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
373
+ 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
374
+ 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
375
+ 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
376
+ 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
377
+ 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (来自 Facebook) 伴随论文 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 由 Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 发布。
378
+ 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
379
+ 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
380
+ 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
381
+ 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
382
+ 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
383
+ 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (来自 Microsoft Research) 伴随论文 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) 由 Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei 发布。
384
+ 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (来自 Facebook), 伴随论文 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 发布。
385
+ 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
386
+ 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
387
+ 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
388
+ 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
389
+ 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。
390
+ 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (来自 University of Würzburg) 伴随论文 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 由 Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 发布。
391
+ 1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
392
+ 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
393
+ 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论�� [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
394
+ 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (来自 Microsoft Research) 伴随论文 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 由 Brandon Smock, Rohith Pesala, Robin Abraham 发布。
395
+ 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
396
+ 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
397
+ 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
398
+ 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
399
+ 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
400
+ 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
401
+ 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
402
+ 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (来自 UNC Chapel Hill) 伴随论文 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 由 Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 发布。
403
+ 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
404
+ 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
405
+ 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
406
+ 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (来自 Peking University) 伴随论文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) 由 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun 发布。
407
+ 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (来自 Tsinghua University and Nankai University) 伴随论文 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 由 Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 发布。
408
+ 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (来自 Multimedia Computing Group, Nanjing University) 伴随论文 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 由 Zhan Tong, Yibing Song, Jue Wang, Limin Wang 发布。
409
+ 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
410
+ 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
411
+ 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
412
+ 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
413
+ 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
414
+ 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (来自 Meta AI) 伴随论文 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 发布.
415
+ 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
416
+ 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。
417
+ 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
418
+ 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
419
+ 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (来自 OpenAI) 伴随论文 [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) 由 Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever 发布。
420
+ 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (来自 Microsoft Research) 伴随论文 [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) 由 Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling 发布。
421
+ 1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (来自 Meta AI) 伴随论文 [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) 由 Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe 发布。
422
+ 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
423
+ 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
424
+ 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
425
+ 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。
426
+ 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (来自 Facebook AI) 伴随论文 [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) 由 Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau 发布。
427
+ 1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (来自 Meta AI) 伴随论文 [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) 由 Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa 发布。
428
+ 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
429
+ 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (来自 Facebook AI) 伴随论文 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 由 Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 发布。
430
+ 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
431
+ 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (来自 Huazhong University of Science & Technology) 伴随论文 [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) 由 Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu 发布。
432
+ 1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (来自 the University of Wisconsin - Madison) 伴随论文 [You Only Sample (Almost) 由 Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 发布。
433
+ 1. 想要贡献新的模型?我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。
434
+
435
+ 要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现,或其是否在 🤗 Tokenizers 库中有对应词符化器(tokenizer),敬请参阅[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
436
+
437
+ 这些实现均已于多个数据集测试(请参看用例脚本)并应于原版实现表现相当。你可以在用例文档的[此节](https://huggingface.co/docs/transformers/examples)中了解表现的细节。
438
+
439
+
440
+ ## 了解更多
441
+
442
+ | 章节 | 描述 |
443
+ |-|-|
444
+ | [文档](https://huggingface.co/transformers/) | 完整的 API 文档和教程 |
445
+ | [任务总结](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers 支持的任务 |
446
+ | [预处理教程](https://huggingface.co/docs/transformers/preprocessing) | 使用 `Tokenizer` 来为模型准备数据 |
447
+ | [训练和微调](https://huggingface.co/docs/transformers/training) | 在 PyTorch/TensorFlow 的训练循环或 `Trainer` API 中使用 🤗 Transformers 提供的模型 |
448
+ | [快速上手:微调和用例脚本](https://github.com/huggingface/transformers/tree/main/examples) | 为各种任务提供的用例脚本 |
449
+ | [模型分享和上传](https://huggingface.co/docs/transformers/model_sharing) | 和社区上传和分享你微调的模型 |
450
+ | [迁移](https://huggingface.co/docs/transformers/migration) | 从 `pytorch-transformers` 或 `pytorch-pretrained-bert` 迁移到 🤗 Transformers |
451
+
452
+ ## 引用
453
+
454
+ 我们已将此库的[论文](https://www.aclweb.org/anthology/2020.emnlp-demos.6/)正式发表,如果你使用了 🤗 Transformers 库,请引用:
455
+ ```bibtex
456
+ @inproceedings{wolf-etal-2020-transformers,
457
+ title = "Transformers: State-of-the-Art Natural Language Processing",
458
+ author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
459
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
460
+ month = oct,
461
+ year = "2020",
462
+ address = "Online",
463
+ publisher = "Association for Computational Linguistics",
464
+ url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
465
+ pages = "38--45"
466
+ }
467
+ ```