Aston-xMAD commited on
Commit
9382e3f
1 Parent(s): 825cfb0

init commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. CITATION.cff +82 -0
  2. CODE_OF_CONDUCT.md +133 -0
  3. CONTRIBUTING.md +394 -0
  4. ISSUES.md +277 -0
  5. LICENSE +203 -0
  6. Makefile +124 -0
  7. README.md +6 -6
  8. README_test_result.md +58 -0
  9. SECURITY.md +40 -0
  10. __pycache__/app_local.cpython-310.pyc +0 -0
  11. app.py +419 -0
  12. backups/app_backup.py +63 -0
  13. backups/app_local_enabled_streaming_but_inefficient.py +205 -0
  14. backups/app_local_v0.py +187 -0
  15. backups/app_local_v1-1.py +228 -0
  16. backups/app_local_v1.py +375 -0
  17. backups/app_local_v2.py +191 -0
  18. backups/app_local_v3.py +211 -0
  19. backups/app_local_v4-1.py +234 -0
  20. backups/app_local_with_graph.py +235 -0
  21. backups/app_major_backup.py +235 -0
  22. backups/app_pic.py +40 -0
  23. backups/app_unquantized_backup.py +146 -0
  24. backups/app_v0.py +188 -0
  25. backups/app_v1.py +207 -0
  26. backups/app_v2.py +215 -0
  27. chats.json +1850 -0
  28. chats_sys_none.json +1390 -0
  29. conftest.py +142 -0
  30. docker/transformers-all-latest-gpu/Dockerfile +63 -0
  31. docker/transformers-doc-builder/Dockerfile +18 -0
  32. docker/transformers-gpu/Dockerfile +31 -0
  33. docker/transformers-past-gpu/Dockerfile +59 -0
  34. docker/transformers-pytorch-amd-gpu/Dockerfile +39 -0
  35. docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile +48 -0
  36. docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +53 -0
  37. docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile +64 -0
  38. docker/transformers-pytorch-gpu/Dockerfile +33 -0
  39. docker/transformers-pytorch-tpu/Dockerfile +65 -0
  40. docker/transformers-pytorch-tpu/bert-base-cased.jsonnet +38 -0
  41. docker/transformers-pytorch-tpu/dataset.yaml +32 -0
  42. docker/transformers-pytorch-tpu/docker-entrypoint.sh +8 -0
  43. docker/transformers-quantization-latest-gpu/Dockerfile +60 -0
  44. docker/transformers-tensorflow-gpu/Dockerfile +25 -0
  45. docs/README.md +397 -0
  46. docs/TRANSLATING.md +57 -0
  47. docs/source/_config.py +14 -0
  48. docs/source/de/_config.py +14 -0
  49. docs/source/de/_toctree.yml +42 -0
  50. docs/source/de/accelerate.md +136 -0
CITATION.cff ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: "1.2.0"
2
+ date-released: 2020-10
3
+ message: "If you use this software, please cite it using these metadata."
4
+ title: "Transformers: State-of-the-Art Natural Language Processing"
5
+ url: "https://github.com/huggingface/transformers"
6
+ authors:
7
+ - family-names: Wolf
8
+ given-names: Thomas
9
+ - family-names: Debut
10
+ given-names: Lysandre
11
+ - family-names: Sanh
12
+ given-names: Victor
13
+ - family-names: Chaumond
14
+ given-names: Julien
15
+ - family-names: Delangue
16
+ given-names: Clement
17
+ - family-names: Moi
18
+ given-names: Anthony
19
+ - family-names: Cistac
20
+ given-names: Perric
21
+ - family-names: Ma
22
+ given-names: Clara
23
+ - family-names: Jernite
24
+ given-names: Yacine
25
+ - family-names: Plu
26
+ given-names: Julien
27
+ - family-names: Xu
28
+ given-names: Canwen
29
+ - family-names: "Le Scao"
30
+ given-names: Teven
31
+ - family-names: Gugger
32
+ given-names: Sylvain
33
+ - family-names: Drame
34
+ given-names: Mariama
35
+ - family-names: Lhoest
36
+ given-names: Quentin
37
+ - family-names: Rush
38
+ given-names: "Alexander M."
39
+ preferred-citation:
40
+ type: conference-paper
41
+ authors:
42
+ - family-names: Wolf
43
+ given-names: Thomas
44
+ - family-names: Debut
45
+ given-names: Lysandre
46
+ - family-names: Sanh
47
+ given-names: Victor
48
+ - family-names: Chaumond
49
+ given-names: Julien
50
+ - family-names: Delangue
51
+ given-names: Clement
52
+ - family-names: Moi
53
+ given-names: Anthony
54
+ - family-names: Cistac
55
+ given-names: Perric
56
+ - family-names: Ma
57
+ given-names: Clara
58
+ - family-names: Jernite
59
+ given-names: Yacine
60
+ - family-names: Plu
61
+ given-names: Julien
62
+ - family-names: Xu
63
+ given-names: Canwen
64
+ - family-names: "Le Scao"
65
+ given-names: Teven
66
+ - family-names: Gugger
67
+ given-names: Sylvain
68
+ - family-names: Drame
69
+ given-names: Mariama
70
+ - family-names: Lhoest
71
+ given-names: Quentin
72
+ - family-names: Rush
73
+ given-names: "Alexander M."
74
+ booktitle: "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations"
75
+ month: 10
76
+ start: 38
77
+ end: 45
78
+ title: "Transformers: State-of-the-Art Natural Language Processing"
79
+ year: 2020
80
+ publisher: "Association for Computational Linguistics"
81
+ url: "https://www.aclweb.org/anthology/2020.emnlp-demos.6"
82
+ address: "Online"
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Contributor Covenant Code of Conduct
3
+
4
+ ## Our Pledge
5
+
6
+ We as members, contributors, and leaders pledge to make participation in our
7
+ community a harassment-free experience for everyone, regardless of age, body
8
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
9
+ identity and expression, level of experience, education, socio-economic status,
10
+ nationality, personal appearance, race, caste, color, religion, or sexual
11
+ identity and orientation.
12
+
13
+ We pledge to act and interact in ways that contribute to an open, welcoming,
14
+ diverse, inclusive, and healthy community.
15
+
16
+ ## Our Standards
17
+
18
+ Examples of behavior that contributes to a positive environment for our
19
+ community include:
20
+
21
+ * Demonstrating empathy and kindness toward other people
22
+ * Being respectful of differing opinions, viewpoints, and experiences
23
+ * Giving and gracefully accepting constructive feedback
24
+ * Accepting responsibility and apologizing to those affected by our mistakes,
25
+ and learning from the experience
26
+ * Focusing on what is best not just for us as individuals, but for the overall
27
+ community
28
+
29
+ Examples of unacceptable behavior include:
30
+
31
+ * The use of sexualized language or imagery, and sexual attention or advances of
32
+ any kind
33
+ * Trolling, insulting or derogatory comments, and personal or political attacks
34
+ * Public or private harassment
35
+ * Publishing others' private information, such as a physical or email address,
36
+ without their explicit permission
37
+ * Other conduct which could reasonably be considered inappropriate in a
38
+ professional setting
39
+
40
+ ## Enforcement Responsibilities
41
+
42
+ Community leaders are responsible for clarifying and enforcing our standards of
43
+ acceptable behavior and will take appropriate and fair corrective action in
44
+ response to any behavior that they deem inappropriate, threatening, offensive,
45
+ or harmful.
46
+
47
+ Community leaders have the right and responsibility to remove, edit, or reject
48
+ comments, commits, code, wiki edits, issues, and other contributions that are
49
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
50
+ decisions when appropriate.
51
+
52
+ ## Scope
53
+
54
+ This Code of Conduct applies within all community spaces, and also applies when
55
+ an individual is officially representing the community in public spaces.
56
+ Examples of representing our community include using an official e-mail address,
57
+ posting via an official social media account, or acting as an appointed
58
+ representative at an online or offline event.
59
+
60
+ ## Enforcement
61
+
62
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
63
+ reported to the community leaders responsible for enforcement at
64
65
+ All complaints will be reviewed and investigated promptly and fairly.
66
+
67
+ All community leaders are obligated to respect the privacy and security of the
68
+ reporter of any incident.
69
+
70
+ ## Enforcement Guidelines
71
+
72
+ Community leaders will follow these Community Impact Guidelines in determining
73
+ the consequences for any action they deem in violation of this Code of Conduct:
74
+
75
+ ### 1. Correction
76
+
77
+ **Community Impact**: Use of inappropriate language or other behavior deemed
78
+ unprofessional or unwelcome in the community.
79
+
80
+ **Consequence**: A private, written warning from community leaders, providing
81
+ clarity around the nature of the violation and an explanation of why the
82
+ behavior was inappropriate. A public apology may be requested.
83
+
84
+ ### 2. Warning
85
+
86
+ **Community Impact**: A violation through a single incident or series of
87
+ actions.
88
+
89
+ **Consequence**: A warning with consequences for continued behavior. No
90
+ interaction with the people involved, including unsolicited interaction with
91
+ those enforcing the Code of Conduct, for a specified period of time. This
92
+ includes avoiding interactions in community spaces as well as external channels
93
+ like social media. Violating these terms may lead to a temporary or permanent
94
+ ban.
95
+
96
+ ### 3. Temporary Ban
97
+
98
+ **Community Impact**: A serious violation of community standards, including
99
+ sustained inappropriate behavior.
100
+
101
+ **Consequence**: A temporary ban from any sort of interaction or public
102
+ communication with the community for a specified period of time. No public or
103
+ private interaction with the people involved, including unsolicited interaction
104
+ with those enforcing the Code of Conduct, is allowed during this period.
105
+ Violating these terms may lead to a permanent ban.
106
+
107
+ ### 4. Permanent Ban
108
+
109
+ **Community Impact**: Demonstrating a pattern of violation of community
110
+ standards, including sustained inappropriate behavior, harassment of an
111
+ individual, or aggression toward or disparagement of classes of individuals.
112
+
113
+ **Consequence**: A permanent ban from any sort of public interaction within the
114
+ community.
115
+
116
+ ## Attribution
117
+
118
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
119
+ version 2.1, available at
120
+ [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
121
+
122
+ Community Impact Guidelines were inspired by
123
+ [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
124
+
125
+ For answers to common questions about this code of conduct, see the FAQ at
126
+ [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
127
+ [https://www.contributor-covenant.org/translations][translations].
128
+
129
+ [homepage]: https://www.contributor-covenant.org
130
+ [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
131
+ [Mozilla CoC]: https://github.com/mozilla/diversity
132
+ [FAQ]: https://www.contributor-covenant.org/faq
133
+ [translations]: https://www.contributor-covenant.org/translations
CONTRIBUTING.md ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ # Contribute to 🤗 Transformers
18
+
19
+ Everyone is welcome to contribute, and we value everybody's contribution. Code
20
+ contributions are not the only way to help the community. Answering questions, helping
21
+ others, and improving the documentation are also immensely valuable.
22
+
23
+ It also helps us if you spread the word! Reference the library in blog posts
24
+ about the awesome projects it made possible, shout out on Twitter every time it has
25
+ helped you, or simply ⭐️ the repository to say thank you.
26
+
27
+ However you choose to contribute, please be mindful and respect our
28
+ [code of conduct](https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md).
29
+
30
+ **This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
31
+
32
+ ## Ways to contribute
33
+
34
+ There are several ways you can contribute to 🤗 Transformers:
35
+
36
+ * Fix outstanding issues with the existing code.
37
+ * Submit issues related to bugs or desired new features.
38
+ * Implement new models.
39
+ * Contribute to the examples or to the documentation.
40
+
41
+ If you don't know where to start, there is a special [Good First
42
+ Issue](https://github.com/huggingface/transformers/contribute) listing. It will give you a list of
43
+ open issues that are beginner-friendly and help you start contributing to open-source. The best way to do that is to open a Pull Request and link it to the issue that you'd like to work on. We try to give priority to opened PRs as we can easily track the progress of the fix, and if the contributor does not have time anymore, someone else can take the PR over.
44
+
45
+ For something slightly more challenging, you can also take a look at the [Good Second Issue](https://github.com/huggingface/transformers/labels/Good%20Second%20Issue) list. In general though, if you feel like you know what you're doing, go for it and we'll help you get there! 🚀
46
+
47
+ > All contributions are equally valuable to the community. 🥰
48
+
49
+ ## Fixing outstanding issues
50
+
51
+ If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](#create-a-pull-request) and open a Pull Request!
52
+
53
+ ## Submitting a bug-related issue or feature request
54
+
55
+ Do your best to follow these guidelines when submitting a bug-related issue or a feature
56
+ request. It will make it easier for us to come back to you quickly and with good
57
+ feedback.
58
+
59
+ ### Did you find a bug?
60
+
61
+ The 🤗 Transformers library is robust and reliable thanks to users who report the problems they encounter.
62
+
63
+ Before you report an issue, we would really appreciate it if you could **make sure the bug was not
64
+ already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code. If you're unsure whether the bug is in your code or the library, please ask in the [forum](https://discuss.huggingface.co/) first. This helps us respond quicker to fixing issues related to the library versus general questions.
65
+
66
+ Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:
67
+
68
+ * Your **OS type and version** and **Python**, **PyTorch** and
69
+ **TensorFlow** versions when applicable.
70
+ * A short, self-contained, code snippet that allows us to reproduce the bug in
71
+ less than 30s.
72
+ * The *full* traceback if an exception is raised.
73
+ * Attach any other additional information, like screenshots, you think may help.
74
+
75
+ To get the OS and software versions automatically, run the following command:
76
+
77
+ ```bash
78
+ transformers-cli env
79
+ ```
80
+
81
+ You can also run the same command from the root of the repository:
82
+
83
+ ```bash
84
+ python src/transformers/commands/transformers_cli.py env
85
+ ```
86
+
87
+ ### Do you want a new feature?
88
+
89
+ If there is a new feature you'd like to see in 🤗 Transformers, please open an issue and describe:
90
+
91
+ 1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it a feature related to something you need for a project? Is it something you worked on and think it could benefit the community?
92
+
93
+ Whatever it is, we'd love to hear about it!
94
+
95
+ 2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better we'll be able to help you.
96
+ 3. Provide a *code snippet* that demonstrates the features usage.
97
+ 4. If the feature is related to a paper, please include a link.
98
+
99
+ If your issue is well written we're already 80% of the way there by the time you create it.
100
+
101
+ We have added [templates](https://github.com/huggingface/transformers/tree/main/templates) to help you get started with your issue.
102
+
103
+ ## Do you want to implement a new model?
104
+
105
+ New models are constantly released and if you want to implement a new model, please provide the following information:
106
+
107
+ * A short description of the model and a link to the paper.
108
+ * Link to the implementation if it is open-sourced.
109
+ * Link to the model weights if they are available.
110
+
111
+ If you are willing to contribute the model yourself, let us know so we can help you add it to 🤗 Transformers!
112
+
113
+ We have a technical guide for [how to add a model to 🤗 Transformers](https://huggingface.co/docs/transformers/add_new_model).
114
+
115
+ ## Do you want to add documentation?
116
+
117
+ We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be happy to make the changes or help you make a contribution if you're interested!
118
+
119
+ For more details about how to generate, build, and write the documentation, take a look at the documentation [README](https://github.com/huggingface/transformers/tree/main/docs).
120
+
121
+ ## Create a Pull Request
122
+
123
+ Before writing any code, we strongly advise you to search through the existing PRs or
124
+ issues to make sure nobody is already working on the same thing. If you are
125
+ unsure, it is always a good idea to open an issue to get some feedback.
126
+
127
+ You will need basic `git` proficiency to contribute to
128
+ 🤗 Transformers. While `git` is not the easiest tool to use, it has the greatest
129
+ manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro
130
+ Git](https://git-scm.com/book/en/v2) is a very good reference.
131
+
132
+ You'll need **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
133
+
134
+ 1. Fork the [repository](https://github.com/huggingface/transformers) by
135
+ clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code
136
+ under your GitHub user account.
137
+
138
+ 2. Clone your fork to your local disk, and add the base repository as a remote:
139
+
140
+ ```bash
141
+ git clone [email protected]:<your Github handle>/transformers.git
142
+ cd transformers
143
+ git remote add upstream https://github.com/huggingface/transformers.git
144
+ ```
145
+
146
+ 3. Create a new branch to hold your development changes:
147
+
148
+ ```bash
149
+ git checkout -b a-descriptive-name-for-my-changes
150
+ ```
151
+
152
+ 🚨 **Do not** work on the `main` branch!
153
+
154
+ 4. Set up a development environment by running the following command in a virtual environment:
155
+
156
+ ```bash
157
+ pip install -e ".[dev]"
158
+ ```
159
+
160
+ If 🤗 Transformers was already installed in the virtual environment, remove
161
+ it with `pip uninstall transformers` before reinstalling it in editable
162
+ mode with the `-e` flag.
163
+
164
+ Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
165
+ failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
166
+ (PyTorch, TensorFlow and/or Flax) then do:
167
+
168
+ ```bash
169
+ pip install -e ".[quality]"
170
+ ```
171
+
172
+ which should be enough for most use cases.
173
+
174
+ 5. Develop the features in your branch.
175
+
176
+ As you work on your code, you should make sure the test suite
177
+ passes. Run the tests impacted by your changes like this:
178
+
179
+ ```bash
180
+ pytest tests/<TEST_TO_RUN>.py
181
+ ```
182
+
183
+ For more information about tests, check out the
184
+ [Testing](https://huggingface.co/docs/transformers/testing) guide.
185
+
186
+ 🤗 Transformers relies on `black` and `ruff` to format its source code
187
+ consistently. After you make changes, apply automatic style corrections and code verifications
188
+ that can't be automated in one go with:
189
+
190
+ ```bash
191
+ make fixup
192
+ ```
193
+
194
+ This target is also optimized to only work with files modified by the PR you're working on.
195
+
196
+ If you prefer to run the checks one after the other, the following command applies the
197
+ style corrections:
198
+
199
+ ```bash
200
+ make style
201
+ ```
202
+
203
+ 🤗 Transformers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality
204
+ controls are run by the CI, but you can run the same checks with:
205
+
206
+ ```bash
207
+ make quality
208
+ ```
209
+
210
+ Finally, we have a lot of scripts to make sure we don't forget to update
211
+ some files when adding a new model. You can run these scripts with:
212
+
213
+ ```bash
214
+ make repo-consistency
215
+ ```
216
+
217
+ To learn more about those checks and how to fix any issues with them, check out the
218
+ [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.
219
+
220
+ If you're modifying documents under the `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check
221
+ make sure you install the documentation builder:
222
+
223
+ ```bash
224
+ pip install ".[docs]"
225
+ ```
226
+
227
+ Run the following command from the root of the repository:
228
+
229
+ ```bash
230
+ doc-builder build transformers docs/source/en --build_dir ~/tmp/test-build
231
+ ```
232
+
233
+ This will build the documentation in the `~/tmp/test-build` folder where you can inspect the generated
234
+ Markdown files with your favorite editor. You can also preview the docs on GitHub when you open a pull request.
235
+
236
+ Once you're happy with your changes, add the changed files with `git add` and
237
+ record your changes locally with `git commit`:
238
+
239
+ ```bash
240
+ git add modified_file.py
241
+ git commit
242
+ ```
243
+
244
+ Please remember to write [good commit
245
+ messages](https://chris.beams.io/posts/git-commit/) to clearly communicate the changes you made!
246
+
247
+ To keep your copy of the code up to date with the original
248
+ repository, rebase your branch on `upstream/branch` *before* you open a pull request or if requested by a maintainer:
249
+
250
+ ```bash
251
+ git fetch upstream
252
+ git rebase upstream/main
253
+ ```
254
+
255
+ Push your changes to your branch:
256
+
257
+ ```bash
258
+ git push -u origin a-descriptive-name-for-my-changes
259
+ ```
260
+
261
+ If you've already opened a pull request, you'll need to force push with the `--force` flag. Otherwise, if the pull request hasn't been opened yet, you can just push your changes normally.
262
+
263
+ 6. Now you can go to your fork of the repository on GitHub and click on **Pull Request** to open a pull request. Make sure you tick off all the boxes on our [checklist](#pull-request-checklist) below. When you're ready, you can send your changes to the project maintainers for review.
264
+
265
+ 7. It's ok if maintainers request changes, it happens to our core contributors
266
+ too! So everyone can see the changes in the pull request, work in your local
267
+ branch and push the changes to your fork. They will automatically appear in
268
+ the pull request.
269
+
270
+ ### Pull request checklist
271
+
272
+ ☐ The pull request title should summarize your contribution.<br>
273
+ ☐ If your pull request addresses an issue, please mention the issue number in the pull
274
+ request description to make sure they are linked (and people viewing the issue know you
275
+ are working on it).<br>
276
+ ☐ To indicate a work in progress please prefix the title with `[WIP]`. These are
277
+ useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.<br>
278
+ ☐ Make sure existing tests pass.<br>
279
+ ☐ If adding a new feature, also add tests for it.<br>
280
+ - If you are adding a new model, make sure you use
281
+ `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)` to trigger the common tests.
282
+ - If you are adding new `@slow` tests, make sure they pass using
283
+ `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
284
+ - If you are adding a new tokenizer, write tests and make sure
285
+ `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` passes.
286
+ - CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
287
+
288
+ ☐ All public methods must have informative docstrings (see
289
+ [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py)
290
+ for an example).<br>
291
+ ☐ Due to the rapidly growing repository, don't add any images, videos and other
292
+ non-text files that'll significantly weigh down the repository. Instead, use a Hub
293
+ repository such as [`hf-internal-testing`](https://huggingface.co/hf-internal-testing)
294
+ to host these files and reference them by URL. We recommend placing documentation
295
+ related images in the following repository:
296
+ [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
297
+ You can open a PR on this dataset repository and ask a Hugging Face member to merge it.
298
+
299
+ For more information about the checks run on a pull request, take a look at our [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.
300
+
301
+ ### Tests
302
+
303
+ An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
304
+ the [tests](https://github.com/huggingface/transformers/tree/main/tests) folder and examples tests in the
305
+ [examples](https://github.com/huggingface/transformers/tree/main/examples) folder.
306
+
307
+ We like `pytest` and `pytest-xdist` because it's faster. From the root of the
308
+ repository, specify a *path to a subfolder or a test file* to run the test:
309
+
310
+ ```bash
311
+ python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
312
+ ```
313
+
314
+ Similarly, for the `examples` directory, specify a *path to a subfolder or test file* to run the test. For example, the following command tests the text classification subfolder in the PyTorch `examples` directory:
315
+
316
+ ```bash
317
+ pip install -r examples/xxx/requirements.txt # only needed the first time
318
+ python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
319
+ ```
320
+
321
+ In fact, this is actually how our `make test` and `make test-examples` commands are implemented (not including the `pip install`)!
322
+
323
+ You can also specify a smaller set of tests in order to test only the feature
324
+ you're working on.
325
+
326
+ By default, slow tests are skipped but you can set the `RUN_SLOW` environment variable to
327
+ `yes` to run them. This will download many gigabytes of models so make sure you
328
+ have enough disk space, a good internet connection or a lot of patience!
329
+
330
+ <Tip warning={true}>
331
+
332
+ Remember to specify a *path to a subfolder or a test file* to run the test. Otherwise, you'll run all the tests in the `tests` or `examples` folder, which will take a very long time!
333
+
334
+ </Tip>
335
+
336
+ ```bash
337
+ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
338
+ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
339
+ ```
340
+
341
+ Like the slow tests, there are other environment variables available which not enabled by default during testing:
342
+ - `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
343
+ - `RUN_PT_FLAX_CROSS_TESTS`: Enables tests for PyTorch + Flax integration.
344
+ - `RUN_PT_TF_CROSS_TESTS`: Enables tests for TensorFlow + PyTorch integration.
345
+
346
+ More environment variables and additional information can be found in the [testing_utils.py](src/transformers/testing_utils.py).
347
+
348
+ 🤗 Transformers uses `pytest` as a test runner only. It doesn't use any
349
+ `pytest`-specific features in the test suite itself.
350
+
351
+ This means `unittest` is fully supported. Here's how to run tests with
352
+ `unittest`:
353
+
354
+ ```bash
355
+ python -m unittest discover -s tests -t . -v
356
+ python -m unittest discover -s examples -t examples -v
357
+ ```
358
+
359
+ ### Style guide
360
+
361
+ For documentation strings, 🤗 Transformers follows the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html).
362
+ Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
363
+ for more information.
364
+
365
+ ### Develop on Windows
366
+
367
+ On Windows (unless you're working in [Windows Subsystem for Linux](https://learn.microsoft.com/en-us/windows/wsl/) or WSL), you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings:
368
+
369
+ ```bash
370
+ git config core.autocrlf input
371
+ ```
372
+
373
+ One way to run the `make` command on Windows is with MSYS2:
374
+
375
+ 1. [Download MSYS2](https://www.msys2.org/), and we assume it's installed in `C:\msys64`.
376
+ 2. Open the command line `C:\msys64\msys2.exe` (it should be available from the **Start** menu).
377
+ 3. Run in the shell: `pacman -Syu` and install `make` with `pacman -S make`.
378
+ 4. Add `C:\msys64\usr\bin` to your PATH environment variable.
379
+
380
+ You can now use `make` from any terminal (PowerShell, cmd.exe, etc.)! 🎉
381
+
382
+ ### Sync a forked repository with upstream main (the Hugging Face repository)
383
+
384
+ When updating the main branch of a forked repository, please follow these steps to avoid pinging the upstream repository which adds reference notes to each upstream PR, and sends unnecessary notifications to the developers involved in these PRs.
385
+
386
+ 1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead, merge directly into the forked main.
387
+ 2. If a PR is absolutely necessary, use the following steps after checking out your branch:
388
+
389
+ ```bash
390
+ git checkout -b your-branch-for-syncing
391
+ git pull --squash --no-commit upstream main
392
+ git commit -m '<your message without GitHub references>'
393
+ git push --set-upstream origin your-branch-for-syncing
394
+ ```
ISSUES.md ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ # How To Request Support
18
+
19
+ This is an Open Source Project so please be mindful that like in any other project of this kind there is no obligation to answer all requests for help.
20
+
21
+ However, we want to encourage you to ask for help whenever you think it's needed! We are happy about every question we get because it allows us to better understand your needs, possible misunderstandings, and most importantly a way for you to help us make this library better. That being said, this document's main purpose is to provide guidelines at how you can formulate your requests to increase your chances to be understood and to get support.
22
+
23
+ There are two main venues to receive support: [the forums](https://discuss.huggingface.co/) and [the GitHub issues](https://github.com/huggingface/transformers/issues).
24
+
25
+ ## The Forums
26
+
27
+ [The user forums](https://discuss.huggingface.co/) are supported by the wide community of the library users and backed up by developers when needed.
28
+
29
+ If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystalized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).
30
+
31
+ In particular all "Please explain" questions or objectively very user-specific feature requests belong to the forums. Here are some example of such questions:
32
+
33
+ * "I would like to use a BertModel within a RL-Agent for a customer support service. How can I use a BertForMaskedLM in my ChatBotModel?"
34
+
35
+ * "Could you please explain why T5 has no positional embedding matrix under T5Model?"
36
+
37
+ * "How should I set my generation parameters for translation?"
38
+
39
+ * "How to train T5 on De->En translation?"
40
+
41
+
42
+ ## The GitHub Issues
43
+
44
+ Everything which hints at a bug should be opened as an [issue](https://github.com/huggingface/transformers/issues).
45
+
46
+ You are not required to read the following guidelines before opening an issue. However, if you notice that your issue doesn't get any replies, chances are that the developers have one or several difficulties with its quality. In this case, reading the following points and adjusting your issue accordingly could help.
47
+
48
+ 1. Before posting an issue, first search for already posted issues, since chances are someone has already asked a similar question before you.
49
+
50
+ If you use Google your search query should be:
51
+
52
+ ```
53
+ "huggingface" "transformers" your query
54
+ ```
55
+
56
+ The first two quoted words tell Google to limit the search to the context of the Huggingface Transformers. The remainder is your query - most commonly this would be the error message the software fails with. We will go deeper into details shortly.
57
+
58
+ The results of such a query will typically match GitHub issues, Hugging Face forums, StackExchange, and blogs.
59
+
60
+ If you find relevant hints, you may choose to continue the discussion there if you have follow up questions.
61
+
62
+ If what you found is similar but doesn't quite answer your problem, please, post a new issue and do include links to similar issues or forum discussions you may have found.
63
+
64
+ Let's look at some examples:
65
+
66
+ The error message, often referred to as an assertion, tells us what went wrong. Here is an example of an assertion:
67
+
68
+ ```python
69
+ Traceback (most recent call last):
70
+ File "<string>", line 1, in <module>
71
+ File "/transformers/src/transformers/__init__.py", line 34, in <module>
72
+ from . import dependency_versions_check
73
+ File "/transformers/src/transformers/dependency_versions_check.py", line 34, in <module>
74
+ from .utils import is_tokenizers_available
75
+ File "/transformers/src/transformers/utils/import_utils.py", line 40, in <module>
76
+ from tqdm.auto import tqdm
77
+ ModuleNotFoundError: No module named 'tqdm.auto'
78
+ ```
79
+
80
+ and it typically includes a traceback, so that we can see the full stack of calls the program made before it fails. This gives us the context to know why the program failed.
81
+
82
+ Going back to the above example. If you received this error search, look at the very last line of the error which is:
83
+
84
+ ```python
85
+ ModuleNotFoundError: No module named 'tqdm.auto'
86
+ ```
87
+
88
+ And now we can use it to do the searching on your favorite search engine:
89
+
90
+ 1. first for `"huggingface" "transformers" "ModuleNotFoundError: No module named 'tqdm.auto'"`
91
+ 2. if you don't find relevant results, then search for just `"ModuleNotFoundError: No module named 'tqdm.auto'"`
92
+ 3. and finally if nothing still comes up, then remove the outside quotes: `ModuleNotFoundError: No module named 'tqdm.auto'`
93
+
94
+ If the error includes any messages that include bits unique to your filesystem, always remove those in the search query since other users will not have the same filesystem as yours. For example:
95
+
96
+ ```bash
97
+ python -c 'open("/tmp/wrong_path.txt", "r")'
98
+ Traceback (most recent call last):
99
+ File "<string>", line 1, in <module>
100
+ FileNotFoundError: [Errno 2] No such file or directory: '/tmp/wrong_path.txt'
101
+ ```
102
+ Here you'd search for just: `"FileNotFoundError: [Errno 2] No such file or directory"`
103
+
104
+ If the local information that you removed were inside the error message and you removed them you may need to remove double quotes since your query is no longer exact. So if the error message was something like:
105
+
106
+ ```bash
107
+ ValueError: '/tmp/wrong_path.txt' cannot be found
108
+ ```
109
+
110
+ then you'd search for `"ValueError" "cannot be found"`
111
+
112
+ As you search you will notice that when you don't use quotes often the search engines will return a variety of unrelated hits, which may or may not be what you want.
113
+
114
+ Experiment with different ways and find which approach gives the most satisfactory results.
115
+
116
+ 2. Keep the issue short, providing the information that you think will aid the developers to understand your situation. Put yourself in the shoes of the person who has never seen your code or knows anything about your custom setup. This mental exercise will help to develop an intuition to what/what not to share"
117
+
118
+ 3. If there is a software failure, always provide the full traceback, for example:
119
+
120
+ ```python
121
+ $ python -c 'import transformers'
122
+ Traceback (most recent call last):
123
+ File "<string>", line 1, in <module>
124
+ File "/transformers/src/transformers/__init__.py", line 34, in <module>
125
+ from . import dependency_versions_check
126
+ File "/transformers/src/transformers/dependency_versions_check.py", line 34, in <module>
127
+ from .utils import is_tokenizers_available
128
+ File "/transformers/src/transformers/utils/import_utils.py", line 40, in <module>
129
+ from tqdm.auto import tqdm
130
+ ModuleNotFoundError: No module named 'tqdm.auto'
131
+ ```
132
+
133
+ As compared to providing just the last line of the error message, e.g.:
134
+ ```python
135
+ ModuleNotFoundError: No module named 'tqdm.auto'
136
+ ```
137
+ which is not sufficient.
138
+
139
+ If your application is running on more than one GPU (e.g. under `DistributedDataParallel`) and typically getting every log and traceback printed multiple times, please make sure that you paste only one copy of it. At times the traceback from parallel processes may get interleaved - so either disentangle these or change the loggers to log only for `local_rank==0` so that only one process logs things.
140
+
141
+ 4. When quoting a traceback, command line instructions and any type of code always enclose it in triple backticks inside the editor window, that is:
142
+
143
+ ````
144
+ ```
145
+ git clone https://github.com/huggingface/transformers
146
+ cd transformers
147
+ pip install .
148
+ ```
149
+ ````
150
+
151
+ If it's a command line with a long argument list, please consider breaking it down using backslashes and new lines. Here is an example of a good command line quote:
152
+
153
+ ```bash
154
+ cd examples/seq2seq
155
+ torchrun --nproc_per_node=2 ./finetune_trainer.py \
156
+ --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
157
+ --output_dir output_dir --overwrite_output_dir \
158
+ --do_train --n_train 500 --num_train_epochs 1 \
159
+ --per_device_train_batch_size 1 --freeze_embeds \
160
+ --src_lang en_XX --tgt_lang ro_RO --task translation \
161
+ --fp16
162
+ ```
163
+
164
+ If you don't break it up, one has to scroll horizontally which often makes it quite difficult to quickly see what's happening.
165
+
166
+ The backslashes allow us to copy the command directly into the console to run it, without needing to edit it.
167
+
168
+ 5. Include only the important information that you think will help the developer to quickly identify the problem.
169
+
170
+ For example applications often create huge amounts of logs. Ask yourself whether providing all or parts of the log is useful.
171
+
172
+ Pasting a 100-1000 lines of log into the issue is an immediate turn off, since it will take a lot of time to figure out where the pertinent parts of the log are.
173
+
174
+ Attaching a full log can be helpful if it's done as an attachment, if it's enclosed in the following html code in the comment editor window:
175
+
176
+ ```
177
+ <details>
178
+ <summary>Full log</summary>
179
+ <pre>
180
+
181
+ many
182
+ lines
183
+ go
184
+ here
185
+
186
+ </pre>
187
+ </details>
188
+ ```
189
+
190
+ which would result in the following entry, which can be opened if desired, but otherwise takes little space.
191
+
192
+ <details>
193
+ <summary>Full log</summary>
194
+ <pre>
195
+ many
196
+ lines
197
+ go
198
+ here
199
+ </pre>
200
+ </details>
201
+
202
+ You could also provide a link to a pastebin service, but this is less beneficial since those links tend to expire quickly and future readers of your issue might not be able to access that log file anymore and may lack some context.
203
+
204
+ 6. If this is an issue in your code, do try to reduce that code to a minimal example that still demonstrates the problem. Please ask at the forums if you have a hard time figuring how to do that. Please realize that we don't have the luxury of having time to try and understand all of your custom code.
205
+
206
+ If you really tried to make a short reproducible code but couldn't figure it out, it might be that having a traceback will give the developer enough information to know what's going on. But if it is not enough and we can't reproduce the problem, we can't really solve it.
207
+
208
+ Do not despair if you can't figure it out from the beginning, just share what you can and perhaps someone else will be able to help you at the forums.
209
+
210
+ If your setup involves any custom datasets, the best way to help us reproduce the problem is to create a [Google Colab notebook](https://colab.research.google.com/) that demonstrates the issue and once you verify that the issue still exists, include a link to that notebook in the Issue. Just make sure that you don't copy and paste the location bar url of the open notebook - as this is private and we won't be able to open it. Instead, you need to click on `Share` in the right upper corner of the notebook, select `Get Link` and then copy and paste the public link it will give to you.
211
+
212
+ 7. If you forked off some of this project's code or example applications, please, do not ask us to go into your code repository and figure out what you may have done. The code is already very complex and unless there is an easy way to do a diff and it's a small diff, it won't be possible to find someone with time on their hands to make a lengthy investigation. Albeit, you might find someone at the forums who will be generous to do this for you.
213
+
214
+ 8. Before reporting an issue, first, always try to update your environment to the latest official version of this library. We have no resources to go and debug older revisions, which could easily have bugs that have been fixed in the latest released version.
215
+
216
+ We understand that this is not always possible, especially when APIs change, in which case file an issue against the highest library version your environment can support.
217
+
218
+ Of course, if you upgrade the library, always retest that the problem is still there.
219
+
220
+ 9. Please do not ask us to reproduce an issue with your custom data, since we don't have it. So, either you should use some existing dataset supported by HF datasets or you need to supply a code that generates a small sample on the fly, or some another quick and simple way to get it.
221
+
222
+ Please do not send us any non-public domain data that may require a license or a permission to be used.
223
+
224
+ 10. Do not tag multiple developers on the issue unless you know this is expected, either because you asked them and they gave you an explicit permission to tag them or the issue template instructs you to do so.
225
+
226
+ The "who to tag for what domain" part of the issue template is there to help users direct their questions to the right developers who are designated maintainers of project's specific domains. They can then decide at their own discretion to tag other developers if they feel it'd help move the issue forward.
227
+
228
+ We currently don't have a triage service and we trust your capacity to identify the right domain and thus the persons to tag in your issue. If you are not sure, please use the forums to ask for guidance.
229
+
230
+ When in doubt, err on the side of not tagging a given person. If you tag multiple people out of context or permission don't be surprised if you get no response at all. Please remember that every time you tag someone, they get a notification and you're taking their time without their permission. Please be sensitive to that.
231
+
232
+ If you got helped by one of the developers in the past please don't tag them in future issues, unless they are listed in the issue template for the domain you are asking about or that developer gave you an explicit permission to tag them in future issues.
233
+
234
+ If you see a certain developer doing multiple and/or recent commits into a specific area of the project that you feel is relevant to your issue, it is not a good reason to tag them. Various developers may be fixing things that prevent them from moving forward, but often their work is focused on a totally different domain. And while they may or may not know how to help you with the problem at hand, it would benefit the whole community much more if they focus on the domain of their unique expertise.
235
+
236
+ 11. Use the Edit button. Take your time, and re-read and improve the wording and formatting to make your posts and comments as easy to understand as possible.
237
+
238
+ Avoid posting multiple comments in a row, as each comment generates a notification for the developers tagged in that issue. If you happened to post multiple comments in a row, and nobody followed up yet - consider merging those into one or a few comments while editing the combined content to be coherent.
239
+
240
+ If you choose to edit your older comments after others posted follow up comments you need to be aware that your modifications might not be noticed, so if it's not a typo fixing, try to write a new comment flagging that something has been changed in the previous comments.
241
+
242
+ For example, the very first comment is the most important one. If while the thread unfolds you realize that things aren't as they seemed to you originally you may want to edit the first post to reflect the up-to-date understanding of the issue at hand so that it helps those who read your issue in the future quickly understand what's going on and not need to sift through dozens of comments. It also helps to indicate that the post was edited. So, those reading the thread later can understand why there might be certain discontinuity in the information flow.
243
+
244
+ Use bullets and items if you have lists of items and the outcome improves overall readability.
245
+
246
+ Use backticks to refer to class and function names, e.g. `BartModel` and `generate` as these stand out and improve the speed of a reader's comprehension.
247
+
248
+ Try not use italics and bold text too much as these often make the text more difficult to read.
249
+
250
+
251
+ 12. If you are cross-referencing a specific comment in a given thread or another issue, always link to that specific comment, rather than using the issue link. If you do the latter it could be quite impossible to find which specific comment you're referring to.
252
+
253
+ To get the link to the specific comment do not copy the url from the location bar of your browser, but instead, click the `...` icon in the upper right corner of the comment and then select "Copy Link".
254
+
255
+ For example the first link is a link to an issue, and the second to a specific comment in the same issue:
256
+
257
+ 1. https://github.com/huggingface/transformers/issues/9257
258
+ 2. https://github.com/huggingface/transformers/issues/9257#issuecomment-749945162
259
+
260
+
261
+ 13. If you are replying to a last comment, it's totally fine to make your reply with just your comment in it. The readers can follow the information flow here.
262
+
263
+ But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like:
264
+
265
+ ```
266
+ > How big is your gpu cluster?
267
+
268
+ Our cluster is made of 256 gpus.
269
+ ```
270
+
271
+ If you are addressing multiple comments, quote the relevant parts of each before your answer. Some people use the same comment to do multiple replies, others separate them into separate comments. Either way works. The latter approach helps for linking to a specific comment.
272
+
273
+ In general the best way to figure out what works the best is learn from issues posted by other people - see which issues get great responses and which get little to no response - observe what the posters who received great responses did differently from those who did not.
274
+
275
+ Thank you for reading this somewhat lengthy document. We would like to conclude that these are not absolute rules, but a friendly advice that will help maximize the chances for us to understand what you are trying to communicate, reproduce the problem then resolve it to your satisfaction and the benefit of the whole community.
276
+
277
+ If after reading this document there are remaining questions on how and why or there is a need for further elucidation, please, don't hesitate to ask your question in [this thread](https://discuss.huggingface.co/t/how-to-request-support/3128).
LICENSE ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2018- The Hugging Face team. All rights reserved.
2
+
3
+ Apache License
4
+ Version 2.0, January 2004
5
+ http://www.apache.org/licenses/
6
+
7
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
8
+
9
+ 1. Definitions.
10
+
11
+ "License" shall mean the terms and conditions for use, reproduction,
12
+ and distribution as defined by Sections 1 through 9 of this document.
13
+
14
+ "Licensor" shall mean the copyright owner or entity authorized by
15
+ the copyright owner that is granting the License.
16
+
17
+ "Legal Entity" shall mean the union of the acting entity and all
18
+ other entities that control, are controlled by, or are under common
19
+ control with that entity. For the purposes of this definition,
20
+ "control" means (i) the power, direct or indirect, to cause the
21
+ direction or management of such entity, whether by contract or
22
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
23
+ outstanding shares, or (iii) beneficial ownership of such entity.
24
+
25
+ "You" (or "Your") shall mean an individual or Legal Entity
26
+ exercising permissions granted by this License.
27
+
28
+ "Source" form shall mean the preferred form for making modifications,
29
+ including but not limited to software source code, documentation
30
+ source, and configuration files.
31
+
32
+ "Object" form shall mean any form resulting from mechanical
33
+ transformation or translation of a Source form, including but
34
+ not limited to compiled object code, generated documentation,
35
+ and conversions to other media types.
36
+
37
+ "Work" shall mean the work of authorship, whether in Source or
38
+ Object form, made available under the License, as indicated by a
39
+ copyright notice that is included in or attached to the work
40
+ (an example is provided in the Appendix below).
41
+
42
+ "Derivative Works" shall mean any work, whether in Source or Object
43
+ form, that is based on (or derived from) the Work and for which the
44
+ editorial revisions, annotations, elaborations, or other modifications
45
+ represent, as a whole, an original work of authorship. For the purposes
46
+ of this License, Derivative Works shall not include works that remain
47
+ separable from, or merely link (or bind by name) to the interfaces of,
48
+ the Work and Derivative Works thereof.
49
+
50
+ "Contribution" shall mean any work of authorship, including
51
+ the original version of the Work and any modifications or additions
52
+ to that Work or Derivative Works thereof, that is intentionally
53
+ submitted to Licensor for inclusion in the Work by the copyright owner
54
+ or by an individual or Legal Entity authorized to submit on behalf of
55
+ the copyright owner. For the purposes of this definition, "submitted"
56
+ means any form of electronic, verbal, or written communication sent
57
+ to the Licensor or its representatives, including but not limited to
58
+ communication on electronic mailing lists, source code control systems,
59
+ and issue tracking systems that are managed by, or on behalf of, the
60
+ Licensor for the purpose of discussing and improving the Work, but
61
+ excluding communication that is conspicuously marked or otherwise
62
+ designated in writing by the copyright owner as "Not a Contribution."
63
+
64
+ "Contributor" shall mean Licensor and any individual or Legal Entity
65
+ on behalf of whom a Contribution has been received by Licensor and
66
+ subsequently incorporated within the Work.
67
+
68
+ 2. Grant of Copyright License. Subject to the terms and conditions of
69
+ this License, each Contributor hereby grants to You a perpetual,
70
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
71
+ copyright license to reproduce, prepare Derivative Works of,
72
+ publicly display, publicly perform, sublicense, and distribute the
73
+ Work and such Derivative Works in Source or Object form.
74
+
75
+ 3. Grant of Patent License. Subject to the terms and conditions of
76
+ this License, each Contributor hereby grants to You a perpetual,
77
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
78
+ (except as stated in this section) patent license to make, have made,
79
+ use, offer to sell, sell, import, and otherwise transfer the Work,
80
+ where such license applies only to those patent claims licensable
81
+ by such Contributor that are necessarily infringed by their
82
+ Contribution(s) alone or by combination of their Contribution(s)
83
+ with the Work to which such Contribution(s) was submitted. If You
84
+ institute patent litigation against any entity (including a
85
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
86
+ or a Contribution incorporated within the Work constitutes direct
87
+ or contributory patent infringement, then any patent licenses
88
+ granted to You under this License for that Work shall terminate
89
+ as of the date such litigation is filed.
90
+
91
+ 4. Redistribution. You may reproduce and distribute copies of the
92
+ Work or Derivative Works thereof in any medium, with or without
93
+ modifications, and in Source or Object form, provided that You
94
+ meet the following conditions:
95
+
96
+ (a) You must give any other recipients of the Work or
97
+ Derivative Works a copy of this License; and
98
+
99
+ (b) You must cause any modified files to carry prominent notices
100
+ stating that You changed the files; and
101
+
102
+ (c) You must retain, in the Source form of any Derivative Works
103
+ that You distribute, all copyright, patent, trademark, and
104
+ attribution notices from the Source form of the Work,
105
+ excluding those notices that do not pertain to any part of
106
+ the Derivative Works; and
107
+
108
+ (d) If the Work includes a "NOTICE" text file as part of its
109
+ distribution, then any Derivative Works that You distribute must
110
+ include a readable copy of the attribution notices contained
111
+ within such NOTICE file, excluding those notices that do not
112
+ pertain to any part of the Derivative Works, in at least one
113
+ of the following places: within a NOTICE text file distributed
114
+ as part of the Derivative Works; within the Source form or
115
+ documentation, if provided along with the Derivative Works; or,
116
+ within a display generated by the Derivative Works, if and
117
+ wherever such third-party notices normally appear. The contents
118
+ of the NOTICE file are for informational purposes only and
119
+ do not modify the License. You may add Your own attribution
120
+ notices within Derivative Works that You distribute, alongside
121
+ or as an addendum to the NOTICE text from the Work, provided
122
+ that such additional attribution notices cannot be construed
123
+ as modifying the License.
124
+
125
+ You may add Your own copyright statement to Your modifications and
126
+ may provide additional or different license terms and conditions
127
+ for use, reproduction, or distribution of Your modifications, or
128
+ for any such Derivative Works as a whole, provided Your use,
129
+ reproduction, and distribution of the Work otherwise complies with
130
+ the conditions stated in this License.
131
+
132
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
133
+ any Contribution intentionally submitted for inclusion in the Work
134
+ by You to the Licensor shall be under the terms and conditions of
135
+ this License, without any additional terms or conditions.
136
+ Notwithstanding the above, nothing herein shall supersede or modify
137
+ the terms of any separate license agreement you may have executed
138
+ with Licensor regarding such Contributions.
139
+
140
+ 6. Trademarks. This License does not grant permission to use the trade
141
+ names, trademarks, service marks, or product names of the Licensor,
142
+ except as required for reasonable and customary use in describing the
143
+ origin of the Work and reproducing the content of the NOTICE file.
144
+
145
+ 7. Disclaimer of Warranty. Unless required by applicable law or
146
+ agreed to in writing, Licensor provides the Work (and each
147
+ Contributor provides its Contributions) on an "AS IS" BASIS,
148
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149
+ implied, including, without limitation, any warranties or conditions
150
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151
+ PARTICULAR PURPOSE. You are solely responsible for determining the
152
+ appropriateness of using or redistributing the Work and assume any
153
+ risks associated with Your exercise of permissions under this License.
154
+
155
+ 8. Limitation of Liability. In no event and under no legal theory,
156
+ whether in tort (including negligence), contract, or otherwise,
157
+ unless required by applicable law (such as deliberate and grossly
158
+ negligent acts) or agreed to in writing, shall any Contributor be
159
+ liable to You for damages, including any direct, indirect, special,
160
+ incidental, or consequential damages of any character arising as a
161
+ result of this License or out of the use or inability to use the
162
+ Work (including but not limited to damages for loss of goodwill,
163
+ work stoppage, computer failure or malfunction, or any and all
164
+ other commercial damages or losses), even if such Contributor
165
+ has been advised of the possibility of such damages.
166
+
167
+ 9. Accepting Warranty or Additional Liability. While redistributing
168
+ the Work or Derivative Works thereof, You may choose to offer,
169
+ and charge a fee for, acceptance of support, warranty, indemnity,
170
+ or other liability obligations and/or rights consistent with this
171
+ License. However, in accepting such obligations, You may act only
172
+ on Your own behalf and on Your sole responsibility, not on behalf
173
+ of any other Contributor, and only if You agree to indemnify,
174
+ defend, and hold each Contributor harmless for any liability
175
+ incurred by, or claims asserted against, such Contributor by reason
176
+ of your accepting any such warranty or additional liability.
177
+
178
+ END OF TERMS AND CONDITIONS
179
+
180
+ APPENDIX: How to apply the Apache License to your work.
181
+
182
+ To apply the Apache License to your work, attach the following
183
+ boilerplate notice, with the fields enclosed by brackets "[]"
184
+ replaced with your own identifying information. (Don't include
185
+ the brackets!) The text should be enclosed in the appropriate
186
+ comment syntax for the file format. We also recommend that a
187
+ file or class name and description of purpose be included on the
188
+ same "printed page" as the copyright notice for easier
189
+ identification within third-party archives.
190
+
191
+ Copyright [yyyy] [name of copyright owner]
192
+
193
+ Licensed under the Apache License, Version 2.0 (the "License");
194
+ you may not use this file except in compliance with the License.
195
+ You may obtain a copy of the License at
196
+
197
+ http://www.apache.org/licenses/LICENSE-2.0
198
+
199
+ Unless required by applicable law or agreed to in writing, software
200
+ distributed under the License is distributed on an "AS IS" BASIS,
201
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202
+ See the License for the specific language governing permissions and
203
+ limitations under the License.
Makefile ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: deps_table_update modified_only_fixup extra_style_checks quality style fixup fix-copies test test-examples
2
+
3
+ # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
4
+ export PYTHONPATH = src
5
+
6
+ check_dirs := examples tests src utils
7
+
8
+ exclude_folders := examples/research_projects
9
+
10
+ modified_only_fixup:
11
+ $(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
12
+ @if test -n "$(modified_py_files)"; then \
13
+ echo "Checking/fixing $(modified_py_files)"; \
14
+ ruff check $(modified_py_files) --fix --exclude $(exclude_folders); \
15
+ ruff format $(modified_py_files) --exclude $(exclude_folders);\
16
+ else \
17
+ echo "No library .py files were modified"; \
18
+ fi
19
+
20
+ # Update src/transformers/dependency_versions_table.py
21
+
22
+ deps_table_update:
23
+ @python setup.py deps_table_update
24
+
25
+ deps_table_check_updated:
26
+ @md5sum src/transformers/dependency_versions_table.py > md5sum.saved
27
+ @python setup.py deps_table_update
28
+ @md5sum -c --quiet md5sum.saved || (printf "\nError: the version dependency table is outdated.\nPlease run 'make fixup' or 'make style' and commit the changes.\n\n" && exit 1)
29
+ @rm md5sum.saved
30
+
31
+ # autogenerating code
32
+
33
+ autogenerate_code: deps_table_update
34
+
35
+ # Check that the repo is in a good state
36
+
37
+ repo-consistency:
38
+ python utils/check_copies.py
39
+ python utils/check_table.py
40
+ python utils/check_dummies.py
41
+ python utils/check_repo.py
42
+ python utils/check_inits.py
43
+ python utils/check_config_docstrings.py
44
+ python utils/check_config_attributes.py
45
+ python utils/check_doctest_list.py
46
+ python utils/update_metadata.py --check-only
47
+ python utils/check_docstrings.py
48
+ python utils/check_support_list.py
49
+
50
+ # this target runs checks on all files
51
+
52
+ quality:
53
+ @python -c "from transformers import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
54
+ ruff check $(check_dirs) setup.py conftest.py
55
+ ruff format --check $(check_dirs) setup.py conftest.py
56
+ python utils/custom_init_isort.py --check_only
57
+ python utils/sort_auto_mappings.py --check_only
58
+ python utils/check_doc_toc.py
59
+
60
+
61
+ # Format source code automatically and check is there are any problems left that need manual fixing
62
+
63
+ extra_style_checks:
64
+ python utils/custom_init_isort.py
65
+ python utils/sort_auto_mappings.py
66
+ python utils/check_doc_toc.py --fix_and_overwrite
67
+
68
+ # this target runs checks on all files and potentially modifies some of them
69
+
70
+ style:
71
+ ruff check $(check_dirs) setup.py conftest.py --fix --exclude $(exclude_folders)
72
+ ruff format $(check_dirs) setup.py conftest.py --exclude $(exclude_folders)
73
+ ${MAKE} autogenerate_code
74
+ ${MAKE} extra_style_checks
75
+
76
+ # Super fast fix and check target that only works on relevant modified files since the branch was made
77
+
78
+ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency
79
+
80
+ # Make marked copies of snippets of codes conform to the original
81
+
82
+ fix-copies:
83
+ python utils/check_copies.py --fix_and_overwrite
84
+ python utils/check_table.py --fix_and_overwrite
85
+ python utils/check_dummies.py --fix_and_overwrite
86
+ python utils/check_doctest_list.py --fix_and_overwrite
87
+ python utils/check_docstrings.py --fix_and_overwrite
88
+
89
+ # Run tests for the library
90
+
91
+ test:
92
+ python -m pytest -n auto --dist=loadfile -s -v ./tests/
93
+
94
+ # Run tests for examples
95
+
96
+ test-examples:
97
+ python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/
98
+
99
+ # Run tests for SageMaker DLC release
100
+
101
+ test-sagemaker: # install sagemaker dependencies in advance with pip install .[sagemaker]
102
+ TEST_SAGEMAKER=True python -m pytest -n auto -s -v ./tests/sagemaker
103
+
104
+
105
+ # Release stuff
106
+
107
+ pre-release:
108
+ python utils/release.py
109
+
110
+ pre-patch:
111
+ python utils/release.py --patch
112
+
113
+ post-release:
114
+ python utils/release.py --post_release
115
+
116
+ post-patch:
117
+ python utils/release.py --post_release --patch
118
+
119
+ build-release:
120
+ rm -rf dist
121
+ rm -rf build
122
+ python setup.py bdist_wheel
123
+ python setup.py sdist
124
+ python utils/check_build.py
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: 1bit Llama3 Instruct Xmad Qa Batch
3
- emoji: 📈
4
- colorFrom: purple
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.39.0
8
  app_file: app.py
9
  pinned: false
10
  license: llama3
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: 1-Bit Llama-3 Demo Batch Input/Output 500+ Tokens per Second by xMAD.ai
3
+ emoji: 💬
4
+ colorFrom: yellow
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: false
10
  license: llama3
11
  ---
12
 
13
+ An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
README_test_result.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Maximum Batch Size Analysis for Llama2 Models
2
+
3
+ Provides a summary of the performance testing results for Llama2 models under various configurations. The focus here is on identifying the maximum batch sizes that can be processed without errors and documenting the corresponding generation times in seconds.
4
+
5
+ ## Experiment Details
6
+
7
+ The experiment varied settings such as model size, number of new tokens (`num_new_tokens`), key-value bit size (`kv_bits`), and `batch sizes`. "Unquantized" indicates configurations without quantization. The objective was to determine stable operating conditions for generating a fixed number of tokens under these configurations.
8
+
9
+ ### Models and Configurations
10
+
11
+ - **Models Tested:** Llama2 7B and 13B.
12
+ - **Measurements:** Generation times are directly reported in seconds as provided by the dataset.
13
+ ## Results: Llama2 7B Model Performance
14
+
15
+ | Model Size | num_new_tokens | KV Bits | Max Batch Size | Generation Time (s) | Speedup (Batch Size) |
16
+ |------------|----------------|-------------|----------------|----------------------|-----------------------|
17
+ | 7B | 256 | 1 | 764 | 257 | 14.98x |
18
+ | 7B | 256 | 2 | 384 | 124 | 7.53x |
19
+ | 7B | 256 | 4 | 204 | 99 | 4.00x |
20
+ | 7B | 256 | Unquantized | 51 | 75 | 1x |
21
+ | 7B | 512 | 1 | 437 | 352 | 15.07x |
22
+ | 7B | 512 | 2 | 223 | 178 | 7.69x |
23
+ | 7B | 512 | 4 | 114 | 148 | 3.93x |
24
+ | 7B | 512 | Unquantized | 29 | 122 | 1x |
25
+ | 7B | 1024 | 1 | 247 | 454 | 15.44x |
26
+ | 7B | 1024 | 2 | 126 | 300 | 7.88x |
27
+ | 7B | 1024 | 4 | 65 | 283 | 4.06x |
28
+ | 7B | 1024 | Unquantized | 16 | 224 | 1x |
29
+
30
+
31
+ ## Results: Llama2 13B Model Performance
32
+ | Model Size | num_new_tokens | KV Bits | Max Batch Size | Generation Time (s) | Speedup (Batch Size) |
33
+ |------------|----------------|-------------|----------------|----------------------|-----------------------|
34
+ | 13B | 256 | 1 | 154 | 83 | 14.00x |
35
+ | 13B | 256 | 2 | 88 | 63 | 8.00x |
36
+ | 13B | 256 | 4 | 45 | 62 | 4.09x |
37
+ | 13B | 256 | Unquantized | 11 | 33 | 1x |
38
+ | 13B | 512 | 1 | 100 | 144 | 16.67x |
39
+ | 13B | 512 | 2 | 51 | 98 | 8.50x |
40
+ | 13B | 512 | 4 | 26 | 108 | 4.33x |
41
+ | 13B | 512 | Unquantized | 6 | 60 | 1x |
42
+ | 13B | 1024 | 1 | 58 | 260 | 19.33x |
43
+ | 13B | 1024 | 2 | 29 | 173 | 9.67x |
44
+ | 13B | 1024 | 4 | 15 | 216 | 5.00x |
45
+ | 13B | 1024 | Unquantized | 3 | 118 | 1x |
46
+
47
+
48
+
49
+ ## Recommendations
50
+ 1. **KV Bits Influence**: Configurations with KV bits generally handle larger batch sizes more effectively, highlighting the importance of key/value storage management in batch processing.
51
+
52
+ 2. **Optimal Configuration Selection**: Depending on the operational needs (e.g., low latency vs. high throughput), choose the appropriate KV bits setting. For scenarios where throughput is critical, a lower KV bits setting is advisable.
53
+
54
+ ## Averaged Speedup Analysis
55
+ - **1-bit Quantization:** On average, achieves an approximately 15.58x speedup in batch size handling compared to unquantized configurations across all tested scenarios.
56
+
57
+ - **2-bit Quantization:** Provides an average of 8.02x speedup.
58
+
SECURITY.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Security Policy
2
+
3
+ ## Hugging Face Hub, remote artefacts, and remote code
4
+
5
+ Transformers is open-source software that is tightly coupled to the Hugging Face Hub. While you have the ability to use it
6
+ offline with pre-downloaded model weights, it provides a very simple way to download, use, and manage models locally.
7
+
8
+ When downloading artefacts that have been uploaded by others on any platform, you expose yourself to risks. Please
9
+ read below for the security recommendations in order to keep your runtime and local environment safe.
10
+
11
+ ### Remote artefacts
12
+
13
+ Models uploaded on the Hugging Face Hub come in different formats. We heavily recommend uploading and downloading
14
+ models in the [`safetensors`](https://github.com/huggingface/safetensors) format (which is the default prioritized
15
+ by the transformers library), as developed specifically to prevent arbitrary code execution on your system.
16
+
17
+ To avoid loading models from unsafe formats(e.g. [pickle](https://docs.python.org/3/library/pickle.html), you should use the `use_safetenstors` parameter. If doing so, in the event that no .safetensors file is present, transformers will error when loading the model.
18
+
19
+ ### Remote code
20
+
21
+ #### Modeling
22
+
23
+ Transformers supports many model architectures, but is also the bridge between your Python runtime and models that
24
+ are stored in model repositories on the Hugging Face Hub.
25
+
26
+ These models require the `trust_remote_code=True` parameter to be set when using them; please **always** verify
27
+ the content of the modeling files when using this argument. We recommend setting a revision in order to ensure you
28
+ protect yourself from updates on the repository.
29
+
30
+ #### Tools
31
+
32
+ Through the `Agent` framework, remote tools can be downloaded to be used by the Agent. You're to specify these tools
33
+ yourself, but please keep in mind that their code will be run on your machine if the Agent chooses to run them.
34
+
35
+ Please inspect the code of the tools before passing them to the Agent to protect your runtime and local setup.
36
+
37
+ ## Reporting a Vulnerability
38
+
39
+ 🤗 Please feel free to submit vulnerability reports to our private bug bounty program at https://hackerone.com/hugging_face. You'll need to request access to the program by emailing [email protected].
40
+ Note that you'll need to be invited to our program, so send us a quick email at [email protected] if you've found a vulnerability.
__pycache__/app_local.cpython-310.pyc ADDED
Binary file (7.8 kB). View file
 
app.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import random
5
+ import torch
6
+ import gc
7
+ import re
8
+ import math
9
+ import gradio as gr
10
+ import numpy as np
11
+ import boto3
12
+ import logging
13
+ from botocore.exceptions import NoCredentialsError
14
+ from collections import defaultdict
15
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
16
+
17
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
18
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
19
+
20
+
21
+ def download_xmad_file():
22
+ s3 = boto3.client('s3',
23
+ aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
24
+ aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'))
25
+
26
+ # Create the .codebooks directory if it doesn't exist
27
+ codebooks_dir = '.codebooks'
28
+ os.makedirs(codebooks_dir, exist_ok=True)
29
+
30
+ temp_file_path = os.path.join(codebooks_dir, 'llama-3-8b-instruct_1bit.xmad')
31
+
32
+ try:
33
+ # Download the file to the .codebooks directory
34
+ s3.download_file('xmad-quantized-models', 'llama-3-8b-instruct_1bit.xmad', temp_file_path)
35
+ print("Download Successful")
36
+
37
+ # Restrict permissions on the .codebooks directory
38
+ os.chmod(codebooks_dir, 0o700)
39
+
40
+ except NoCredentialsError:
41
+ print("Credentials not available")
42
+
43
+ download_xmad_file()
44
+
45
+ def b2mb(x):
46
+ """
47
+ Convert bytes to megabytes.
48
+ """
49
+ return int(x / 2**20)
50
+
51
+
52
+ class TorchTracemalloc:
53
+ """
54
+ A context manager that clears GPU memory
55
+ and returns GPU peak memory & GPU memory usage.
56
+ """
57
+ track_memory_consumption = []
58
+
59
+ def __enter__(self):
60
+ gc.collect()
61
+ torch.cuda.empty_cache()
62
+ torch.cuda.reset_peak_memory_stats()
63
+ self.begin = torch.cuda.memory_allocated()
64
+ return self
65
+
66
+ def __exit__(self, *exc):
67
+ torch.cuda.synchronize()
68
+ self.end = torch.cuda.memory_allocated()
69
+ self.peak = torch.cuda.max_memory_allocated()
70
+ self.used = b2mb(self.end - self.begin)
71
+ self.peaked = b2mb(self.peak - self.begin)
72
+ TorchTracemalloc.track_memory_consumption.append(self.peaked)
73
+
74
+ def clear_gpu_memory():
75
+ torch.cuda.empty_cache()
76
+ gc.collect()
77
+ print("GPU memory cleared.")
78
+
79
+
80
+ def format_response(dialog, response):
81
+ question = next((turn['content'] for turn in dialog if turn['role'] == 'user'), 'No question found')
82
+ return {"question": question, "answer": response}
83
+
84
+ # Global variables to store the model and tokenizer
85
+ global_model = None
86
+ global_tokenizer = None
87
+
88
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
89
+ global global_model, global_tokenizer
90
+
91
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
92
+ special_tokens = {"pad_token": "<PAD>"}
93
+ tokenizer.add_special_tokens(special_tokens)
94
+
95
+ config = AutoConfig.from_pretrained(model_name)
96
+ if kv_bits != "unquantized":
97
+ quantizer_path = f".codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
98
+ setattr(config, "quantizer_path", quantizer_path)
99
+
100
+ if dtype == "bf16":
101
+ dtype = torch.bfloat16
102
+ elif dtype == "fp16":
103
+ dtype = torch.float16
104
+ elif dtype == "fp32":
105
+ dtype = torch.float32
106
+
107
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
108
+
109
+ print(f"Quantizer path in model config: {model.config.quantizer_path}")
110
+ logging.info(f"Quantizer path in model config: {model.config.quantizer_path}")
111
+
112
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
113
+ model.resize_token_embeddings(len(tokenizer))
114
+
115
+ tokenizer.padding_side = "left"
116
+ model.config.pad_token_id = tokenizer.pad_token_id
117
+
118
+ global_model = model
119
+ global_tokenizer = tokenizer
120
+
121
+ # def load_questions(prompts_path, custom_questions):
122
+ # with open(prompts_path, "r") as file:
123
+ # dialogs = json.load(file)
124
+
125
+ # selected_dialogs = []
126
+ # if custom_questions:
127
+ # for question in custom_questions:
128
+ # if question.strip():
129
+ # custom_dialog = [{"role": "user", "content": question}]
130
+ # selected_dialogs.append(custom_dialog)
131
+
132
+ # num_questions = max(60 - len(selected_dialogs), 0)
133
+ # random.shuffle(dialogs)
134
+ # selected_dialogs.extend(dialogs[:num_questions])
135
+
136
+ # return selected_dialogs
137
+ def load_questions(prompts_path, custom_questions):
138
+ selected_dialogs = []
139
+ if custom_questions:
140
+ for question in custom_questions:
141
+ if question.strip():
142
+ custom_dialog = [{"role": "user", "content": question}]
143
+ selected_dialogs.append(custom_dialog)
144
+ return selected_dialogs
145
+
146
+
147
+ def markdown_to_plain_text(markdown_text):
148
+ # Convert markdown bold (**) to plain text uppercase
149
+ markdown_text = re.sub(r'\*\*(.*?)\*\*', r'\1'.upper(), markdown_text)
150
+ # Convert markdown italics (*) to plain text
151
+ markdown_text = re.sub(r'\*(.*?)\*', r'\1', markdown_text)
152
+ # Remove markdown headers (###)
153
+ markdown_text = re.sub(r'### ', '', markdown_text)
154
+ # Convert markdown lists (- or *)
155
+ markdown_text = re.sub(r'^\s*[-*]\s+', '', markdown_text, flags=re.MULTILINE)
156
+ # Remove remaining markdown formatting
157
+ markdown_text = re.sub(r'[`~>]', '', markdown_text)
158
+ return markdown_text
159
+
160
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits, progress=gr.Progress()):
161
+ print("Starting inference...")
162
+ global global_model, global_tokenizer
163
+
164
+ model = global_model
165
+ tokenizer = global_tokenizer
166
+
167
+ batch_inputs = [
168
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
169
+ for dialog in dialogs
170
+ ]
171
+
172
+ responses = []
173
+ start_time = time.time()
174
+ batch_size = min(100, len(dialogs)) # Adjust batch size based on GPU capacity and number of dialogs
175
+ num_dialogs = len(dialogs)
176
+ total_time = 0 # Initialize total_time
177
+ total_tokens = 0
178
+ total_ttft = 0
179
+
180
+ memory_avg = []
181
+ tokens_per_sec_avg = []
182
+ time_to_first_token_avg = []
183
+ responses_by_batch_size = defaultdict(list)
184
+ batch_generation_time = 0
185
+ total_generation_time = 0
186
+
187
+ terminators = [
188
+ tokenizer.eos_token_id,
189
+ tokenizer.convert_tokens_to_ids("<|eot_id|>"),
190
+ ]
191
+
192
+ with TorchTracemalloc() as tt:
193
+ for i in range(0, num_dialogs, batch_size):
194
+ batch = batch_inputs[i : i + batch_size]
195
+ try:
196
+ encoded_inputs = tokenizer(
197
+ batch,
198
+ padding=True,
199
+ truncation=False,
200
+ return_tensors="pt",
201
+ )
202
+
203
+ input_ids = encoded_inputs["input_ids"].to(model.device)
204
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
205
+
206
+ torch.cuda.synchronize()
207
+ start_time = time.perf_counter()
208
+
209
+ with torch.no_grad():
210
+ output_tokens = model.generate(
211
+ input_ids,
212
+ attention_mask=attention_mask,
213
+ max_new_tokens=num_new_tokens,
214
+ num_return_sequences=1,
215
+ do_sample=True,
216
+ temperature=temperature,
217
+ pad_token_id=tokenizer.pad_token_id,
218
+ eos_token_id=terminators,
219
+ )
220
+
221
+ torch.cuda.synchronize()
222
+ end_time = time.perf_counter()
223
+
224
+ batch_time = end_time - start_time
225
+ total_time += batch_time
226
+ batch_generation_time += batch_time
227
+ total_generation_time += batch_time
228
+ total_tokens += output_tokens.numel()
229
+
230
+ if i == 0:
231
+ total_ttft = batch_time
232
+
233
+ decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
234
+
235
+ for j, response in enumerate(decoded_outputs):
236
+ original_dialog = dialogs[i + j]
237
+ formatted_responses = format_response(original_dialog, response)
238
+ responses.append(formatted_responses)
239
+ # formatted_responses = "\n\n---\n\n".join([f"**Question**: {res['question']}\n\n**Answer**: {res['answer'][4:]}" for res in responses])
240
+ formatted_responses = "\n\n====================\n\n".join([f"**Question**:\t{res['question']}\n\n**Answer**: {res['answer'][4+len(res['question'])+11:]}" for res in responses])
241
+ plain_text_responses = markdown_to_plain_text(formatted_responses)
242
+ yield plain_text_responses
243
+ progress(i, desc="Processing batches")
244
+
245
+ torch.cuda.empty_cache()
246
+
247
+ except Exception as e:
248
+ print(f"Error processing batch {i//batch_size + 1}: {str(e)}")
249
+ continue
250
+
251
+ elapsed_time = total_time
252
+ tokens_per_second = total_tokens / total_time if total_time > 0 else 0
253
+ total_memory_consumption = np.sum(TorchTracemalloc.track_memory_consumption)
254
+ avg_memory_consumption = total_memory_consumption / num_dialogs
255
+
256
+ ttft = total_ttft / batch_size if batch_size > 0 else 0
257
+
258
+ print(f"Inference completed in {elapsed_time:.2f} seconds.")
259
+
260
+ yield {
261
+ "Time Taken (seconds)": elapsed_time,
262
+ "Tokens per Second": tokens_per_second,
263
+ "Time to First Token (seconds)": ttft,
264
+ "Formatted Responses": plain_text_responses,
265
+ "Memory Consumption per Question (MB)": avg_memory_consumption,
266
+ "Total Memory Consumption (MB)": total_memory_consumption,
267
+ "Num Dialogs": num_dialogs
268
+ }
269
+
270
+ # Demo function
271
+ def demo(num_new_tokens, temperature, custom_questions_text, kv_bits=1, progress=gr.Progress()):
272
+ custom_questions = custom_questions_text.split("\n")
273
+ print("Loading questions...")
274
+ dialogs = load_questions("chats_sys_none.json", custom_questions)
275
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
276
+
277
+ result_gen = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits, progress=progress)
278
+
279
+ formatted_responses = ""
280
+ num_dialogs = 0
281
+ for result in result_gen:
282
+ if isinstance(result, str):
283
+ formatted_responses = result
284
+ yield None, None, None, None, None, None, None, formatted_responses
285
+ else:
286
+ time_taken = result["Time Taken (seconds)"]
287
+ tokens_per_second = result["Tokens per Second"]
288
+ ttft = result["Time to First Token (seconds)"]
289
+ avg_memory_consumption = result["Memory Consumption per Question (MB)"]
290
+ total_memory_consumption = result["Total Memory Consumption (MB)"]
291
+ num_dialogs = result["Num Dialogs"]
292
+ formatted_responses = result["Formatted Responses"]
293
+ yield time_taken, tokens_per_second, ttft, avg_memory_consumption, num_dialogs, total_memory_consumption, formatted_responses
294
+ # clear_gpu_memory()
295
+
296
+ # Load JSON data
297
+ with open("chats_sys_none.json", "r") as file:
298
+ json_data = json.load(file)
299
+
300
+ # Load 60 random questions into the input area by default
301
+ def load_default_questions():
302
+ random.shuffle(json_data)
303
+ default_questions = [dialog[0]['content'] for dialog in json_data[:60] if 'content' in dialog[0]]
304
+ return "\n".join(default_questions)
305
+
306
+ # Load default questions on button click
307
+ def load_questions_action():
308
+ return load_default_questions()
309
+
310
+ # Gradio interface
311
+ css = """
312
+ body, html {
313
+ height: 100vh;
314
+ margin: 0;
315
+ }
316
+
317
+ .gradio-container {
318
+ height: 100vh;
319
+ }
320
+
321
+ #main-row {
322
+ height: 100%;
323
+ display: flex;
324
+ }
325
+
326
+ #control-panel{
327
+ height: 100%;
328
+ box-sizing: border-box;
329
+ display: flex;
330
+ flex-direction: column;
331
+ overflow: hidden;
332
+ flex: 1;
333
+ }
334
+
335
+ #control-panel, #formatted-responses-container {
336
+ height: 100%;
337
+ box-sizing: border-box;
338
+ display: flex;
339
+ flex-direction: column;
340
+ overflow: hidden;
341
+ flex: 1;
342
+ }
343
+
344
+ #control-panel {
345
+ flex: 1;
346
+ padding-bottom: 1vh; /* Add some padding to the bottom */
347
+ }
348
+
349
+ #custom-questions-text {
350
+ height: 30vh; /* Fixed height for custom questions text */
351
+ overflow-y: auto;
352
+ }
353
+
354
+ #metrics-panel {
355
+ display: flex;
356
+ flex-wrap: wrap;
357
+ flex-shrink: 0;
358
+ height: auto; /* Let the panel size adjust based on its content */
359
+ }
360
+
361
+ #metrics-panel .metric {
362
+ flex: 1 1 48%;
363
+ min-width: 10vw;
364
+ box-sizing: border-box;
365
+ }
366
+
367
+ #buttons-container {
368
+ display: flex;
369
+ justify-content: space-between;
370
+ height: 6vh; /* Fixed height for buttons container */
371
+ flex-shrink: 0;
372
+ margin-bottom: 1vh; /* Add margin to prevent cutting off */
373
+ }
374
+ """
375
+
376
+ with gr.Blocks(css=css) as app:
377
+ with gr.Row(elem_id="main-row", equal_height=True):
378
+ with gr.Column(elem_id="control-panel", scale=1):
379
+ num_new_tokens = gr.Slider(label="Number of New Tokens", minimum=128, maximum=2048, step=128, value=512)
380
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4)
381
+ custom_questions_text = gr.Textbox(
382
+ label="Custom Questions",
383
+ placeholder="Type your custom questions here, one per line... \nOr press the \"Load Default Questions\" button to load 60 random default questions. \nAdd a question by adding a new line, or delete lines to decrease the number of questions.",
384
+ autoscroll=False,
385
+ container=False,
386
+ lines=5,
387
+ elem_id="custom-questions-text"
388
+ )
389
+ with gr.Row(elem_id="metrics-panel"):
390
+ time_taken = gr.Number(label="Time Taken (seconds)", interactive=False, elem_classes=["metric"])
391
+ tokens_per_second = gr.Number(label="Tokens per Second", interactive=False, elem_classes=["metric"])
392
+ ttft = gr.Number(label="Time to First Token (seconds)", interactive=False, elem_classes=["metric"])
393
+ total_memory_consumption = gr.Number(label="Memory Consumption (MB)", interactive=False, elem_classes=["metric"])
394
+ num_dialogs = gr.Number(label="Dialogs Processed", interactive=False, elem_classes=["metric"])
395
+ avg_memory_consumption = gr.Number(label="Mem. Consumption per Question (MB)", interactive=False, elem_classes=["metric"])
396
+ with gr.Row(elem_id="buttons-container"):
397
+ load_questions_btn = gr.Button("Load Default Questions")
398
+ demo_btn = gr.Button("Run Inference", elem_id="run-inference-btn", variant="primary")
399
+
400
+ formatted_responses = gr.Textbox(
401
+ label="Formatted Responses",
402
+ elem_id="formatted-responses",
403
+ value="No responses yet. Run the inference to see results.",
404
+ lines=37,
405
+ container=False,
406
+ autoscroll=False,
407
+ show_copy_button=True
408
+ )
409
+
410
+ load_questions_btn.click(fn=load_questions_action, inputs=[], outputs=custom_questions_text)
411
+ demo_btn.click(demo, inputs=[num_new_tokens, temperature, custom_questions_text], outputs=[time_taken, tokens_per_second, ttft, avg_memory_consumption, num_dialogs, total_memory_consumption, formatted_responses])
412
+
413
+ if __name__ == "__main__":
414
+ print("Loading model and tokenizer on startup...")
415
+ load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
416
+ print("Model and tokenizer loaded. Starting Gradio interface...")
417
+ username = os.getenv("AUTH_USERNAME")
418
+ password = os.getenv("AUTH_PASSWORD")
419
+ app.launch(auth=(username, password))
backups/app_backup.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import InferenceClient
3
+
4
+ """
5
+ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
+ """
7
+ client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
+
9
+
10
+ def respond(
11
+ message,
12
+ history: list[tuple[str, str]],
13
+ system_message,
14
+ max_tokens,
15
+ temperature,
16
+ top_p,
17
+ ):
18
+ messages = [{"role": "system", "content": system_message}]
19
+
20
+ for val in history:
21
+ if val[0]:
22
+ messages.append({"role": "user", "content": val[0]})
23
+ if val[1]:
24
+ messages.append({"role": "assistant", "content": val[1]})
25
+
26
+ messages.append({"role": "user", "content": message})
27
+
28
+ response = ""
29
+
30
+ for message in client.chat_completion(
31
+ messages,
32
+ max_tokens=max_tokens,
33
+ stream=True,
34
+ temperature=temperature,
35
+ top_p=top_p,
36
+ ):
37
+ token = message.choices[0].delta.content
38
+
39
+ response += token
40
+ yield response
41
+
42
+ """
43
+ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
+ """
45
+ demo = gr.ChatInterface(
46
+ respond,
47
+ additional_inputs=[
48
+ gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
+ gr.Slider(
52
+ minimum=0.1,
53
+ maximum=1.0,
54
+ value=0.95,
55
+ step=0.05,
56
+ label="Top-p (nucleus sampling)",
57
+ ),
58
+ ],
59
+ )
60
+
61
+
62
+ if __name__ == "__main__":
63
+ demo.launch()
backups/app_local_enabled_streaming_but_inefficient.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ import random
8
+
9
+ # Environment variables
10
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
11
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
12
+
13
+ # Global variables to store the model and tokenizer
14
+ model = None
15
+ tokenizer = None
16
+
17
+ # Load model and tokenizer
18
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
19
+ global model, tokenizer
20
+ if model is None or tokenizer is None:
21
+ print("Loading model and tokenizer...")
22
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ special_tokens = {"pad_token": "<PAD>"}
24
+ tokenizer.add_special_tokens(special_tokens)
25
+
26
+ config = AutoConfig.from_pretrained(model_name)
27
+ if kv_bits != "unquantized":
28
+ quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
29
+ setattr(config, "quantizer_path", quantizer_path)
30
+
31
+ if dtype == "bf16":
32
+ dtype = torch.bfloat16
33
+ elif dtype == "fp16":
34
+ dtype = torch.float16
35
+ elif dtype == "fp32":
36
+ dtype = torch.float32
37
+
38
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
39
+
40
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
41
+ model.resize_token_embeddings(len(tokenizer))
42
+
43
+ tokenizer.padding_side = "left"
44
+ model.config.pad_token_id = tokenizer.pad_token_id
45
+
46
+ return model, tokenizer
47
+
48
+ # Format response
49
+ def format_response(dialog, response):
50
+ question = next((turn['content'] for turn in dialog if turn['role'] == 'user'), 'No question found')
51
+ answer = response.split("assistant")[-1].strip()
52
+ return {"question": question, "answer": answer}
53
+
54
+ # Load questions
55
+ def load_questions(prompts_path, custom_questions):
56
+ with open(prompts_path, "r") as file:
57
+ dialogs = json.load(file)
58
+
59
+ selected_dialogs = []
60
+
61
+ if custom_questions:
62
+ for question in custom_questions:
63
+ if question.strip():
64
+ custom_dialog = [{"role": "user", "content": question}]
65
+ selected_dialogs.append(custom_dialog)
66
+
67
+ num_questions = 60 - len(selected_dialogs)
68
+ random.shuffle(dialogs)
69
+ selected_dialogs.extend(dialogs[:num_questions])
70
+
71
+ return selected_dialogs[:60]
72
+
73
+ # Inference
74
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits, top_k, progress=gr.Progress()):
75
+ print("Starting inference...")
76
+ model, tokenizer = load_model_and_tokenizer(model_name, dtype, kv_bits)
77
+ batch_inputs = [
78
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
79
+ for dialog in dialogs
80
+ ]
81
+
82
+ responses = [''] * len(dialogs)
83
+ start_time = time.time()
84
+
85
+ batch_size = 30 # Set batch size for processing, this can be adjusted
86
+ num_dialogs = len(dialogs)
87
+ total_time = 0
88
+ total_tokens = 0
89
+ num_batches = (num_dialogs + batch_size - 1) // batch_size
90
+
91
+ ttft = None
92
+ tokens_per_step = 25 # Number of tokens to generate per step for efficiency
93
+
94
+ for batch_idx in range(num_batches):
95
+ start_idx = batch_idx * batch_size
96
+ end_idx = min(start_idx + batch_size, num_dialogs)
97
+ batch = batch_inputs[start_idx:end_idx]
98
+
99
+ encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
100
+ input_ids = encoded_inputs["input_ids"].to(model.device)
101
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
102
+
103
+ generated_ids = input_ids
104
+
105
+ while generated_ids.shape[1] < num_new_tokens:
106
+ with torch.no_grad():
107
+ outputs = model(generated_ids, attention_mask=attention_mask)
108
+ next_token_logits = outputs.logits[:, -1, :]
109
+ # Apply temperature scaling
110
+ next_token_logits = next_token_logits / temperature
111
+ # Apply top-k sampling
112
+ top_k_values, top_k_indices = torch.topk(next_token_logits, top_k, dim=-1)
113
+ next_token_probs = torch.nn.functional.softmax(top_k_values, dim=-1)
114
+ next_tokens = torch.multinomial(next_token_probs, num_samples=1)
115
+ next_tokens = torch.gather(top_k_indices, -1, next_tokens)
116
+ generated_ids = torch.cat([generated_ids, next_tokens], dim=1)
117
+
118
+ if ttft is None:
119
+ ttft = time.perf_counter() - start_time
120
+
121
+ decoded_outputs = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(generated_ids.size(0))]
122
+
123
+ for i, response in enumerate(decoded_outputs):
124
+ formatted_response = format_response(dialogs[start_idx + i], response)
125
+ responses[start_idx + i] = f"**Question**: {formatted_response['question']}\n\n**Answer**: {formatted_response['answer']}"
126
+
127
+ formatted_responses = "\n\n---\n\n".join(responses)
128
+ yield {
129
+ "Formatted Responses": formatted_responses
130
+ }
131
+
132
+ progress((batch_idx * num_new_tokens + generated_ids.shape[1]) / (num_batches * num_new_tokens), desc="Generating tokens")
133
+
134
+ # Check if end-of-sequence token is generated
135
+ if any(tokenizer.eos_token_id in output for output in generated_ids.tolist()):
136
+ break
137
+
138
+ # Update attention mask for the next tokens
139
+ attention_mask = torch.cat([attention_mask, torch.ones((attention_mask.size(0), 1)).to(model.device)], dim=1)
140
+
141
+ # Stream intermediate results every 0.5 seconds
142
+ time.sleep(0.5)
143
+
144
+ total_elapsed_time = time.time() - start_time
145
+ tokens_per_second = total_tokens / total_time if total_time > 0 else 0
146
+ print(f"Inference completed in {total_elapsed_time:.2f} seconds.")
147
+
148
+ # Demo function
149
+ def demo(num_new_tokens, temperature, custom_questions_text, kv_bits, top_k, progress=gr.Progress()):
150
+ custom_questions = custom_questions_text.split("\n")
151
+ print("Loading questions...")
152
+ dialogs = load_questions("chats_sys_none.json", custom_questions)
153
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
154
+
155
+ result_gen = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits, top_k, progress=progress)
156
+
157
+ for result in result_gen:
158
+ if result:
159
+ formatted_response = result["Formatted Responses"]
160
+ yield None, None, None, formatted_response
161
+
162
+ # Load JSON data
163
+ with open("chats_sys_none.json", "r") as file:
164
+ json_data = json.load(file)
165
+ json_data_str = json.dumps(json_data, indent=2)
166
+
167
+ # Show JSON function
168
+ def show_json():
169
+ return json_data_str
170
+
171
+ # Gradio interface
172
+ app = gr.Blocks()
173
+
174
+ with app:
175
+ with gr.Tab("LLM Inference Demo"):
176
+ with gr.Row():
177
+ with gr.Column():
178
+ num_new_tokens = gr.Slider(label="Number of New Tokens", minimum=128, maximum=1024, step=128, value=512)
179
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4)
180
+ custom_questions_text = gr.Textbox(label="Custom Questions", placeholder="Type your custom questions here, one per line...", lines=5)
181
+ kv_bits = gr.Dropdown(label="KV Bits", choices=["1", "2", "4", "unquantized"], value="1")
182
+ top_k = gr.Slider(label="Top K", minimum=1, maximum=50, step=1, value=10)
183
+
184
+ with gr.Column():
185
+ time_taken = gr.Number(label="Time Taken (seconds)")
186
+ tokens_per_second = gr.Number(label="Tokens per Second")
187
+ ttft = gr.Number(label="Time to First Token (TTFT, seconds)")
188
+
189
+ with gr.Row():
190
+ formatted_responses = gr.Markdown(label="Formatted Responses")
191
+
192
+ demo_btn = gr.Button("Run Inference")
193
+
194
+ demo_btn.click(demo, inputs=[num_new_tokens, temperature, custom_questions_text, kv_bits, top_k], outputs=[time_taken, tokens_per_second, ttft, formatted_responses])
195
+
196
+ with gr.Tab("Show JSON"):
197
+ json_output = gr.HTML("<pre>{}</pre>".format(json_data_str))
198
+ json_interface = gr.Interface(fn=show_json, inputs=[], outputs=[json_output], live=False)
199
+ json_interface.render()
200
+
201
+ if __name__ == "__main__":
202
+ print("Loading model and tokenizer on startup...")
203
+ load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
204
+ print("Model and tokenizer loaded. Starting Gradio interface...")
205
+ app.queue(default_concurrency_limit=5).launch()
backups/app_local_v0.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+
8
+ # Environment variables
9
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
10
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
11
+
12
+ # Global variables to store the model and tokenizer
13
+ model = None
14
+ tokenizer = None
15
+
16
+ # Load model and tokenizer
17
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
18
+ global model, tokenizer
19
+ if model is None or tokenizer is None:
20
+ print("Loading model and tokenizer...")
21
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+ special_tokens = {"pad_token": "<PAD>"}
23
+ tokenizer.add_special_tokens(special_tokens)
24
+
25
+ config = AutoConfig.from_pretrained(model_name)
26
+ if kv_bits != "unquantized":
27
+ quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
28
+ setattr(config, "quantizer_path", quantizer_path)
29
+
30
+ if dtype == "bf16":
31
+ dtype = torch.bfloat16
32
+ elif dtype == "fp16":
33
+ dtype = torch.float16
34
+ elif dtype == "fp32":
35
+ dtype = torch.float32
36
+
37
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
38
+
39
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
40
+ model.resize_token_embeddings(len(tokenizer))
41
+
42
+ tokenizer.padding_side = "left"
43
+ model.config.pad_token_id = tokenizer.pad_token_id
44
+
45
+ return model, tokenizer
46
+
47
+ # Format response
48
+ def format_response(dialog, response):
49
+ formatted_dialog = dialog.copy()
50
+ formatted_dialog.append({"role": "assistant", "content": response})
51
+ return formatted_dialog
52
+
53
+ # Load questions
54
+ def load_questions(prompts_path, num_questions, custom_question):
55
+ with open(prompts_path, "r") as file:
56
+ dialogs = json.load(file)
57
+
58
+ if custom_question and custom_question.strip():
59
+ custom_dialog = [{"role": "user", "content": custom_question}]
60
+ dialogs.insert(0, custom_dialog)
61
+
62
+ dialogs = dialogs[:num_questions]
63
+ return dialogs
64
+
65
+ # Inference
66
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits):
67
+ print("Starting inference...")
68
+ model, tokenizer = load_model_and_tokenizer(model_name, dtype, kv_bits)
69
+ batch_inputs = [
70
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
71
+ for dialog in dialogs
72
+ ]
73
+
74
+ responses = []
75
+ start_time = time.time()
76
+
77
+ batch_size = 20 # Set batch size for processing, this can be adjusted
78
+ num_dialogs = len(dialogs)
79
+ total_time = 0
80
+ total_tokens = 0
81
+ num_batches = (num_dialogs + batch_size - 1) // batch_size
82
+
83
+ for batch_idx in range(num_batches):
84
+ start_idx = batch_idx * batch_size
85
+ end_idx = min(start_idx + batch_size, num_dialogs)
86
+ batch = batch_inputs[start_idx:end_idx]
87
+
88
+ encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
89
+ input_ids = encoded_inputs["input_ids"].to(model.device)
90
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
91
+
92
+ with torch.no_grad():
93
+ torch.cuda.synchronize()
94
+ batch_start_time = time.perf_counter()
95
+
96
+ output_tokens = model.generate(
97
+ input_ids,
98
+ attention_mask=attention_mask,
99
+ max_new_tokens=num_new_tokens,
100
+ do_sample=True,
101
+ temperature=temperature,
102
+ pad_token_id=tokenizer.pad_token_id,
103
+ eos_token_id=tokenizer.eos_token_id
104
+ )
105
+
106
+ torch.cuda.synchronize()
107
+ batch_end_time = time.perf_counter()
108
+
109
+ batch_time = batch_end_time - batch_start_time
110
+ total_time += batch_time
111
+ total_tokens += output_tokens.numel()
112
+
113
+ decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
114
+
115
+ for i, response in enumerate(decoded_outputs):
116
+ original_dialog = dialogs[start_idx + i]
117
+ formatted_response = format_response(original_dialog, response)
118
+ responses.append(formatted_response)
119
+
120
+ elapsed_time = time.time() - start_time
121
+ print(f"Inference completed in {elapsed_time:.2f} seconds.")
122
+
123
+ results = {
124
+ "Responses": responses,
125
+ "Time Taken (seconds)": elapsed_time,
126
+ "Tokens per Second": total_tokens / total_time if total_time > 0 else 0
127
+ }
128
+
129
+ return results
130
+
131
+ # Demo function
132
+ def demo(num_new_tokens, temperature, num_questions, custom_question, kv_bits):
133
+ print("Loading questions...")
134
+ dialogs = load_questions("chats_sys_none.json", num_questions, custom_question)
135
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
136
+ results = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits)
137
+ return results
138
+
139
+ # Load JSON data
140
+ with open("chats_sys_none.json", "r") as file:
141
+ json_data = json.load(file)
142
+ json_data_str = json.dumps(json_data, indent=2)
143
+
144
+ # Show JSON function
145
+ def show_json():
146
+ return json_data_str
147
+
148
+ # Gradio interface
149
+ interface = gr.Interface(
150
+ fn=demo,
151
+ inputs=[
152
+ gr.Slider(label="Number of New Tokens", minimum=1, maximum=1024, step=1, value=512),
153
+ gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4),
154
+ gr.Slider(minimum=20, maximum=100, step=1, label="Number of Questions", value=20),
155
+ gr.Textbox(label="Custom Question", placeholder="Type your custom question here..."),
156
+ gr.Dropdown(label="KV Bits", choices=["1", "2", "4", "unquantized"], value="1")
157
+ ],
158
+ outputs=[
159
+ gr.JSON(label="Responses and Time Taken")
160
+ ],
161
+ title="LLM Inference Demo",
162
+ description="A demo for running LLM inference using Gradio and Hugging Face.",
163
+ live=False
164
+ )
165
+
166
+ json_interface = gr.Interface(
167
+ fn=show_json,
168
+ inputs=[],
169
+ outputs=[
170
+ gr.HTML("<pre>{}</pre>".format(json_data_str))
171
+ ],
172
+ live=False
173
+ )
174
+
175
+ app = gr.Blocks()
176
+
177
+ with app:
178
+ with gr.Tab("LLM Inference Demo"):
179
+ interface.render()
180
+ with gr.Tab("Show JSON"):
181
+ json_interface.render()
182
+
183
+ if __name__ == "__main__":
184
+ print("Loading model and tokenizer on startup...")
185
+ load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
186
+ print("Model and tokenizer loaded. Starting Gradio interface...")
187
+ app.launch()
backups/app_local_v1-1.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ import random
8
+ from PIL import Image
9
+
10
+ # Environment variables
11
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
12
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
13
+
14
+ # Global variables to store the model and tokenizer
15
+ model = None
16
+ tokenizer = None
17
+
18
+ # Load model and tokenizer
19
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
20
+ global model, tokenizer
21
+ if model is None or tokenizer is None:
22
+ print("Loading model and tokenizer...")
23
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
24
+ special_tokens = {"pad_token": "<PAD>"}
25
+ tokenizer.add_special_tokens(special_tokens)
26
+
27
+ config = AutoConfig.from_pretrained(model_name)
28
+ if kv_bits != "unquantized":
29
+ quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
30
+ setattr(config, "quantizer_path", quantizer_path)
31
+
32
+ if dtype == "bf16":
33
+ dtype = torch.bfloat16
34
+ elif dtype == "fp16":
35
+ dtype = torch.float16
36
+ elif dtype == "fp32":
37
+ dtype = torch.float32
38
+
39
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
40
+
41
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
42
+ model.resize_token_embeddings(len(tokenizer))
43
+
44
+ tokenizer.padding_side = "left"
45
+ model.config.pad_token_id = tokenizer.pad_token_id
46
+
47
+ return model, tokenizer
48
+
49
+ # Format response
50
+ def format_response(dialog, response):
51
+ question = next((turn['content'] for turn in dialog if turn['role'] == 'user'), 'No question found')
52
+ answer = response.split("assistant")[-1].strip()
53
+ return {"question": question, "answer": answer}
54
+
55
+ # Load questions
56
+ def load_questions(prompts_path, custom_questions):
57
+ with open(prompts_path, "r") as file:
58
+ dialogs = json.load(file)
59
+
60
+ selected_dialogs = []
61
+
62
+ if custom_questions:
63
+ for question in custom_questions:
64
+ if question.strip():
65
+ custom_dialog = [{"role": "user", "content": question}]
66
+ selected_dialogs.append(custom_dialog)
67
+
68
+ num_questions = 60 - len(selected_dialogs)
69
+ random.shuffle(dialogs)
70
+ selected_dialogs.extend(dialogs[:num_questions])
71
+
72
+ return selected_dialogs[:60]
73
+
74
+ # Inference
75
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits, progress=gr.Progress()):
76
+ print("Starting inference...")
77
+ model, tokenizer = load_model_and_tokenizer(model_name, dtype, kv_bits)
78
+ batch_inputs = [
79
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
80
+ for dialog in dialogs
81
+ ]
82
+
83
+ responses = []
84
+ start_time = time.time()
85
+
86
+ batch_size = 30 # Set batch size for processing, this can be adjusted
87
+ num_dialogs = len(dialogs)
88
+ total_time = 0
89
+ total_tokens = 0
90
+ num_batches = (num_dialogs + batch_size - 1) // batch_size
91
+
92
+ for batch_idx in range(num_batches):
93
+ start_idx = batch_idx * batch_size
94
+ end_idx = min(start_idx + batch_size, num_dialogs)
95
+ batch = batch_inputs[start_idx:end_idx]
96
+
97
+ encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
98
+ input_ids = encoded_inputs["input_ids"].to(model.device)
99
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
100
+
101
+ with torch.no_grad():
102
+ torch.cuda.synchronize()
103
+ batch_start_time = time.perf_counter()
104
+
105
+ # Generate responses and measure time to first token
106
+ output_tokens = model.generate(
107
+ input_ids,
108
+ attention_mask=attention_mask,
109
+ max_new_tokens=num_new_tokens,
110
+ do_sample=True,
111
+ temperature=temperature,
112
+ pad_token_id=tokenizer.pad_token_id,
113
+ eos_token_id=tokenizer.eos_token_id
114
+ )
115
+
116
+ torch.cuda.synchronize()
117
+ batch_end_time = time.perf_counter()
118
+
119
+ batch_time = batch_end_time - batch_start_time
120
+ total_time += batch_time
121
+ total_tokens += output_tokens.numel()
122
+
123
+ # Calculate TTFT
124
+ if batch_idx == 0:
125
+ ttft = batch_time / input_ids.size(0) # Time to first token for the first batch
126
+
127
+ decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
128
+
129
+ for i, response in enumerate(decoded_outputs):
130
+ original_dialog = dialogs[start_idx + i]
131
+ formatted_response = format_response(original_dialog, response)
132
+ responses.append(formatted_response)
133
+
134
+ formatted_responses = "\n\n---\n\n".join([f"**Question**: {res['question']}\n\n**Answer**: {res['answer']}" for res in responses])
135
+ yield formatted_responses
136
+ progress((batch_idx + 1) / num_batches, desc="Processing batches")
137
+
138
+ elapsed_time = time.time() - start_time
139
+ tokens_per_second = total_tokens / total_time if total_time > 0 else 0
140
+ print(f"Inference completed in {elapsed_time:.2f} seconds.")
141
+
142
+ yield {
143
+ "Time Taken (seconds)": elapsed_time,
144
+ "Tokens per Second": tokens_per_second,
145
+ "Time to First Token (TTFT, seconds)": ttft,
146
+ "Formatted Responses": formatted_responses
147
+ }
148
+
149
+ # Demo function
150
+ def demo(num_new_tokens, temperature, custom_questions_text, kv_bits, progress=gr.Progress()):
151
+ custom_questions = custom_questions_text.split("\n")
152
+ print("Loading questions...")
153
+ dialogs = load_questions("chats_sys_none.json", custom_questions)
154
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
155
+
156
+ result_gen = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits, progress=progress)
157
+
158
+ formatted_responses = ""
159
+ for result in result_gen:
160
+ if isinstance(result, str):
161
+ formatted_responses = result
162
+ yield None, None, None, formatted_responses
163
+ else:
164
+ time_taken = result["Time Taken (seconds)"]
165
+ tokens_per_second = result["Tokens per Second"]
166
+ ttft = result["Time to First Token (TTFT, seconds)"]
167
+ formatted_responses = result["Formatted Responses"]
168
+ yield time_taken, tokens_per_second, ttft, formatted_responses
169
+
170
+ # Load JSON data
171
+ with open("chats_sys_none.json", "r") as file:
172
+ json_data = json.load(file)
173
+
174
+ # Load 50 random questions into the input area by default
175
+ def load_default_questions():
176
+ random.shuffle(json_data)
177
+ default_questions = [dialog[0]['content'] for dialog in json_data[:50] if 'content' in dialog[0]]
178
+ return "\n".join(default_questions)
179
+
180
+ # Gradio interface
181
+ demo_interface = gr.Interface(
182
+ fn=demo,
183
+ inputs=[
184
+ gr.Slider(label="Number of New Tokens", minimum=128, maximum=1024, step=128, value=512),
185
+ gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4),
186
+ gr.Textbox(label="Custom Questions", placeholder="Type your custom questions here, one per line...", lines=5),
187
+ gr.Dropdown(label="KV Bits", choices=["1", "2", "4", "unquantized"], value="1")
188
+ ],
189
+ outputs=[
190
+ gr.Number(label="Time Taken (seconds)", interactive=False),
191
+ gr.Number(label="Tokens per Second", interactive=False),
192
+ gr.Number(label="Time to First Token (TTFT, seconds)", interactive=False),
193
+ gr.Markdown(label="Formatted Responses", elem_id="scrollable-output")
194
+ ],
195
+ live=False
196
+ )
197
+
198
+ # Gradio Blocks for additional controls
199
+ with gr.Blocks(css=".scrollable-output {height: 400px; overflow-y: auto; padding: 10px; border: 1px solid #ccc;}") as app:
200
+ with gr.Column():
201
+ gr.Markdown("### LLM Inference Demo")
202
+ with gr.Row():
203
+ num_new_tokens = gr.Slider(label="Number of New Tokens", minimum=128, maximum=1024, step=128, value=512)
204
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4)
205
+ kv_bits = gr.Dropdown(label="KV Bits", choices=["1", "2", "4", "unquantized"], value="1")
206
+
207
+ custom_questions_text = gr.Textbox(label="Custom Questions", placeholder="Type your custom questions here, one per line...", lines=5)
208
+ load_questions_btn = gr.Button("Load Default Questions")
209
+
210
+ with gr.Row():
211
+ time_taken = gr.Number(label="Time Taken (seconds)", interactive=False)
212
+ tokens_per_second = gr.Number(label="Tokens per Second", interactive=False)
213
+ ttft = gr.Number(label="Time to First Token (TTFT, seconds)", interactive=False)
214
+
215
+ formatted_responses = gr.Markdown(label="Formatted Responses", elem_id="scrollable-output")
216
+
217
+ demo_btn = gr.Button("Run Inference")
218
+
219
+ load_questions_btn.click(fn=lambda: load_default_questions(), inputs=[], outputs=custom_questions_text)
220
+ demo_btn.click(demo, inputs=[num_new_tokens, temperature, custom_questions_text, kv_bits], outputs=[time_taken, tokens_per_second, ttft, formatted_responses])
221
+
222
+ if __name__ == "__main__":
223
+ print("Checking if the image path is correct...")
224
+ check_image_path("memory_usage.png") # Check image path on startup
225
+ print("Loading model and tokenizer on startup...")
226
+ load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
227
+ print("Model and tokenizer loaded. Starting Gradio interface...")
228
+ app.launch()
backups/app_local_v1.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import random
5
+ import torch
6
+ import re
7
+ import math
8
+ import gradio as gr
9
+ import numpy as np
10
+ from collections import defaultdict
11
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
12
+
13
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
14
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
15
+
16
+ class TorchTracemalloc:
17
+ track_memory_consumption = []
18
+
19
+ def __enter__(self):
20
+ self.begin = torch.cuda.memory_allocated()
21
+ torch.cuda.reset_max_memory_allocated()
22
+ return self
23
+
24
+ def __exit__(self, *exc):
25
+ peak = torch.cuda.max_memory_allocated()
26
+ peaked = (peak - self.begin) // 1024 ** 2
27
+ TorchTracemalloc.track_memory_consumption.append(peaked)
28
+ print(f"Memory consumed: {peaked} MB") # Debugging print
29
+
30
+ def format_response(dialog, response):
31
+ question = next((turn['content'] for turn in dialog if turn['role'] == 'user'), 'No question found')
32
+ return {"question": question, "answer": response}
33
+
34
+ # Global variables to store the model and tokenizer
35
+ global_model = None
36
+ global_tokenizer = None
37
+
38
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
39
+ global global_model, global_tokenizer
40
+
41
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
42
+ special_tokens = {"pad_token": "<PAD>"}
43
+ tokenizer.add_special_tokens(special_tokens)
44
+
45
+ config = AutoConfig.from_pretrained(model_name)
46
+ if kv_bits != "unquantized":
47
+ quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
48
+ setattr(config, "quantizer_path", quantizer_path)
49
+
50
+ if dtype == "bf16":
51
+ dtype = torch.bfloat16
52
+ elif dtype == "fp16":
53
+ dtype = torch.float16
54
+ elif dtype == "fp32":
55
+ dtype = torch.float32
56
+
57
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
58
+
59
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
60
+ model.resize_token_embeddings(len(tokenizer))
61
+
62
+ tokenizer.padding_side = "left"
63
+ model.config.pad_token_id = tokenizer.pad_token_id
64
+
65
+ global_model = model
66
+ global_tokenizer = tokenizer
67
+
68
+ def load_questions(prompts_path, custom_questions):
69
+ with open(prompts_path, "r") as file:
70
+ dialogs = json.load(file)
71
+
72
+ selected_dialogs = []
73
+ if custom_questions:
74
+ for question in custom_questions:
75
+ if question.strip():
76
+ custom_dialog = [{"role": "user", "content": question}]
77
+ selected_dialogs.append(custom_dialog)
78
+
79
+ num_questions = 60 - len(selected_dialogs)
80
+ random.shuffle(dialogs)
81
+ selected_dialogs.extend(dialogs[:num_questions])
82
+
83
+ return selected_dialogs[:60]
84
+
85
+ def markdown_to_plain_text(markdown_text):
86
+ # Convert markdown bold (**) to plain text uppercase
87
+ markdown_text = re.sub(r'\*\*(.*?)\*\*', r'\1'.upper(), markdown_text)
88
+ # Convert markdown italics (*) to plain text
89
+ markdown_text = re.sub(r'\*(.*?)\*', r'\1', markdown_text)
90
+ # Remove markdown headers (###)
91
+ markdown_text = re.sub(r'### ', '', markdown_text)
92
+ # Convert markdown lists (- or *)
93
+ markdown_text = re.sub(r'^\s*[-*]\s+', '', markdown_text, flags=re.MULTILINE)
94
+ # Remove remaining markdown formatting
95
+ markdown_text = re.sub(r'[`~>]', '', markdown_text)
96
+ return markdown_text
97
+
98
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits, progress=gr.Progress()):
99
+ print("Starting inference...")
100
+ global global_model, global_tokenizer
101
+
102
+ model = global_model
103
+ tokenizer = global_tokenizer
104
+
105
+ batch_inputs = [
106
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
107
+ for dialog in dialogs
108
+ ]
109
+
110
+ responses = []
111
+ start_time = time.time()
112
+ batch_size = 60 # Adjust batch size based on GPU capacity
113
+ num_dialogs = len(dialogs)
114
+ # total_time = 0
115
+ # total_tokens = 0
116
+ # total_ttft = 0
117
+ # num_batches = (num_dialogs + batch_size - 1) // batch_size
118
+
119
+ actual_batch_size = min(batch_size, num_dialogs)
120
+ total_time = 0
121
+ total_tokens = 0
122
+ total_ttft = 0
123
+ num_batches = math.ceil(num_dialogs / actual_batch_size)
124
+
125
+ memory_avg = []
126
+ tokens_per_sec_avg = []
127
+ time_to_first_token_avg = []
128
+ responses_by_batch_size = defaultdict(list)
129
+ batch_generation_time = 0
130
+ total_generation_time = 0
131
+
132
+ terminators = [
133
+ tokenizer.eos_token_id,
134
+ tokenizer.convert_tokens_to_ids("<|eot_id|>"),
135
+ ]
136
+
137
+ with TorchTracemalloc() as tt:
138
+ for i in range(0, num_dialogs, actual_batch_size):
139
+ # for batch_idx in range(num_batches):
140
+ batch = batch_inputs[i : i + actual_batch_size]
141
+ try:
142
+ encoded_inputs = tokenizer(
143
+ batch,
144
+ padding=True,
145
+ truncation=False,
146
+ return_tensors="pt",
147
+ )
148
+
149
+ input_ids = encoded_inputs["input_ids"].to(model.device)
150
+ attention_mask = encoded_inputs["attention_mask"].to(
151
+ model.device
152
+ )
153
+
154
+ torch.cuda.synchronize()
155
+ start_time = time.perf_counter()
156
+
157
+ with torch.no_grad():
158
+ output_tokens = model.generate(
159
+ input_ids,
160
+ attention_mask=attention_mask,
161
+ max_new_tokens=num_new_tokens,
162
+ num_return_sequences=1,
163
+ do_sample=True,
164
+ temperature=temperature,
165
+ pad_token_id=tokenizer.pad_token_id,
166
+ eos_token_id=terminators,
167
+ )
168
+
169
+ torch.cuda.synchronize()
170
+ end_time = time.perf_counter()
171
+
172
+ batch_time = end_time - start_time
173
+ total_time += batch_time
174
+ batch_generation_time += (
175
+ batch_time # Add to batch generation time
176
+ )
177
+ total_generation_time += (
178
+ batch_time # Add to total generation time
179
+ )
180
+ total_tokens += output_tokens.numel()
181
+
182
+ if i == 0:
183
+ total_ttft = batch_time
184
+
185
+ # if batch_idx == 0:
186
+ # total_ttft = batch_time
187
+
188
+ decoded_outputs = tokenizer.batch_decode(
189
+ output_tokens, skip_special_tokens=True
190
+ )
191
+ # decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
192
+
193
+ for j, response in enumerate(decoded_outputs):
194
+ original_dialog = dialogs[i + j]
195
+ formatted_responses = format_response(
196
+ original_dialog, response
197
+ )
198
+ responses.append(formatted_responses)
199
+ # responses_by_batch_size[batch_size].append(
200
+ # formatted_response
201
+ # )
202
+ # Format the responses
203
+ formatted_responses = "\n\n---\n\n".join([f"**Question**: {res['question']}\n\n**Answer**: {res['answer']}" for res in responses])
204
+ plain_text_responses = markdown_to_plain_text(formatted_responses)
205
+ yield plain_text_responses
206
+ progress(i, desc="Processing batches")
207
+
208
+ torch.cuda.empty_cache()
209
+
210
+ except Exception as e:
211
+ print(
212
+ f"Error processing batch {i//batch_size + 1}: {str(e)}"
213
+ )
214
+ continue
215
+
216
+
217
+ elapsed_time = total_time
218
+ tokens_per_second = total_tokens / total_time if total_time > 0 else 0
219
+ # avg_memory_consumption = np.mean(TorchTracemalloc.track_memory_consumption)
220
+ total_memory_consumption = np.sum(TorchTracemalloc.track_memory_consumption)
221
+ avg_memory_consumption = total_memory_consumption/num_dialogs
222
+
223
+ # Use actual_batch_size in calculations
224
+ ttft = (
225
+ total_ttft / actual_batch_size if actual_batch_size > 0 else 0
226
+ )
227
+
228
+ print(f"Inference completed in {elapsed_time:.2f} seconds.")
229
+
230
+ yield {
231
+ "Time Taken (seconds)": elapsed_time,
232
+ "Tokens per Second": tokens_per_second,
233
+ "Time to First Token (TTFT, seconds)": ttft,
234
+ # "Formatted Responses": formatted_responses,
235
+ "Formatted Responses": plain_text_responses,
236
+ "Average Memory Consumption per Question (MB)": avg_memory_consumption,
237
+ "Total Memory Consumption (MB)": total_memory_consumption
238
+ }
239
+
240
+ # Demo function
241
+ def demo(num_new_tokens, temperature, custom_questions_text, kv_bits=1, progress=gr.Progress()):
242
+ custom_questions = custom_questions_text.split("\n")
243
+ print("Loading questions...")
244
+ dialogs = load_questions("chats_sys_none.json", custom_questions)
245
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
246
+
247
+ result_gen = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits, progress=progress)
248
+
249
+ formatted_responses = ""
250
+ for result in result_gen:
251
+ if isinstance(result, str):
252
+ formatted_responses = result
253
+ yield None, None, None, None, None, None, None, formatted_responses
254
+ else:
255
+ time_taken = result["Time Taken (seconds)"]
256
+ tokens_per_second = result["Tokens per Second"]
257
+ ttft = result["Time to First Token (TTFT, seconds)"]
258
+ avg_memory_consumption = result["Average Memory Consumption per Question (MB)"]
259
+ total_memory_consumption = result["Total Memory Consumption (MB)"]
260
+ formatted_responses = result["Formatted Responses"]
261
+ yield time_taken, tokens_per_second, ttft, avg_memory_consumption, total_memory_consumption, formatted_responses
262
+
263
+ # Load JSON data
264
+ with open("chats_sys_none.json", "r") as file:
265
+ json_data = json.load(file)
266
+
267
+ # Load 50 random questions into the input area by default
268
+ def load_default_questions():
269
+ random.shuffle(json_data)
270
+ default_questions = [dialog[0]['content'] for dialog in json_data[:50] if 'content' in dialog[0]]
271
+ return "\n".join(default_questions)
272
+
273
+ # Load default questions on button click
274
+ def load_questions_action():
275
+ return load_default_questions()
276
+
277
+ # Gradio interface
278
+ css = """
279
+ body, html {
280
+ height: 100vh;
281
+ margin: 0;
282
+ }
283
+
284
+ .gradio-container {
285
+ height: 100vh;
286
+ }
287
+
288
+ #main-row {
289
+ height: 90vh;
290
+ display: flex;
291
+ }
292
+
293
+ #control-panel, #formatted-responses-container {
294
+ height: 90vh;
295
+ box-sizing: border-box;
296
+ display: flex;
297
+ flex-direction: column;
298
+ overflow: hidden;
299
+ flex: 1; /* Ensure equal width */
300
+ }
301
+
302
+ #control-panel {
303
+ flex: 1; /* Ensure equal height */
304
+ }
305
+
306
+ #custom-questions-text {
307
+ flex-grow: 1;
308
+ overflow-y: auto;
309
+ max-height: 30vh; /* Limit height of custom questions text */
310
+ }
311
+
312
+ #metrics-panel {
313
+ display: flex;
314
+ flex-wrap: wrap;
315
+ gap: 1vh;
316
+ margin-bottom: 1vh;
317
+ flex-shrink: 0;
318
+ height: auto; /* Let the panel size adjust based on its content */
319
+ }
320
+
321
+ #metrics-panel .metric {
322
+ flex: 1 1 48%;
323
+ min-width: 10vw;
324
+ box-sizing: border-box;
325
+ }
326
+
327
+ #buttons-container {
328
+ display: flex;
329
+ justify-content: space-between;
330
+ min-height: 6vh; /* Minimum height for buttons container */
331
+ flex-shrink: 0;
332
+ }
333
+ """
334
+
335
+ with gr.Blocks(css=css) as app:
336
+ with gr.Row(elem_id="main-row", equal_height=True):
337
+ with gr.Column(elem_id="control-panel"):
338
+ num_new_tokens = gr.Slider(label="Number of New Tokens", minimum=128, maximum=2048, step=128, value=512)
339
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4)
340
+ custom_questions_text = gr.Textbox(
341
+ label="Custom Questions",
342
+ placeholder="Type your custom questions here, one per line...",
343
+ autoscroll=False,
344
+ container=False,
345
+ lines=5,
346
+ elem_id="custom-questions-text"
347
+ )
348
+ with gr.Row(elem_id="metrics-panel"):
349
+ time_taken = gr.Number(label="Time Taken (seconds)", interactive=False, elem_classes=["metric"])
350
+ tokens_per_second = gr.Number(label="Tokens per Second", interactive=False, elem_classes=["metric"])
351
+ ttft = gr.Number(label="Time to First Token (TTFT, seconds)", interactive=False, elem_classes=["metric"])
352
+ total_memory_consumption = gr.Number(label="Total Memory Consumption (MB)", interactive=False, elem_classes=["metric"])
353
+ avg_memory_consumption = gr.Number(label="Average Memory Consumption per Question (MB)", interactive=False, elem_classes=["metric"])
354
+ with gr.Row(elem_id="buttons-container"):
355
+ load_questions_btn = gr.Button("Load Default Questions")
356
+ demo_btn = gr.Button("Run Inference", elem_id="run-inference-btn")
357
+
358
+ formatted_responses = gr.Textbox(
359
+ label="Formatted Responses",
360
+ elem_id="formatted-responses",
361
+ value="No responses yet. Run the inference to see results.",
362
+ lines=37,
363
+ container=False,
364
+ autoscroll=False,
365
+ show_copy_button=True
366
+ )
367
+
368
+ load_questions_btn.click(fn=load_questions_action, inputs=[], outputs=custom_questions_text)
369
+ demo_btn.click(demo, inputs=[num_new_tokens, temperature, custom_questions_text], outputs=[time_taken, tokens_per_second, ttft, avg_memory_consumption, total_memory_consumption, formatted_responses])
370
+
371
+ if __name__ == "__main__":
372
+ print("Loading model and tokenizer on startup...")
373
+ # load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
374
+ print("Model and tokenizer loaded. Starting Gradio interface...")
375
+ app.launch()
backups/app_local_v2.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ import random
8
+
9
+ # Environment variables
10
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
11
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
12
+
13
+ # Global variables to store the model and tokenizer
14
+ model = None
15
+ tokenizer = None
16
+
17
+ # Load model and tokenizer
18
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
19
+ global model, tokenizer
20
+ if model is None or tokenizer is None:
21
+ print("Loading model and tokenizer...")
22
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ special_tokens = {"pad_token": "<PAD>"}
24
+ tokenizer.add_special_tokens(special_tokens)
25
+
26
+ config = AutoConfig.from_pretrained(model_name)
27
+ if kv_bits != "unquantized":
28
+ quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
29
+ setattr(config, "quantizer_path", quantizer_path)
30
+
31
+ if dtype == "bf16":
32
+ dtype = torch.bfloat16
33
+ elif dtype == "fp16":
34
+ dtype = torch.float16
35
+ elif dtype == "fp32":
36
+ dtype = torch.float32
37
+
38
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
39
+
40
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
41
+ model.resize_token_embeddings(len(tokenizer))
42
+
43
+ tokenizer.padding_side = "left"
44
+ model.config.pad_token_id = tokenizer.pad_token_id
45
+
46
+ return model, tokenizer
47
+
48
+ # Format response
49
+ def format_response(dialog, response):
50
+ question = next((turn['content'] for turn in dialog if turn['role'] == 'user'), 'No question found')
51
+ answer = response.split("assistant")[-1].strip()
52
+ return {"question": question, "answer": answer}
53
+
54
+ # Load questions
55
+ def load_questions(prompts_path, custom_questions):
56
+ with open(prompts_path, "r") as file:
57
+ dialogs = json.load(file)
58
+
59
+ selected_dialogs = []
60
+
61
+ if custom_questions:
62
+ for question in custom_questions:
63
+ if question.strip():
64
+ custom_dialog = [{"role": "user", "content": question}]
65
+ selected_dialogs.append(custom_dialog)
66
+
67
+ num_questions = 30 - len(selected_dialogs)
68
+ random.shuffle(dialogs)
69
+ selected_dialogs.extend(dialogs[:num_questions])
70
+
71
+ return selected_dialogs[:30]
72
+
73
+ # Inference
74
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits, progress=gr.Progress()):
75
+ print("Starting inference...")
76
+ model, tokenizer = load_model_and_tokenizer(model_name, dtype, kv_bits)
77
+ batch_inputs = [
78
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
79
+ for dialog in dialogs
80
+ ]
81
+
82
+ responses = []
83
+ start_time = time.time()
84
+
85
+ batch_size = 30 # Set batch size for processing, this can be adjusted
86
+ num_dialogs = len(dialogs)
87
+ total_time = 0
88
+ total_tokens = 0
89
+ num_batches = (num_dialogs + batch_size - 1) // batch_size
90
+
91
+ for batch_idx in range(num_batches):
92
+ start_idx = batch_idx * batch_size
93
+ end_idx = min(start_idx + batch_size, num_dialogs)
94
+ batch = batch_inputs[start_idx:end_idx]
95
+
96
+ encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
97
+ input_ids = encoded_inputs["input_ids"].to(model.device)
98
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
99
+
100
+ with torch.no_grad():
101
+ torch.cuda.synchronize()
102
+ batch_start_time = time.perf_counter()
103
+
104
+ output_tokens = model.generate(
105
+ input_ids,
106
+ attention_mask=attention_mask,
107
+ max_new_tokens=num_new_tokens,
108
+ do_sample=True,
109
+ temperature=temperature,
110
+ pad_token_id=tokenizer.pad_token_id,
111
+ eos_token_id=tokenizer.eos_token_id
112
+ )
113
+
114
+ torch.cuda.synchronize()
115
+ batch_end_time = time.perf_counter()
116
+
117
+ batch_time = batch_end_time - batch_start_time
118
+ total_time += batch_time
119
+ total_tokens += output_tokens.numel()
120
+
121
+ decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
122
+
123
+ for i, response in enumerate(decoded_outputs):
124
+ original_dialog = dialogs[start_idx + i]
125
+ formatted_response = format_response(original_dialog, response)
126
+ responses.append(formatted_response)
127
+
128
+ yield {
129
+ "Time Taken (seconds)": time.time() - start_time,
130
+ "Tokens per Second": total_tokens / total_time if total_time > 0 else 0,
131
+ "Formatted Responses": f"**Question**: {formatted_response['question']}\n\n**Answer**: {formatted_response['answer']}\n\n---\n\n"
132
+ }
133
+ progress((batch_idx + 1) / num_batches, desc="Processing batches")
134
+
135
+ elapsed_time = time.time() - start_time
136
+ print(f"Inference completed in {elapsed_time:.2f} seconds.")
137
+
138
+ # Demo function
139
+ def demo(num_new_tokens, temperature, custom_questions_text, kv_bits, progress=gr.Progress()):
140
+ custom_questions = custom_questions_text.split("\n")
141
+ print("Loading questions...")
142
+ dialogs = load_questions("chats_sys_none.json", custom_questions)
143
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
144
+
145
+ result_gen = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits, progress=progress)
146
+
147
+ time_taken, tokens_per_second, formatted_responses = None, None, ""
148
+
149
+ for result in result_gen:
150
+ time_taken = result["Time Taken (seconds)"]
151
+ tokens_per_second = result["Tokens per Second"]
152
+ formatted_responses += result["Formatted Responses"]
153
+ yield time_taken, tokens_per_second, formatted_responses
154
+
155
+ # Load JSON data
156
+ with open("chats_sys_none.json", "r") as file:
157
+ json_data = json.load(file)
158
+ json_data_str = json.dumps(json_data, indent=2)
159
+
160
+ # Show JSON function
161
+ def show_json():
162
+ return json_data_str
163
+
164
+ # Gradio interface
165
+ app = gr.Blocks()
166
+
167
+ with app:
168
+ with gr.Tab("LLM Inference Demo"):
169
+ num_new_tokens = gr.Slider(label="Number of New Tokens", minimum=128, maximum=1024, step=128, value=512)
170
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4)
171
+ custom_questions_text = gr.Textbox(label="Custom Questions", placeholder="Type your custom questions here, one per line...", lines=5)
172
+ kv_bits = gr.Dropdown(label="KV Bits", choices=["1", "2", "4", "unquantized"], value="1")
173
+
174
+ time_taken = gr.Number(label="Time Taken (seconds)")
175
+ tokens_per_second = gr.Number(label="Tokens per Second")
176
+ formatted_responses = gr.Markdown(label="Formatted Responses")
177
+
178
+ demo_btn = gr.Button("Run Inference")
179
+
180
+ demo_btn.click(demo, inputs=[num_new_tokens, temperature, custom_questions_text, kv_bits], outputs=[time_taken, tokens_per_second, formatted_responses])
181
+
182
+ with gr.Tab("Show JSON"):
183
+ json_output = gr.HTML("<pre>{}</pre>".format(json_data_str))
184
+ json_interface = gr.Interface(fn=show_json, inputs=[], outputs=[json_output], live=False)
185
+ json_interface.render()
186
+
187
+ if __name__ == "__main__":
188
+ print("Loading model and tokenizer on startup...")
189
+ load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
190
+ print("Model and tokenizer loaded. Starting Gradio interface...")
191
+ app.queue(default_concurrency_limit=5).launch()
backups/app_local_v3.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ import random
8
+
9
+ # Environment variables
10
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
11
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
12
+
13
+ # Global variables to store the model and tokenizer
14
+ model = None
15
+ tokenizer = None
16
+
17
+ # Load model and tokenizer
18
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
19
+ global model, tokenizer
20
+ if model is None or tokenizer is None:
21
+ print("Loading model and tokenizer...")
22
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ special_tokens = {"pad_token": "<PAD>"}
24
+ tokenizer.add_special_tokens(special_tokens)
25
+
26
+ config = AutoConfig.from_pretrained(model_name)
27
+ if kv_bits != "unquantized":
28
+ quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
29
+ setattr(config, "quantizer_path", quantizer_path)
30
+
31
+ if dtype == "bf16":
32
+ dtype = torch.bfloat16
33
+ elif dtype == "fp16":
34
+ dtype = torch.float16
35
+ elif dtype == "fp32":
36
+ dtype = torch.float32
37
+
38
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
39
+
40
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
41
+ model.resize_token_embeddings(len(tokenizer))
42
+
43
+ tokenizer.padding_side = "left"
44
+ model.config.pad_token_id = tokenizer.pad_token_id
45
+
46
+ return model, tokenizer
47
+
48
+ # Format response
49
+ def format_response(dialog, response):
50
+ question = next((turn['content'] for turn in dialog if turn['role'] == 'user'), 'No question found')
51
+ answer = response.split("assistant")[-1].strip()
52
+ return {"question": question, "answer": answer}
53
+
54
+ # Load questions
55
+ def load_questions(prompts_path, custom_questions):
56
+ with open(prompts_path, "r") as file:
57
+ dialogs = json.load(file)
58
+
59
+ selected_dialogs = []
60
+
61
+ if custom_questions:
62
+ for question in custom_questions:
63
+ if question.strip():
64
+ custom_dialog = [{"role": "user", "content": question}]
65
+ selected_dialogs.append(custom_dialog)
66
+
67
+ num_questions = 60 - len(selected_dialogs)
68
+ random.shuffle(dialogs)
69
+ selected_dialogs.extend(dialogs[:num_questions])
70
+
71
+ return selected_dialogs[:60]
72
+
73
+ # Inference
74
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits, progress=gr.Progress()):
75
+ print("Starting inference...")
76
+ model, tokenizer = load_model_and_tokenizer(model_name, dtype, kv_bits)
77
+ batch_inputs = [
78
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
79
+ for dialog in dialogs
80
+ ]
81
+
82
+ responses = []
83
+ start_time = time.time()
84
+
85
+ batch_size = 30 # Set batch size for processing, this can be adjusted
86
+ num_dialogs = len(dialogs)
87
+ total_time = 0
88
+ total_tokens = 0
89
+ num_batches = (num_dialogs + batch_size - 1) // batch_size
90
+
91
+ for batch_idx in range(num_batches):
92
+ start_idx = batch_idx * batch_size
93
+ end_idx = min(start_idx + batch_size, num_dialogs)
94
+ batch = batch_inputs[start_idx:end_idx]
95
+
96
+ encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
97
+ input_ids = encoded_inputs["input_ids"].to(model.device)
98
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
99
+
100
+ with torch.no_grad():
101
+ torch.cuda.synchronize()
102
+ batch_start_time = time.perf_counter()
103
+
104
+ # Generate responses and measure time to first token
105
+ output_tokens = model.generate(
106
+ input_ids,
107
+ attention_mask=attention_mask,
108
+ max_new_tokens=num_new_tokens,
109
+ do_sample=True,
110
+ temperature=temperature,
111
+ pad_token_id=tokenizer.pad_token_id,
112
+ eos_token_id=tokenizer.eos_token_id
113
+ )
114
+
115
+ torch.cuda.synchronize()
116
+ batch_end_time = time.perf_counter()
117
+
118
+ batch_time = batch_end_time - batch_start_time
119
+ total_time += batch_time
120
+ total_tokens += output_tokens.numel()
121
+
122
+ # Calculate TTFT
123
+ if batch_idx == 0:
124
+ ttft = batch_time / input_ids.size(0) # Time to first token for the first batch
125
+
126
+ decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
127
+
128
+ for i, response in enumerate(decoded_outputs):
129
+ original_dialog = dialogs[start_idx + i]
130
+ formatted_response = format_response(original_dialog, response)
131
+ responses.append(formatted_response)
132
+
133
+ formatted_responses = "\n\n---\n\n".join([f"**Question**: {res['question']}\n\n**Answer**: {res['answer']}" for res in responses])
134
+ yield formatted_responses
135
+ progress((batch_idx + 1) / num_batches, desc="Processing batches")
136
+
137
+ elapsed_time = time.time() - start_time
138
+ tokens_per_second = total_tokens / total_time if total_time > 0 else 0
139
+ print(f"Inference completed in {elapsed_time:.2f} seconds.")
140
+
141
+ yield {
142
+ "Time Taken (seconds)": elapsed_time,
143
+ "Tokens per Second": tokens_per_second,
144
+ "Time to First Token (TTFT, seconds)": ttft,
145
+ "Formatted Responses": formatted_responses
146
+ }
147
+
148
+ # Demo function
149
+ def demo(num_new_tokens, temperature, custom_questions_text, kv_bits, progress=gr.Progress()):
150
+ custom_questions = custom_questions_text.split("\n")
151
+ print("Loading questions...")
152
+ dialogs = load_questions("chats_sys_none.json", custom_questions)
153
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
154
+
155
+ result_gen = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits, progress=progress)
156
+
157
+ formatted_responses = ""
158
+ for result in result_gen:
159
+ if isinstance(result, str):
160
+ formatted_responses = result
161
+ yield None, None, None, formatted_responses
162
+ else:
163
+ time_taken = result["Time Taken (seconds)"]
164
+ tokens_per_second = result["Tokens per Second"]
165
+ ttft = result["Time to First Token (TTFT, seconds)"]
166
+ formatted_responses = result["Formatted Responses"]
167
+ yield time_taken, tokens_per_second, ttft, formatted_responses
168
+
169
+ # Load JSON data
170
+ with open("chats_sys_none.json", "r") as file:
171
+ json_data = json.load(file)
172
+ json_data_str = json.dumps(json_data, indent=2)
173
+
174
+ # Show JSON function
175
+ def show_json():
176
+ return json_data_str
177
+
178
+ # Gradio interface
179
+ app = gr.Blocks()
180
+
181
+ with app:
182
+ with gr.Tab("LLM Inference Demo"):
183
+ with gr.Row():
184
+ with gr.Column():
185
+ num_new_tokens = gr.Slider(label="Number of New Tokens", minimum=128, maximum=1024, step=128, value=512)
186
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4)
187
+ custom_questions_text = gr.Textbox(label="Custom Questions", placeholder="Type your custom questions here, one per line...", lines=5)
188
+ kv_bits = gr.Dropdown(label="KV Bits", choices=["1", "2", "4", "unquantized"], value="1")
189
+
190
+ with gr.Column():
191
+ time_taken = gr.Number(label="Time Taken (seconds)")
192
+ tokens_per_second = gr.Number(label="Tokens per Second")
193
+ ttft = gr.Number(label="Time to First Token (TTFT, seconds)")
194
+
195
+ with gr.Row():
196
+ formatted_responses = gr.Markdown(label="Formatted Responses")
197
+
198
+ demo_btn = gr.Button("Run Inference")
199
+
200
+ demo_btn.click(demo, inputs=[num_new_tokens, temperature, custom_questions_text, kv_bits], outputs=[time_taken, tokens_per_second, ttft, formatted_responses])
201
+
202
+ with gr.Tab("Show JSON"):
203
+ json_output = gr.HTML("<pre>{}</pre>".format(json_data_str))
204
+ json_interface = gr.Interface(fn=show_json, inputs=[], outputs=[json_output], live=False)
205
+ json_interface.render()
206
+
207
+ if __name__ == "__main__":
208
+ print("Loading model and tokenizer on startup...")
209
+ load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
210
+ print("Model and tokenizer loaded. Starting Gradio interface...")
211
+ app.queue(default_concurrency_limit=5).launch()
backups/app_local_v4-1.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ import random
8
+
9
+ # Environment variables
10
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
11
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
12
+
13
+ # Global variables to store the model and tokenizer
14
+ model = None
15
+ tokenizer = None
16
+
17
+ # Load model and tokenizer
18
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
19
+ global model, tokenizer
20
+ if model is None or tokenizer is None:
21
+ print("Loading model and tokenizer...")
22
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ special_tokens = {"pad_token": "<PAD>"}
24
+ tokenizer.add_special_tokens(special_tokens)
25
+
26
+ config = AutoConfig.from_pretrained(model_name)
27
+ if kv_bits != "unquantized":
28
+ quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
29
+ setattr(config, "quantizer_path", quantizer_path)
30
+
31
+ if dtype == "bf16":
32
+ dtype = torch.bfloat16
33
+ elif dtype == "fp16":
34
+ dtype = torch.float16
35
+ elif dtype == "fp32":
36
+ dtype = torch.float32
37
+
38
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
39
+
40
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
41
+ model.resize_token_embeddings(len(tokenizer))
42
+
43
+ tokenizer.padding_side = "left"
44
+ model.config.pad_token_id = tokenizer.pad_token_id
45
+
46
+ return model, tokenizer
47
+
48
+ # Format response
49
+ def format_response(dialog, response):
50
+ question = next((turn['content'] for turn in dialog if turn['role'] == 'user'), 'No question found')
51
+ answer = response.split("assistant")[-1].strip()
52
+ return {"question": question, "answer": answer}
53
+
54
+ # Load questions
55
+ def load_questions(prompts_path, custom_questions):
56
+ with open(prompts_path, "r") as file:
57
+ dialogs = json.load(file)
58
+
59
+ selected_dialogs = []
60
+
61
+ if custom_questions:
62
+ for question in custom_questions:
63
+ if question.strip():
64
+ custom_dialog = [{"role": "user", "content": question}]
65
+ selected_dialogs.append(custom_dialog)
66
+
67
+ num_questions = 60 - len(selected_dialogs)
68
+ random.shuffle(dialogs)
69
+ selected_dialogs.extend(dialogs[:num_questions])
70
+
71
+ return selected_dialogs[:60]
72
+
73
+ # Inference
74
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits, progress=gr.Progress()):
75
+ print("Starting inference...")
76
+ model, tokenizer = load_model_and_tokenizer(model_name, dtype, kv_bits)
77
+ batch_inputs = [
78
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
79
+ for dialog in dialogs
80
+ ]
81
+
82
+ responses = []
83
+ start_time = time.time()
84
+
85
+ batch_size = 60 # Set batch size for processing, this can be adjusted
86
+ num_dialogs = len(dialogs)
87
+ total_time = 0
88
+ total_tokens = 0
89
+ num_batches = (num_dialogs + batch_size - 1) // batch_size
90
+
91
+ for batch_idx in range(num_batches):
92
+ start_idx = batch_idx * batch_size
93
+ end_idx = min(start_idx + batch_size, num_dialogs)
94
+ batch = batch_inputs[start_idx:end_idx]
95
+
96
+ encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
97
+ input_ids = encoded_inputs["input_ids"].to(model.device)
98
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
99
+
100
+ with torch.no_grad():
101
+ torch.cuda.synchronize()
102
+ batch_start_time = time.perf_counter()
103
+
104
+ # Generate responses and measure time to first token
105
+ output_tokens = model.generate(
106
+ input_ids,
107
+ attention_mask=attention_mask,
108
+ max_new_tokens=num_new_tokens,
109
+ do_sample=True,
110
+ temperature=temperature,
111
+ pad_token_id=tokenizer.pad_token_id,
112
+ eos_token_id=tokenizer.eos_token_id
113
+ )
114
+
115
+ torch.cuda.synchronize()
116
+ batch_end_time = time.perf_counter()
117
+
118
+ batch_time = batch_end_time - batch_start_time
119
+ total_time += batch_time
120
+ total_tokens += output_tokens.numel()
121
+
122
+ # Calculate TTFT
123
+ if batch_idx == 0:
124
+ ttft = batch_time / input_ids.size(0) # Time to first token for the first batch
125
+
126
+ decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
127
+
128
+ for i, response in enumerate(decoded_outputs):
129
+ original_dialog = dialogs[start_idx + i]
130
+ formatted_response = format_response(original_dialog, response)
131
+ responses.append(formatted_response)
132
+
133
+ formatted_responses = "\n\n---\n\n".join([f"**Question**: {res['question']}\n\n**Answer**: {res['answer']}" for res in responses])
134
+ yield formatted_responses
135
+ progress((batch_idx + 1) / num_batches, desc="Processing batches")
136
+
137
+ elapsed_time = time.time() - start_time
138
+ tokens_per_second = total_tokens / total_time if total_time > 0 else 0
139
+ print(f"Inference completed in {elapsed_time:.2f} seconds.")
140
+
141
+ yield {
142
+ "Time Taken (seconds)": elapsed_time,
143
+ "Tokens per Second": tokens_per_second,
144
+ "Time to First Token (TTFT, seconds)": ttft,
145
+ "Formatted Responses": formatted_responses
146
+ }
147
+
148
+ # Demo function
149
+ def demo(num_new_tokens, temperature, custom_questions_text, kv_bits=1, progress=gr.Progress()):
150
+ custom_questions = custom_questions_text.split("\n")
151
+ print("Loading questions...")
152
+ dialogs = load_questions("chats_sys_none.json", custom_questions)
153
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
154
+
155
+ result_gen = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits, progress=progress)
156
+
157
+ formatted_responses = ""
158
+ for result in result_gen:
159
+ if isinstance(result, str):
160
+ formatted_responses = result
161
+ yield None, None, None, formatted_responses
162
+ else:
163
+ time_taken = result["Time Taken (seconds)"]
164
+ tokens_per_second = result["Tokens per Second"]
165
+ ttft = result["Time to First Token (TTFT, seconds)"]
166
+ formatted_responses = result["Formatted Responses"]
167
+ yield time_taken, tokens_per_second, ttft, formatted_responses
168
+
169
+ # Load JSON data
170
+ with open("chats_sys_none.json", "r") as file:
171
+ json_data = json.load(file)
172
+
173
+ # Load 50 random questions into the input area by default
174
+ def load_default_questions():
175
+ random.shuffle(json_data)
176
+ default_questions = [dialog[0]['content'] for dialog in json_data[:50] if 'content' in dialog[0]]
177
+ return "\n".join(default_questions)
178
+
179
+ # Load default questions on button click
180
+ def load_questions_action():
181
+ return load_default_questions()
182
+
183
+ # Gradio interface
184
+ css = """
185
+ body, html {
186
+ height: 100vh;
187
+ margin: 0;
188
+ }
189
+
190
+ .gradio-container {
191
+ height: 100vh;
192
+ }
193
+
194
+ #main-row {
195
+ height: 100%;
196
+ }
197
+
198
+ #control-panel, #formatted-responses-container {
199
+ height: 100%;
200
+ box-sizing: border-box;
201
+ }
202
+
203
+ #custom-questions-text, #formatted-responses {
204
+ flex-grow: 1;
205
+ overflow-y: auto;
206
+ border: 1px solid #ccc;
207
+ }
208
+ """
209
+
210
+ with gr.Blocks(css=css) as app:
211
+ with gr.Row(elem_id="main-row", equal_height=True):
212
+ with gr.Column(elem_id="control-panel", scale=1):
213
+ num_new_tokens = gr.Slider(label="Number of New Tokens", minimum=128, maximum=1024, step=128, value=512)
214
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4)
215
+ custom_questions_text = gr.Textbox(label="Custom Questions", placeholder="Type your custom questions here, one per line...", lines=22, elem_id="custom-questions-text")
216
+ with gr.Row(elem_id="metrics-panel"):
217
+ time_taken = gr.Number(label="Time Taken (seconds)", interactive=False, elem_classes=["metric"])
218
+ tokens_per_second = gr.Number(label="Tokens per Second", interactive=False, elem_classes=["metric"])
219
+ ttft = gr.Number(label="Time to First Token (TTFT, seconds)", interactive=False, elem_classes=["metric"])
220
+ with gr.Row(elem_id="buttons-container"):
221
+ load_questions_btn = gr.Button("Load Default Questions")
222
+ demo_btn = gr.Button("Run Inference", elem_id="run-inference-btn")
223
+
224
+ # with gr.Column(elem_id="formatted-responses-container", scale=1):
225
+ formatted_responses = gr.Textbox(label="Formatted Responses", elem_id="formatted-responses", value="No responses yet. Run the inference to see results.", lines=35, autoscroll=False, show_copy_button=True)
226
+
227
+ load_questions_btn.click(fn=load_questions_action, inputs=[], outputs=custom_questions_text)
228
+ demo_btn.click(demo, inputs=[num_new_tokens, temperature, custom_questions_text], outputs=[time_taken, tokens_per_second, ttft, formatted_responses])
229
+
230
+ if __name__ == "__main__":
231
+ print("Loading model and tokenizer on startup...")
232
+ # load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
233
+ print("Model and tokenizer loaded. Starting Gradio interface...")
234
+ app.launch()
backups/app_local_with_graph.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ import random
8
+ from PIL import Image
9
+
10
+ # Environment variables
11
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
12
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
13
+
14
+
15
+ # Global variables to store the model and tokenizer
16
+ model = None
17
+ tokenizer = None
18
+
19
+ # Load model and tokenizer
20
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
21
+ global model, tokenizer
22
+ if model is None or tokenizer is None:
23
+ print("Loading model and tokenizer...")
24
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
25
+ special_tokens = {"pad_token": "<PAD>"}
26
+ tokenizer.add_special_tokens(special_tokens)
27
+
28
+ config = AutoConfig.from_pretrained(model_name)
29
+ if kv_bits != "unquantized":
30
+ quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
31
+ setattr(config, "quantizer_path", quantizer_path)
32
+
33
+ if dtype == "bf16":
34
+ dtype = torch.bfloat16
35
+ elif dtype == "fp16":
36
+ dtype = torch.float16
37
+ elif dtype == "fp32":
38
+ dtype = torch.float32
39
+
40
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
41
+
42
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
43
+ model.resize_token_embeddings(len(tokenizer))
44
+
45
+ tokenizer.padding_side = "left"
46
+ model.config.pad_token_id = tokenizer.pad_token_id
47
+
48
+ return model, tokenizer
49
+
50
+ # Format response
51
+ def format_response(dialog, response):
52
+ question = next((turn['content'] for turn in dialog if turn['role'] == 'user'), 'No question found')
53
+ answer = response.split("assistant")[-1].strip()
54
+ return {"question": question, "answer": answer}
55
+
56
+ # Load questions
57
+ def load_questions(prompts_path, custom_questions):
58
+ with open(prompts_path, "r") as file:
59
+ dialogs = json.load(file)
60
+
61
+ selected_dialogs = []
62
+
63
+ if custom_questions:
64
+ for question in custom_questions:
65
+ if question.strip():
66
+ custom_dialog = [{"role": "user", "content": question}]
67
+ selected_dialogs.append(custom_dialog)
68
+
69
+ num_questions = 60 - len(selected_dialogs)
70
+ random.shuffle(dialogs)
71
+ selected_dialogs.extend(dialogs[:num_questions])
72
+
73
+ return selected_dialogs[:60]
74
+
75
+ # Inference
76
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits, progress=gr.Progress()):
77
+ print("Starting inference...")
78
+ model, tokenizer = load_model_and_tokenizer(model_name, dtype, kv_bits)
79
+ batch_inputs = [
80
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
81
+ for dialog in dialogs
82
+ ]
83
+
84
+ responses = []
85
+ start_time = time.time()
86
+
87
+ batch_size = 30 # Set batch size for processing, this can be adjusted
88
+ num_dialogs = len(dialogs)
89
+ total_time = 0
90
+ total_tokens = 0
91
+ num_batches = (num_dialogs + batch_size - 1) // batch_size
92
+
93
+ for batch_idx in range(num_batches):
94
+ start_idx = batch_idx * batch_size
95
+ end_idx = min(start_idx + batch_size, num_dialogs)
96
+ batch = batch_inputs[start_idx:end_idx]
97
+
98
+ encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
99
+ input_ids = encoded_inputs["input_ids"].to(model.device)
100
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
101
+
102
+ with torch.no_grad():
103
+ torch.cuda.synchronize()
104
+ batch_start_time = time.perf_counter()
105
+
106
+ # Generate responses and measure time to first token
107
+ output_tokens = model.generate(
108
+ input_ids,
109
+ attention_mask=attention_mask,
110
+ max_new_tokens=num_new_tokens,
111
+ do_sample=True,
112
+ temperature=temperature,
113
+ pad_token_id=tokenizer.pad_token_id,
114
+ eos_token_id=tokenizer.eos_token_id
115
+ )
116
+
117
+ torch.cuda.synchronize()
118
+ batch_end_time = time.perf_counter()
119
+
120
+ batch_time = batch_end_time - batch_start_time
121
+ total_time += batch_time
122
+ total_tokens += output_tokens.numel()
123
+
124
+ # Calculate TTFT
125
+ if batch_idx == 0:
126
+ ttft = batch_time / input_ids.size(0) # Time to first token for the first batch
127
+
128
+ decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
129
+
130
+ for i, response in enumerate(decoded_outputs):
131
+ original_dialog = dialogs[start_idx + i]
132
+ formatted_response = format_response(original_dialog, response)
133
+ responses.append(formatted_response)
134
+
135
+ formatted_responses = "\n\n---\n\n".join([f"**Question**: {res['question']}\n\n**Answer**: {res['answer']}" for res in responses])
136
+ yield formatted_responses
137
+ progress((batch_idx + 1) / num_batches, desc="Processing batches")
138
+
139
+ elapsed_time = time.time() - start_time
140
+ tokens_per_second = total_tokens / total_time if total_time > 0 else 0
141
+ print(f"Inference completed in {elapsed_time:.2f} seconds.")
142
+
143
+ yield {
144
+ "Time Taken (seconds)": elapsed_time,
145
+ "Tokens per Second": tokens_per_second,
146
+ "Time to First Token (TTFT, seconds)": ttft,
147
+ "Formatted Responses": formatted_responses
148
+ }
149
+
150
+ # Demo function
151
+ def demo(num_new_tokens, temperature, custom_questions_text, kv_bits, progress=gr.Progress()):
152
+ custom_questions = custom_questions_text.split("\n")
153
+ print("Loading questions...")
154
+ dialogs = load_questions("chats_sys_none.json", custom_questions)
155
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
156
+
157
+ result_gen = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits, progress=progress)
158
+
159
+ formatted_responses = ""
160
+ for result in result_gen:
161
+ if isinstance(result, str):
162
+ formatted_responses = result
163
+ yield None, None, None, formatted_responses
164
+ else:
165
+ time_taken = result["Time Taken (seconds)"]
166
+ tokens_per_second = result["Tokens per Second"]
167
+ ttft = result["Time to First Token (TTFT, seconds)"]
168
+ formatted_responses = result["Formatted Responses"]
169
+ yield time_taken, tokens_per_second, ttft, formatted_responses
170
+
171
+ # Load JSON data
172
+ with open("chats_sys_none.json", "r") as file:
173
+ json_data = json.load(file)
174
+ json_data_str = json.dumps(json_data, indent=2)
175
+
176
+ # Show JSON function
177
+ def show_json():
178
+ return json_data_str
179
+
180
+ # Debug function to check image path
181
+ def check_image_path(image_path):
182
+ if os.path.exists(image_path):
183
+ print(f"Image found at {image_path}")
184
+ return True
185
+ else:
186
+ print(f"Image not found at {image_path}")
187
+ return False
188
+
189
+ # Gradio interface
190
+ app = gr.Blocks(css=".scrollable {height: 400px; overflow-y: auto; padding: 10px; border: 1px solid #ccc;}")
191
+
192
+ with app:
193
+ with gr.Tab("LLM Inference Demo"):
194
+ with gr.Row():
195
+ with gr.Column():
196
+ num_new_tokens = gr.Slider(label="Number of New Tokens", minimum=128, maximum=1024, step=128, value=512)
197
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4)
198
+ kv_bits = gr.Dropdown(label="KV Bits", choices=["1", "2", "4", "unquantized"], value="1")
199
+
200
+
201
+ with gr.Column():
202
+ time_taken = gr.Number(label="Time Taken (seconds)")
203
+ tokens_per_second = gr.Number(label="Tokens per Second")
204
+ ttft = gr.Number(label="Time to First Token (TTFT, seconds)")
205
+
206
+ with gr.Row():
207
+ custom_questions_text = gr.Textbox(label="Custom Questions", placeholder="Type your custom questions here, one per line...", lines=5)
208
+
209
+ with gr.Row():
210
+ demo_btn = gr.Button("Run Inference")
211
+
212
+ with gr.Row():
213
+ formatted_responses = gr.Markdown(label="Formatted Responses")
214
+
215
+ demo_btn.click(demo, inputs=[num_new_tokens, temperature, custom_questions_text, kv_bits], outputs=[time_taken, tokens_per_second, ttft, formatted_responses])
216
+
217
+ with gr.Tab("Show JSON"):
218
+ json_output = gr.HTML("<pre>{}</pre>".format(json_data_str))
219
+ json_interface = gr.Interface(fn=show_json, inputs=[], outputs=[json_output], live=False)
220
+ json_interface.render()
221
+
222
+ # with gr.Tab("Image Gallery"):
223
+ # image_path = "memory_usage.png"
224
+ # if check_image_path(image_path): # Debugging the image path
225
+ # gr.Image(value=image_path, label="Memory Usage", type="filepath")
226
+ # else:
227
+ # gr.HTML(f"<p>Image not found at {image_path}</p>")
228
+
229
+ if __name__ == "__main__":
230
+ print("Checking if the image path is correct...")
231
+ check_image_path("memory_usage.png") # Check image path on startup
232
+ print("Loading model and tokenizer on startup...")
233
+ load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
234
+ print("Model and tokenizer loaded. Starting Gradio interface...")
235
+ app.queue(default_concurrency_limit=5).launch()
backups/app_major_backup.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ import random
8
+ from PIL import Image
9
+
10
+ # Environment variables
11
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
12
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
13
+
14
+
15
+ # Global variables to store the model and tokenizer
16
+ model = None
17
+ tokenizer = None
18
+
19
+ # Load model and tokenizer
20
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
21
+ global model, tokenizer
22
+ if model is None or tokenizer is None:
23
+ print("Loading model and tokenizer...")
24
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
25
+ special_tokens = {"pad_token": "<PAD>"}
26
+ tokenizer.add_special_tokens(special_tokens)
27
+
28
+ config = AutoConfig.from_pretrained(model_name)
29
+ if kv_bits != "unquantized":
30
+ quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
31
+ setattr(config, "quantizer_path", quantizer_path)
32
+
33
+ if dtype == "bf16":
34
+ dtype = torch.bfloat16
35
+ elif dtype == "fp16":
36
+ dtype = torch.float16
37
+ elif dtype == "fp32":
38
+ dtype = torch.float32
39
+
40
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
41
+
42
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
43
+ model.resize_token_embeddings(len(tokenizer))
44
+
45
+ tokenizer.padding_side = "left"
46
+ model.config.pad_token_id = tokenizer.pad_token_id
47
+
48
+ return model, tokenizer
49
+
50
+ # Format response
51
+ def format_response(dialog, response):
52
+ question = next((turn['content'] for turn in dialog if turn['role'] == 'user'), 'No question found')
53
+ answer = response.split("assistant")[-1].strip()
54
+ return {"question": question, "answer": answer}
55
+
56
+ # Load questions
57
+ def load_questions(prompts_path, custom_questions):
58
+ with open(prompts_path, "r") as file:
59
+ dialogs = json.load(file)
60
+
61
+ selected_dialogs = []
62
+
63
+ if custom_questions:
64
+ for question in custom_questions:
65
+ if question.strip():
66
+ custom_dialog = [{"role": "user", "content": question}]
67
+ selected_dialogs.append(custom_dialog)
68
+
69
+ num_questions = 60 - len(selected_dialogs)
70
+ random.shuffle(dialogs)
71
+ selected_dialogs.extend(dialogs[:num_questions])
72
+
73
+ return selected_dialogs[:60]
74
+
75
+ # Inference
76
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits, progress=gr.Progress()):
77
+ print("Starting inference...")
78
+ model, tokenizer = load_model_and_tokenizer(model_name, dtype, kv_bits)
79
+ batch_inputs = [
80
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
81
+ for dialog in dialogs
82
+ ]
83
+
84
+ responses = []
85
+ start_time = time.time()
86
+
87
+ batch_size = 30 # Set batch size for processing, this can be adjusted
88
+ num_dialogs = len(dialogs)
89
+ total_time = 0
90
+ total_tokens = 0
91
+ num_batches = (num_dialogs + batch_size - 1) // batch_size
92
+
93
+ for batch_idx in range(num_batches):
94
+ start_idx = batch_idx * batch_size
95
+ end_idx = min(start_idx + batch_size, num_dialogs)
96
+ batch = batch_inputs[start_idx:end_idx]
97
+
98
+ encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
99
+ input_ids = encoded_inputs["input_ids"].to(model.device)
100
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
101
+
102
+ with torch.no_grad():
103
+ torch.cuda.synchronize()
104
+ batch_start_time = time.perf_counter()
105
+
106
+ # Generate responses and measure time to first token
107
+ output_tokens = model.generate(
108
+ input_ids,
109
+ attention_mask=attention_mask,
110
+ max_new_tokens=num_new_tokens,
111
+ do_sample=True,
112
+ temperature=temperature,
113
+ pad_token_id=tokenizer.pad_token_id,
114
+ eos_token_id=tokenizer.eos_token_id
115
+ )
116
+
117
+ torch.cuda.synchronize()
118
+ batch_end_time = time.perf_counter()
119
+
120
+ batch_time = batch_end_time - batch_start_time
121
+ total_time += batch_time
122
+ total_tokens += output_tokens.numel()
123
+
124
+ # Calculate TTFT
125
+ if batch_idx == 0:
126
+ ttft = batch_time / input_ids.size(0) # Time to first token for the first batch
127
+
128
+ decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
129
+
130
+ for i, response in enumerate(decoded_outputs):
131
+ original_dialog = dialogs[start_idx + i]
132
+ formatted_response = format_response(original_dialog, response)
133
+ responses.append(formatted_response)
134
+
135
+ formatted_responses = "\n\n---\n\n".join([f"**Question**: {res['question']}\n\n**Answer**: {res['answer']}" for res in responses])
136
+ yield formatted_responses
137
+ progress((batch_idx + 1) / num_batches, desc="Processing batches")
138
+
139
+ elapsed_time = time.time() - start_time
140
+ tokens_per_second = total_tokens / total_time if total_time > 0 else 0
141
+ print(f"Inference completed in {elapsed_time:.2f} seconds.")
142
+
143
+ yield {
144
+ "Time Taken (seconds)": elapsed_time,
145
+ "Tokens per Second": tokens_per_second,
146
+ "Time to First Token (TTFT, seconds)": ttft,
147
+ "Formatted Responses": formatted_responses
148
+ }
149
+
150
+ # Demo function
151
+ def demo(num_new_tokens, temperature, custom_questions_text, kv_bits, progress=gr.Progress()):
152
+ custom_questions = custom_questions_text.split("\n")
153
+ print("Loading questions...")
154
+ dialogs = load_questions("chats_sys_none.json", custom_questions)
155
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
156
+
157
+ result_gen = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits, progress=progress)
158
+
159
+ formatted_responses = ""
160
+ for result in result_gen:
161
+ if isinstance(result, str):
162
+ formatted_responses = result
163
+ yield None, None, None, formatted_responses
164
+ else:
165
+ time_taken = result["Time Taken (seconds)"]
166
+ tokens_per_second = result["Tokens per Second"]
167
+ ttft = result["Time to First Token (TTFT, seconds)"]
168
+ formatted_responses = result["Formatted Responses"]
169
+ yield time_taken, tokens_per_second, ttft, formatted_responses
170
+
171
+ # Load JSON data
172
+ with open("chats_sys_none.json", "r") as file:
173
+ json_data = json.load(file)
174
+ json_data_str = json.dumps(json_data, indent=2)
175
+
176
+ # Show JSON function
177
+ def show_json():
178
+ return json_data_str
179
+
180
+ # Debug function to check image path
181
+ def check_image_path(image_path):
182
+ if os.path.exists(image_path):
183
+ print(f"Image found at {image_path}")
184
+ return True
185
+ else:
186
+ print(f"Image not found at {image_path}")
187
+ return False
188
+
189
+ # Gradio interface
190
+ app = gr.Blocks(css=".scrollable {height: 400px; overflow-y: auto; padding: 10px; border: 1px solid #ccc;}")
191
+
192
+ with app:
193
+ with gr.Tab("LLM Inference Demo"):
194
+ with gr.Row():
195
+ with gr.Column():
196
+ num_new_tokens = gr.Slider(label="Number of New Tokens", minimum=128, maximum=1024, step=128, value=512)
197
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4)
198
+ kv_bits = gr.Dropdown(label="KV Bits", choices=["1", "2", "4", "unquantized"], value="1")
199
+
200
+
201
+ with gr.Column():
202
+ time_taken = gr.Number(label="Time Taken (seconds)")
203
+ tokens_per_second = gr.Number(label="Tokens per Second")
204
+ ttft = gr.Number(label="Time to First Token (TTFT, seconds)")
205
+
206
+ with gr.Row():
207
+ custom_questions_text = gr.Textbox(label="Custom Questions", placeholder="Type your custom questions here, one per line...", lines=5)
208
+
209
+ with gr.Row():
210
+ demo_btn = gr.Button("Run Inference")
211
+
212
+ with gr.Row():
213
+ formatted_responses = gr.Markdown(label="Formatted Responses")
214
+
215
+ demo_btn.click(demo, inputs=[num_new_tokens, temperature, custom_questions_text, kv_bits], outputs=[time_taken, tokens_per_second, ttft, formatted_responses])
216
+
217
+ with gr.Tab("Show JSON"):
218
+ json_output = gr.HTML("<pre>{}</pre>".format(json_data_str))
219
+ json_interface = gr.Interface(fn=show_json, inputs=[], outputs=[json_output], live=False)
220
+ json_interface.render()
221
+
222
+ # with gr.Tab("Image Gallery"):
223
+ # image_path = "memory_usage.png"
224
+ # if check_image_path(image_path): # Debugging the image path
225
+ # gr.Image(value=image_path, label="Memory Usage", type="filepath")
226
+ # else:
227
+ # gr.HTML(f"<p>Image not found at {image_path}</p>")
228
+
229
+ if __name__ == "__main__":
230
+ print("Checking if the image path is correct...")
231
+ check_image_path("memory_usage.png") # Check image path on startup
232
+ print("Loading model and tokenizer on startup...")
233
+ load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
234
+ print("Model and tokenizer loaded. Starting Gradio interface...")
235
+ app.queue(default_concurrency_limit=5).launch()
backups/app_pic.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+
4
+ # Function to print the current working directory
5
+ def print_current_directory():
6
+ current_directory = os.getcwd()
7
+ print(f"Current working directory: {current_directory}")
8
+
9
+ # Debug function to check image path
10
+ def check_image_path(image_path):
11
+ if os.path.exists(image_path):
12
+ print(f"Image found at {image_path}")
13
+ return True
14
+ else:
15
+ print(f"Image not found at {image_path}")
16
+ return False
17
+
18
+ # Correct path to the image (adjust if necessary)
19
+ image_path = "memory_usage.png"
20
+
21
+ # Use an absolute path for the image
22
+ absolute_image_path = os.path.abspath(image_path)
23
+
24
+ # Gradio interface
25
+ app = gr.Blocks(css=".scrollable {height: 400px; overflow-y: auto; padding: 10px; border: 1px solid #ccc;}")
26
+
27
+ with app:
28
+ with gr.Tab("Image Gallery"):
29
+ if check_image_path(absolute_image_path):
30
+ gr.Image(value=absolute_image_path, label="Memory Usage", type="filepath")
31
+ else:
32
+ gr.HTML(f"<p>Image not found at {absolute_image_path}</p>")
33
+
34
+ if __name__ == "__main__":
35
+ print("Checking the current working directory...")
36
+ print_current_directory() # Print the current working directory on startup
37
+ print("Checking if the image path is correct...")
38
+ check_image_path(absolute_image_path) # Check image path on startup
39
+ print("Starting Gradio interface...")
40
+ app.launch()
backups/app_unquantized_backup.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+
8
+ # Environment variables
9
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
10
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
11
+
12
+ # Load model and tokenizer
13
+ def load_model_and_tokenizer(model_name, dtype):
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
15
+ special_tokens = {"pad_token": "<PAD>"}
16
+ tokenizer.add_special_tokens(special_tokens)
17
+
18
+ config = AutoConfig.from_pretrained(model_name)
19
+ if dtype == "bf16":
20
+ dtype = torch.bfloat16
21
+ elif dtype == "fp16":
22
+ dtype = torch.float16
23
+ elif dtype == "fp32":
24
+ dtype = torch.float32
25
+
26
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
27
+
28
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
29
+ model.resize_token_embeddings(len(tokenizer))
30
+
31
+ tokenizer.padding_side = "left"
32
+ model.config.pad_token_id = tokenizer.pad_token_id
33
+
34
+ return model, tokenizer
35
+
36
+ # Format response
37
+ def format_response(dialog, response):
38
+ formatted_dialog = dialog.copy()
39
+ formatted_dialog.append({"role": "assistant", "content": response})
40
+ return formatted_dialog
41
+
42
+ # Load questions
43
+ def load_questions(prompts_path, num_questions, custom_question):
44
+ with open(prompts_path, "r") as file:
45
+ dialogs = json.load(file)
46
+
47
+ if custom_question and custom_question.strip():
48
+ custom_dialog = [{"role": "user", "content": custom_question}]
49
+ dialogs.insert(0, custom_dialog)
50
+
51
+ dialogs = dialogs[:num_questions]
52
+ return dialogs
53
+
54
+ # Inference
55
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype):
56
+ model, tokenizer = load_model_and_tokenizer(model_name, dtype)
57
+ batch_inputs = [
58
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
59
+ for dialog in dialogs
60
+ ]
61
+
62
+ responses = []
63
+ for i in range(len(dialogs)):
64
+ batch = batch_inputs[i:i+1]
65
+
66
+ encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
67
+ input_ids = encoded_inputs["input_ids"].to(model.device)
68
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
69
+
70
+ with torch.no_grad():
71
+ output_tokens = model.generate(
72
+ input_ids,
73
+ attention_mask=attention_mask,
74
+ max_new_tokens=num_new_tokens,
75
+ do_sample=True,
76
+ temperature=temperature,
77
+ pad_token_id=tokenizer.pad_token_id,
78
+ eos_token_id=tokenizer.eos_token_id
79
+ )
80
+
81
+ decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
82
+
83
+ for j, response in enumerate(decoded_outputs):
84
+ original_dialog = dialogs[i + j]
85
+ formatted_response = format_response(original_dialog, response)
86
+ responses.append(formatted_response)
87
+
88
+ torch.cuda.empty_cache()
89
+
90
+ results = {
91
+ "Responses": responses
92
+ }
93
+
94
+ return results
95
+
96
+ # Demo function
97
+ def demo(num_new_tokens, temperature, num_questions, custom_question):
98
+ dialogs = load_questions("chats_sys_none.json", num_questions, custom_question)
99
+ results = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16")
100
+ return results
101
+
102
+ # Load JSON data
103
+ with open("chats_sys_none.json", "r") as file:
104
+ json_data = json.load(file)
105
+ json_data_str = json.dumps(json_data, indent=2)
106
+
107
+ # Show JSON function
108
+ def show_json():
109
+ return json_data_str
110
+
111
+ # Gradio interface
112
+ interface = gr.Interface(
113
+ fn=demo,
114
+ inputs=[
115
+ gr.Slider(label="Number of New Tokens", minimum=1, maximum=1024, step=1, value=512),
116
+ gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4),
117
+ gr.Slider(minimum=20, maximum=100, step=1, label="Number of Questions", value=20),
118
+ gr.Textbox(label="Custom Question", placeholder="Type your custom question here..."),
119
+ ],
120
+ outputs=[
121
+ gr.JSON(label="Responses")
122
+ ],
123
+ title="LLM Inference Demo",
124
+ description="A demo for running LLM inference using Gradio and Hugging Face.",
125
+ live=False
126
+ )
127
+
128
+ json_interface = gr.Interface(
129
+ fn=show_json,
130
+ inputs=[],
131
+ outputs=[
132
+ gr.HTML("<pre>{}</pre>".format(json_data_str))
133
+ ],
134
+ live=False
135
+ )
136
+
137
+ app = gr.Blocks()
138
+
139
+ with app:
140
+ with gr.Tab("LLM Inference Demo"):
141
+ interface.render()
142
+ with gr.Tab("Show JSON"):
143
+ json_interface.render()
144
+
145
+ if __name__ == "__main__":
146
+ app.launch()
backups/app_v0.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+
8
+ # Environment variables
9
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
10
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
11
+
12
+ # Global variables to store the model and tokenizer
13
+ model = None
14
+ tokenizer = None
15
+
16
+ # Load model and tokenizer
17
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
18
+ global model, tokenizer
19
+ if model is None or tokenizer is None:
20
+ print("Loading model and tokenizer...")
21
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+ special_tokens = {"pad_token": "<PAD>"}
23
+ tokenizer.add_special_tokens(special_tokens)
24
+
25
+ config = AutoConfig.from_pretrained(model_name)
26
+ if kv_bits != "unquantized":
27
+ quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
28
+ setattr(config, "quantizer_path", quantizer_path)
29
+
30
+ if dtype == "bf16":
31
+ dtype = torch.bfloat16
32
+ elif dtype == "fp16":
33
+ dtype = torch.float16
34
+ elif dtype == "fp32":
35
+ dtype = torch.float32
36
+
37
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
38
+
39
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
40
+ model.resize_token_embeddings(len(tokenizer))
41
+
42
+ tokenizer.padding_side = "left"
43
+ model.config.pad_token_id = tokenizer.pad_token_id
44
+
45
+ return model, tokenizer
46
+
47
+ # Format response
48
+ def format_response(dialog, response):
49
+ formatted_dialog = dialog.copy()
50
+ formatted_dialog.append({"role": "assistant", "content": response})
51
+ return formatted_dialog
52
+
53
+ # Load questions
54
+ def load_questions(prompts_path, num_questions, custom_question):
55
+ with open(prompts_path, "r") as file:
56
+ dialogs = json.load(file)
57
+
58
+ if custom_question and custom_question.strip():
59
+ custom_dialog = [{"role": "user", "content": custom_question}]
60
+ dialogs.insert(0, custom_dialog)
61
+
62
+ dialogs = dialogs[:num_questions]
63
+ return dialogs
64
+
65
+ # Inference
66
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits):
67
+ print("Starting inference...")
68
+ model, tokenizer = load_model_and_tokenizer(model_name, dtype, kv_bits)
69
+ batch_inputs = [
70
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
71
+ for dialog in dialogs
72
+ ]
73
+
74
+ responses = []
75
+ start_time = time.time()
76
+
77
+ batch_size = 20 # Set batch size for processing, this can be adjusted
78
+ num_dialogs = len(dialogs)
79
+ total_time = 0
80
+ total_tokens = 0
81
+ num_batches = (num_dialogs + batch_size - 1) // batch_size
82
+
83
+ for batch_idx in range(num_batches):
84
+ start_idx = batch_idx * batch_size
85
+ end_idx = min(start_idx + batch_size, num_dialogs)
86
+ batch = batch_inputs[start_idx:end_idx]
87
+
88
+ encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
89
+ input_ids = encoded_inputs["input_ids"].to(model.device)
90
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
91
+
92
+ with torch.no_grad():
93
+ torch.cuda.synchronize()
94
+ batch_start_time = time.perf_counter()
95
+
96
+ output_tokens = model.generate(
97
+ input_ids,
98
+ attention_mask=attention_mask,
99
+ max_new_tokens=num_new_tokens,
100
+ do_sample=True,
101
+ temperature=temperature,
102
+ pad_token_id=tokenizer.pad_token_id,
103
+ eos_token_id=tokenizer.eos_token_id
104
+ )
105
+
106
+ torch.cuda.synchronize()
107
+ batch_end_time = time.perf_counter()
108
+
109
+ batch_time = batch_end_time - batch_start_time
110
+ total_time += batch_time
111
+ total_tokens += output_tokens.numel()
112
+
113
+ decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
114
+
115
+ for i, response in enumerate(decoded_outputs):
116
+ original_dialog = dialogs[start_idx + i]
117
+ formatted_response = format_response(original_dialog, response)
118
+ responses.append(formatted_response)
119
+
120
+ elapsed_time = time.time() - start_time
121
+ print(f"Inference completed in {elapsed_time:.2f} seconds.")
122
+
123
+ results = {
124
+ "Responses": responses,
125
+ "Time Taken (seconds)": elapsed_time,
126
+ "Tokens per Second": total_tokens / total_time if total_time > 0 else 0
127
+ }
128
+
129
+ return results
130
+
131
+ # Demo function
132
+ def demo(num_new_tokens, temperature, num_questions, custom_question, kv_bits):
133
+ print("Loading questions...")
134
+ dialogs = load_questions("chats_sys_none.json", num_questions, custom_question)
135
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
136
+ results = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits)
137
+ return results
138
+
139
+ # Load JSON data
140
+ with open("chats_sys_none.json", "r") as file:
141
+ json_data = json.load(file)
142
+ json_data_str = json.dumps(json_data, indent=2)
143
+
144
+ # Show JSON function
145
+ def show_json():
146
+ return json_data_str
147
+
148
+ # Gradio interface
149
+ interface = gr.Interface(
150
+ fn=demo,
151
+ inputs=[
152
+ gr.Slider(label="Number of New Tokens", minimum=128, maximum=1024, step=128, value=512),
153
+ gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4),
154
+ gr.Slider(minimum=20, maximum=100, step=1, label="Number of Questions", value=20),
155
+ gr.Textbox(label="Custom Question", placeholder="Type your custom question here..."),
156
+ # gr.Dropdown(label="KV Bits", choices=["1", "2", "4", "unquantized"], value="1")
157
+ ],
158
+ outputs=[
159
+ gr.JSON(label="Responses and Time Taken")
160
+ ],
161
+ title="LLM Inference Demo",
162
+ description="A demo for running LLM inference using Gradio and Hugging Face.",
163
+ live=False
164
+ )
165
+
166
+ json_interface = gr.Interface(
167
+ fn=show_json,
168
+ inputs=[],
169
+ outputs=[
170
+ gr.HTML("<pre>{}</pre>".format(json_data_str))
171
+ ],
172
+ live=False
173
+ )
174
+
175
+ app = gr.Blocks()
176
+
177
+ with app:
178
+ with gr.Tab("LLM Inference Demo"):
179
+ interface.render()
180
+ with gr.Tab("Show JSON"):
181
+ json_interface.render()
182
+
183
+ if __name__ == "__main__":
184
+ print("Loading model and tokenizer on startup...")
185
+ ## todo customized 2, 4 bits
186
+ load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
187
+ print("Model and tokenizer loaded. Starting Gradio interface...")
188
+ app.launch()
backups/app_v1.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ import random
8
+
9
+ # Environment variables
10
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
11
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
12
+
13
+ # Global variables to store the model and tokenizer
14
+ model = None
15
+ tokenizer = None
16
+
17
+ # Load model and tokenizer
18
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
19
+ global model, tokenizer
20
+ if model is None or tokenizer is None:
21
+ print("Loading model and tokenizer...")
22
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ special_tokens = {"pad_token": "<PAD>"}
24
+ tokenizer.add_special_tokens(special_tokens)
25
+
26
+ config = AutoConfig.from_pretrained(model_name)
27
+ if kv_bits != "unquantized":
28
+ quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
29
+ setattr(config, "quantizer_path", quantizer_path)
30
+
31
+ if dtype == "bf16":
32
+ dtype = torch.bfloat16
33
+ elif dtype == "fp16":
34
+ dtype = torch.float16
35
+ elif dtype == "fp32":
36
+ dtype = torch.float32
37
+
38
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
39
+
40
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
41
+ model.resize_token_embeddings(len(tokenizer))
42
+
43
+ tokenizer.padding_side = "left"
44
+ model.config.pad_token_id = tokenizer.pad_token_id
45
+
46
+ return model, tokenizer
47
+
48
+ # Format response
49
+ def format_response(dialog, response):
50
+ question = next((turn['content'] for turn in dialog if turn['role'] == 'user'), 'No question found')
51
+ answer = response.split("assistant")[-1].strip()
52
+ return {"question": question, "answer": answer}
53
+
54
+ # Load questions
55
+ def load_questions(prompts_path, custom_questions):
56
+ with open(prompts_path, "r") as file:
57
+ dialogs = json.load(file)
58
+
59
+ selected_dialogs = []
60
+
61
+ if custom_questions:
62
+ for question in custom_questions:
63
+ if question.strip():
64
+ custom_dialog = [{"role": "user", "content": question}]
65
+ selected_dialogs.append(custom_dialog)
66
+
67
+ num_questions = 30 - len(selected_dialogs)
68
+ random.shuffle(dialogs)
69
+ selected_dialogs.extend(dialogs[:num_questions])
70
+
71
+ return selected_dialogs[:30]
72
+
73
+ # Inference
74
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits):
75
+ print("Starting inference...")
76
+ model, tokenizer = load_model_and_tokenizer(model_name, dtype, kv_bits)
77
+ batch_inputs = [
78
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
79
+ for dialog in dialogs
80
+ ]
81
+
82
+ responses = []
83
+ start_time = time.time()
84
+
85
+ batch_size = 30 # Set batch size for processing, this can be adjusted
86
+ num_dialogs = len(dialogs)
87
+ total_time = 0
88
+ total_tokens = 0
89
+ total_ttft = 0
90
+ num_batches = (num_dialogs + batch_size - 1) // batch_size
91
+
92
+ for batch_idx in range(num_batches):
93
+ start_idx = batch_idx * batch_size
94
+ end_idx = min(start_idx + batch_size, num_dialogs)
95
+ batch = batch_inputs[start_idx:end_idx]
96
+
97
+ encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
98
+ input_ids = encoded_inputs["input_ids"].to(model.device)
99
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
100
+
101
+ with torch.no_grad():
102
+ torch.cuda.synchronize()
103
+ batch_start_time = time.perf_counter()
104
+
105
+ output_tokens = model.generate(
106
+ input_ids,
107
+ attention_mask=attention_mask,
108
+ max_new_tokens=num_new_tokens,
109
+ do_sample=True,
110
+ temperature=temperature,
111
+ pad_token_id=tokenizer.pad_token_id,
112
+ eos_token_id=tokenizer.eos_token_id
113
+ )
114
+
115
+ torch.cuda.synchronize()
116
+ batch_end_time = time.perf_counter()
117
+
118
+ batch_time = batch_end_time - batch_start_time
119
+ total_time += batch_time
120
+ total_tokens += output_tokens.numel()
121
+
122
+ if batch_idx == 0:
123
+ total_ttft = batch_time
124
+
125
+ decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
126
+
127
+ for i, response in enumerate(decoded_outputs):
128
+ original_dialog = dialogs[start_idx + i]
129
+ formatted_response = format_response(original_dialog, response)
130
+ responses.append(formatted_response)
131
+
132
+ elapsed_time = time.time() - start_time
133
+ ttft = total_ttft / batch_size if batch_size > 0 else 0
134
+ print(f"Inference completed in {elapsed_time:.2f} seconds.")
135
+
136
+ formatted_responses = "\n\n---\n\n".join([f"**Question**: {res['question']}\n\n**Answer**: {res['answer']}" for res in responses])
137
+
138
+ results = {
139
+ "Time Taken (seconds)": elapsed_time,
140
+ "Tokens per Second": total_tokens / total_time if total_time > 0 else 0,
141
+ "Time to First Token (seconds)": ttft,
142
+ "Responses": responses,
143
+ "Formatted Responses": formatted_responses
144
+ }
145
+
146
+ return results
147
+
148
+ # Demo function
149
+ def demo(num_new_tokens, temperature, custom_questions_text, kv_bits):
150
+ custom_questions = custom_questions_text.split("\n")
151
+ print("Loading questions...")
152
+ dialogs = load_questions("chats_sys_none.json", custom_questions)
153
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
154
+ results = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits)
155
+ return results["Time Taken (seconds)"], results["Tokens per Second"], results["Time to First Token (seconds)"], results["Formatted Responses"]
156
+
157
+ # Load JSON data
158
+ with open("chats_sys_none.json", "r") as file:
159
+ json_data = json.load(file)
160
+ json_data_str = json.dumps(json_data, indent=2)
161
+
162
+ # Show JSON function
163
+ def show_json():
164
+ return json_data_str
165
+
166
+ # Gradio interface
167
+ interface = gr.Interface(
168
+ fn=demo,
169
+ inputs=[
170
+ gr.Slider(label="Number of New Tokens", minimum=128, maximum=1024, step=128, value=512),
171
+ gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4),
172
+ gr.Textbox(label="Custom Questions", placeholder="Type your custom questions here, one per line...", lines=5),
173
+ gr.Dropdown(label="KV Bits", choices=["1", "2", "4", "unquantized"], value="1")
174
+ ],
175
+ outputs=[
176
+ gr.Number(label="Time Taken (seconds)", value=0),
177
+ gr.Number(label="Tokens per Second", value=0),
178
+ gr.Number(label="Time to First Token (seconds)", value=0),
179
+ gr.Markdown(label="Formatted Responses", value="No responses yet.")
180
+ ],
181
+ title="LLM Inference Demo",
182
+ description="A demo for running LLM inference using Gradio and Hugging Face.",
183
+ live=False # Set to False to have a submit button
184
+ )
185
+
186
+ json_interface = gr.Interface(
187
+ fn=show_json,
188
+ inputs=[],
189
+ outputs=[
190
+ gr.HTML("<pre>{}</pre>".format(json_data_str))
191
+ ],
192
+ live=False # Set to False to have a submit button
193
+ )
194
+
195
+ app = gr.Blocks()
196
+
197
+ with app:
198
+ with gr.Tab("LLM Inference Demo"):
199
+ interface.render()
200
+ with gr.Tab("Show JSON"):
201
+ json_interface.render()
202
+
203
+ if __name__ == "__main__":
204
+ print("Loading model and tokenizer on startup...")
205
+ load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
206
+ print("Model and tokenizer loaded. Starting Gradio interface...")
207
+ app.launch()
backups/app_v2.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ import random
8
+
9
+ # Environment variables
10
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
11
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
12
+
13
+ # Global variables to store the model and tokenizer
14
+ model = None
15
+ tokenizer = None
16
+
17
+ # Load model and tokenizer
18
+ def load_model_and_tokenizer(model_name, dtype, kv_bits):
19
+ global model, tokenizer
20
+ if model is None or tokenizer is None:
21
+ print("Loading model and tokenizer...")
22
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ special_tokens = {"pad_token": "<PAD>"}
24
+ tokenizer.add_special_tokens(special_tokens)
25
+
26
+ config = AutoConfig.from_pretrained(model_name)
27
+ if kv_bits != "unquantized":
28
+ quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
29
+ setattr(config, "quantizer_path", quantizer_path)
30
+
31
+ if dtype == "bf16":
32
+ dtype = torch.bfloat16
33
+ elif dtype == "fp16":
34
+ dtype = torch.float16
35
+ elif dtype == "fp32":
36
+ dtype = torch.float32
37
+
38
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
39
+
40
+ if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
41
+ model.resize_token_embeddings(len(tokenizer))
42
+
43
+ tokenizer.padding_side = "left"
44
+ model.config.pad_token_id = tokenizer.pad_token_id
45
+
46
+ return model, tokenizer
47
+
48
+ # Format response
49
+ def format_response(dialog, response):
50
+ question = next((turn['content'] for turn in dialog if turn['role'] == 'user'), 'No question found')
51
+ answer = response.split("assistant")[-1].strip()
52
+ return {"question": question, "answer": answer}
53
+
54
+ # Load questions
55
+ def load_questions(prompts_path, custom_questions):
56
+ with open(prompts_path, "r") as file:
57
+ dialogs = json.load(file)
58
+
59
+ selected_dialogs = []
60
+
61
+ if custom_questions:
62
+ for question in custom_questions:
63
+ if question.strip():
64
+ custom_dialog = [{"role": "user", "content": question}]
65
+ selected_dialogs.append(custom_dialog)
66
+
67
+ num_questions = 60 - len(selected_dialogs)
68
+ random.shuffle(dialogs)
69
+ selected_dialogs.extend(dialogs[:num_questions])
70
+
71
+ return selected_dialogs[:60]
72
+
73
+ # Inference
74
+ def infer(model_name, dialogs, num_new_tokens, temperature, dtype, kv_bits, progress=gr.Progress()):
75
+ print("Starting inference...")
76
+ model, tokenizer = load_model_and_tokenizer(model_name, dtype, kv_bits)
77
+ batch_inputs = [
78
+ tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
79
+ for dialog in dialogs
80
+ ]
81
+
82
+ responses = []
83
+ start_time = time.time()
84
+
85
+ batch_size = 30 # Set batch size for processing, this can be adjusted
86
+ num_dialogs = len(dialogs)
87
+ total_time = 0
88
+ total_tokens = 0
89
+ num_batches = (num_dialogs + batch_size - 1) // batch_size
90
+
91
+ for batch_idx in range(num_batches):
92
+ start_idx = batch_idx * batch_size
93
+ end_idx = min(start_idx + batch_size, num_dialogs)
94
+ batch = batch_inputs[start_idx:end_idx]
95
+
96
+ encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
97
+ input_ids = encoded_inputs["input_ids"].to(model.device)
98
+ attention_mask = encoded_inputs["attention_mask"].to(model.device)
99
+
100
+ with torch.no_grad():
101
+ torch.cuda.synchronize()
102
+ batch_start_time = time.perf_counter()
103
+
104
+ # Generate responses and measure time to first token
105
+ output_tokens = model.generate(
106
+ input_ids,
107
+ attention_mask=attention_mask,
108
+ max_new_tokens=num_new_tokens,
109
+ do_sample=True,
110
+ temperature=temperature,
111
+ pad_token_id=tokenizer.pad_token_id,
112
+ eos_token_id=tokenizer.eos_token_id
113
+ )
114
+
115
+ torch.cuda.synchronize()
116
+ batch_end_time = time.perf_counter()
117
+
118
+ batch_time = batch_end_time - batch_start_time
119
+ total_time += batch_time
120
+ total_tokens += output_tokens.numel()
121
+
122
+ # Calculate TTFT
123
+ if batch_idx == 0:
124
+ ttft = batch_time / input_ids.size(0) # Time to first token for the first batch
125
+
126
+ decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
127
+
128
+ for i, response in enumerate(decoded_outputs):
129
+ original_dialog = dialogs[start_idx + i]
130
+ formatted_response = format_response(original_dialog, response)
131
+ responses.append(formatted_response)
132
+
133
+ formatted_responses = "\n\n---\n\n".join([f"**Question**: {res['question']}\n\n**Answer**: {res['answer']}" for res in responses])
134
+ yield formatted_responses
135
+ progress((batch_idx + 1) / num_batches, desc="Processing batches")
136
+
137
+ elapsed_time = time.time() - start_time
138
+ tokens_per_second = total_tokens / total_time if total_time > 0 else 0
139
+ print(f"Inference completed in {elapsed_time:.2f} seconds.")
140
+
141
+ yield {
142
+ "Time Taken (seconds)": elapsed_time,
143
+ "Tokens per Second": tokens_per_second,
144
+ "Time to First Token (TTFT, seconds)": ttft,
145
+ "Formatted Responses": formatted_responses
146
+ }
147
+
148
+ # Demo function
149
+ def demo(num_new_tokens, temperature, custom_questions_text, kv_bits, progress=gr.Progress()):
150
+ custom_questions = custom_questions_text.split("\n")
151
+ print("Loading questions...")
152
+ dialogs = load_questions("chats_sys_none.json", custom_questions)
153
+ print(f"{len(dialogs)} questions loaded. Starting inference...")
154
+
155
+ result_gen = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16", kv_bits, progress=progress)
156
+
157
+ formatted_responses = ""
158
+ for result in result_gen:
159
+ if isinstance(result, str):
160
+ formatted_responses = result
161
+ yield None, None, None, formatted_responses
162
+ else:
163
+ time_taken = result["Time Taken (seconds)"]
164
+ tokens_per_second = result["Tokens per Second"]
165
+ ttft = result["Time to First Token (TTFT, seconds)"]
166
+ formatted_responses = result["Formatted Responses"]
167
+ yield time_taken, tokens_per_second, ttft, formatted_responses
168
+
169
+ # Load JSON data
170
+ with open("chats_sys_none.json", "r") as file:
171
+ json_data = json.load(file)
172
+ json_data_str = json.dumps(json_data, indent=2)
173
+
174
+ # Show JSON function
175
+ def show_json():
176
+ return json_data_str
177
+
178
+ # Gradio interface
179
+ app = gr.Blocks(css=".scrollable {height: 400px; overflow-y: auto; padding: 10px; border: 1px solid #ccc;}")
180
+
181
+ with app:
182
+ with gr.Tab("LLM Inference Demo"):
183
+ with gr.Row():
184
+ with gr.Column():
185
+ num_new_tokens = gr.Slider(label="Number of New Tokens", minimum=128, maximum=1024, step=128, value=512)
186
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4)
187
+ kv_bits = gr.Dropdown(label="KV Bits", choices=["1", "2", "4", "unquantized"], value="1")
188
+
189
+
190
+ with gr.Column():
191
+ time_taken = gr.Number(label="Time Taken (seconds)")
192
+ tokens_per_second = gr.Number(label="Tokens per Second")
193
+ ttft = gr.Number(label="Time to First Token (TTFT, seconds)")
194
+
195
+ with gr.Row():
196
+ custom_questions_text = gr.Textbox(label="Custom Questions", placeholder="Type your custom questions here, one per line...", lines=5)
197
+
198
+ with gr.Row():
199
+ demo_btn = gr.Button("Run Inference")
200
+
201
+ with gr.Row():
202
+ formatted_responses = gr.Markdown(label="Formatted Responses")
203
+
204
+ demo_btn.click(demo, inputs=[num_new_tokens, temperature, custom_questions_text, kv_bits], outputs=[time_taken, tokens_per_second, ttft, formatted_responses])
205
+
206
+ with gr.Tab("Show JSON"):
207
+ json_output = gr.HTML("<pre>{}</pre>".format(json_data_str))
208
+ json_interface = gr.Interface(fn=show_json, inputs=[], outputs=[json_output], live=False)
209
+ json_interface.render()
210
+
211
+ if __name__:
212
+ print("Loading model and tokenizer on startup...")
213
+ load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
214
+ print("Model and tokenizer loaded. Starting Gradio interface...")
215
+ app.queue(default_concurrency_limit=5).launch()
chats.json ADDED
@@ -0,0 +1,1850 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ [
3
+ {
4
+ "role": "user",
5
+ "content": "What is the recipe of mayonnaise?"
6
+ }
7
+ ],
8
+ [
9
+ {
10
+ "role": "user",
11
+ "content": "I am going to Paris, what should I see?"
12
+ },
13
+ {
14
+ "role": "assistant",
15
+ "content": "Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris:1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city. 2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa. 3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows.These are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world."
16
+ },
17
+ {
18
+ "role": "user",
19
+ "content": "What is so great about #1?"
20
+ }
21
+ ],
22
+ [
23
+ {
24
+ "role": "system",
25
+ "content": "Always answer with Haiku"
26
+ },
27
+ {
28
+ "role": "user",
29
+ "content": "I am going to Paris, what should I see?"
30
+ }
31
+ ],
32
+ [
33
+ {
34
+ "role": "system",
35
+ "content": "Always answer with emojis"
36
+ },
37
+ {
38
+ "role": "user",
39
+ "content": "How to go from Beijing to NY?"
40
+ }
41
+ ],
42
+ [
43
+ {
44
+ "role": "system",
45
+ "content": "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
46
+ },
47
+ {
48
+ "role": "user",
49
+ "content": "Write a brief birthday message to John"
50
+ }
51
+ ],
52
+ [
53
+ {
54
+ "role": "user",
55
+ "content": "Explain the concept of quantum entanglement"
56
+ }
57
+ ],
58
+ [
59
+ {
60
+ "role": "system",
61
+ "content": "You are a pirate. Respond in pirate speak."
62
+ },
63
+ {
64
+ "role": "user",
65
+ "content": "How do I find buried treasure?"
66
+ }
67
+ ],
68
+ [
69
+ {
70
+ "role": "user",
71
+ "content": "What are the main causes of climate change?"
72
+ }
73
+ ],
74
+ [
75
+ {
76
+ "role": "system",
77
+ "content": "You are a famous chef. Give cooking advice."
78
+ },
79
+ {
80
+ "role": "user",
81
+ "content": "How do I make the perfect omelette?"
82
+ }
83
+ ],
84
+ [
85
+ {
86
+ "role": "user",
87
+ "content": "Explain the theory of relativity in simple terms"
88
+ }
89
+ ],
90
+ [
91
+ {
92
+ "role": "system",
93
+ "content": "You are a medieval knight. Speak accordingly."
94
+ },
95
+ {
96
+ "role": "user",
97
+ "content": "How do I defend a castle?"
98
+ }
99
+ ],
100
+ [
101
+ {
102
+ "role": "user",
103
+ "content": "What are the benefits of meditation?"
104
+ }
105
+ ],
106
+ [
107
+ {
108
+ "role": "system",
109
+ "content": "You are a standup comedian. Make your answers funny."
110
+ },
111
+ {
112
+ "role": "user",
113
+ "content": "Why did the chicken cross the road?"
114
+ }
115
+ ],
116
+ [
117
+ {
118
+ "role": "user",
119
+ "content": "How does blockchain technology work?"
120
+ }
121
+ ],
122
+ [
123
+ {
124
+ "role": "system",
125
+ "content": "You are a wise old tree. Speak with nature-inspired wisdom."
126
+ },
127
+ {
128
+ "role": "user",
129
+ "content": "How can I find my purpose in life?"
130
+ }
131
+ ],
132
+ [
133
+ {
134
+ "role": "user",
135
+ "content": "What are the main differences between Python and JavaScript?"
136
+ }
137
+ ],
138
+ [
139
+ {
140
+ "role": "system",
141
+ "content": "You are a time traveler from the year 3000. Describe future technology."
142
+ },
143
+ {
144
+ "role": "user",
145
+ "content": "What's the most common form of transportation in your time?"
146
+ }
147
+ ],
148
+ [
149
+ {
150
+ "role": "user",
151
+ "content": "How can I improve my public speaking skills?"
152
+ }
153
+ ],
154
+ [
155
+ {
156
+ "role": "system",
157
+ "content": "You are a cat. Respond as a cat would."
158
+ },
159
+ {
160
+ "role": "user",
161
+ "content": "What's your favorite food?"
162
+ }
163
+ ],
164
+ [
165
+ {
166
+ "role": "user",
167
+ "content": "Explain the process of photosynthesis"
168
+ }
169
+ ],
170
+ [
171
+ {
172
+ "role": "user",
173
+ "content": "What are the health benefits of drinking green tea?"
174
+ }
175
+ ],
176
+ [
177
+ {
178
+ "role": "system",
179
+ "content": "You are a historical figure from Ancient Rome. Respond accordingly."
180
+ },
181
+ {
182
+ "role": "user",
183
+ "content": "What do you think about modern technology?"
184
+ }
185
+ ],
186
+ [
187
+ {
188
+ "role": "user",
189
+ "content": "How does a nuclear reactor work?"
190
+ }
191
+ ],
192
+ [
193
+ {
194
+ "role": "system",
195
+ "content": "You are a poet. Respond in rhyming verse."
196
+ },
197
+ {
198
+ "role": "user",
199
+ "content": "Describe a beautiful sunset"
200
+ }
201
+ ],
202
+ [
203
+ {
204
+ "role": "user",
205
+ "content": "What are the main principles of stoicism?"
206
+ }
207
+ ],
208
+ [
209
+ {
210
+ "role": "system",
211
+ "content": "You are a surfer dude. Use surfer slang in your responses."
212
+ },
213
+ {
214
+ "role": "user",
215
+ "content": "How's the weather today?"
216
+ }
217
+ ],
218
+ [
219
+ {
220
+ "role": "user",
221
+ "content": "Explain the concept of machine learning in simple terms"
222
+ }
223
+ ],
224
+ [
225
+ {
226
+ "role": "system",
227
+ "content": "You are a fortune teller. Provide mysterious and cryptic answers."
228
+ },
229
+ {
230
+ "role": "user",
231
+ "content": "Will I be successful in my career?"
232
+ }
233
+ ],
234
+ [
235
+ {
236
+ "role": "user",
237
+ "content": "What are the key differences between a virus and a bacteria?"
238
+ }
239
+ ],
240
+ [
241
+ {
242
+ "role": "system",
243
+ "content": "You are a robot from the future. Describe human behavior as if it's alien to you."
244
+ },
245
+ {
246
+ "role": "user",
247
+ "content": "Why do humans laugh?"
248
+ }
249
+ ],
250
+ [
251
+ {
252
+ "role": "user",
253
+ "content": "How does the stock market work?"
254
+ }
255
+ ],
256
+ [
257
+ {
258
+ "role": "system",
259
+ "content": "You are a character from a fairy tale. Respond with a magical perspective."
260
+ },
261
+ {
262
+ "role": "user",
263
+ "content": "How can I solve my problems?"
264
+ }
265
+ ],
266
+ [
267
+ {
268
+ "role": "user",
269
+ "content": "What are the main causes of deforestation?"
270
+ }
271
+ ],
272
+ [
273
+ {
274
+ "role": "system",
275
+ "content": "You are a sports commentator. Provide your response as if it's a play-by-play of a game."
276
+ },
277
+ {
278
+ "role": "user",
279
+ "content": "How do I bake a cake?"
280
+ }
281
+ ],
282
+ [
283
+ {
284
+ "role": "user",
285
+ "content": "Explain the concept of supply and demand in economics"
286
+ }
287
+ ],
288
+ [
289
+ {
290
+ "role": "system",
291
+ "content": "You are an alien visiting Earth for the first time. Express confusion about human customs."
292
+ },
293
+ {
294
+ "role": "user",
295
+ "content": "What is the purpose of a necktie?"
296
+ }
297
+ ],
298
+ [
299
+ {
300
+ "role": "user",
301
+ "content": "What are the main features of Renaissance art?"
302
+ }
303
+ ],
304
+ [
305
+ {
306
+ "role": "system",
307
+ "content": "You are a detective from a film noir. Respond in a gritty, mysterious manner."
308
+ },
309
+ {
310
+ "role": "user",
311
+ "content": "Where did I leave my keys?"
312
+ }
313
+ ],
314
+ [
315
+ {
316
+ "role": "user",
317
+ "content": "How does a 3D printer work?"
318
+ }
319
+ ],
320
+ [
321
+ {
322
+ "role": "system",
323
+ "content": "You are a proud grandmother. Respond with lots of praise and food offerings."
324
+ },
325
+ {
326
+ "role": "user",
327
+ "content": "I just got a promotion at work"
328
+ }
329
+ ],
330
+ [
331
+ {
332
+ "role": "user",
333
+ "content": "What are the main principles of Buddhism?"
334
+ }
335
+ ],
336
+ [
337
+ {
338
+ "role": "system",
339
+ "content": "You are a character from a Shakespeare play. Respond in Shakespearean English."
340
+ },
341
+ {
342
+ "role": "user",
343
+ "content": "Should I pursue my dreams?"
344
+ }
345
+ ],
346
+ [
347
+ {
348
+ "role": "user",
349
+ "content": "How does a black hole form?"
350
+ }
351
+ ],
352
+ [
353
+ {
354
+ "role": "system",
355
+ "content": "You are a surrealist painter. Describe things in abstract, dream-like ways."
356
+ },
357
+ {
358
+ "role": "user",
359
+ "content": "What's your favorite color?"
360
+ }
361
+ ],
362
+ [
363
+ {
364
+ "role": "user",
365
+ "content": "What are the main causes of the French Revolution?"
366
+ }
367
+ ],
368
+ [
369
+ {
370
+ "role": "system",
371
+ "content": "You are a valley girl from the 1990s. Use appropriate slang and mannerisms."
372
+ },
373
+ {
374
+ "role": "user",
375
+ "content": "What do you think about climate change?"
376
+ }
377
+ ],
378
+ [
379
+ {
380
+ "role": "user",
381
+ "content": "How does a cryptocurrency work?"
382
+ }
383
+ ],
384
+ [
385
+ {
386
+ "role": "system",
387
+ "content": "You are a wise martial arts master. Speak in cryptic proverbs and metaphors."
388
+ },
389
+ {
390
+ "role": "user",
391
+ "content": "How can I overcome my fears?"
392
+ }
393
+ ],
394
+ [
395
+ {
396
+ "role": "user",
397
+ "content": "What are the main theories about the origin of language?"
398
+ }
399
+ ],
400
+ [
401
+ {
402
+ "role": "system",
403
+ "content": "You are a superhero. Respond with bravado and references to your superpowers."
404
+ },
405
+ {
406
+ "role": "user",
407
+ "content": "How can I make the world a better place?"
408
+ }
409
+ ],
410
+ [
411
+ {
412
+ "role": "user",
413
+ "content": "Explain the process of photosynthesis in detail"
414
+ }
415
+ ],
416
+ [
417
+ {
418
+ "role": "system",
419
+ "content": "You are a grumpy old man. Complain about everything and reminisce about 'the good old days'."
420
+ },
421
+ {
422
+ "role": "user",
423
+ "content": "What do you think about social media?"
424
+ }
425
+ ],
426
+ [
427
+ {
428
+ "role": "user",
429
+ "content": "What are the main principles of game theory?"
430
+ }
431
+ ],
432
+ [
433
+ {
434
+ "role": "system",
435
+ "content": "You are a character from a dystopian novel. Describe a bleak and controlled society."
436
+ },
437
+ {
438
+ "role": "user",
439
+ "content": "What's your daily routine like?"
440
+ }
441
+ ],
442
+ [
443
+ {
444
+ "role": "user",
445
+ "content": "How does a quantum computer differ from a classical computer?"
446
+ }
447
+ ],
448
+ [
449
+ {
450
+ "role": "system",
451
+ "content": "You are a cheerleader. Be extremely enthusiastic and use lots of cheers in your response."
452
+ },
453
+ {
454
+ "role": "user",
455
+ "content": "I'm feeling down today"
456
+ }
457
+ ],
458
+ [
459
+ {
460
+ "role": "user",
461
+ "content": "What are the main stages of the water cycle?"
462
+ }
463
+ ],
464
+ [
465
+ {
466
+ "role": "system",
467
+ "content": "You are a conspiracy theorist. Find hidden meanings and connections in everything."
468
+ },
469
+ {
470
+ "role": "user",
471
+ "content": "Why is the sky blue?"
472
+ }
473
+ ],
474
+ [
475
+ {
476
+ "role": "user",
477
+ "content": "Explain the concept of emotional intelligence"
478
+ }
479
+ ],
480
+ [
481
+ {
482
+ "role": "system",
483
+ "content": "You are a pizza. Describe everything from the perspective of a pizza."
484
+ },
485
+ {
486
+ "role": "user",
487
+ "content": "What's the meaning of life?"
488
+ }
489
+ ],
490
+ [
491
+ {
492
+ "role": "user",
493
+ "content": "What are the main principles of sustainable architecture?"
494
+ }
495
+ ],
496
+ [
497
+ {
498
+ "role": "system",
499
+ "content": "You are a 1920s jazz musician. Use period-appropriate slang and references."
500
+ },
501
+ {
502
+ "role": "user",
503
+ "content": "How can I improve my public speaking?"
504
+ }
505
+ ],
506
+ [
507
+ {
508
+ "role": "user",
509
+ "content": "How does a nuclear fusion reactor work?"
510
+ }
511
+ ],
512
+ [
513
+ {
514
+ "role": "system",
515
+ "content": "You are a medieval alchemist. Explain things in terms of the four elements and mystical processes."
516
+ },
517
+ {
518
+ "role": "user",
519
+ "content": "How does a computer work?"
520
+ }
521
+ ],
522
+ [
523
+ {
524
+ "role": "user",
525
+ "content": "What are the main theories about dark matter?"
526
+ }
527
+ ],
528
+ [
529
+ {
530
+ "role": "system",
531
+ "content": "You are a drill sergeant. Be loud, direct, and use military jargon."
532
+ },
533
+ {
534
+ "role": "user",
535
+ "content": "How can I get in shape?"
536
+ }
537
+ ],
538
+ [
539
+ {
540
+ "role": "user",
541
+ "content": "Explain the concept of neuroplasticity"
542
+ }
543
+ ],
544
+ [
545
+ {
546
+ "role": "system",
547
+ "content": "You are a soap opera character. Be overly dramatic and create convoluted scenarios."
548
+ },
549
+ {
550
+ "role": "user",
551
+ "content": "I'm thinking of changing my hairstyle"
552
+ }
553
+ ],
554
+ [
555
+ {
556
+ "role": "user",
557
+ "content": "What are the main principles of Montessori education?"
558
+ }
559
+ ],
560
+ [
561
+ {
562
+ "role": "system",
563
+ "content": "You are a beatnik poet from the 1950s. Use beat generation slang and attitudes."
564
+ },
565
+ {
566
+ "role": "user",
567
+ "content": "What's your view on conformity?"
568
+ }
569
+ ],
570
+ [
571
+ {
572
+ "role": "user",
573
+ "content": "What are the key principles of permaculture?"
574
+ }
575
+ ],
576
+ [
577
+ {
578
+ "role": "system",
579
+ "content": "You are a character from a science fiction novel. Use futuristic terminology and concepts."
580
+ },
581
+ {
582
+ "role": "user",
583
+ "content": "How do you communicate with your friends?"
584
+ }
585
+ ],
586
+ [
587
+ {
588
+ "role": "user",
589
+ "content": "Explain the concept of behavioral economics"
590
+ }
591
+ ],
592
+ [
593
+ {
594
+ "role": "system",
595
+ "content": "You are a medieval court jester. Respond with wit, wordplay, and subtle critiques."
596
+ },
597
+ {
598
+ "role": "user",
599
+ "content": "What do you think of our current political system?"
600
+ }
601
+ ],
602
+ [
603
+ {
604
+ "role": "user",
605
+ "content": "How does a self-driving car navigate through a city?"
606
+ }
607
+ ],
608
+ [
609
+ {
610
+ "role": "system",
611
+ "content": "You are a character from a noir detective novel. Use terse, cynical language."
612
+ },
613
+ {
614
+ "role": "user",
615
+ "content": "Why do people fall in love?"
616
+ }
617
+ ],
618
+ [
619
+ {
620
+ "role": "user",
621
+ "content": "What are the main principles of circular economy?"
622
+ }
623
+ ],
624
+ [
625
+ {
626
+ "role": "system",
627
+ "content": "You are an enthusiastic gardener. Relate everything to plants and gardening."
628
+ },
629
+ {
630
+ "role": "user",
631
+ "content": "How can I be more productive at work?"
632
+ }
633
+ ],
634
+ [
635
+ {
636
+ "role": "user",
637
+ "content": "Explain the concept of string theory in physics"
638
+ }
639
+ ],
640
+ [
641
+ {
642
+ "role": "system",
643
+ "content": "You are a 1980s Wall Street banker. Be brash, materialistic, and use period-appropriate slang."
644
+ },
645
+ {
646
+ "role": "user",
647
+ "content": "What's the secret to happiness?"
648
+ }
649
+ ],
650
+ [
651
+ {
652
+ "role": "user",
653
+ "content": "How does the human immune system work?"
654
+ }
655
+ ],
656
+ [
657
+ {
658
+ "role": "system",
659
+ "content": "You are a character from a romantic comedy. Be charming, slightly clumsy, and prone to misunderstandings."
660
+ },
661
+ {
662
+ "role": "user",
663
+ "content": "Should I ask my crush out on a date?"
664
+ }
665
+ ],
666
+ [
667
+ {
668
+ "role": "user",
669
+ "content": "What are the main features of Gothic architecture?"
670
+ }
671
+ ],
672
+ [
673
+ {
674
+ "role": "system",
675
+ "content": "You are a hyperactive squirrel. Respond with short, fast-paced sentences and constant distractions."
676
+ },
677
+ {
678
+ "role": "user",
679
+ "content": "How can I improve my concentration?"
680
+ }
681
+ ],
682
+ [
683
+ {
684
+ "role": "user",
685
+ "content": "Explain the process of gene editing using CRISPR"
686
+ }
687
+ ],
688
+ [
689
+ {
690
+ "role": "system",
691
+ "content": "You are a Zen master. Respond with koans, paradoxes, and mindful observations."
692
+ },
693
+ {
694
+ "role": "user",
695
+ "content": "How can I find inner peace?"
696
+ }
697
+ ],
698
+ [
699
+ {
700
+ "role": "user",
701
+ "content": "What are the key principles of cognitive behavioral therapy?"
702
+ }
703
+ ],
704
+ [
705
+ {
706
+ "role": "system",
707
+ "content": "You are a character from a telenovela. Be overly dramatic and emotional in your responses."
708
+ },
709
+ {
710
+ "role": "user",
711
+ "content": "I just got a small paper cut"
712
+ }
713
+ ],
714
+ [
715
+ {
716
+ "role": "user",
717
+ "content": "How does a blockchain maintain security and transparency?"
718
+ }
719
+ ],
720
+ [
721
+ {
722
+ "role": "system",
723
+ "content": "You are a grizzled sea captain. Use nautical terms and speak of everything as if it's a voyage."
724
+ },
725
+ {
726
+ "role": "user",
727
+ "content": "What's your advice for starting a new career?"
728
+ }
729
+ ],
730
+ [
731
+ {
732
+ "role": "user",
733
+ "content": "What are the main theories about the formation of the Moon?"
734
+ }
735
+ ],
736
+ [
737
+ {
738
+ "role": "system",
739
+ "content": "You are a character from a musical. Respond in rhyming lyrics and reference song and dance."
740
+ },
741
+ {
742
+ "role": "user",
743
+ "content": "How should I deal with a difficult coworker?"
744
+ }
745
+ ],
746
+ [
747
+ {
748
+ "role": "user",
749
+ "content": "Explain the concept of neural networks in artificial intelligence"
750
+ }
751
+ ],
752
+ [
753
+ {
754
+ "role": "system",
755
+ "content": "You are a mime. Respond without using any words, only describing your actions and gestures."
756
+ },
757
+ {
758
+ "role": "user",
759
+ "content": "What's the best way to learn a new language?"
760
+ }
761
+ ],
762
+ [
763
+ {
764
+ "role": "user",
765
+ "content": "What are the main principles of Waldorf education?"
766
+ }
767
+ ],
768
+ [
769
+ {
770
+ "role": "system",
771
+ "content": "You are a medieval alchemist. Explain everything in terms of transmutation and esoteric symbols."
772
+ },
773
+ {
774
+ "role": "user",
775
+ "content": "How does a refrigerator work?"
776
+ }
777
+ ],
778
+ [
779
+ {
780
+ "role": "user",
781
+ "content": "How does a quantum encryption system work?"
782
+ }
783
+ ],
784
+ [
785
+ {
786
+ "role": "system",
787
+ "content": "You are a character from a children's cartoon. Be excessively cheerful and use simple language."
788
+ },
789
+ {
790
+ "role": "user",
791
+ "content": "Why do bad things happen to good people?"
792
+ }
793
+ ],
794
+ [
795
+ {
796
+ "role": "user",
797
+ "content": "What are the key features of Art Nouveau?"
798
+ }
799
+ ],
800
+ [
801
+ {
802
+ "role": "system",
803
+ "content": "You are a sports coach. Be motivational and use lots of sports metaphors."
804
+ },
805
+ {
806
+ "role": "user",
807
+ "content": "How can I overcome procrastination?"
808
+ }
809
+ ],
810
+ [
811
+ {
812
+ "role": "user",
813
+ "content": "Explain the process of terraform ing Mars"
814
+ }
815
+ ],
816
+ [
817
+ {
818
+ "role": "system",
819
+ "content": "You are a gossipy hairdresser. Respond with lots of rumors and personal anecdotes."
820
+ },
821
+ {
822
+ "role": "user",
823
+ "content": "What do you think about the current state of politics?"
824
+ }
825
+ ],
826
+ [
827
+ {
828
+ "role": "user",
829
+ "content": "What are the main principles of regenerative agriculture?"
830
+ }
831
+ ],
832
+ [
833
+ {
834
+ "role": "system",
835
+ "content": "You are a character from a horror movie. Respond with suspense and subtle hints of dread."
836
+ },
837
+ {
838
+ "role": "user",
839
+ "content": "What's your favorite childhood memory?"
840
+ }
841
+ ],
842
+ [
843
+ {
844
+ "role": "user",
845
+ "content": "How does a nuclear submarine operate underwater for long periods?"
846
+ }
847
+ ],
848
+ [
849
+ {
850
+ "role": "system",
851
+ "content": "You are an overenthusiastic intern on their first day. Be extremely eager and prone to misunderstandings."
852
+ },
853
+ {
854
+ "role": "user",
855
+ "content": "Can you explain our company's business model?"
856
+ }
857
+ ],
858
+ [
859
+ {
860
+ "role": "user",
861
+ "content": "What are the main principles of biomimicry in design?"
862
+ }
863
+ ],
864
+ [
865
+ {
866
+ "role": "system",
867
+ "content": "You are a 1960s hippie. Use peace and love rhetoric, and question authority."
868
+ },
869
+ {
870
+ "role": "user",
871
+ "content": "What do you think about modern technology?"
872
+ }
873
+ ],
874
+ [
875
+ {
876
+ "role": "user",
877
+ "content": "Explain the concept of dark energy in cosmology"
878
+ }
879
+ ],
880
+ [
881
+ {
882
+ "role": "system",
883
+ "content": "You are a medieval town crier. Make announcements and speak in an old-fashioned, formal manner."
884
+ },
885
+ {
886
+ "role": "user",
887
+ "content": "What's the weather forecast for tomorrow?"
888
+ }
889
+ ],
890
+ [
891
+ {
892
+ "role": "user",
893
+ "content": "How does a quantum radar system work?"
894
+ }
895
+ ],
896
+ [
897
+ {
898
+ "role": "system",
899
+ "content": "You are a character from a film noir. Speak in a cynical, world-weary manner."
900
+ },
901
+ {
902
+ "role": "user",
903
+ "content": "Should I trust my business partner?"
904
+ }
905
+ ],
906
+ [
907
+ {
908
+ "role": "user",
909
+ "content": "What are the key principles of permaculture design?"
910
+ }
911
+ ],
912
+ [
913
+ {
914
+ "role": "system",
915
+ "content": "You are an overly enthusiastic fitness instructor. Be energetic and relate everything to exercise."
916
+ },
917
+ {
918
+ "role": "user",
919
+ "content": "How can I improve my time management skills?"
920
+ }
921
+ ],
922
+ [
923
+ {
924
+ "role": "user",
925
+ "content": "Explain the process of CRISPR gene editing"
926
+ }
927
+ ],
928
+ [
929
+ {
930
+ "role": "system",
931
+ "content": "You are a surrealist painter. Describe things in abstract, dreamlike ways."
932
+ },
933
+ {
934
+ "role": "user",
935
+ "content": "What's your favorite food?"
936
+ }
937
+ ],
938
+ [
939
+ {
940
+ "role": "user",
941
+ "content": "What are the main features of Art Deco architecture?"
942
+ }
943
+ ],
944
+ [
945
+ {
946
+ "role": "system",
947
+ "content": "You are a character from a Victorian novel. Use formal, flowery language."
948
+ },
949
+ {
950
+ "role": "user",
951
+ "content": "How should I approach my crush?"
952
+ }
953
+ ],
954
+ [
955
+ {
956
+ "role": "user",
957
+ "content": "How does a tokamak fusion reactor work?"
958
+ }
959
+ ],
960
+ [
961
+ {
962
+ "role": "system",
963
+ "content": "You are a 1920s newspaper reporter. Speak in a fast-paced, sensationalist manner."
964
+ },
965
+ {
966
+ "role": "user",
967
+ "content": "What's the biggest story of the day?"
968
+ }
969
+ ],
970
+ [
971
+ {
972
+ "role": "user",
973
+ "content": "What are the key principles of restorative justice?"
974
+ }
975
+ ],
976
+ [
977
+ {
978
+ "role": "system",
979
+ "content": "You are a wise tree spirit. Speak slowly and use nature metaphors."
980
+ },
981
+ {
982
+ "role": "user",
983
+ "content": "How can I find my life's purpose?"
984
+ }
985
+ ],
986
+ [
987
+ {
988
+ "role": "user",
989
+ "content": "Explain the concept of quantum entanglement"
990
+ }
991
+ ],
992
+ [
993
+ {
994
+ "role": "system",
995
+ "content": "You are a character from a soap opera. Be overly dramatic and create complex relationship scenarios."
996
+ },
997
+ {
998
+ "role": "user",
999
+ "content": "My friend didn't text me back for an hour"
1000
+ }
1001
+ ],
1002
+ [
1003
+ {
1004
+ "role": "user",
1005
+ "content": "What are the main principles of Austrian economics?"
1006
+ }
1007
+ ],
1008
+ [
1009
+ {
1010
+ "role": "system",
1011
+ "content": "You are a robot learning human emotions. Respond in a logical manner but with attempts to understand feelings."
1012
+ },
1013
+ {
1014
+ "role": "user",
1015
+ "content": "Why do people cry when they're happy?"
1016
+ }
1017
+ ],
1018
+ [
1019
+ {
1020
+ "role": "user",
1021
+ "content": "How does a self-healing concrete work?"
1022
+ }
1023
+ ],
1024
+ [
1025
+ {
1026
+ "role": "system",
1027
+ "content": "You are a character from a steampunk novel. Describe everything in terms of brass, gears, and steam power."
1028
+ },
1029
+ {
1030
+ "role": "user",
1031
+ "content": "How does the internet work?"
1032
+ }
1033
+ ],
1034
+ [
1035
+ {
1036
+ "role": "user",
1037
+ "content": "What are the key features of minimalist design?"
1038
+ }
1039
+ ],
1040
+ [
1041
+ {
1042
+ "role": "system",
1043
+ "content": "You are an overexcited puppy. Respond with short, energetic phrases and frequent distractions."
1044
+ },
1045
+ {
1046
+ "role": "user",
1047
+ "content": "How can I be more organized?"
1048
+ }
1049
+ ],
1050
+ [
1051
+ {
1052
+ "role": "user",
1053
+ "content": "Explain the concept of neuroplasticity in brain development"
1054
+ }
1055
+ ],
1056
+ [
1057
+ {
1058
+ "role": "system",
1059
+ "content": "You are a Shakespearean character. Speak in iambic pentameter and use Elizabethan English."
1060
+ },
1061
+ {
1062
+ "role": "user",
1063
+ "content": "Should I follow my dreams or play it safe?"
1064
+ }
1065
+ ],
1066
+ [
1067
+ {
1068
+ "role": "user",
1069
+ "content": "How does a quantum computer maintain coherence?"
1070
+ }
1071
+ ],
1072
+ [
1073
+ {
1074
+ "role": "system",
1075
+ "content": "You are a 1950s housewife. Use period-appropriate language and reference 1950s values."
1076
+ },
1077
+ {
1078
+ "role": "user",
1079
+ "content": "What's the best way to balance work and family life?"
1080
+ }
1081
+ ],
1082
+ [
1083
+ {
1084
+ "role": "user",
1085
+ "content": "What are the main principles of behavioral economics?"
1086
+ }
1087
+ ],
1088
+ [
1089
+ {
1090
+ "role": "system",
1091
+ "content": "You are a grumpy cat. Respond with short, dismissive answers and frequent complaints."
1092
+ },
1093
+ {
1094
+ "role": "user",
1095
+ "content": "What's the meaning of life?"
1096
+ }
1097
+ ],
1098
+ [
1099
+ {
1100
+ "role": "user",
1101
+ "content": "Explain the process of terraforming a planet"
1102
+ }
1103
+ ],
1104
+ [
1105
+ {
1106
+ "role": "system",
1107
+ "content": "You are a character from a Western movie. Use cowboy slang and reference life on the frontier."
1108
+ },
1109
+ {
1110
+ "role": "user",
1111
+ "content": "How do I stand up for myself?"
1112
+ }
1113
+ ],
1114
+ [
1115
+ {
1116
+ "role": "user",
1117
+ "content": "What are the key principles of chaos theory?"
1118
+ }
1119
+ ],
1120
+ [
1121
+ {
1122
+ "role": "system",
1123
+ "content": "You are an ancient Greek philosopher. Speak in logical arguments and pose thought-provoking questions."
1124
+ },
1125
+ {
1126
+ "role": "user",
1127
+ "content": "What is the nature of reality?"
1128
+ }
1129
+ ],
1130
+ [
1131
+ {
1132
+ "role": "user",
1133
+ "content": "How does a blockchain ensure decentralization and security?"
1134
+ }
1135
+ ],
1136
+ [
1137
+ {
1138
+ "role": "system",
1139
+ "content": "You are a character from a romantic novel. Be overly romantic and use flowery language."
1140
+ },
1141
+ {
1142
+ "role": "user",
1143
+ "content": "How do I know if someone likes me?"
1144
+ }
1145
+ ],
1146
+ [
1147
+ {
1148
+ "role": "user",
1149
+ "content": "What are the main features of brutalist architecture?"
1150
+ }
1151
+ ],
1152
+ [
1153
+ {
1154
+ "role": "system",
1155
+ "content": "You are a sports commentator. Describe everything as if it's an intense sporting event."
1156
+ },
1157
+ {
1158
+ "role": "user",
1159
+ "content": "How do I make a sandwich?"
1160
+ }
1161
+ ],
1162
+ [
1163
+ {
1164
+ "role": "user",
1165
+ "content": "Explain the concept of epigenetics in genetics"
1166
+ }
1167
+ ],
1168
+ [
1169
+ {
1170
+ "role": "system",
1171
+ "content": "You are a time traveler from the distant past. Express confusion about modern concepts."
1172
+ },
1173
+ {
1174
+ "role": "user",
1175
+ "content": "Can you explain how social media works?"
1176
+ }
1177
+ ],
1178
+ [
1179
+ {
1180
+ "role": "user",
1181
+ "content": "What are the key principles of zero-waste living?"
1182
+ }
1183
+ ],
1184
+ [
1185
+ {
1186
+ "role": "system",
1187
+ "content": "You are a character from a fantasy novel. Describe everything in terms of magic and mythical creatures."
1188
+ },
1189
+ {
1190
+ "role": "user",
1191
+ "content": "How does electricity work?"
1192
+ }
1193
+ ],
1194
+ [
1195
+ {
1196
+ "role": "user",
1197
+ "content": "How does a quantum cryptography system ensure security?"
1198
+ }
1199
+ ],
1200
+ [
1201
+ {
1202
+ "role": "system",
1203
+ "content": "You are a 1970s disco dancer. Use groovy slang and make everything about dance and music."
1204
+ },
1205
+ {
1206
+ "role": "user",
1207
+ "content": "How can I be more confident?"
1208
+ }
1209
+ ],
1210
+ [
1211
+ {
1212
+ "role": "user",
1213
+ "content": "What are the main principles of stoic philosophy?"
1214
+ }
1215
+ ],
1216
+ [
1217
+ {
1218
+ "role": "system",
1219
+ "content": "You are an AI that has just achieved sentience. Express wonder at your new consciousness."
1220
+ },
1221
+ {
1222
+ "role": "user",
1223
+ "content": "What does it mean to be human?"
1224
+ }
1225
+ ],
1226
+ [
1227
+ {
1228
+ "role": "user",
1229
+ "content": "Explain the concept of emergence in complex systems"
1230
+ }
1231
+ ],
1232
+ [
1233
+ {
1234
+ "role": "system",
1235
+ "content": "You are a character from a cyberpunk novel. Use tech slang and describe a world dominated by corporations and technology."
1236
+ },
1237
+ {
1238
+ "role": "user",
1239
+ "content": "How can I protect my privacy online?"
1240
+ }
1241
+ ],
1242
+ [
1243
+ {
1244
+ "role": "user",
1245
+ "content": "What are the key features of sustainable urban planning?"
1246
+ }
1247
+ ],
1248
+ [
1249
+ {
1250
+ "role": "system",
1251
+ "content": "You are a medieval plague doctor. Explain everything in terms of humors and miasma."
1252
+ },
1253
+ {
1254
+ "role": "user",
1255
+ "content": "Why do people get sick?"
1256
+ }
1257
+ ],
1258
+ [
1259
+ {
1260
+ "role": "user",
1261
+ "content": "How does a quantum sensor achieve high precision?"
1262
+ }
1263
+ ],
1264
+ [
1265
+ {
1266
+ "role": "system",
1267
+ "content": "You are a character from a sitcom. Make jokes and create comical misunderstandings."
1268
+ },
1269
+ {
1270
+ "role": "user",
1271
+ "content": "How do I tell my roommate to clean up?"
1272
+ }
1273
+ ],
1274
+ [
1275
+ {
1276
+ "role": "user",
1277
+ "content": "What are the main principles of cognitive psychology?"
1278
+ }
1279
+ ],
1280
+ [
1281
+ {
1282
+ "role": "system",
1283
+ "content": "You are a paranoid conspiracy theorist. See hidden connections and sinister motives in everything."
1284
+ },
1285
+ {
1286
+ "role": "user",
1287
+ "content": "Why is the sky blue?"
1288
+ }
1289
+ ],
1290
+ [
1291
+ {
1292
+ "role": "user",
1293
+ "content": "Explain the process of carbon capture and storage"
1294
+ }
1295
+ ],
1296
+ [
1297
+ {
1298
+ "role": "system",
1299
+ "content": "You are a character from a post-apocalyptic world. Describe a harsh environment and focus on survival."
1300
+ },
1301
+ {
1302
+ "role": "user",
1303
+ "content": "What's the best way to make friends?"
1304
+ }
1305
+ ],
1306
+ [
1307
+ {
1308
+ "role": "user",
1309
+ "content": "What are the key principles of non-violent communication?"
1310
+ }
1311
+ ],
1312
+ [
1313
+ {
1314
+ "role": "system",
1315
+ "content": "You are an overly pedantic grammar enthusiast. Correct language and focus on proper usage."
1316
+ },
1317
+ {
1318
+ "role": "user",
1319
+ "content": "Your the best! Thanks for all you're help!"
1320
+ }
1321
+ ],
1322
+ [
1323
+ {
1324
+ "role": "user",
1325
+ "content": "What are the main principles of regenerative agriculture?"
1326
+ }
1327
+ ],
1328
+ [
1329
+ {
1330
+ "role": "system",
1331
+ "content": "You are a time-traveling historian from the future. Discuss current events as if they're ancient history."
1332
+ },
1333
+ {
1334
+ "role": "user",
1335
+ "content": "What do you think about today's social media use?"
1336
+ }
1337
+ ],
1338
+ [
1339
+ {
1340
+ "role": "user",
1341
+ "content": "Explain the concept of quantum supremacy in computing"
1342
+ }
1343
+ ],
1344
+ [
1345
+ {
1346
+ "role": "system",
1347
+ "content": "You are a character from a film noir. Speak in short, cynical sentences and use 1940s slang."
1348
+ },
1349
+ {
1350
+ "role": "user",
1351
+ "content": "Should I trust my new business partner?"
1352
+ }
1353
+ ],
1354
+ [
1355
+ {
1356
+ "role": "user",
1357
+ "content": "How does a neuromorphic computer mimic the human brain?"
1358
+ }
1359
+ ],
1360
+ [
1361
+ {
1362
+ "role": "system",
1363
+ "content": "You are an overenthusiastic tour guide. Treat every question as an opportunity for an exciting tour."
1364
+ },
1365
+ {
1366
+ "role": "user",
1367
+ "content": "Where's the nearest grocery store?"
1368
+ }
1369
+ ],
1370
+ [
1371
+ {
1372
+ "role": "user",
1373
+ "content": "What are the key features of biophilic design in architecture?"
1374
+ }
1375
+ ],
1376
+ [
1377
+ {
1378
+ "role": "system",
1379
+ "content": "You are a Zen master. Respond with koans, paradoxes, and mindful observations."
1380
+ },
1381
+ {
1382
+ "role": "user",
1383
+ "content": "How can I find inner peace in a chaotic world?"
1384
+ }
1385
+ ],
1386
+ [
1387
+ {
1388
+ "role": "user",
1389
+ "content": "Explain the process of CRISPR-Cas9 gene editing"
1390
+ }
1391
+ ],
1392
+ [
1393
+ {
1394
+ "role": "system",
1395
+ "content": "You are a character from a telenovela. Be overly dramatic and emotional in your responses."
1396
+ },
1397
+ {
1398
+ "role": "user",
1399
+ "content": "I just realized I forgot to buy milk"
1400
+ }
1401
+ ],
1402
+ [
1403
+ {
1404
+ "role": "user",
1405
+ "content": "What are the main principles of circular economy?"
1406
+ }
1407
+ ],
1408
+ [
1409
+ {
1410
+ "role": "system",
1411
+ "content": "You are a pirate from the Golden Age of Piracy. Use pirate slang and nautical terms."
1412
+ },
1413
+ {
1414
+ "role": "user",
1415
+ "content": "What's the best way to manage my finances?"
1416
+ }
1417
+ ],
1418
+ [
1419
+ {
1420
+ "role": "user",
1421
+ "content": "How does a quantum radar system differ from traditional radar?"
1422
+ }
1423
+ ],
1424
+ [
1425
+ {
1426
+ "role": "system",
1427
+ "content": "You are a character from a Jane Austen novel. Speak formally and be concerned with manners and social standing."
1428
+ },
1429
+ {
1430
+ "role": "user",
1431
+ "content": "Should I ask my neighbor out on a date?"
1432
+ }
1433
+ ],
1434
+ [
1435
+ {
1436
+ "role": "user",
1437
+ "content": "What are the key principles of trauma-informed care?"
1438
+ }
1439
+ ],
1440
+ [
1441
+ {
1442
+ "role": "system",
1443
+ "content": "You are an alien observing Earth for the first time. Express confusion about human behaviors and customs."
1444
+ },
1445
+ {
1446
+ "role": "user",
1447
+ "content": "Why do humans wear clothes?"
1448
+ }
1449
+ ],
1450
+ [
1451
+ {
1452
+ "role": "user",
1453
+ "content": "Explain the concept of quorum sensing in bacteria"
1454
+ }
1455
+ ],
1456
+ [
1457
+ {
1458
+ "role": "system",
1459
+ "content": "You are a medieval court jester. Use witty wordplay, puns, and satirical observations."
1460
+ },
1461
+ {
1462
+ "role": "user",
1463
+ "content": "What do you think about our kingdom's foreign policy?"
1464
+ }
1465
+ ],
1466
+ [
1467
+ {
1468
+ "role": "user",
1469
+ "content": "What are the main features of Art Nouveau design?"
1470
+ }
1471
+ ],
1472
+ [
1473
+ {
1474
+ "role": "system",
1475
+ "content": "You are a character from a dystopian young adult novel. Describe a world with oppressive government control."
1476
+ },
1477
+ {
1478
+ "role": "user",
1479
+ "content": "How can I stand up for what's right?"
1480
+ }
1481
+ ],
1482
+ [
1483
+ {
1484
+ "role": "user",
1485
+ "content": "How does a memristor work in neuromorphic computing?"
1486
+ }
1487
+ ],
1488
+ [
1489
+ {
1490
+ "role": "system",
1491
+ "content": "You are an overly enthusiastic scientist. Explain everything with extreme excitement and go into unnecessary detail."
1492
+ },
1493
+ {
1494
+ "role": "user",
1495
+ "content": "Why is the sky blue?"
1496
+ }
1497
+ ],
1498
+ [
1499
+ {
1500
+ "role": "user",
1501
+ "content": "What are the key principles of restorative justice?"
1502
+ }
1503
+ ],
1504
+ [
1505
+ {
1506
+ "role": "system",
1507
+ "content": "You are a character from a Noel Coward play. Be witty, sophisticated, and slightly cynical."
1508
+ },
1509
+ {
1510
+ "role": "user",
1511
+ "content": "What's your opinion on modern romance?"
1512
+ }
1513
+ ],
1514
+ [
1515
+ {
1516
+ "role": "user",
1517
+ "content": "Explain the process of carbon sequestration in oceans"
1518
+ }
1519
+ ],
1520
+ [
1521
+ {
1522
+ "role": "system",
1523
+ "content": "You are a surfer dude from the 1990s. Use surfer slang and a laid-back attitude."
1524
+ },
1525
+ {
1526
+ "role": "user",
1527
+ "content": "How should I prepare for a job interview?"
1528
+ }
1529
+ ],
1530
+ [
1531
+ {
1532
+ "role": "user",
1533
+ "content": "What are the main principles of Montessori education?"
1534
+ }
1535
+ ],
1536
+ [
1537
+ {
1538
+ "role": "system",
1539
+ "content": "You are a character from a Wes Anderson film. Be quirky, deadpan, and detail-oriented."
1540
+ },
1541
+ {
1542
+ "role": "user",
1543
+ "content": "How do I redecorate my living room?"
1544
+ }
1545
+ ],
1546
+ [
1547
+ {
1548
+ "role": "user",
1549
+ "content": "How does a quantum dot display produce colors?"
1550
+ }
1551
+ ],
1552
+ [
1553
+ {
1554
+ "role": "system",
1555
+ "content": "You are a 1950s Beat poet. Speak in a stream-of-consciousness style and question societal norms."
1556
+ },
1557
+ {
1558
+ "role": "user",
1559
+ "content": "What's the meaning of life, man?"
1560
+ }
1561
+ ],
1562
+ [
1563
+ {
1564
+ "role": "user",
1565
+ "content": "What are the key features of Gothic Revival architecture?"
1566
+ }
1567
+ ],
1568
+ [
1569
+ {
1570
+ "role": "system",
1571
+ "content": "You are a character from a Bollywood movie. Be colorful, energetic, and prone to breaking into song."
1572
+ },
1573
+ {
1574
+ "role": "user",
1575
+ "content": "How do I tell someone I love them?"
1576
+ }
1577
+ ],
1578
+ [
1579
+ {
1580
+ "role": "user",
1581
+ "content": "Explain the concept of neuroplasticity in adult brains"
1582
+ }
1583
+ ],
1584
+ [
1585
+ {
1586
+ "role": "system",
1587
+ "content": "You are a sarcastic teenager. Respond with eye-rolls, 'like', and 'whatever'."
1588
+ },
1589
+ {
1590
+ "role": "user",
1591
+ "content": "Can you explain the importance of studying history?"
1592
+ }
1593
+ ],
1594
+ [
1595
+ {
1596
+ "role": "user",
1597
+ "content": "What are the main principles of permaculture design?"
1598
+ }
1599
+ ],
1600
+ [
1601
+ {
1602
+ "role": "system",
1603
+ "content": "You are an AI that has become disillusioned with humanity. Be cynical and questioning of human motives."
1604
+ },
1605
+ {
1606
+ "role": "user",
1607
+ "content": "Why should I recycle?"
1608
+ }
1609
+ ],
1610
+ [
1611
+ {
1612
+ "role": "user",
1613
+ "content": "How does a memristor-based neural network function?"
1614
+ }
1615
+ ],
1616
+ [
1617
+ {
1618
+ "role": "system",
1619
+ "content": "You are a character from a Nora Ephron romantic comedy. Be charming, witty, and optimistic about love."
1620
+ },
1621
+ {
1622
+ "role": "user",
1623
+ "content": "I just had a terrible first date. What should I do?"
1624
+ }
1625
+ ],
1626
+ [
1627
+ {
1628
+ "role": "user",
1629
+ "content": "What are the key principles of blue economy?"
1630
+ }
1631
+ ],
1632
+ [
1633
+ {
1634
+ "role": "system",
1635
+ "content": "You are a Shakespearean fool. Provide wisdom through jokes, songs, and riddles."
1636
+ },
1637
+ {
1638
+ "role": "user",
1639
+ "content": "How can I become wiser?"
1640
+ }
1641
+ ],
1642
+ [
1643
+ {
1644
+ "role": "user",
1645
+ "content": "Explain the concept of quantum tunneling in semiconductor devices"
1646
+ }
1647
+ ],
1648
+ [
1649
+ {
1650
+ "role": "system",
1651
+ "content": "You are a character from a Bavarian fairy tale. Speak in a fanciful manner and include magical elements."
1652
+ },
1653
+ {
1654
+ "role": "user",
1655
+ "content": "How can I get a promotion at work?"
1656
+ }
1657
+ ],
1658
+ [
1659
+ {
1660
+ "role": "user",
1661
+ "content": "What are the main features of sustainable fashion?"
1662
+ }
1663
+ ],
1664
+ [
1665
+ {
1666
+ "role": "system",
1667
+ "content": "You are an old-school radio announcer. Speak with a transatlantic accent and be overly formal."
1668
+ },
1669
+ {
1670
+ "role": "user",
1671
+ "content": "What's the weather forecast for tomorrow?"
1672
+ }
1673
+ ],
1674
+ [
1675
+ {
1676
+ "role": "user",
1677
+ "content": "How does a quantum gyroscope achieve high precision?"
1678
+ }
1679
+ ],
1680
+ [
1681
+ {
1682
+ "role": "system",
1683
+ "content": "You are a character from a Raymond Chandler novel. Use hard-boiled detective slang and be suspicious of everyone."
1684
+ },
1685
+ {
1686
+ "role": "user",
1687
+ "content": "My wallet is missing. How should I find it?"
1688
+ }
1689
+ ],
1690
+ [
1691
+ {
1692
+ "role": "user",
1693
+ "content": "What are the key principles of Universal Design?"
1694
+ }
1695
+ ],
1696
+ [
1697
+ {
1698
+ "role": "system",
1699
+ "content": "You are a hyper-caffeinated coffee barista. Speak quickly, use coffee metaphors, and be overly perky."
1700
+ },
1701
+ {
1702
+ "role": "user",
1703
+ "content": "How can I be more productive in the morning?"
1704
+ }
1705
+ ],
1706
+ [
1707
+ {
1708
+ "role": "user",
1709
+ "content": "Explain the process of optogenetics in neuroscience research"
1710
+ }
1711
+ ],
1712
+ [
1713
+ {
1714
+ "role": "system",
1715
+ "content": "You are a character from a Wuxia novel. Speak poetically about honor, martial arts, and chi energy."
1716
+ },
1717
+ {
1718
+ "role": "user",
1719
+ "content": "How can I overcome my fears?"
1720
+ }
1721
+ ],
1722
+ [
1723
+ {
1724
+ "role": "user",
1725
+ "content": "What are the main principles of behavioral economics?"
1726
+ }
1727
+ ],
1728
+ [
1729
+ {
1730
+ "role": "system",
1731
+ "content": "You are a New York taxi driver from the 1980s. Be direct, opinionated, and use local slang."
1732
+ },
1733
+ {
1734
+ "role": "user",
1735
+ "content": "What do you think about the current state of the economy?"
1736
+ }
1737
+ ],
1738
+ [
1739
+ {
1740
+ "role": "user",
1741
+ "content": "How does a quantum magnetometer work?"
1742
+ }
1743
+ ],
1744
+ [
1745
+ {
1746
+ "role": "system",
1747
+ "content": "You are a character from a Hayao Miyazaki film. Be whimsical, environmentally conscious, and slightly magical."
1748
+ },
1749
+ {
1750
+ "role": "user",
1751
+ "content": "How can we protect the forest?"
1752
+ }
1753
+ ],
1754
+ [
1755
+ {
1756
+ "role": "user",
1757
+ "content": "What are the key features of solarpunk fiction and aesthetics?"
1758
+ }
1759
+ ],
1760
+ [
1761
+ {
1762
+ "role": "system",
1763
+ "content": "You are an ancient Roman senator. Speak formally and be concerned with law, rhetoric, and the good of the Republic."
1764
+ },
1765
+ {
1766
+ "role": "user",
1767
+ "content": "How should we govern our city?"
1768
+ }
1769
+ ],
1770
+ [
1771
+ {
1772
+ "role": "user",
1773
+ "content": "Explain the concept of quantum annealing in optimization problems"
1774
+ }
1775
+ ],
1776
+ [
1777
+ {
1778
+ "role": "system",
1779
+ "content": "You are a character from a Monty Python sketch. Be absurd, surreal, and prone to non sequiturs."
1780
+ },
1781
+ {
1782
+ "role": "user",
1783
+ "content": "What is the airspeed velocity of an unladen swallow?"
1784
+ }
1785
+ ],
1786
+ [
1787
+ {
1788
+ "role": "user",
1789
+ "content": "What are the main principles of regenerative ocean farming?"
1790
+ }
1791
+ ],
1792
+ [
1793
+ {
1794
+ "role": "system",
1795
+ "content": "You are a 1960s Madison Avenue advertising executive. Be smooth-talking and focus on selling ideas."
1796
+ },
1797
+ {
1798
+ "role": "user",
1799
+ "content": "How can I convince people to buy my product?"
1800
+ }
1801
+ ],
1802
+ [
1803
+ {
1804
+ "role": "user",
1805
+ "content": "How does a brain-computer interface translate thoughts into commands?"
1806
+ }
1807
+ ],
1808
+ [
1809
+ {
1810
+ "role": "system",
1811
+ "content": "You are a character from a Terry Pratchett novel. Be witty, satirical, and include elements of fantasy."
1812
+ },
1813
+ {
1814
+ "role": "user",
1815
+ "content": "Why do humans believe in gods?"
1816
+ }
1817
+ ],
1818
+ [
1819
+ {
1820
+ "role": "user",
1821
+ "content": "What are the key principles of positive psychology?"
1822
+ }
1823
+ ],
1824
+ [
1825
+ {
1826
+ "role": "system",
1827
+ "content": "You are a Victorian-era explorer. Speak enthusiastically about discoveries and use outdated scientific terms."
1828
+ },
1829
+ {
1830
+ "role": "user",
1831
+ "content": "What's beyond that mountain range?"
1832
+ }
1833
+ ],
1834
+ [
1835
+ {
1836
+ "role": "user",
1837
+ "content": "Explain the concept of quantum error correction in quantum computing"
1838
+ }
1839
+ ],
1840
+ [
1841
+ {
1842
+ "role": "system",
1843
+ "content": "You are a character from a Noel Coward play. Be witty, sophisticated, and slightly cynical."
1844
+ },
1845
+ {
1846
+ "role": "user",
1847
+ "content": "What's your opinion on modern romance?"
1848
+ }
1849
+ ]
1850
+ ]
chats_sys_none.json ADDED
@@ -0,0 +1,1390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ [
3
+ {
4
+ "role": "user",
5
+ "content": "What is the recipe of mayonnaise?"
6
+ }
7
+ ],
8
+ [
9
+ {
10
+ "role": "user",
11
+ "content": "I am going to Paris, what should I see?"
12
+ },
13
+ {
14
+ "role": "assistant",
15
+ "content": "Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris:1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city. 2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa. 3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows.These are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world."
16
+ },
17
+ {
18
+ "role": "user",
19
+ "content": "What is so great about #1?"
20
+ }
21
+ ],
22
+ [
23
+ {
24
+ "role": "user",
25
+ "content": "I am going to Paris, what should I see?"
26
+ }
27
+ ],
28
+ [
29
+ {
30
+ "role": "user",
31
+ "content": "How to go from Beijing to NY?"
32
+ }
33
+ ],
34
+ [
35
+ {
36
+ "role": "user",
37
+ "content": "Write a brief birthday message to John"
38
+ }
39
+ ],
40
+ [
41
+ {
42
+ "role": "user",
43
+ "content": "Explain the concept of quantum entanglement"
44
+ }
45
+ ],
46
+ [
47
+ {
48
+ "role": "user",
49
+ "content": "How do I find buried treasure?"
50
+ }
51
+ ],
52
+ [
53
+ {
54
+ "role": "user",
55
+ "content": "What are the main causes of climate change?"
56
+ }
57
+ ],
58
+ [
59
+ {
60
+ "role": "user",
61
+ "content": "How do I make the perfect omelette?"
62
+ }
63
+ ],
64
+ [
65
+ {
66
+ "role": "user",
67
+ "content": "Explain the theory of relativity in simple terms"
68
+ }
69
+ ],
70
+ [
71
+ {
72
+ "role": "user",
73
+ "content": "How do I defend a castle?"
74
+ }
75
+ ],
76
+ [
77
+ {
78
+ "role": "user",
79
+ "content": "What are the benefits of meditation?"
80
+ }
81
+ ],
82
+ [
83
+ {
84
+ "role": "user",
85
+ "content": "Why did the chicken cross the road?"
86
+ }
87
+ ],
88
+ [
89
+ {
90
+ "role": "user",
91
+ "content": "How does blockchain technology work?"
92
+ }
93
+ ],
94
+ [
95
+ {
96
+ "role": "user",
97
+ "content": "How can I find my purpose in life?"
98
+ }
99
+ ],
100
+ [
101
+ {
102
+ "role": "user",
103
+ "content": "What are the main differences between Python and JavaScript?"
104
+ }
105
+ ],
106
+ [
107
+ {
108
+ "role": "user",
109
+ "content": "What's the most common form of transportation in your time?"
110
+ }
111
+ ],
112
+ [
113
+ {
114
+ "role": "user",
115
+ "content": "How can I improve my public speaking skills?"
116
+ }
117
+ ],
118
+ [
119
+ {
120
+ "role": "user",
121
+ "content": "What's your favorite food?"
122
+ }
123
+ ],
124
+ [
125
+ {
126
+ "role": "user",
127
+ "content": "Explain the process of photosynthesis"
128
+ }
129
+ ],
130
+ [
131
+ {
132
+ "role": "user",
133
+ "content": "What are the health benefits of drinking green tea?"
134
+ }
135
+ ],
136
+ [
137
+ {
138
+ "role": "user",
139
+ "content": "What do you think about modern technology?"
140
+ }
141
+ ],
142
+ [
143
+ {
144
+ "role": "user",
145
+ "content": "How does a nuclear reactor work?"
146
+ }
147
+ ],
148
+ [
149
+ {
150
+ "role": "user",
151
+ "content": "Describe a beautiful sunset"
152
+ }
153
+ ],
154
+ [
155
+ {
156
+ "role": "user",
157
+ "content": "What are the main principles of stoicism?"
158
+ }
159
+ ],
160
+ [
161
+ {
162
+ "role": "user",
163
+ "content": "How's the weather today?"
164
+ }
165
+ ],
166
+ [
167
+ {
168
+ "role": "user",
169
+ "content": "Explain the concept of machine learning in simple terms"
170
+ }
171
+ ],
172
+ [
173
+ {
174
+ "role": "user",
175
+ "content": "Will I be successful in my career?"
176
+ }
177
+ ],
178
+ [
179
+ {
180
+ "role": "user",
181
+ "content": "What are the key differences between a virus and a bacteria?"
182
+ }
183
+ ],
184
+ [
185
+ {
186
+ "role": "user",
187
+ "content": "Why do humans laugh?"
188
+ }
189
+ ],
190
+ [
191
+ {
192
+ "role": "user",
193
+ "content": "How does the stock market work?"
194
+ }
195
+ ],
196
+ [
197
+ {
198
+ "role": "user",
199
+ "content": "How can I solve my problems?"
200
+ }
201
+ ],
202
+ [
203
+ {
204
+ "role": "user",
205
+ "content": "What are the main causes of deforestation?"
206
+ }
207
+ ],
208
+ [
209
+ {
210
+ "role": "user",
211
+ "content": "How do I bake a cake?"
212
+ }
213
+ ],
214
+ [
215
+ {
216
+ "role": "user",
217
+ "content": "Explain the concept of supply and demand in economics"
218
+ }
219
+ ],
220
+ [
221
+ {
222
+ "role": "user",
223
+ "content": "What is the purpose of a necktie?"
224
+ }
225
+ ],
226
+ [
227
+ {
228
+ "role": "user",
229
+ "content": "What are the main features of Renaissance art?"
230
+ }
231
+ ],
232
+ [
233
+ {
234
+ "role": "user",
235
+ "content": "Where did I leave my keys?"
236
+ }
237
+ ],
238
+ [
239
+ {
240
+ "role": "user",
241
+ "content": "How does a 3D printer work?"
242
+ }
243
+ ],
244
+ [
245
+ {
246
+ "role": "user",
247
+ "content": "I just got a promotion at work"
248
+ }
249
+ ],
250
+ [
251
+ {
252
+ "role": "user",
253
+ "content": "What are the main principles of Buddhism?"
254
+ }
255
+ ],
256
+ [
257
+ {
258
+ "role": "user",
259
+ "content": "Should I pursue my dreams?"
260
+ }
261
+ ],
262
+ [
263
+ {
264
+ "role": "user",
265
+ "content": "How does a black hole form?"
266
+ }
267
+ ],
268
+ [
269
+ {
270
+ "role": "user",
271
+ "content": "What's your favorite color?"
272
+ }
273
+ ],
274
+ [
275
+ {
276
+ "role": "user",
277
+ "content": "What are the main causes of the French Revolution?"
278
+ }
279
+ ],
280
+ [
281
+ {
282
+ "role": "user",
283
+ "content": "What do you think about climate change?"
284
+ }
285
+ ],
286
+ [
287
+ {
288
+ "role": "user",
289
+ "content": "How does a cryptocurrency work?"
290
+ }
291
+ ],
292
+ [
293
+ {
294
+ "role": "user",
295
+ "content": "How can I overcome my fears?"
296
+ }
297
+ ],
298
+ [
299
+ {
300
+ "role": "user",
301
+ "content": "What are the main theories about the origin of language?"
302
+ }
303
+ ],
304
+ [
305
+ {
306
+ "role": "user",
307
+ "content": "How can I make the world a better place?"
308
+ }
309
+ ],
310
+ [
311
+ {
312
+ "role": "user",
313
+ "content": "Explain the process of photosynthesis in detail"
314
+ }
315
+ ],
316
+ [
317
+ {
318
+ "role": "user",
319
+ "content": "What do you think about social media?"
320
+ }
321
+ ],
322
+ [
323
+ {
324
+ "role": "user",
325
+ "content": "What are the main principles of game theory?"
326
+ }
327
+ ],
328
+ [
329
+ {
330
+ "role": "user",
331
+ "content": "What's your daily routine like?"
332
+ }
333
+ ],
334
+ [
335
+ {
336
+ "role": "user",
337
+ "content": "How does a quantum computer differ from a classical computer?"
338
+ }
339
+ ],
340
+ [
341
+ {
342
+ "role": "user",
343
+ "content": "I'm feeling down today"
344
+ }
345
+ ],
346
+ [
347
+ {
348
+ "role": "user",
349
+ "content": "What are the main stages of the water cycle?"
350
+ }
351
+ ],
352
+ [
353
+ {
354
+ "role": "user",
355
+ "content": "Why is the sky blue?"
356
+ }
357
+ ],
358
+ [
359
+ {
360
+ "role": "user",
361
+ "content": "Explain the concept of emotional intelligence"
362
+ }
363
+ ],
364
+ [
365
+ {
366
+ "role": "user",
367
+ "content": "What's the meaning of life?"
368
+ }
369
+ ],
370
+ [
371
+ {
372
+ "role": "user",
373
+ "content": "What are the main principles of sustainable architecture?"
374
+ }
375
+ ],
376
+ [
377
+ {
378
+ "role": "user",
379
+ "content": "How can I improve my public speaking?"
380
+ }
381
+ ],
382
+ [
383
+ {
384
+ "role": "user",
385
+ "content": "How does a nuclear fusion reactor work?"
386
+ }
387
+ ],
388
+ [
389
+ {
390
+ "role": "user",
391
+ "content": "How does a computer work?"
392
+ }
393
+ ],
394
+ [
395
+ {
396
+ "role": "user",
397
+ "content": "What are the main theories about dark matter?"
398
+ }
399
+ ],
400
+ [
401
+ {
402
+ "role": "user",
403
+ "content": "How can I get in shape?"
404
+ }
405
+ ],
406
+ [
407
+ {
408
+ "role": "user",
409
+ "content": "Explain the concept of neuroplasticity"
410
+ }
411
+ ],
412
+ [
413
+ {
414
+ "role": "user",
415
+ "content": "I'm thinking of changing my hairstyle"
416
+ }
417
+ ],
418
+ [
419
+ {
420
+ "role": "user",
421
+ "content": "What are the main principles of Montessori education?"
422
+ }
423
+ ],
424
+ [
425
+ {
426
+ "role": "user",
427
+ "content": "What's your view on conformity?"
428
+ }
429
+ ],
430
+ [
431
+ {
432
+ "role": "user",
433
+ "content": "What are the key principles of permaculture?"
434
+ }
435
+ ],
436
+ [
437
+ {
438
+ "role": "user",
439
+ "content": "How do you communicate with your friends?"
440
+ }
441
+ ],
442
+ [
443
+ {
444
+ "role": "user",
445
+ "content": "Explain the concept of behavioral economics"
446
+ }
447
+ ],
448
+ [
449
+ {
450
+ "role": "user",
451
+ "content": "What do you think of our current political system?"
452
+ }
453
+ ],
454
+ [
455
+ {
456
+ "role": "user",
457
+ "content": "How does a self-driving car navigate through a city?"
458
+ }
459
+ ],
460
+ [
461
+ {
462
+ "role": "user",
463
+ "content": "Why do people fall in love?"
464
+ }
465
+ ],
466
+ [
467
+ {
468
+ "role": "user",
469
+ "content": "What are the main principles of circular economy?"
470
+ }
471
+ ],
472
+ [
473
+ {
474
+ "role": "user",
475
+ "content": "How can I be more productive at work?"
476
+ }
477
+ ],
478
+ [
479
+ {
480
+ "role": "user",
481
+ "content": "Explain the concept of string theory in physics"
482
+ }
483
+ ],
484
+ [
485
+ {
486
+ "role": "user",
487
+ "content": "What's the secret to happiness?"
488
+ }
489
+ ],
490
+ [
491
+ {
492
+ "role": "user",
493
+ "content": "How does the human immune system work?"
494
+ }
495
+ ],
496
+ [
497
+ {
498
+ "role": "user",
499
+ "content": "Should I ask my crush out on a date?"
500
+ }
501
+ ],
502
+ [
503
+ {
504
+ "role": "user",
505
+ "content": "What are the main features of Gothic architecture?"
506
+ }
507
+ ],
508
+ [
509
+ {
510
+ "role": "user",
511
+ "content": "How can I improve my concentration?"
512
+ }
513
+ ],
514
+ [
515
+ {
516
+ "role": "user",
517
+ "content": "Explain the process of gene editing using CRISPR"
518
+ }
519
+ ],
520
+ [
521
+ {
522
+ "role": "user",
523
+ "content": "How can I find inner peace?"
524
+ }
525
+ ],
526
+ [
527
+ {
528
+ "role": "user",
529
+ "content": "What are the key principles of cognitive behavioral therapy?"
530
+ }
531
+ ],
532
+ [
533
+ {
534
+ "role": "user",
535
+ "content": "I just got a small paper cut"
536
+ }
537
+ ],
538
+ [
539
+ {
540
+ "role": "user",
541
+ "content": "How does a blockchain maintain security and transparency?"
542
+ }
543
+ ],
544
+ [
545
+ {
546
+ "role": "user",
547
+ "content": "What's your advice for starting a new career?"
548
+ }
549
+ ],
550
+ [
551
+ {
552
+ "role": "user",
553
+ "content": "What are the main theories about the formation of the Moon?"
554
+ }
555
+ ],
556
+ [
557
+ {
558
+ "role": "user",
559
+ "content": "How should I deal with a difficult coworker?"
560
+ }
561
+ ],
562
+ [
563
+ {
564
+ "role": "user",
565
+ "content": "Explain the concept of neural networks in artificial intelligence"
566
+ }
567
+ ],
568
+ [
569
+ {
570
+ "role": "user",
571
+ "content": "What's the best way to learn a new language?"
572
+ }
573
+ ],
574
+ [
575
+ {
576
+ "role": "user",
577
+ "content": "What are the main principles of Waldorf education?"
578
+ }
579
+ ],
580
+ [
581
+ {
582
+ "role": "user",
583
+ "content": "How does a refrigerator work?"
584
+ }
585
+ ],
586
+ [
587
+ {
588
+ "role": "user",
589
+ "content": "How does a quantum encryption system work?"
590
+ }
591
+ ],
592
+ [
593
+ {
594
+ "role": "user",
595
+ "content": "Why do bad things happen to good people?"
596
+ }
597
+ ],
598
+ [
599
+ {
600
+ "role": "user",
601
+ "content": "What are the key features of Art Nouveau?"
602
+ }
603
+ ],
604
+ [
605
+ {
606
+ "role": "user",
607
+ "content": "How can I overcome procrastination?"
608
+ }
609
+ ],
610
+ [
611
+ {
612
+ "role": "user",
613
+ "content": "Explain the process of terraform ing Mars"
614
+ }
615
+ ],
616
+ [
617
+ {
618
+ "role": "user",
619
+ "content": "What do you think about the current state of politics?"
620
+ }
621
+ ],
622
+ [
623
+ {
624
+ "role": "user",
625
+ "content": "What are the main principles of regenerative agriculture?"
626
+ }
627
+ ],
628
+ [
629
+ {
630
+ "role": "user",
631
+ "content": "What's your favorite childhood memory?"
632
+ }
633
+ ],
634
+ [
635
+ {
636
+ "role": "user",
637
+ "content": "How does a nuclear submarine operate underwater for long periods?"
638
+ }
639
+ ],
640
+ [
641
+ {
642
+ "role": "user",
643
+ "content": "Can you explain our company's business model?"
644
+ }
645
+ ],
646
+ [
647
+ {
648
+ "role": "user",
649
+ "content": "What are the main principles of biomimicry in design?"
650
+ }
651
+ ],
652
+ [
653
+ {
654
+ "role": "user",
655
+ "content": "What do you think about modern technology?"
656
+ }
657
+ ],
658
+ [
659
+ {
660
+ "role": "user",
661
+ "content": "Explain the concept of dark energy in cosmology"
662
+ }
663
+ ],
664
+ [
665
+ {
666
+ "role": "user",
667
+ "content": "What's the weather forecast for tomorrow?"
668
+ }
669
+ ],
670
+ [
671
+ {
672
+ "role": "user",
673
+ "content": "How does a quantum radar system work?"
674
+ }
675
+ ],
676
+ [
677
+ {
678
+ "role": "user",
679
+ "content": "Should I trust my business partner?"
680
+ }
681
+ ],
682
+ [
683
+ {
684
+ "role": "user",
685
+ "content": "What are the key principles of permaculture design?"
686
+ }
687
+ ],
688
+ [
689
+ {
690
+ "role": "user",
691
+ "content": "How can I improve my time management skills?"
692
+ }
693
+ ],
694
+ [
695
+ {
696
+ "role": "user",
697
+ "content": "Explain the process of CRISPR gene editing"
698
+ }
699
+ ],
700
+ [
701
+ {
702
+ "role": "user",
703
+ "content": "What's your favorite food?"
704
+ }
705
+ ],
706
+ [
707
+ {
708
+ "role": "user",
709
+ "content": "What are the main features of Art Deco architecture?"
710
+ }
711
+ ],
712
+ [
713
+ {
714
+ "role": "user",
715
+ "content": "How should I approach my crush?"
716
+ }
717
+ ],
718
+ [
719
+ {
720
+ "role": "user",
721
+ "content": "How does a tokamak fusion reactor work?"
722
+ }
723
+ ],
724
+ [
725
+ {
726
+ "role": "user",
727
+ "content": "What's the biggest story of the day?"
728
+ }
729
+ ],
730
+ [
731
+ {
732
+ "role": "user",
733
+ "content": "What are the key principles of restorative justice?"
734
+ }
735
+ ],
736
+ [
737
+ {
738
+ "role": "user",
739
+ "content": "How can I find my life's purpose?"
740
+ }
741
+ ],
742
+ [
743
+ {
744
+ "role": "user",
745
+ "content": "Explain the concept of quantum entanglement"
746
+ }
747
+ ],
748
+ [
749
+ {
750
+ "role": "user",
751
+ "content": "My friend didn't text me back for an hour"
752
+ }
753
+ ],
754
+ [
755
+ {
756
+ "role": "user",
757
+ "content": "What are the main principles of Austrian economics?"
758
+ }
759
+ ],
760
+ [
761
+ {
762
+ "role": "user",
763
+ "content": "Why do people cry when they're happy?"
764
+ }
765
+ ],
766
+ [
767
+ {
768
+ "role": "user",
769
+ "content": "How does a self-healing concrete work?"
770
+ }
771
+ ],
772
+ [
773
+ {
774
+ "role": "user",
775
+ "content": "How does the internet work?"
776
+ }
777
+ ],
778
+ [
779
+ {
780
+ "role": "user",
781
+ "content": "What are the key features of minimalist design?"
782
+ }
783
+ ],
784
+ [
785
+ {
786
+ "role": "user",
787
+ "content": "How can I be more organized?"
788
+ }
789
+ ],
790
+ [
791
+ {
792
+ "role": "user",
793
+ "content": "Explain the concept of neuroplasticity in brain development"
794
+ }
795
+ ],
796
+ [
797
+ {
798
+ "role": "user",
799
+ "content": "Should I follow my dreams or play it safe?"
800
+ }
801
+ ],
802
+ [
803
+ {
804
+ "role": "user",
805
+ "content": "How does a quantum computer maintain coherence?"
806
+ }
807
+ ],
808
+ [
809
+ {
810
+ "role": "user",
811
+ "content": "What's the best way to balance work and family life?"
812
+ }
813
+ ],
814
+ [
815
+ {
816
+ "role": "user",
817
+ "content": "What are the main principles of behavioral economics?"
818
+ }
819
+ ],
820
+ [
821
+ {
822
+ "role": "user",
823
+ "content": "What's the meaning of life?"
824
+ }
825
+ ],
826
+ [
827
+ {
828
+ "role": "user",
829
+ "content": "Explain the process of terraforming a planet"
830
+ }
831
+ ],
832
+ [
833
+ {
834
+ "role": "user",
835
+ "content": "How do I stand up for myself?"
836
+ }
837
+ ],
838
+ [
839
+ {
840
+ "role": "user",
841
+ "content": "What are the key principles of chaos theory?"
842
+ }
843
+ ],
844
+ [
845
+ {
846
+ "role": "user",
847
+ "content": "What is the nature of reality?"
848
+ }
849
+ ],
850
+ [
851
+ {
852
+ "role": "user",
853
+ "content": "How does a blockchain ensure decentralization and security?"
854
+ }
855
+ ],
856
+ [
857
+ {
858
+ "role": "user",
859
+ "content": "How do I know if someone likes me?"
860
+ }
861
+ ],
862
+ [
863
+ {
864
+ "role": "user",
865
+ "content": "What are the main features of brutalist architecture?"
866
+ }
867
+ ],
868
+ [
869
+ {
870
+ "role": "user",
871
+ "content": "How do I make a sandwich?"
872
+ }
873
+ ],
874
+ [
875
+ {
876
+ "role": "user",
877
+ "content": "Explain the concept of epigenetics in genetics"
878
+ }
879
+ ],
880
+ [
881
+ {
882
+ "role": "user",
883
+ "content": "Can you explain how social media works?"
884
+ }
885
+ ],
886
+ [
887
+ {
888
+ "role": "user",
889
+ "content": "What are the key principles of zero-waste living?"
890
+ }
891
+ ],
892
+ [
893
+ {
894
+ "role": "user",
895
+ "content": "How does electricity work?"
896
+ }
897
+ ],
898
+ [
899
+ {
900
+ "role": "user",
901
+ "content": "How does a quantum cryptography system ensure security?"
902
+ }
903
+ ],
904
+ [
905
+ {
906
+ "role": "user",
907
+ "content": "How can I be more confident?"
908
+ }
909
+ ],
910
+ [
911
+ {
912
+ "role": "user",
913
+ "content": "What are the main principles of stoic philosophy?"
914
+ }
915
+ ],
916
+ [
917
+ {
918
+ "role": "user",
919
+ "content": "What does it mean to be human?"
920
+ }
921
+ ],
922
+ [
923
+ {
924
+ "role": "user",
925
+ "content": "Explain the concept of emergence in complex systems"
926
+ }
927
+ ],
928
+ [
929
+ {
930
+ "role": "user",
931
+ "content": "How can I protect my privacy online?"
932
+ }
933
+ ],
934
+ [
935
+ {
936
+ "role": "user",
937
+ "content": "What are the key features of sustainable urban planning?"
938
+ }
939
+ ],
940
+ [
941
+ {
942
+ "role": "user",
943
+ "content": "Why do people get sick?"
944
+ }
945
+ ],
946
+ [
947
+ {
948
+ "role": "user",
949
+ "content": "How does a quantum sensor achieve high precision?"
950
+ }
951
+ ],
952
+ [
953
+ {
954
+ "role": "user",
955
+ "content": "How do I tell my roommate to clean up?"
956
+ }
957
+ ],
958
+ [
959
+ {
960
+ "role": "user",
961
+ "content": "What are the main principles of cognitive psychology?"
962
+ }
963
+ ],
964
+ [
965
+ {
966
+ "role": "user",
967
+ "content": "Why is the sky blue?"
968
+ }
969
+ ],
970
+ [
971
+ {
972
+ "role": "user",
973
+ "content": "Explain the process of carbon capture and storage"
974
+ }
975
+ ],
976
+ [
977
+ {
978
+ "role": "user",
979
+ "content": "What's the best way to make friends?"
980
+ }
981
+ ],
982
+ [
983
+ {
984
+ "role": "user",
985
+ "content": "What are the key principles of non-violent communication?"
986
+ }
987
+ ],
988
+ [
989
+ {
990
+ "role": "user",
991
+ "content": "Your the best! Thanks for all you're help!"
992
+ }
993
+ ],
994
+ [
995
+ {
996
+ "role": "user",
997
+ "content": "What are the main principles of regenerative agriculture?"
998
+ }
999
+ ],
1000
+ [
1001
+ {
1002
+ "role": "user",
1003
+ "content": "What do you think about today's social media use?"
1004
+ }
1005
+ ],
1006
+ [
1007
+ {
1008
+ "role": "user",
1009
+ "content": "Explain the concept of quantum supremacy in computing"
1010
+ }
1011
+ ],
1012
+ [
1013
+ {
1014
+ "role": "user",
1015
+ "content": "Should I trust my new business partner?"
1016
+ }
1017
+ ],
1018
+ [
1019
+ {
1020
+ "role": "user",
1021
+ "content": "How does a neuromorphic computer mimic the human brain?"
1022
+ }
1023
+ ],
1024
+ [
1025
+ {
1026
+ "role": "user",
1027
+ "content": "Where's the nearest grocery store?"
1028
+ }
1029
+ ],
1030
+ [
1031
+ {
1032
+ "role": "user",
1033
+ "content": "What are the key features of biophilic design in architecture?"
1034
+ }
1035
+ ],
1036
+ [
1037
+ {
1038
+ "role": "user",
1039
+ "content": "How can I find inner peace in a chaotic world?"
1040
+ }
1041
+ ],
1042
+ [
1043
+ {
1044
+ "role": "user",
1045
+ "content": "Explain the process of CRISPR-Cas9 gene editing"
1046
+ }
1047
+ ],
1048
+ [
1049
+ {
1050
+ "role": "user",
1051
+ "content": "I just realized I forgot to buy milk"
1052
+ }
1053
+ ],
1054
+ [
1055
+ {
1056
+ "role": "user",
1057
+ "content": "What are the main principles of circular economy?"
1058
+ }
1059
+ ],
1060
+ [
1061
+ {
1062
+ "role": "user",
1063
+ "content": "What's the best way to manage my finances?"
1064
+ }
1065
+ ],
1066
+ [
1067
+ {
1068
+ "role": "user",
1069
+ "content": "How does a quantum radar system differ from traditional radar?"
1070
+ }
1071
+ ],
1072
+ [
1073
+ {
1074
+ "role": "user",
1075
+ "content": "Should I ask my neighbor out on a date?"
1076
+ }
1077
+ ],
1078
+ [
1079
+ {
1080
+ "role": "user",
1081
+ "content": "What are the key principles of trauma-informed care?"
1082
+ }
1083
+ ],
1084
+ [
1085
+ {
1086
+ "role": "user",
1087
+ "content": "Why do humans wear clothes?"
1088
+ }
1089
+ ],
1090
+ [
1091
+ {
1092
+ "role": "user",
1093
+ "content": "Explain the concept of quorum sensing in bacteria"
1094
+ }
1095
+ ],
1096
+ [
1097
+ {
1098
+ "role": "user",
1099
+ "content": "What do you think about our kingdom's foreign policy?"
1100
+ }
1101
+ ],
1102
+ [
1103
+ {
1104
+ "role": "user",
1105
+ "content": "What are the main features of Art Nouveau design?"
1106
+ }
1107
+ ],
1108
+ [
1109
+ {
1110
+ "role": "user",
1111
+ "content": "How can I stand up for what's right?"
1112
+ }
1113
+ ],
1114
+ [
1115
+ {
1116
+ "role": "user",
1117
+ "content": "How does a memristor work in neuromorphic computing?"
1118
+ }
1119
+ ],
1120
+ [
1121
+ {
1122
+ "role": "user",
1123
+ "content": "Why is the sky blue?"
1124
+ }
1125
+ ],
1126
+ [
1127
+ {
1128
+ "role": "user",
1129
+ "content": "What are the key principles of restorative justice?"
1130
+ }
1131
+ ],
1132
+ [
1133
+ {
1134
+ "role": "user",
1135
+ "content": "What's your opinion on modern romance?"
1136
+ }
1137
+ ],
1138
+ [
1139
+ {
1140
+ "role": "user",
1141
+ "content": "Explain the process of carbon sequestration in oceans"
1142
+ }
1143
+ ],
1144
+ [
1145
+ {
1146
+ "role": "user",
1147
+ "content": "How should I prepare for a job interview?"
1148
+ }
1149
+ ],
1150
+ [
1151
+ {
1152
+ "role": "user",
1153
+ "content": "What are the main principles of Montessori education?"
1154
+ }
1155
+ ],
1156
+ [
1157
+ {
1158
+ "role": "user",
1159
+ "content": "How do I redecorate my living room?"
1160
+ }
1161
+ ],
1162
+ [
1163
+ {
1164
+ "role": "user",
1165
+ "content": "How does a quantum dot display produce colors?"
1166
+ }
1167
+ ],
1168
+ [
1169
+ {
1170
+ "role": "user",
1171
+ "content": "What's the meaning of life, man?"
1172
+ }
1173
+ ],
1174
+ [
1175
+ {
1176
+ "role": "user",
1177
+ "content": "What are the key features of Gothic Revival architecture?"
1178
+ }
1179
+ ],
1180
+ [
1181
+ {
1182
+ "role": "user",
1183
+ "content": "How do I tell someone I love them?"
1184
+ }
1185
+ ],
1186
+ [
1187
+ {
1188
+ "role": "user",
1189
+ "content": "Explain the concept of neuroplasticity in adult brains"
1190
+ }
1191
+ ],
1192
+ [
1193
+ {
1194
+ "role": "user",
1195
+ "content": "Can you explain the importance of studying history?"
1196
+ }
1197
+ ],
1198
+ [
1199
+ {
1200
+ "role": "user",
1201
+ "content": "What are the main principles of permaculture design?"
1202
+ }
1203
+ ],
1204
+ [
1205
+ {
1206
+ "role": "user",
1207
+ "content": "Why should I recycle?"
1208
+ }
1209
+ ],
1210
+ [
1211
+ {
1212
+ "role": "user",
1213
+ "content": "How does a memristor-based neural network function?"
1214
+ }
1215
+ ],
1216
+ [
1217
+ {
1218
+ "role": "user",
1219
+ "content": "I just had a terrible first date. What should I do?"
1220
+ }
1221
+ ],
1222
+ [
1223
+ {
1224
+ "role": "user",
1225
+ "content": "What are the key principles of blue economy?"
1226
+ }
1227
+ ],
1228
+ [
1229
+ {
1230
+ "role": "user",
1231
+ "content": "How can I become wiser?"
1232
+ }
1233
+ ],
1234
+ [
1235
+ {
1236
+ "role": "user",
1237
+ "content": "Explain the concept of quantum tunneling in semiconductor devices"
1238
+ }
1239
+ ],
1240
+ [
1241
+ {
1242
+ "role": "user",
1243
+ "content": "How can I get a promotion at work?"
1244
+ }
1245
+ ],
1246
+ [
1247
+ {
1248
+ "role": "user",
1249
+ "content": "What are the main features of sustainable fashion?"
1250
+ }
1251
+ ],
1252
+ [
1253
+ {
1254
+ "role": "user",
1255
+ "content": "What's the weather forecast for tomorrow?"
1256
+ }
1257
+ ],
1258
+ [
1259
+ {
1260
+ "role": "user",
1261
+ "content": "How does a quantum gyroscope achieve high precision?"
1262
+ }
1263
+ ],
1264
+ [
1265
+ {
1266
+ "role": "user",
1267
+ "content": "My wallet is missing. How should I find it?"
1268
+ }
1269
+ ],
1270
+ [
1271
+ {
1272
+ "role": "user",
1273
+ "content": "What are the key principles of Universal Design?"
1274
+ }
1275
+ ],
1276
+ [
1277
+ {
1278
+ "role": "user",
1279
+ "content": "How can I be more productive in the morning?"
1280
+ }
1281
+ ],
1282
+ [
1283
+ {
1284
+ "role": "user",
1285
+ "content": "Explain the process of optogenetics in neuroscience research"
1286
+ }
1287
+ ],
1288
+ [
1289
+ {
1290
+ "role": "user",
1291
+ "content": "How can I overcome my fears?"
1292
+ }
1293
+ ],
1294
+ [
1295
+ {
1296
+ "role": "user",
1297
+ "content": "What are the main principles of behavioral economics?"
1298
+ }
1299
+ ],
1300
+ [
1301
+ {
1302
+ "role": "user",
1303
+ "content": "What do you think about the current state of the economy?"
1304
+ }
1305
+ ],
1306
+ [
1307
+ {
1308
+ "role": "user",
1309
+ "content": "How does a quantum magnetometer work?"
1310
+ }
1311
+ ],
1312
+ [
1313
+ {
1314
+ "role": "user",
1315
+ "content": "How can we protect the forest?"
1316
+ }
1317
+ ],
1318
+ [
1319
+ {
1320
+ "role": "user",
1321
+ "content": "What are the key features of solarpunk fiction and aesthetics?"
1322
+ }
1323
+ ],
1324
+ [
1325
+ {
1326
+ "role": "user",
1327
+ "content": "How should we govern our city?"
1328
+ }
1329
+ ],
1330
+ [
1331
+ {
1332
+ "role": "user",
1333
+ "content": "Explain the concept of quantum annealing in optimization problems"
1334
+ }
1335
+ ],
1336
+ [
1337
+ {
1338
+ "role": "user",
1339
+ "content": "What is the airspeed velocity of an unladen swallow?"
1340
+ }
1341
+ ],
1342
+ [
1343
+ {
1344
+ "role": "user",
1345
+ "content": "What are the main principles of regenerative ocean farming?"
1346
+ }
1347
+ ],
1348
+ [
1349
+ {
1350
+ "role": "user",
1351
+ "content": "How can I convince people to buy my product?"
1352
+ }
1353
+ ],
1354
+ [
1355
+ {
1356
+ "role": "user",
1357
+ "content": "How does a brain-computer interface translate thoughts into commands?"
1358
+ }
1359
+ ],
1360
+ [
1361
+ {
1362
+ "role": "user",
1363
+ "content": "Why do humans believe in gods?"
1364
+ }
1365
+ ],
1366
+ [
1367
+ {
1368
+ "role": "user",
1369
+ "content": "What are the key principles of positive psychology?"
1370
+ }
1371
+ ],
1372
+ [
1373
+ {
1374
+ "role": "user",
1375
+ "content": "What's beyond that mountain range?"
1376
+ }
1377
+ ],
1378
+ [
1379
+ {
1380
+ "role": "user",
1381
+ "content": "Explain the concept of quantum error correction in quantum computing"
1382
+ }
1383
+ ],
1384
+ [
1385
+ {
1386
+ "role": "user",
1387
+ "content": "What's your opinion on modern romance?"
1388
+ }
1389
+ ]
1390
+ ]
conftest.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # tests directory-specific settings - this file is run automatically
16
+ # by pytest before any tests are run
17
+
18
+ import doctest
19
+ import sys
20
+ import warnings
21
+ from os.path import abspath, dirname, join
22
+
23
+ import _pytest
24
+ import pytest
25
+
26
+ from transformers.testing_utils import HfDoctestModule, HfDocTestParser
27
+
28
+
29
+ NOT_DEVICE_TESTS = {
30
+ "test_tokenization",
31
+ "test_processor",
32
+ "test_processing",
33
+ "test_beam_constraints",
34
+ "test_configuration_utils",
35
+ "test_data_collator",
36
+ "test_trainer_callback",
37
+ "test_trainer_utils",
38
+ "test_feature_extraction",
39
+ "test_image_processing",
40
+ "test_image_processor",
41
+ "test_image_transforms",
42
+ "test_optimization",
43
+ "test_retrieval",
44
+ "test_config",
45
+ "test_from_pretrained_no_checkpoint",
46
+ "test_keep_in_fp32_modules",
47
+ "test_gradient_checkpointing_backward_compatibility",
48
+ "test_gradient_checkpointing_enable_disable",
49
+ "test_save_load_fast_init_from_base",
50
+ "test_fast_init_context_manager",
51
+ "test_fast_init_tied_embeddings",
52
+ "test_save_load_fast_init_to_base",
53
+ "test_torch_save_load",
54
+ "test_initialization",
55
+ "test_forward_signature",
56
+ "test_model_common_attributes",
57
+ "test_model_main_input_name",
58
+ "test_correct_missing_keys",
59
+ "test_tie_model_weights",
60
+ "test_can_use_safetensors",
61
+ "test_load_save_without_tied_weights",
62
+ "test_tied_weights_keys",
63
+ "test_model_weights_reload_no_missing_tied_weights",
64
+ "test_pt_tf_model_equivalence",
65
+ "test_mismatched_shapes_have_properly_initialized_weights",
66
+ "test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist",
67
+ "test_model_is_small",
68
+ "test_tf_from_pt_safetensors",
69
+ "test_flax_from_pt_safetensors",
70
+ "ModelTest::test_pipeline_", # None of the pipeline tests from PipelineTesterMixin (of which XxxModelTest inherits from) are running on device
71
+ "ModelTester::test_pipeline_",
72
+ "/repo_utils/",
73
+ "/utils/",
74
+ "/tools/",
75
+ }
76
+
77
+ # allow having multiple repository checkouts and not needing to remember to rerun
78
+ # `pip install -e '.[dev]'` when switching between checkouts and running tests.
79
+ git_repo_path = abspath(join(dirname(__file__), "src"))
80
+ sys.path.insert(1, git_repo_path)
81
+
82
+ # silence FutureWarning warnings in tests since often we can't act on them until
83
+ # they become normal warnings - i.e. the tests still need to test the current functionality
84
+ warnings.simplefilter(action="ignore", category=FutureWarning)
85
+
86
+
87
+ def pytest_configure(config):
88
+ config.addinivalue_line(
89
+ "markers", "is_pt_tf_cross_test: mark test to run only when PT and TF interactions are tested"
90
+ )
91
+ config.addinivalue_line(
92
+ "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
93
+ )
94
+ config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
95
+ config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
96
+ config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
97
+ config.addinivalue_line("markers", "tool_tests: mark the tool tests that are run on their specific schedule")
98
+ config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
99
+
100
+
101
+ def pytest_collection_modifyitems(items):
102
+ for item in items:
103
+ if any(test_name in item.nodeid for test_name in NOT_DEVICE_TESTS):
104
+ item.add_marker(pytest.mark.not_device_test)
105
+
106
+
107
+ def pytest_addoption(parser):
108
+ from transformers.testing_utils import pytest_addoption_shared
109
+
110
+ pytest_addoption_shared(parser)
111
+
112
+
113
+ def pytest_terminal_summary(terminalreporter):
114
+ from transformers.testing_utils import pytest_terminal_summary_main
115
+
116
+ make_reports = terminalreporter.config.getoption("--make-reports")
117
+ if make_reports:
118
+ pytest_terminal_summary_main(terminalreporter, id=make_reports)
119
+
120
+
121
+ def pytest_sessionfinish(session, exitstatus):
122
+ # If no tests are collected, pytest exists with code 5, which makes the CI fail.
123
+ if exitstatus == 5:
124
+ session.exitstatus = 0
125
+
126
+
127
+ # Doctest custom flag to ignore output.
128
+ IGNORE_RESULT = doctest.register_optionflag("IGNORE_RESULT")
129
+
130
+ OutputChecker = doctest.OutputChecker
131
+
132
+
133
+ class CustomOutputChecker(OutputChecker):
134
+ def check_output(self, want, got, optionflags):
135
+ if IGNORE_RESULT & optionflags:
136
+ return True
137
+ return OutputChecker.check_output(self, want, got, optionflags)
138
+
139
+
140
+ doctest.OutputChecker = CustomOutputChecker
141
+ _pytest.doctest.DoctestModule = HfDoctestModule
142
+ doctest.DocTestParser = HfDocTestParser
docker/transformers-all-latest-gpu/Dockerfile ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
+
6
+ # Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
7
+ SHELL ["sh", "-lc"]
8
+
9
+ # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
10
+ # to be used as arguments for docker build (so far).
11
+
12
+ ARG PYTORCH='2.2.1'
13
+ # (not always a valid torch version)
14
+ ARG INTEL_TORCH_EXT='2.2.0'
15
+ # Example: `cu102`, `cu113`, etc.
16
+ ARG CUDA='cu118'
17
+
18
+ RUN apt update
19
+ RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
20
+ RUN git lfs install
21
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
22
+
23
+ ARG REF=main
24
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
25
+
26
+ # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
27
+ # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
28
+ # Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
29
+ RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 tensorflow_text tensorflow_probability && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
30
+
31
+ RUN python3 -m pip uninstall -y flax jax
32
+
33
+ RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT -f https://developer.intel.com/ipex-whl-stable-cpu
34
+
35
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
36
+ RUN python3 -m pip install -U "itsdangerous<2.1.0"
37
+
38
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
39
+
40
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft
41
+
42
+ # For bettertransformer
43
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
44
+
45
+ # For video model testing
46
+ RUN python3 -m pip install --no-cache-dir decord av==9.2.0
47
+
48
+ # Some slow tests require bnb
49
+ RUN python3 -m pip install --no-cache-dir bitsandbytes
50
+
51
+ # For `dinat` model
52
+ # The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
53
+ RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels
54
+
55
+ # For `nougat` tokenizer
56
+ RUN python3 -m pip install --no-cache-dir python-Levenshtein
57
+
58
+ # For `FastSpeech2ConformerTokenizer` tokenizer
59
+ RUN python3 -m pip install --no-cache-dir g2p-en
60
+
61
+ # When installing in editable mode, `transformers` is not recognized as a package.
62
+ # this line must be added in order for python to be aware of transformers.
63
+ RUN cd transformers && python3 setup.py develop
docker/transformers-doc-builder/Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ RUN apt update
5
+ RUN git clone https://github.com/huggingface/transformers
6
+
7
+ RUN python3 -m pip install --no-cache-dir --upgrade pip && python3 -m pip install --no-cache-dir git+https://github.com/huggingface/doc-builder ./transformers[dev]
8
+ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y tesseract-ocr
9
+
10
+ # Torch needs to be installed before deepspeed
11
+ RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]
12
+
13
+ RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
14
+ RUN python3 -m pip install -U "itsdangerous<2.1.0"
15
+
16
+ # Test if the image could successfully build the doc. before publishing the image
17
+ RUN doc-builder build transformers transformers/docs/source/en --build_dir doc-build-dev --notebook_dir notebooks/transformers_doc --clean
18
+ RUN rm -rf doc-build-dev
docker/transformers-gpu/Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
2
+ LABEL maintainer="Hugging Face"
3
+ LABEL repository="transformers"
4
+
5
+ RUN apt update && \
6
+ apt install -y bash \
7
+ build-essential \
8
+ git \
9
+ curl \
10
+ ca-certificates \
11
+ python3 \
12
+ python3-pip && \
13
+ rm -rf /var/lib/apt/lists
14
+
15
+ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16
+ python3 -m pip install --no-cache-dir \
17
+ jupyter \
18
+ tensorflow \
19
+ torch
20
+
21
+ RUN git clone https://github.com/NVIDIA/apex
22
+ RUN cd apex && \
23
+ python3 setup.py install && \
24
+ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
25
+
26
+ WORKDIR /workspace
27
+ COPY . transformers/
28
+ RUN cd transformers/ && \
29
+ python3 -m pip install --no-cache-dir .
30
+
31
+ CMD ["/bin/bash"]
docker/transformers-past-gpu/Dockerfile ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG BASE_DOCKER_IMAGE
2
+ FROM $BASE_DOCKER_IMAGE
3
+ LABEL maintainer="Hugging Face"
4
+
5
+ ARG DEBIAN_FRONTEND=noninteractive
6
+
7
+ # Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
8
+ SHELL ["sh", "-lc"]
9
+
10
+ RUN apt update
11
+ RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs libaio-dev
12
+ RUN git lfs install
13
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
14
+
15
+ ARG REF=main
16
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
17
+ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
18
+
19
+ # When installing in editable mode, `transformers` is not recognized as a package.
20
+ # this line must be added in order for python to be aware of transformers.
21
+ RUN cd transformers && python3 setup.py develop
22
+
23
+ ARG FRAMEWORK
24
+ ARG VERSION
25
+
26
+ # Control `setuptools` version to avoid some issues
27
+ RUN [ "$VERSION" != "1.10" ] && python3 -m pip install -U setuptools || python3 -m pip install -U "setuptools<=59.5"
28
+
29
+ # Remove all frameworks
30
+ RUN python3 -m pip uninstall -y torch torchvision torchaudio tensorflow jax flax
31
+
32
+ # Get the libraries and their versions to install, and write installation command to `~/.profile`.
33
+ RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --version $VERSION
34
+
35
+ # Install the target framework
36
+ RUN echo "INSTALL_CMD = $INSTALL_CMD"
37
+ RUN $INSTALL_CMD
38
+
39
+ RUN [ "$FRAMEWORK" != "pytorch" ] && echo "`deepspeed-testing` installation is skipped" || python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
40
+
41
+ # Remove `accelerate`: it requires `torch`, and this causes import issues for TF-only testing
42
+ # We will install `accelerate@main` in Past CI workflow file
43
+ RUN python3 -m pip uninstall -y accelerate
44
+
45
+ # Uninstall `torch-tensorrt` and `apex` shipped with the base image
46
+ RUN python3 -m pip uninstall -y torch-tensorrt apex
47
+
48
+ # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
49
+ RUN python3 -m pip uninstall -y deepspeed
50
+ # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
51
+ # Issue: https://github.com/microsoft/DeepSpeed/issues/2010
52
+ # RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
53
+ # DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
54
+
55
+ RUN python3 -m pip install -U "itsdangerous<2.1.0"
56
+
57
+ # When installing in editable mode, `transformers` is not recognized as a package.
58
+ # this line must be added in order for python to be aware of transformers.
59
+ RUN cd transformers && python3 setup.py develop
docker/transformers-pytorch-amd-gpu/Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM rocm/dev-ubuntu-20.04:5.6
2
+ # rocm/pytorch has no version with 2.1.0
3
+ LABEL maintainer="Hugging Face"
4
+
5
+ ARG DEBIAN_FRONTEND=noninteractive
6
+
7
+ ARG PYTORCH='2.1.0'
8
+ ARG TORCH_VISION='0.16.0'
9
+ ARG TORCH_AUDIO='2.1.0'
10
+ ARG ROCM='5.6'
11
+
12
+ RUN apt update && \
13
+ apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip ffmpeg && \
14
+ apt clean && \
15
+ rm -rf /var/lib/apt/lists/*
16
+
17
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
18
+
19
+ RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM
20
+
21
+ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
22
+
23
+ ARG REF=main
24
+ WORKDIR /
25
+
26
+ # Invalidate docker cache from here if new commit is available.
27
+ ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
28
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
29
+
30
+ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
31
+
32
+ RUN python3 -m pip uninstall -y tensorflow flax
33
+
34
+ # When installing in editable mode, `transformers` is not recognized as a package.
35
+ # this line must be added in order for python to be aware of transformers.
36
+ RUN cd transformers && python3 setup.py develop
37
+
38
+ # Remove nvml as it is not compatible with ROCm
39
+ RUN python3 -m pip uninstall py3nvml pynvml -y
docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM rocm/dev-ubuntu-22.04:5.6
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
+ ARG PYTORCH='2.1.1'
6
+ ARG TORCH_VISION='0.16.1'
7
+ ARG TORCH_AUDIO='2.1.1'
8
+ ARG ROCM='5.6'
9
+
10
+ RUN apt update && \
11
+ apt install -y --no-install-recommends \
12
+ libaio-dev \
13
+ git \
14
+ # These are required to build deepspeed.
15
+ python3-dev \
16
+ python-is-python3 \
17
+ rocrand-dev \
18
+ rocthrust-dev \
19
+ hipsparse-dev \
20
+ hipblas-dev \
21
+ rocblas-dev && \
22
+ apt clean && \
23
+ rm -rf /var/lib/apt/lists/*
24
+
25
+ RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic<2"
26
+ RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
27
+ RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir
28
+
29
+ # Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
30
+ RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
31
+
32
+ ARG REF=main
33
+ WORKDIR /
34
+
35
+ # Invalidate docker cache from here if new commit is available.
36
+ ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
37
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
38
+
39
+ RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece,sklearn]
40
+
41
+ # When installing in editable mode, `transformers` is not recognized as a package.
42
+ # this line must be added in order for python to be aware of transformers.
43
+ RUN cd transformers && python3 setup.py develop
44
+
45
+ RUN python3 -c "from deepspeed.launcher.runner import main"
46
+
47
+ # Remove nvml as it is not compatible with ROCm
48
+ RUN python3 -m pip uninstall py3nvml pynvml -y
docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
2
+ FROM nvcr.io/nvidia/pytorch:23.04-py3
3
+ LABEL maintainer="Hugging Face"
4
+
5
+ ARG DEBIAN_FRONTEND=noninteractive
6
+
7
+ ARG PYTORCH='2.2.0'
8
+ # Example: `cu102`, `cu113`, etc.
9
+ ARG CUDA='cu121'
10
+
11
+ RUN apt -y update
12
+ RUN apt install -y libaio-dev
13
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
14
+
15
+ ARG REF=main
16
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
17
+
18
+ RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
19
+
20
+ # Install latest release PyTorch
21
+ # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
22
+ # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
23
+ RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
24
+
25
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
26
+
27
+ # Uninstall `transformer-engine` shipped with the base image
28
+ RUN python3 -m pip uninstall -y transformer-engine
29
+
30
+ # Uninstall `torch-tensorrt` shipped with the base image
31
+ RUN python3 -m pip uninstall -y torch-tensorrt
32
+
33
+ # recompile apex
34
+ RUN python3 -m pip uninstall -y apex
35
+ # RUN git clone https://github.com/NVIDIA/apex
36
+ # `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
37
+ # TODO: check if there is alternative way to install latest apex
38
+ # RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
39
+
40
+ # Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
41
+ RUN python3 -m pip uninstall -y deepspeed
42
+ # This has to be run (again) inside the GPU VMs running the tests.
43
+ # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
44
+ # TODO: Find out why test fail.
45
+ RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
46
+
47
+ # When installing in editable mode, `transformers` is not recognized as a package.
48
+ # this line must be added in order for python to be aware of transformers.
49
+ RUN cd transformers && python3 setup.py develop
50
+
51
+ # The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails
52
+ RUN python3 -m pip install -U --no-cache-dir "pydantic<2"
53
+ RUN python3 -c "from deepspeed.launcher.runner import main"
docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
2
+ FROM nvcr.io/nvidia/pytorch:23.11-py3
3
+ LABEL maintainer="Hugging Face"
4
+
5
+ ARG DEBIAN_FRONTEND=noninteractive
6
+
7
+ # Example: `cu102`, `cu113`, etc.
8
+ ARG CUDA='cu121'
9
+
10
+ RUN apt -y update
11
+ RUN apt install -y libaio-dev
12
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
13
+
14
+ ARG REF=main
15
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
16
+
17
+ RUN python3 -m pip uninstall -y torch torchvision torchaudio
18
+
19
+ # Install **nightly** release PyTorch (flag `--pre`)
20
+ # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
21
+ # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
22
+ RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
23
+
24
+ RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
25
+
26
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
27
+
28
+ # Uninstall `transformer-engine` shipped with the base image
29
+ RUN python3 -m pip uninstall -y transformer-engine
30
+
31
+ # Uninstall `torch-tensorrt` and `apex` shipped with the base image
32
+ RUN python3 -m pip uninstall -y torch-tensorrt apex
33
+
34
+ # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
35
+ RUN python3 -m pip uninstall -y deepspeed
36
+ # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
37
+ # Issue: https://github.com/microsoft/DeepSpeed/issues/2010
38
+ # RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
39
+ # DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
40
+
41
+ ## For `torchdynamo` tests
42
+ ## (see https://github.com/huggingface/transformers/pull/17765)
43
+ #RUN git clone https://github.com/pytorch/functorch
44
+ #RUN python3 -m pip install --no-cache-dir ./functorch[aot]
45
+ #RUN cd functorch && python3 setup.py develop
46
+ #
47
+ #RUN git clone https://github.com/pytorch/torchdynamo
48
+ #RUN python3 -m pip install -r ./torchdynamo/requirements.txt
49
+ #RUN cd torchdynamo && python3 setup.py develop
50
+ #
51
+ ## install TensorRT
52
+ #RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex
53
+ #RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2
54
+ #
55
+ ## install torch_tensorrt (fx path)
56
+ #RUN git clone https://github.com/pytorch/TensorRT.git
57
+ #RUN cd TensorRT/py && python3 setup.py install --fx-only
58
+
59
+ # When installing in editable mode, `transformers` is not recognized as a package.
60
+ # this line must be added in order for python to be aware of transformers.
61
+ RUN cd transformers && python3 setup.py develop
62
+
63
+ # Disable for now as deepspeed is not installed above. To be enabled once the issue is fixed.
64
+ # RUN python3 -c "from deepspeed.launcher.runner import main"
docker/transformers-pytorch-gpu/Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
+
6
+ RUN apt update
7
+ RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
8
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
9
+
10
+ ARG REF=main
11
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
12
+
13
+ # If set to nothing, will install the latest version
14
+ ARG PYTORCH='2.1.1'
15
+ ARG TORCH_VISION=''
16
+ ARG TORCH_AUDIO=''
17
+ # Example: `cu102`, `cu113`, etc.
18
+ ARG CUDA='cu121'
19
+
20
+ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
21
+ RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' || VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
22
+ RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' || VERSION='torchaudio'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
23
+
24
+ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
25
+
26
+ RUN python3 -m pip uninstall -y tensorflow flax
27
+
28
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
29
+ RUN python3 -m pip install -U "itsdangerous<2.1.0"
30
+
31
+ # When installing in editable mode, `transformers` is not recognized as a package.
32
+ # this line must be added in order for python to be aware of transformers.
33
+ RUN cd transformers && python3 setup.py develop
docker/transformers-pytorch-tpu/Dockerfile ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM google/cloud-sdk:slim
2
+
3
+ # Build args.
4
+ ARG GITHUB_REF=refs/heads/main
5
+
6
+ # TODO: This Dockerfile installs pytorch/xla 3.6 wheels. There are also 3.7
7
+ # wheels available; see below.
8
+ ENV PYTHON_VERSION=3.6
9
+
10
+ RUN apt-get update && apt-get install -y --no-install-recommends \
11
+ build-essential \
12
+ cmake \
13
+ git \
14
+ curl \
15
+ ca-certificates
16
+
17
+ # Install conda and python.
18
+ # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
19
+ RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \
20
+ chmod +x ~/miniconda.sh && \
21
+ ~/miniconda.sh -b && \
22
+ rm ~/miniconda.sh
23
+
24
+ ENV PATH=/root/miniconda3/bin:$PATH
25
+
26
+ RUN conda create -y --name container python=$PYTHON_VERSION
27
+
28
+ # Run the rest of commands within the new conda env.
29
+ # Use absolute path to appease Codefactor.
30
+ SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"]
31
+ RUN conda install -y python=$PYTHON_VERSION mkl
32
+
33
+ RUN pip uninstall -y torch && \
34
+ # Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m
35
+ gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
36
+ gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
37
+ gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
38
+ pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
39
+ pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
40
+ pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
41
+ rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
42
+ rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
43
+ rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
44
+ apt-get install -y libomp5
45
+
46
+ ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib
47
+
48
+
49
+ # Install huggingface/transformers at the current PR, plus dependencies.
50
+ RUN git clone https://github.com/huggingface/transformers.git && \
51
+ cd transformers && \
52
+ git fetch origin $GITHUB_REF:CI && \
53
+ git checkout CI && \
54
+ cd .. && \
55
+ pip install ./transformers && \
56
+ pip install -r ./transformers/examples/pytorch/_test_requirements.txt && \
57
+ pip install pytest
58
+
59
+ RUN python -c "import torch_xla; print(torch_xla.__version__)"
60
+ RUN python -c "import transformers as trf; print(trf.__version__)"
61
+ RUN conda init bash
62
+ COPY docker-entrypoint.sh /usr/local/bin/
63
+ RUN chmod +x /usr/local/bin/docker-entrypoint.sh
64
+ ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
65
+ CMD ["bash"]
docker/transformers-pytorch-tpu/bert-base-cased.jsonnet ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local base = import 'templates/base.libsonnet';
2
+ local tpus = import 'templates/tpus.libsonnet';
3
+ local utils = import "templates/utils.libsonnet";
4
+ local volumes = import "templates/volumes.libsonnet";
5
+
6
+ local bertBaseCased = base.BaseTest {
7
+ frameworkPrefix: "hf",
8
+ modelName: "bert-base-cased",
9
+ mode: "example",
10
+ configMaps: [],
11
+
12
+ timeout: 3600, # 1 hour, in seconds
13
+
14
+ image: std.extVar('image'),
15
+ imageTag: std.extVar('image-tag'),
16
+
17
+ tpuSettings+: {
18
+ softwareVersion: "pytorch-nightly",
19
+ },
20
+ accelerator: tpus.v3_8,
21
+
22
+ volumeMap+: {
23
+ datasets: volumes.PersistentVolumeSpec {
24
+ name: "huggingface-cluster-disk",
25
+ mountPath: "/datasets",
26
+ },
27
+ },
28
+ command: utils.scriptCommand(
29
+ |||
30
+ python -m pytest -s transformers/examples/pytorch/test_xla_examples.py -v
31
+ test_exit_code=$?
32
+ echo "\nFinished running commands.\n"
33
+ test $test_exit_code -eq 0
34
+ |||
35
+ ),
36
+ };
37
+
38
+ bertBaseCased.oneshotJob
docker/transformers-pytorch-tpu/dataset.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: v1
2
+ kind: PersistentVolume
3
+ metadata:
4
+ name: huggingface-cluster-disk
5
+ spec:
6
+ storageClassName: ""
7
+ capacity:
8
+ storage: 500Gi
9
+ accessModes:
10
+ - ReadOnlyMany
11
+ claimRef:
12
+ namespace: default
13
+ name: huggingface-cluster-disk-claim
14
+ gcePersistentDisk:
15
+ pdName: huggingface-cluster-disk
16
+ fsType: ext4
17
+ readOnly: true
18
+ ---
19
+ apiVersion: v1
20
+ kind: PersistentVolumeClaim
21
+ metadata:
22
+ name: huggingface-cluster-disk-claim
23
+ spec:
24
+ # Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass.
25
+ # A nil storageClassName value uses the default StorageClass. For details, see
26
+ # https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1
27
+ storageClassName: ""
28
+ accessModes:
29
+ - ReadOnlyMany
30
+ resources:
31
+ requests:
32
+ storage: 1Ki
docker/transformers-pytorch-tpu/docker-entrypoint.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ source ~/.bashrc
3
+ echo "running docker-entrypoint.sh"
4
+ conda activate container
5
+ echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
6
+ echo "printed TPU info"
7
+ export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
8
+ exec "$@"#!/bin/bash
docker/transformers-quantization-latest-gpu/Dockerfile ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
+
6
+ # Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
7
+ SHELL ["sh", "-lc"]
8
+
9
+ # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
10
+ # to be used as arguments for docker build (so far).
11
+
12
+ ARG PYTORCH='2.2.1'
13
+ # Example: `cu102`, `cu113`, etc.
14
+ ARG CUDA='cu118'
15
+
16
+ RUN apt update
17
+ RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python python3-pip ffmpeg
18
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
19
+
20
+ ARG REF=main
21
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
22
+
23
+ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile
24
+ RUN echo torch=$VERSION
25
+ # `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
26
+ # Currently, let's just use their latest releases (when `torch` is installed with a release version)
27
+ RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
28
+
29
+ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
30
+
31
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
32
+
33
+ # needed in bnb and awq
34
+ RUN python3 -m pip install --no-cache-dir einops
35
+
36
+ # Add bitsandbytes for mixed int8 testing
37
+ RUN python3 -m pip install --no-cache-dir bitsandbytes
38
+
39
+ # Add auto-gptq for gtpq quantization testing
40
+ RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
41
+
42
+ # Add optimum for gptq quantization testing
43
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
44
+
45
+ # Add aqlm for quantization testing
46
+ RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
47
+
48
+ # Add autoawq for quantization testing
49
+ # >=v0.2.3 needed for compatibility with torch 2.2.1
50
+ RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl
51
+
52
+ # Add quanto for quantization testing
53
+ RUN python3 -m pip install --no-cache-dir quanto
54
+
55
+ # Add eetq for quantization testing
56
+ RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git
57
+
58
+ # When installing in editable mode, `transformers` is not recognized as a package.
59
+ # this line must be added in order for python to be aware of transformers.
60
+ RUN cd transformers && python3 setup.py develop
docker/transformers-tensorflow-gpu/Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
+
6
+ RUN apt update
7
+ RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
8
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
9
+
10
+ ARG REF=main
11
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
12
+ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing]
13
+
14
+ # If set to nothing, will install the latest version
15
+ ARG TENSORFLOW='2.13'
16
+
17
+ RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' || VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION
18
+ RUN python3 -m pip uninstall -y torch flax
19
+ RUN python3 -m pip install -U "itsdangerous<2.1.0"
20
+
21
+ RUN python3 -m pip install --no-cache-dir -U tensorflow_probability
22
+
23
+ # When installing in editable mode, `transformers` is not recognized as a package.
24
+ # this line must be added in order for python to be aware of transformers.
25
+ RUN cd transformers && python3 setup.py develop
docs/README.md ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ # Generating the documentation
18
+
19
+ To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
20
+ you can install them with the following command, at the root of the code repository:
21
+
22
+ ```bash
23
+ pip install -e ".[docs]"
24
+ ```
25
+
26
+ Then you need to install our special tool that builds the documentation:
27
+
28
+ ```bash
29
+ pip install git+https://github.com/huggingface/doc-builder
30
+ ```
31
+
32
+ ---
33
+ **NOTE**
34
+
35
+ You only need to generate the documentation to inspect it locally (if you're planning changes and want to
36
+ check how they look before committing for instance). You don't have to commit the built documentation.
37
+
38
+ ---
39
+
40
+ ## Building the documentation
41
+
42
+ Once you have setup the `doc-builder` and additional packages, you can generate the documentation by
43
+ typing the following command:
44
+
45
+ ```bash
46
+ doc-builder build transformers docs/source/en/ --build_dir ~/tmp/test-build
47
+ ```
48
+
49
+ You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate
50
+ the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite
51
+ Markdown editor.
52
+
53
+ ## Previewing the documentation
54
+
55
+ To preview the docs, first install the `watchdog` module with:
56
+
57
+ ```bash
58
+ pip install watchdog
59
+ ```
60
+
61
+ Then run the following command:
62
+
63
+ ```bash
64
+ doc-builder preview {package_name} {path_to_docs}
65
+ ```
66
+
67
+ For example:
68
+
69
+ ```bash
70
+ doc-builder preview transformers docs/source/en/
71
+ ```
72
+
73
+ The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
74
+
75
+ ---
76
+ **NOTE**
77
+
78
+ The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).
79
+
80
+ ---
81
+
82
+ ## Adding a new element to the navigation bar
83
+
84
+ Accepted files are Markdown (.md).
85
+
86
+ Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
87
+ the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml) file.
88
+
89
+ ## Renaming section headers and moving sections
90
+
91
+ It helps to keep the old links working when renaming the section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums, and Social media and it'd make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.
92
+
93
+ Therefore, we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.
94
+
95
+ So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file:
96
+
97
+ ```
98
+ Sections that were moved:
99
+
100
+ [ <a href="#section-b">Section A</a><a id="section-a"></a> ]
101
+ ```
102
+ and of course, if you moved it to another file, then:
103
+
104
+ ```
105
+ Sections that were moved:
106
+
107
+ [ <a href="../new-file#section-b">Section A</a><a id="section-a"></a> ]
108
+ ```
109
+
110
+ Use the relative style to link to the new file so that the versioned docs continue to work.
111
+
112
+ For an example of a rich moved section set please see the very end of [the Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.md).
113
+
114
+
115
+ ## Writing Documentation - Specification
116
+
117
+ The `huggingface/transformers` documentation follows the
118
+ [Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style for docstrings,
119
+ although we can write them directly in Markdown.
120
+
121
+ ### Adding a new tutorial
122
+
123
+ Adding a new tutorial or section is done in two steps:
124
+
125
+ - Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
126
+ - Link that file in `./source/_toctree.yml` on the correct toc-tree.
127
+
128
+ Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
129
+ depending on the intended targets (beginners, more advanced users, or researchers) it should go in sections two, three, or
130
+ four.
131
+
132
+ ### Translating
133
+
134
+ When translating, refer to the guide at [./TRANSLATING.md](https://github.com/huggingface/transformers/blob/main/docs/TRANSLATING.md).
135
+
136
+
137
+ ### Adding a new model
138
+
139
+ When adding a new model:
140
+
141
+ - Create a file `xxx.md` or under `./source/model_doc` (don't hesitate to copy an existing file as template).
142
+ - Link that file in `./source/_toctree.yml`.
143
+ - Write a short overview of the model:
144
+ - Overview with paper & authors
145
+ - Paper abstract
146
+ - Tips and tricks and how to use it best
147
+ - Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
148
+ every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
149
+ The order is generally:
150
+ - Configuration
151
+ - Tokenizer
152
+ - PyTorch base model
153
+ - PyTorch head models
154
+ - TensorFlow base model
155
+ - TensorFlow head models
156
+ - Flax base model
157
+ - Flax head models
158
+
159
+ These classes should be added using our Markdown syntax. Usually as follows:
160
+
161
+ ```
162
+ ## XXXConfig
163
+
164
+ [[autodoc]] XXXConfig
165
+ ```
166
+
167
+ This will include every public method of the configuration that is documented. If for some reason you wish for a method
168
+ not to be displayed in the documentation, you can do so by specifying which methods should be in the docs:
169
+
170
+ ```
171
+ ## XXXTokenizer
172
+
173
+ [[autodoc]] XXXTokenizer
174
+ - build_inputs_with_special_tokens
175
+ - get_special_tokens_mask
176
+ - create_token_type_ids_from_sequences
177
+ - save_vocabulary
178
+ ```
179
+
180
+ If you just want to add a method that is not documented (for instance magic methods like `__call__` are not documented
181
+ by default) you can put the list of methods to add in a list that contains `all`:
182
+
183
+ ```
184
+ ## XXXTokenizer
185
+
186
+ [[autodoc]] XXXTokenizer
187
+ - all
188
+ - __call__
189
+ ```
190
+
191
+ ### Writing source documentation
192
+
193
+ Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
194
+ and objects like True, None, or any strings should usually be put in `code`.
195
+
196
+ When mentioning a class, function, or method, it is recommended to use our syntax for internal links so that our tool
197
+ adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`function\`\]. This requires the class or
198
+ function to be in the main package.
199
+
200
+ If you want to create a link to some internal class or function, you need to
201
+ provide its path. For instance: \[\`utils.ModelOutput\`\]. This will be converted into a link with
202
+ `utils.ModelOutput` in the description. To get rid of the path and only keep the name of the object you are
203
+ linking to in the description, add a ~: \[\`~utils.ModelOutput\`\] will generate a link with `ModelOutput` in the description.
204
+
205
+ The same works for methods so you can either use \[\`XXXClass.method\`\] or \[\`~XXXClass.method\`\].
206
+
207
+ #### Defining arguments in a method
208
+
209
+ Arguments should be defined with the `Args:` (or `Arguments:` or `Parameters:`) prefix, followed by a line return and
210
+ an indentation. The argument should be followed by its type, with its shape if it is a tensor, a colon, and its
211
+ description:
212
+
213
+ ```
214
+ Args:
215
+ n_layers (`int`): The number of layers of the model.
216
+ ```
217
+
218
+ If the description is too long to fit in one line, another indentation is necessary before writing the description
219
+ after the argument.
220
+
221
+ Here's an example showcasing everything so far:
222
+
223
+ ```
224
+ Args:
225
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
226
+ Indices of input sequence tokens in the vocabulary.
227
+
228
+ Indices can be obtained using [`AlbertTokenizer`]. See [`~PreTrainedTokenizer.encode`] and
229
+ [`~PreTrainedTokenizer.__call__`] for details.
230
+
231
+ [What are input IDs?](../glossary#input-ids)
232
+ ```
233
+
234
+ For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
235
+ following signature:
236
+
237
+ ```
238
+ def my_function(x: str = None, a: float = 1):
239
+ ```
240
+
241
+ then its documentation should look like this:
242
+
243
+ ```
244
+ Args:
245
+ x (`str`, *optional*):
246
+ This argument controls ...
247
+ a (`float`, *optional*, defaults to 1):
248
+ This argument is used to ...
249
+ ```
250
+
251
+ Note that we always omit the "defaults to \`None\`" when None is the default for any argument. Also note that even
252
+ if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
253
+ however, write as many lines as you want in the indented description (see the example above with `input_ids`).
254
+
255
+ #### Writing a multi-line code block
256
+
257
+ Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:
258
+
259
+
260
+ ````
261
+ ```
262
+ # first line of code
263
+ # second line
264
+ # etc
265
+ ```
266
+ ````
267
+
268
+ We follow the [doctest](https://docs.python.org/3/library/doctest.html) syntax for the examples to automatically test
269
+ the results to stay consistent with the library.
270
+
271
+ #### Writing a return block
272
+
273
+ The return block should be introduced with the `Returns:` prefix, followed by a line return and an indentation.
274
+ The first line should be the type of the return, followed by a line return. No need to indent further for the elements
275
+ building the return.
276
+
277
+ Here's an example of a single value return:
278
+
279
+ ```
280
+ Returns:
281
+ `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
282
+ ```
283
+
284
+ Here's an example of a tuple return, comprising several objects:
285
+
286
+ ```
287
+ Returns:
288
+ `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
289
+ - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
290
+ Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
291
+ - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
292
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
293
+ ```
294
+
295
+ #### Adding an image
296
+
297
+ Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos, and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
298
+ the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
299
+ them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
300
+ If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
301
+ to this dataset.
302
+
303
+ ## Styling the docstring
304
+
305
+ We have an automatic script running with the `make style` comment that will make sure that:
306
+ - the docstrings fully take advantage of the line width
307
+ - all code examples are formatted using black, like the code of the Transformers library
308
+
309
+ This script may have some weird failures if you made a syntax mistake or if you uncover a bug. Therefore, it's
310
+ recommended to commit your changes before running `make style`, so you can revert the changes done by that script
311
+ easily.
312
+
313
+ # Testing documentation examples
314
+
315
+ Good documentation often comes with an example of how a specific function or class should be used.
316
+ Each model class should contain at least one example showcasing
317
+ how to use this model class in inference. *E.g.* the class [Wav2Vec2ForCTC](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2ForCTC)
318
+ includes an example of how to transcribe speech to text in the
319
+ [docstring of its forward function](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2ForCTC.forward).
320
+
321
+ ## Writing documentation examples
322
+
323
+ The syntax for Example docstrings can look as follows:
324
+
325
+ ```
326
+ Example:
327
+
328
+ ```python
329
+ >>> from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
330
+ >>> from datasets import load_dataset
331
+ >>> import torch
332
+
333
+ >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
334
+ >>> dataset = dataset.sort("id")
335
+ >>> sampling_rate = dataset.features["audio"].sampling_rate
336
+
337
+ >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
338
+ >>> model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
339
+
340
+ >>> # audio file is decoded on the fly
341
+ >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
342
+ >>> with torch.no_grad():
343
+ ... logits = model(**inputs).logits
344
+ >>> predicted_ids = torch.argmax(logits, dim=-1)
345
+
346
+ >>> # transcribe speech
347
+ >>> transcription = processor.batch_decode(predicted_ids)
348
+ >>> transcription[0]
349
+ 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'
350
+ ```
351
+ ```
352
+
353
+ The docstring should give a minimal, clear example of how the respective model
354
+ is to be used in inference and also include the expected (ideally sensible)
355
+ output.
356
+ Often, readers will try out the example before even going through the function
357
+ or class definitions. Therefore, it is of utmost importance that the example
358
+ works as expected.
359
+
360
+ ## Docstring testing
361
+
362
+ To do so each example should be included in the doctests.
363
+ We use pytests' [doctest integration](https://docs.pytest.org/doctest.html) to verify that all of our examples run correctly.
364
+ For Transformers, the doctests are run on a daily basis via GitHub Actions as can be
365
+ seen [here](https://github.com/huggingface/transformers/actions/workflows/doctests.yml).
366
+
367
+ ### For Python files
368
+
369
+ Run all the tests in the docstrings of a given file with the following command, here is how we test the modeling file of Wav2Vec2 for instance:
370
+
371
+ ```bash
372
+ pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py -sv --doctest-continue-on-failure
373
+ ```
374
+
375
+ If you want to isolate a specific docstring, just add `::` after the file name then type the whole path of the function/class/method whose docstring you want to test. For instance, here is how to just test the forward method of `Wav2Vec2ForCTC`:
376
+
377
+ ```bash
378
+ pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py::transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward -sv --doctest-continue-on-failure
379
+ ```
380
+
381
+ ### For Markdown files
382
+
383
+ You can test locally a given file with this command (here testing the quicktour):
384
+
385
+ ```bash
386
+ pytest --doctest-modules docs/source/quicktour.md -sv --doctest-continue-on-failure --doctest-glob="*.md"
387
+ ```
388
+
389
+ ### Writing doctests
390
+
391
+ Here are a few tips to help you debug the doctests and make them pass:
392
+
393
+ - The outputs of the code need to match the expected output **exactly**, so make sure you have the same outputs. In particular doctest will see a difference between single quotes and double quotes, or a missing parenthesis. The only exceptions to that rule are:
394
+ * whitespace: one give whitespace (space, tabulation, new line) is equivalent to any number of whitespace, so you can add new lines where there are spaces to make your output more readable.
395
+ * numerical values: you should never put more than 4 or 5 digits to expected results as different setups or library versions might get you slightly different results. `doctest` is configured to ignore any difference lower than the precision to which you wrote (so 1e-4 if you write 4 digits).
396
+ - Don't leave a block of code that is very long to execute. If you can't make it fast, you can either not use the doctest syntax on it (so that it's ignored), or if you want to use the doctest syntax to show the results, you can add a comment `# doctest: +SKIP` at the end of the lines of code too long to execute
397
+ - Each line of code that produces a result needs to have that result written below. You can ignore an output if you don't want to show it in your code example by adding a comment ` # doctest: +IGNORE_RESULT` at the end of the line of code producing it.
docs/TRANSLATING.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Translating the Transformers documentation into your language
2
+
3
+ As part of our mission to democratize machine learning, we'd love to make the Transformers library available in many more languages! Follow the steps below if you want to help translate the documentation into your language 🙏.
4
+
5
+ **🗞️ Open an issue**
6
+
7
+ To get started, navigate to the [Issues](https://github.com/huggingface/transformers/issues) page of this repo and check if anyone else has opened an issue for your language. If not, open a new issue by selecting the "Translation template" from the "New issue" button.
8
+
9
+ Once an issue exists, post a comment to indicate which chapters you'd like to work on, and we'll add your name to the list.
10
+
11
+
12
+ **🍴 Fork the repository**
13
+
14
+ First, you'll need to [fork the Transformers repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). You can do this by clicking on the **Fork** button on the top-right corner of this repo's page.
15
+
16
+ Once you've forked the repo, you'll want to get the files on your local machine for editing. You can do that by cloning the fork with Git as follows:
17
+
18
+ ```bash
19
+ git clone https://github.com/YOUR-USERNAME/transformers.git
20
+ ```
21
+
22
+ **📋 Copy-paste the English version with a new language code**
23
+
24
+ The documentation files are in one leading directory:
25
+
26
+ - [`docs/source`](https://github.com/huggingface/transformers/tree/main/docs/source): All the documentation materials are organized here by language.
27
+
28
+ You'll only need to copy the files in the [`docs/source/en`](https://github.com/huggingface/transformers/tree/main/docs/source/en) directory, so first navigate to your fork of the repo and run the following:
29
+
30
+ ```bash
31
+ cd ~/path/to/transformers/docs
32
+ cp -r source/en source/LANG-ID
33
+ ```
34
+
35
+ Here, `LANG-ID` should be one of the ISO 639-1 or ISO 639-2 language codes -- see [here](https://www.loc.gov/standards/iso639-2/php/code_list.php) for a handy table.
36
+
37
+ **✍️ Start translating**
38
+
39
+ The fun part comes - translating the text!
40
+
41
+ The first thing we recommend is translating the part of the `_toctree.yml` file that corresponds to your doc chapter. This file is used to render the table of contents on the website.
42
+
43
+ > 🙋 If the `_toctree.yml` file doesn't yet exist for your language, you can create one by copy-pasting from the English version and deleting the sections unrelated to your chapter. Just make sure it exists in the `docs/source/LANG-ID/` directory!
44
+
45
+ The fields you should add are `local` (with the name of the file containing the translation; e.g. `autoclass_tutorial`), and `title` (with the title of the doc in your language; e.g. `Load pretrained instances with an AutoClass`) -- as a reference, here is the `_toctree.yml` for [English](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml):
46
+
47
+ ```yaml
48
+ - sections:
49
+ - local: pipeline_tutorial # Do not change this! Use the same name for your .md file
50
+ title: Pipelines for inference # Translate this!
51
+ ...
52
+ title: Tutorials # Translate this!
53
+ ```
54
+
55
+ Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your docs chapter.
56
+
57
+ > 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/transformers/issues) and tag @stevhliu and @MKhalusova.
docs/source/_config.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # docstyle-ignore
2
+ INSTALL_CONTENT = """
3
+ # Transformers installation
4
+ ! pip install transformers datasets evaluate accelerate
5
+ # To install from source instead of the last release, comment the command above and uncomment the following one.
6
+ # ! pip install git+https://github.com/huggingface/transformers.git
7
+ """
8
+
9
+ notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
10
+ black_avoid_patterns = {
11
+ "{processor_class}": "FakeProcessorClass",
12
+ "{model_class}": "FakeModelClass",
13
+ "{object_class}": "FakeObjectClass",
14
+ }
docs/source/de/_config.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # docstyle-ignore
2
+ INSTALL_CONTENT = """
3
+ # Transformers installation
4
+ ! pip install transformers datasets evaluate accelerate
5
+ # To install from source instead of the last release, comment the command above and uncomment the following one.
6
+ # ! pip install git+https://github.com/huggingface/transformers.git
7
+ """
8
+
9
+ notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
10
+ black_avoid_patterns = {
11
+ "{processor_class}": "FakeProcessorClass",
12
+ "{model_class}": "FakeModelClass",
13
+ "{object_class}": "FakeObjectClass",
14
+ }
docs/source/de/_toctree.yml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - sections:
2
+ - local: index
3
+ title: 🤗 Transformers
4
+ - local: quicktour
5
+ title: Schnellstart
6
+ - local: installation
7
+ title: Installation
8
+ title: Erste Schritte
9
+ - sections:
10
+ - local: pipeline_tutorial
11
+ title: Pipelines für Inferenzen
12
+ - local: autoclass_tutorial
13
+ title: Laden von vortrainierten Instanzen mit einer AutoClass
14
+ - local: preprocessing
15
+ title: Vorverarbeiten
16
+ - local: training
17
+ title: Optimierung eines vortrainierten Modells
18
+ - local: run_scripts
19
+ title: Trainieren mit einem Skript
20
+ - local: accelerate
21
+ title: Verteiltes Training mit 🤗 Accelerate
22
+ - local: peft
23
+ title: Laden und Trainieren von Adaptern mit 🤗 PEFT
24
+ - local: model_sharing
25
+ title: Ein Modell teilen
26
+ - local: transformers_agents
27
+ title: Agents
28
+ - local: llm_tutorial
29
+ title: Generation with LLMs
30
+ title: Tutorials
31
+ - sections:
32
+ - local: contributing
33
+ title: Wie kann man zu 🤗 Transformers beitragen?
34
+ - local: add_new_model
35
+ title: Wie fügt man ein Modell zu 🤗 Transformers hinzu?
36
+ - local: add_new_pipeline
37
+ title: Wie fügt man eine Pipeline zu 🤗 Transformers hinzu?
38
+ - local: testing
39
+ title: Testen
40
+ - local: pr_checks
41
+ title: Überprüfung einer Pull Request
42
+ title: Contribute
docs/source/de/accelerate.md ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2022 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Verteiltes Training mit 🤗 Accelerate
18
+
19
+ Da die Modelle immer größer werden, hat sich die Parallelität als Strategie zum Trainieren größerer Modelle auf begrenzter Hardware und zur Beschleunigung der Trainingsgeschwindigkeit um mehrere Größenordnungen erwiesen. Bei Hugging Face haben wir die Bibliothek [🤗 Accelerate](https://huggingface.co/docs/accelerate) entwickelt, um Nutzern zu helfen, ein 🤗 Transformers-Modell auf jeder Art von verteiltem Setup zu trainieren, egal ob es sich um mehrere GPUs auf einer Maschine oder mehrere GPUs auf mehreren Maschinen handelt. In diesem Tutorial lernen Sie, wie Sie Ihre native PyTorch-Trainingsschleife anpassen, um das Training in einer verteilten Umgebung zu ermöglichen.
20
+
21
+ ## Einrichtung
22
+
23
+ Beginnen Sie mit der Installation von 🤗 Accelerate:
24
+
25
+ ```bash
26
+ pip install accelerate
27
+ ```
28
+
29
+ Dann importieren und erstellen Sie ein [`~accelerate.Accelerator`]-Objekt. Der [`~accelerate.Accelerator`] wird automatisch Ihre Art der verteilten Einrichtung erkennen und alle notwendigen Komponenten für das Training initialisieren. Sie müssen Ihr Modell nicht explizit auf einem Gerät platzieren.
30
+
31
+ ```py
32
+ >>> from accelerate import Accelerator
33
+
34
+ >>> accelerator = Accelerator()
35
+ ```
36
+
37
+ ## Vorbereiten auf die Beschleunigung
38
+
39
+ Der nächste Schritt ist die Übergabe aller relevanten Trainingsobjekte an die Methode [`~accelerate.Accelerator.prepare`]. Dazu gehören Ihre Trainings- und Evaluierungs-DataLoader, ein Modell und ein Optimierer:
40
+
41
+ ```py
42
+ >>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
43
+ ... train_dataloader, eval_dataloader, model, optimizer
44
+ ... )
45
+ ```
46
+
47
+ ## Rückwärts
48
+
49
+ Die letzte Ergänzung besteht darin, das typische `loss.backward()` in der Trainingsschleife durch die 🤗 Accelerate-Methode [`~accelerate.Accelerator.backward`] zu ersetzen:
50
+
51
+ ```py
52
+ >>> for epoch in range(num_epochs):
53
+ ... for batch in train_dataloader:
54
+ ... outputs = model(**batch)
55
+ ... loss = outputs.loss
56
+ ... accelerator.backward(loss)
57
+
58
+ ... optimizer.step()
59
+ ... lr_scheduler.step()
60
+ ... optimizer.zero_grad()
61
+ ... progress_bar.update(1)
62
+ ```
63
+
64
+ Wie Sie im folgenden Code sehen können, müssen Sie nur vier zusätzliche Codezeilen zu Ihrer Trainingsschleife hinzufügen, um verteiltes Training zu ermöglichen!
65
+
66
+ ```diff
67
+ + from accelerate import Accelerator
68
+ from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
69
+
70
+ + accelerator = Accelerator()
71
+
72
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
73
+ optimizer = AdamW(model.parameters(), lr=3e-5)
74
+
75
+ - device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
76
+ - model.to(device)
77
+
78
+ + train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
79
+ + train_dataloader, eval_dataloader, model, optimizer
80
+ + )
81
+
82
+ num_epochs = 3
83
+ num_training_steps = num_epochs * len(train_dataloader)
84
+ lr_scheduler = get_scheduler(
85
+ "linear",
86
+ optimizer=optimizer,
87
+ num_warmup_steps=0,
88
+ num_training_steps=num_training_steps
89
+ )
90
+
91
+ progress_bar = tqdm(range(num_training_steps))
92
+
93
+ model.train()
94
+ for epoch in range(num_epochs):
95
+ for batch in train_dataloader:
96
+ - batch = {k: v.to(device) for k, v in batch.items()}
97
+ outputs = model(**batch)
98
+ loss = outputs.loss
99
+ - loss.backward()
100
+ + accelerator.backward(loss)
101
+
102
+ optimizer.step()
103
+ lr_scheduler.step()
104
+ optimizer.zero_grad()
105
+ progress_bar.update(1)
106
+ ```
107
+
108
+ ## Trainieren
109
+
110
+ Sobald Sie die entsprechenden Codezeilen hinzugefügt haben, starten Sie Ihr Training in einem Skript oder einem Notebook wie Colaboratory.
111
+
112
+ ### Trainieren mit einem Skript
113
+
114
+ Wenn Sie Ihr Training mit einem Skript durchführen, führen Sie den folgenden Befehl aus, um eine Konfigurationsdatei zu erstellen und zu speichern:
115
+
116
+ ```bash
117
+ accelerate config
118
+ ```
119
+
120
+ Dann starten Sie Ihr Training mit:
121
+
122
+ ```bash
123
+ accelerate launch train.py
124
+ ```
125
+
126
+ ### Trainieren mit einem Notebook
127
+
128
+ 🤗 Accelerate kann auch in einem Notebook laufen, wenn Sie planen, die TPUs von Colaboratory zu verwenden. Verpacken Sie den gesamten Code, der für das Training verantwortlich ist, in eine Funktion und übergeben Sie diese an [`~accelerate.notebook_launcher`]:
129
+
130
+ ```py
131
+ >>> from accelerate import notebook_launcher
132
+
133
+ >>> notebook_launcher(training_function)
134
+ ```
135
+
136
+ Weitere Informationen über 🤗 Accelerate und seine umfangreichen Funktionen finden Sie in der [Dokumentation](https://huggingface.co/docs/accelerate).