Sierra-W117
commited on
Commit
·
7daf5bf
0
Parent(s):
Initial commit
Browse files- .gitattributes +2 -0
- .gitignore +10 -0
- .python-version +1 -0
- README.md +0 -0
- data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/dataset_info.json +3 -0
- data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-test.arrow +3 -0
- data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00000-of-00005.arrow +3 -0
- data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00001-of-00005.arrow +3 -0
- data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00002-of-00005.arrow +3 -0
- data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00003-of-00005.arrow +3 -0
- data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00004-of-00005.arrow +3 -0
- data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517_builder.lock +0 -0
- data/_home_sierra_Desktop_schoolx_ml_math-gpt2_data_AI-MO___numina_math-co_t_default_0.0.0_9d8d210c9f6a36c8f3cd84045668c9b7800ef517.lock +0 -0
- data/test.txt +3 -0
- data/test_dataset/data-00000-of-00001.arrow +3 -0
- data/test_dataset/dataset_info.json +3 -0
- data/test_dataset/state.json +3 -0
- data/train.txt +3 -0
- data/train_dataset/data-00000-of-00012.arrow +3 -0
- data/train_dataset/data-00001-of-00012.arrow +3 -0
- data/train_dataset/data-00002-of-00012.arrow +3 -0
- data/train_dataset/data-00003-of-00012.arrow +3 -0
- data/train_dataset/data-00004-of-00012.arrow +3 -0
- data/train_dataset/data-00005-of-00012.arrow +3 -0
- data/train_dataset/data-00006-of-00012.arrow +3 -0
- data/train_dataset/data-00007-of-00012.arrow +3 -0
- data/train_dataset/data-00008-of-00012.arrow +3 -0
- data/train_dataset/data-00009-of-00012.arrow +3 -0
- data/train_dataset/data-00010-of-00012.arrow +3 -0
- data/train_dataset/data-00011-of-00012.arrow +3 -0
- data/train_dataset/dataset_info.json +3 -0
- data/train_dataset/state.json +3 -0
- git-guide.txt +28 -0
- models/math-gpt2-v0/checkpoint-64/config.json +3 -0
- models/math-gpt2-v0/checkpoint-64/generation_config.json +3 -0
- models/math-gpt2-v0/checkpoint-64/model.safetensors +3 -0
- models/math-gpt2-v0/checkpoint-64/optimizer.pt +3 -0
- models/math-gpt2-v0/checkpoint-64/rng_state.pth +3 -0
- models/math-gpt2-v0/checkpoint-64/scheduler.pt +3 -0
- models/math-gpt2-v0/checkpoint-64/trainer_state.json +3 -0
- models/math-gpt2-v0/checkpoint-64/training_args.bin +3 -0
- pyproject.toml +15 -0
- training.ipynb +482 -0
- using.ipynb +156 -0
- uv.lock +0 -0
.gitattributes
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
data/**/* filter=lfs diff=lfs merge=lfs -text
|
2 |
+
models/**/* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python-generated files
|
2 |
+
__pycache__/
|
3 |
+
*.py[oc]
|
4 |
+
build/
|
5 |
+
dist/
|
6 |
+
wheels/
|
7 |
+
*.egg-info
|
8 |
+
|
9 |
+
# Virtual environments
|
10 |
+
.venv
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.12
|
README.md
ADDED
File without changes
|
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c3e56bb3dc9a357e233a9fb7f8e4e8d78cf632912ee7cdb82ac7309ae13a444d
|
3 |
+
size 1841
|
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-test.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6100e0a2b7a8da10e1826327f03572f062c8692094e6f79c80a9a0fea0ffe3c2
|
3 |
+
size 283680
|
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00000-of-00005.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02602e4416333875f19cc121f04e3f59d13fe6281fe9172684dd326cb63d90e9
|
3 |
+
size 501967576
|
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00001-of-00005.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:96c471c057f51439eed145f3adb76a0199a769e0d04b09d1c307002b64631d04
|
3 |
+
size 502355600
|
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00002-of-00005.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6976fd4b3ea2373fb4db96de99459a0afefa4eaa2a60df6e04ce84f9ddb37d4f
|
3 |
+
size 502181200
|
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00003-of-00005.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4f52c13815176e107801de98c486340647c0cfdee24d0384cc1b52bc29a32a1d
|
3 |
+
size 501469240
|
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00004-of-00005.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:019ca18217d6570a600ee3c3fec9789ea6480dae2bb247c38ae89032b27245c1
|
3 |
+
size 487732184
|
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517_builder.lock
ADDED
File without changes
|
data/_home_sierra_Desktop_schoolx_ml_math-gpt2_data_AI-MO___numina_math-co_t_default_0.0.0_9d8d210c9f6a36c8f3cd84045668c9b7800ef517.lock
ADDED
File without changes
|
data/test.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe68549af30f20ef71cb8e885305e80ab40f38ce960a060b301d2f2df1bb9184
|
3 |
+
size 140685
|
data/test_dataset/data-00000-of-00001.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f77549e0a8318c2ad7f46f1c3df79f9a5e4bd3b2812a80a7f2b403bd0ea912cb
|
3 |
+
size 1125192
|
data/test_dataset/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee8e05b4381db6d6cc7427c83f0536c1da684169b0b57e251a76d23a3d084670
|
3 |
+
size 431
|
data/test_dataset/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eb96174acb2f77d300ea1fdc971d183ba564d79853319a2ec41c19615af41301
|
3 |
+
size 298
|
data/train.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1d5690c7a443730e822249ea2e0b37799c330851f6c87342c0dcd5d77fc1126
|
3 |
+
size 1243204995
|
data/train_dataset/data-00000-of-00012.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b06f1e008b17e8ee80546673206e4ade8842dc0b840f3352db50d09d90c7072
|
3 |
+
size 473206928
|
data/train_dataset/data-00001-of-00012.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5036abbfeaf78a0b696fc26833d80c2b44ca32bd57f0a62eb1760ebf58261485
|
3 |
+
size 473246368
|
data/train_dataset/data-00002-of-00012.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45186315c884dbb0b03aefc8d7ebc4a43fe416cb1991acdc6643f8038806aea1
|
3 |
+
size 472882424
|
data/train_dataset/data-00003-of-00012.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b4489cea42c1478171f9ed986fae4da66833daa184353b6d2638cd463d9e0d6e
|
3 |
+
size 473004376
|
data/train_dataset/data-00004-of-00012.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:572836a964eef213e9570d19b5abf74d8b9d99591e25e498d9f4141a8a845b4b
|
3 |
+
size 473022936
|
data/train_dataset/data-00005-of-00012.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a3d18f5c5a879daa106b1b596493471ca89075023d4a9aa3335c7aa866e7d06
|
3 |
+
size 472883784
|
data/train_dataset/data-00006-of-00012.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e38a00712bcb23bdd1a93a1c5299969607a25319542b3c24a0d0e26eadc36d1
|
3 |
+
size 472906568
|
data/train_dataset/data-00007-of-00012.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e6366921b4e83156591b110b91a85629cce7a56a2fc16922603f29da1b7e2da
|
3 |
+
size 473162936
|
data/train_dataset/data-00008-of-00012.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a99404147a03ed183bf5dad5c834de9a83bcd2dc2dc6531c0b8ad31fe5a113f1
|
3 |
+
size 473061832
|
data/train_dataset/data-00009-of-00012.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9593e6cfc36949e9107db0b17b06ad1f183cd9d3e0bb8afdc98b8f64bcf5765
|
3 |
+
size 472899088
|
data/train_dataset/data-00010-of-00012.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:71bb48eed735d31e8c1e4e3503140fd3c7137c5aec0cec4dca2eb2ac12faf59d
|
3 |
+
size 473032744
|
data/train_dataset/data-00011-of-00012.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04884fad18f5ab356bebce10fbf996407f78f432d12d14e693cf18fcdcb7f700
|
3 |
+
size 472870416
|
data/train_dataset/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee8e05b4381db6d6cc7427c83f0536c1da684169b0b57e251a76d23a3d084670
|
3 |
+
size 431
|
data/train_dataset/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6cfacc7784b691edb44b06521c361be2d888ed9cb961eb70584bab9303760293
|
3 |
+
size 947
|
git-guide.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
xdg-open ~/.git-credentials
|
2 |
+
xdg-open ~/.gitconfig
|
3 |
+
xdg-open ~/.cache/huggingface
|
4 |
+
|
5 |
+
huggingface-cli login
|
6 |
+
huggingface-cli repo create "repo-name"
|
7 |
+
|
8 |
+
git config --global user.email "[email protected]"
|
9 |
+
git config --global user.name "username"
|
10 |
+
|
11 |
+
uv init
|
12 |
+
* git lfs track отправляет большие файлы в специальное хранилище; пушатся не сами файлы, а ссылки на них в хранилище
|
13 |
+
git lfs track "data/**/*" (директория data; ** - все под-директории; * - все файлы)
|
14 |
+
git lfs track "models/**/*"
|
15 |
+
|
16 |
+
git add .gitattributes
|
17 |
+
git add . (добавь все файлы, что находится в этой директории)
|
18 |
+
|
19 |
+
git commit -m "Initial commit"
|
20 |
+
|
21 |
+
git add git-guide.txt
|
22 |
+
git commit --amend --no-edit (если изменил какой-то файл после того, как объявил комит и ты не хочешь создавать новый)
|
23 |
+
|
24 |
+
git remote add origin https://huggingface.co/Sierra-W117/math-gpt2
|
25 |
+
(git remote remove origin)
|
26 |
+
|
27 |
+
git branch -M main
|
28 |
+
git push -u origin main
|
models/math-gpt2-v0/checkpoint-64/config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:285d59ee8e224c185ad2239766d59b8c8c60fc4911b1fc98000ed47c06749151
|
3 |
+
size 907
|
models/math-gpt2-v0/checkpoint-64/generation_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c831d77f4ccb7e22873162a06bdb0b846a588f6586890c6aab7cec1b7ef9a60e
|
3 |
+
size 119
|
models/math-gpt2-v0/checkpoint-64/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12b69d3d4944bac6ac070c5912020130f1a85aac5557f469a55e3170f7ab338a
|
3 |
+
size 497774208
|
models/math-gpt2-v0/checkpoint-64/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b03bc453c349d0eb2faba453ac1ab28097404a05d27a942294b8269e8894f1a6
|
3 |
+
size 995642298
|
models/math-gpt2-v0/checkpoint-64/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be2f08e0a34d93e3742746f4ecb1a5a6653f3b1da6db4b41f5f39cf3ae48a6f7
|
3 |
+
size 14244
|
models/math-gpt2-v0/checkpoint-64/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3d0278b77bd0a6a0013d1641e2fe9dd8de3641b1d4a7ec8656f671cec7965afc
|
3 |
+
size 1064
|
models/math-gpt2-v0/checkpoint-64/trainer_state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf0bf27b94c3ebb86dcdf29d67753a85cd2510fcfbb14ef590bc7046bbe49eff
|
3 |
+
size 744
|
models/math-gpt2-v0/checkpoint-64/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cb4c73e15e28f4fa556da7bde75ca259de5e13418df731c6ac6b42092336fde5
|
3 |
+
size 5304
|
pyproject.toml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "math-gpt2"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Add your description here"
|
5 |
+
readme = "README.md"
|
6 |
+
requires-python = ">=3.12"
|
7 |
+
dependencies = [
|
8 |
+
"accelerate>=1.2.1",
|
9 |
+
"datasets>=3.2.0",
|
10 |
+
"ipykernel>=6.29.5",
|
11 |
+
"ipywidgets>=8.1.5",
|
12 |
+
"numpy>=2.2.1",
|
13 |
+
"torch>=2.5.1",
|
14 |
+
"transformers>=4.47.1",
|
15 |
+
]
|
training.ipynb
ADDED
@@ -0,0 +1,482 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"data": {
|
10 |
+
"text/plain": [
|
11 |
+
"True"
|
12 |
+
]
|
13 |
+
},
|
14 |
+
"execution_count": 1,
|
15 |
+
"metadata": {},
|
16 |
+
"output_type": "execute_result"
|
17 |
+
}
|
18 |
+
],
|
19 |
+
"source": [
|
20 |
+
"import torch\n",
|
21 |
+
"torch.cuda.is_available()"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"cell_type": "code",
|
26 |
+
"execution_count": 2,
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [],
|
29 |
+
"source": [
|
30 |
+
"import os\n",
|
31 |
+
"from datasets import load_dataset, Dataset, concatenate_datasets\n",
|
32 |
+
"from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments, GPT2LMHeadModel"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"cell_type": "code",
|
37 |
+
"execution_count": 3,
|
38 |
+
"metadata": {},
|
39 |
+
"outputs": [
|
40 |
+
{
|
41 |
+
"data": {
|
42 |
+
"text/plain": [
|
43 |
+
"DatasetDict({\n",
|
44 |
+
" train: Dataset({\n",
|
45 |
+
" features: ['source', 'problem', 'solution', 'messages'],\n",
|
46 |
+
" num_rows: 859494\n",
|
47 |
+
" })\n",
|
48 |
+
" test: Dataset({\n",
|
49 |
+
" features: ['source', 'problem', 'solution', 'messages'],\n",
|
50 |
+
" num_rows: 100\n",
|
51 |
+
" })\n",
|
52 |
+
"})"
|
53 |
+
]
|
54 |
+
},
|
55 |
+
"execution_count": 3,
|
56 |
+
"metadata": {},
|
57 |
+
"output_type": "execute_result"
|
58 |
+
}
|
59 |
+
],
|
60 |
+
"source": [
|
61 |
+
"cache_dir = os.path.join(os.getcwd(), \"data\")\n",
|
62 |
+
"\n",
|
63 |
+
"dataset = load_dataset(\"AI-MO/NuminaMath-CoT\", cache_dir = cache_dir)\n",
|
64 |
+
"dataset"
|
65 |
+
]
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"cell_type": "code",
|
69 |
+
"execution_count": 4,
|
70 |
+
"metadata": {},
|
71 |
+
"outputs": [
|
72 |
+
{
|
73 |
+
"name": "stdout",
|
74 |
+
"output_type": "stream",
|
75 |
+
"text": [
|
76 |
+
"Problem: Find all solutions to the equation $\\displaystyle\\sqrt[3]{3 - \\frac{x}{3}} = -2$. \n",
|
77 |
+
"Solution: Start by isolating the cube root:\n",
|
78 |
+
"$$ \\sqrt[3]{3 - \\frac{x}{3}} = -2 $$\n",
|
79 |
+
"\n",
|
80 |
+
"Cube both sides to eliminate the cube root:\n",
|
81 |
+
"$$ 3 - \\frac{x}{3} = (-2)^3 $$\n",
|
82 |
+
"$$ 3 - \\frac{x}{3} = -8 $$\n",
|
83 |
+
"\n",
|
84 |
+
"Solve for $x$:\n",
|
85 |
+
"$$ 3 + 8 = \\frac{x}{3} $$\n",
|
86 |
+
"$$ 11 = \\frac{x}{3} $$\n",
|
87 |
+
"$$ x = 33 $$\n",
|
88 |
+
"\n",
|
89 |
+
"Thus, the solution to the equation is:\n",
|
90 |
+
"$$ \\boxed{x = 33} $$\n"
|
91 |
+
]
|
92 |
+
}
|
93 |
+
],
|
94 |
+
"source": [
|
95 |
+
"constanta = 4\n",
|
96 |
+
"\n",
|
97 |
+
"problem = dataset['train']['problem'][constanta]\n",
|
98 |
+
"solution = dataset['train']['solution'][constanta]\n",
|
99 |
+
"\n",
|
100 |
+
"print(f\"Problem: {problem} \\nSolution: {solution}\")"
|
101 |
+
]
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"cell_type": "code",
|
105 |
+
"execution_count": 5,
|
106 |
+
"metadata": {},
|
107 |
+
"outputs": [],
|
108 |
+
"source": [
|
109 |
+
"def format_text(item):\n",
|
110 |
+
" return f\"Problem: {item['problem']}\\nSolution: {item['solution']}\\n\\n\"\n",
|
111 |
+
"\n",
|
112 |
+
"def prepare_dataset(dataset_dict, file_name, cache_dir):\n",
|
113 |
+
"\n",
|
114 |
+
" file = os.path.join(cache_dir, f\"{file_name}.txt\")\n",
|
115 |
+
"\n",
|
116 |
+
" if os.path.exists(file):\n",
|
117 |
+
" return file\n",
|
118 |
+
" \n",
|
119 |
+
" with open(file, 'w', encoding='utf-8') as f:\n",
|
120 |
+
" for item in dataset_dict:\n",
|
121 |
+
" f.write(format_text(item))\n",
|
122 |
+
" \n",
|
123 |
+
" return file\n",
|
124 |
+
"\n",
|
125 |
+
"train_file = prepare_dataset(\n",
|
126 |
+
" dataset_dict = dataset['train'],\n",
|
127 |
+
" file_name = \"train\",\n",
|
128 |
+
" cache_dir = cache_dir\n",
|
129 |
+
")\n",
|
130 |
+
"\n",
|
131 |
+
"test_file = prepare_dataset(\n",
|
132 |
+
" dataset_dict = dataset['test'],\n",
|
133 |
+
" file_name = \"test\",\n",
|
134 |
+
" cache_dir = cache_dir\n",
|
135 |
+
")"
|
136 |
+
]
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"cell_type": "code",
|
140 |
+
"execution_count": 6,
|
141 |
+
"metadata": {},
|
142 |
+
"outputs": [],
|
143 |
+
"source": [
|
144 |
+
"tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n",
|
145 |
+
"tokenizer.pad_token = tokenizer.eos_token"
|
146 |
+
]
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"cell_type": "code",
|
150 |
+
"execution_count": 7,
|
151 |
+
"metadata": {},
|
152 |
+
"outputs": [
|
153 |
+
{
|
154 |
+
"data": {
|
155 |
+
"text/plain": [
|
156 |
+
"26997273"
|
157 |
+
]
|
158 |
+
},
|
159 |
+
"execution_count": 7,
|
160 |
+
"metadata": {},
|
161 |
+
"output_type": "execute_result"
|
162 |
+
}
|
163 |
+
],
|
164 |
+
"source": [
|
165 |
+
"with open(train_file, 'r') as f:\n",
|
166 |
+
" num_lines = sum(1 for line in f)\n",
|
167 |
+
"num_lines"
|
168 |
+
]
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"cell_type": "code",
|
172 |
+
"execution_count": 8,
|
173 |
+
"metadata": {},
|
174 |
+
"outputs": [],
|
175 |
+
"source": [
|
176 |
+
"def tokenize_function(examples, tokenizer):\n",
|
177 |
+
" item = tokenizer(\n",
|
178 |
+
" examples['text'],\n",
|
179 |
+
" truncation = True,\n",
|
180 |
+
" max_length = 64,\n",
|
181 |
+
" padding = True,\n",
|
182 |
+
" return_tensors = 'pt'\n",
|
183 |
+
" ).to(\"cuda\")\n",
|
184 |
+
" return item\n",
|
185 |
+
"\n",
|
186 |
+
"\n",
|
187 |
+
"def process_chunk(text_chunk, tokenizer):\n",
|
188 |
+
" dataset = Dataset.from_dict({\n",
|
189 |
+
" 'text': text_chunk\n",
|
190 |
+
" }).map(\n",
|
191 |
+
" lambda x: tokenize_function(x, tokenizer),\n",
|
192 |
+
" batched = True\n",
|
193 |
+
" )\n",
|
194 |
+
" return dataset\n",
|
195 |
+
"\n",
|
196 |
+
"\n",
|
197 |
+
"def create_dataset(file, tokenizer, dataset_name, cache_dir, chunk_size = 750_000, max_chunks = 20):\n",
|
198 |
+
" dataset_path = os.path.join(cache_dir, dataset_name)\n",
|
199 |
+
"\n",
|
200 |
+
" if os.path.exists(dataset_path):\n",
|
201 |
+
" return Dataset.load_from_disk(dataset_path)\n",
|
202 |
+
" \n",
|
203 |
+
" datasets = []\n",
|
204 |
+
" chunks_processed = 0\n",
|
205 |
+
" \n",
|
206 |
+
" with open(file, 'r', encoding = 'utf-8') as f:\n",
|
207 |
+
" while chunks_processed < max_chunks:\n",
|
208 |
+
" text_chunk = []\n",
|
209 |
+
" while len(text_chunk) < chunk_size:\n",
|
210 |
+
" line = f.readline()\n",
|
211 |
+
" if not line:\n",
|
212 |
+
" break\n",
|
213 |
+
" text_chunk.append(line)\n",
|
214 |
+
" \n",
|
215 |
+
" if text_chunk:\n",
|
216 |
+
" datasets.append(process_chunk(text_chunk, tokenizer))\n",
|
217 |
+
" chunks_processed += 1\n",
|
218 |
+
" \n",
|
219 |
+
"\n",
|
220 |
+
" the_dataset = concatenate_datasets(datasets)\n",
|
221 |
+
" the_dataset.save_to_disk(dataset_path)\n",
|
222 |
+
" \n",
|
223 |
+
" return the_dataset\n",
|
224 |
+
"\n",
|
225 |
+
"\n",
|
226 |
+
"train_dataset = create_dataset(\n",
|
227 |
+
" file = train_file, \n",
|
228 |
+
" tokenizer = tokenizer,\n",
|
229 |
+
" dataset_name = \"train_dataset\",\n",
|
230 |
+
" cache_dir = cache_dir\n",
|
231 |
+
" )\n",
|
232 |
+
"\n",
|
233 |
+
"test_dataset = create_dataset(\n",
|
234 |
+
" file = test_file, \n",
|
235 |
+
" tokenizer = tokenizer,\n",
|
236 |
+
" dataset_name = \"test_dataset\",\n",
|
237 |
+
" cache_dir = cache_dir,\n",
|
238 |
+
" max_chunks = 1\n",
|
239 |
+
" )"
|
240 |
+
]
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"cell_type": "code",
|
244 |
+
"execution_count": 9,
|
245 |
+
"metadata": {},
|
246 |
+
"outputs": [],
|
247 |
+
"source": [
|
248 |
+
"data_collator = DataCollatorForLanguageModeling(\n",
|
249 |
+
" tokenizer = tokenizer,\n",
|
250 |
+
" mlm = False\n",
|
251 |
+
" )"
|
252 |
+
]
|
253 |
+
},
|
254 |
+
{
|
255 |
+
"cell_type": "code",
|
256 |
+
"execution_count": 10,
|
257 |
+
"metadata": {},
|
258 |
+
"outputs": [
|
259 |
+
{
|
260 |
+
"name": "stdout",
|
261 |
+
"output_type": "stream",
|
262 |
+
"text": [
|
263 |
+
"{'text': 'Problem: Suppose that $g(x) = 5x - 3$. What is $g^{-1}(g^{-1}(14))$?\\n', 'input_ids': [40781, 25, 39200, 326, 720, 70, 7, 87, 8, 796, 642, 87, 532, 513, 35307, 1867, 318, 720, 70, 36796, 12, 16, 92, 7, 70, 36796, 12, 16, 92, 7, 1415, 4008, 3, 30, 198, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}\n"
|
264 |
+
]
|
265 |
+
}
|
266 |
+
],
|
267 |
+
"source": [
|
268 |
+
"print(train_dataset[13])"
|
269 |
+
]
|
270 |
+
},
|
271 |
+
{
|
272 |
+
"cell_type": "code",
|
273 |
+
"execution_count": 11,
|
274 |
+
"metadata": {},
|
275 |
+
"outputs": [
|
276 |
+
{
|
277 |
+
"name": "stdout",
|
278 |
+
"output_type": "stream",
|
279 |
+
"text": [
|
280 |
+
"Problem: Suppose that $g(x) = 5x - 3$. What is $g^{-1}(g^{-1}(14))$?\n",
|
281 |
+
"<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>\n"
|
282 |
+
]
|
283 |
+
}
|
284 |
+
],
|
285 |
+
"source": [
|
286 |
+
"print(tokenizer.decode(train_dataset[13]['input_ids']))"
|
287 |
+
]
|
288 |
+
},
|
289 |
+
{
|
290 |
+
"cell_type": "code",
|
291 |
+
"execution_count": 12,
|
292 |
+
"metadata": {},
|
293 |
+
"outputs": [],
|
294 |
+
"source": [
|
295 |
+
"def compute_metrics(eval_pred):\n",
|
296 |
+
" predictions, labels = eval_pred\n",
|
297 |
+
" \n",
|
298 |
+
" loss = torch.nn.CrossEntropyLoss()(\n",
|
299 |
+
" torch.tensor(predictions).cuda(), \n",
|
300 |
+
" torch.tensor(labels).cuda()\n",
|
301 |
+
" ).item()\n",
|
302 |
+
" \n",
|
303 |
+
" return loss"
|
304 |
+
]
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"cell_type": "code",
|
308 |
+
"execution_count": 13,
|
309 |
+
"metadata": {},
|
310 |
+
"outputs": [
|
311 |
+
{
|
312 |
+
"data": {
|
313 |
+
"text/plain": [
|
314 |
+
"15000000"
|
315 |
+
]
|
316 |
+
},
|
317 |
+
"execution_count": 13,
|
318 |
+
"metadata": {},
|
319 |
+
"output_type": "execute_result"
|
320 |
+
}
|
321 |
+
],
|
322 |
+
"source": [
|
323 |
+
"len(train_dataset)"
|
324 |
+
]
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"cell_type": "code",
|
328 |
+
"execution_count": 14,
|
329 |
+
"metadata": {},
|
330 |
+
"outputs": [],
|
331 |
+
"source": [
|
332 |
+
"train_dataset = train_dataset.select(range(len(train_dataset)//15_000))"
|
333 |
+
]
|
334 |
+
},
|
335 |
+
{
|
336 |
+
"cell_type": "code",
|
337 |
+
"execution_count": 21,
|
338 |
+
"metadata": {},
|
339 |
+
"outputs": [],
|
340 |
+
"source": [
|
341 |
+
"os.makedirs(\"models\", exist_ok = True)\n",
|
342 |
+
"os.makedirs(\"models/math-gpt2-v0\", exist_ok = True)\n",
|
343 |
+
"os.makedirs(\"models/math-gpt2-v0/logs\", exist_ok = True)"
|
344 |
+
]
|
345 |
+
},
|
346 |
+
{
|
347 |
+
"cell_type": "code",
|
348 |
+
"execution_count": 23,
|
349 |
+
"metadata": {},
|
350 |
+
"outputs": [],
|
351 |
+
"source": [
|
352 |
+
"import gc\n",
|
353 |
+
"\n",
|
354 |
+
"gc.collect()\n",
|
355 |
+
"torch.cuda.empty_cache()\n",
|
356 |
+
"\n",
|
357 |
+
"# идея - сохранить лучший и оба по обе стороны\n",
|
358 |
+
"# правильно ли я понимаю, что в текущих условиях, будет соохранена только одна модель, а logs - все"
|
359 |
+
]
|
360 |
+
},
|
361 |
+
{
|
362 |
+
"cell_type": "code",
|
363 |
+
"execution_count": 24,
|
364 |
+
"metadata": {},
|
365 |
+
"outputs": [],
|
366 |
+
"source": [
|
367 |
+
"model = GPT2LMHeadModel.from_pretrained('gpt2')\n",
|
368 |
+
"model.resize_token_embeddings(len(tokenizer))\n",
|
369 |
+
"model = model.to(\"cuda\")\n",
|
370 |
+
"\n",
|
371 |
+
"training_args = TrainingArguments(\n",
|
372 |
+
"\n",
|
373 |
+
" output_dir = \"models/math-gpt2-v0\", # директория для сохранения модели\n",
|
374 |
+
" logging_dir = \"models/math-gpt2-v0/logs\", # куда будут сохраняться логи обучения в формате TensorBoard \n",
|
375 |
+
" run_name = \"math-gpt2-v0\", # название эксперимента в системах логирования (например, TensorBoard)\n",
|
376 |
+
" overwrite_output_dir = False,\n",
|
377 |
+
" save_total_limit = 1, # сохранить модель с лучшим счетом из всех steps\n",
|
378 |
+
" save_strategy = \"steps\",\n",
|
379 |
+
" \n",
|
380 |
+
" do_eval = True, # валидация\n",
|
381 |
+
" eval_strategy = \"steps\", # стратегия запуска волидации: по эпохам или по шагам\n",
|
382 |
+
" eval_steps = 10000, # количество шагов между валидациями\n",
|
383 |
+
"\n",
|
384 |
+
" load_best_model_at_end = True,\n",
|
385 |
+
" metric_for_best_model = \"cross_entropy\", # по какой метрике оцениваем\n",
|
386 |
+
" greater_is_better = False, # хотим ее минимизировать или максимизировать\n",
|
387 |
+
"\n",
|
388 |
+
" num_train_epochs = 4,\n",
|
389 |
+
" per_device_train_batch_size = 32, # количество примеров для обучения на одном устройстве GPU\n",
|
390 |
+
" per_device_eval_batch_size = 64, \n",
|
391 |
+
" gradient_checkpointing = True, # экономит память GPU за счет пересчета некоторых промежуточных активаций вместо их хранения\n",
|
392 |
+
" fp16 = True, # использование 16-битных чисел с плавающей точкой вместо стандартных 32-битных (fp32)\n",
|
393 |
+
"\n",
|
394 |
+
" gradient_accumulation_steps = 2, # накапливает градиенты N батчей перед обновлением весов\n",
|
395 |
+
" eval_accumulation_steps = 2, # накапливает результаты N батчей при валидации\n",
|
396 |
+
"\n",
|
397 |
+
" learning_rate = 5e-5, # ОТВЕТ ЗНАТОКА: это базовый lr\n",
|
398 |
+
" warmup_steps = 500, # Количество шагов для прогрева (warmup) learning rate. Прогрев в этом случае - обновление learning rate.\n",
|
399 |
+
" weight_decay = 0.01, # ниже тоже базовые, знаю\n",
|
400 |
+
" adam_beta1 = 0.9,\n",
|
401 |
+
" adam_beta2 = 0.999,\n",
|
402 |
+
" max_grad_norm = 1.0,\n",
|
403 |
+
" \n",
|
404 |
+
" save_steps = 10000, # частота сохранения модели\n",
|
405 |
+
" logging_steps = 10000, # частота логирования метрик\n",
|
406 |
+
" push_to_hub = False, # загрузка модели на HuggingFace\n",
|
407 |
+
")\n",
|
408 |
+
"\n",
|
409 |
+
"trainer = Trainer(\n",
|
410 |
+
" model = model,\n",
|
411 |
+
" args = training_args,\n",
|
412 |
+
" data_collator = data_collator,\n",
|
413 |
+
" train_dataset = train_dataset,\n",
|
414 |
+
" eval_dataset = test_dataset,\n",
|
415 |
+
" compute_metrics = compute_metrics\n",
|
416 |
+
")"
|
417 |
+
]
|
418 |
+
},
|
419 |
+
{
|
420 |
+
"cell_type": "code",
|
421 |
+
"execution_count": 25,
|
422 |
+
"metadata": {},
|
423 |
+
"outputs": [
|
424 |
+
{
|
425 |
+
"data": {
|
426 |
+
"application/vnd.jupyter.widget-view+json": {
|
427 |
+
"model_id": "c4125cf1faae456796edb8f452529acb",
|
428 |
+
"version_major": 2,
|
429 |
+
"version_minor": 0
|
430 |
+
},
|
431 |
+
"text/plain": [
|
432 |
+
" 0%| | 0/64 [00:00<?, ?it/s]"
|
433 |
+
]
|
434 |
+
},
|
435 |
+
"metadata": {},
|
436 |
+
"output_type": "display_data"
|
437 |
+
},
|
438 |
+
{
|
439 |
+
"name": "stdout",
|
440 |
+
"output_type": "stream",
|
441 |
+
"text": [
|
442 |
+
"{'train_runtime': 27.7282, 'train_samples_per_second': 144.257, 'train_steps_per_second': 2.308, 'train_loss': 6.928213119506836, 'epoch': 4.0}\n"
|
443 |
+
]
|
444 |
+
},
|
445 |
+
{
|
446 |
+
"data": {
|
447 |
+
"text/plain": [
|
448 |
+
"TrainOutput(global_step=64, training_loss=6.928213119506836, metrics={'train_runtime': 27.7282, 'train_samples_per_second': 144.257, 'train_steps_per_second': 2.308, 'total_flos': 130646016000000.0, 'train_loss': 6.928213119506836, 'epoch': 4.0})"
|
449 |
+
]
|
450 |
+
},
|
451 |
+
"execution_count": 25,
|
452 |
+
"metadata": {},
|
453 |
+
"output_type": "execute_result"
|
454 |
+
}
|
455 |
+
],
|
456 |
+
"source": [
|
457 |
+
"trainer.train()"
|
458 |
+
]
|
459 |
+
}
|
460 |
+
],
|
461 |
+
"metadata": {
|
462 |
+
"kernelspec": {
|
463 |
+
"display_name": ".venv",
|
464 |
+
"language": "python",
|
465 |
+
"name": "python3"
|
466 |
+
},
|
467 |
+
"language_info": {
|
468 |
+
"codemirror_mode": {
|
469 |
+
"name": "ipython",
|
470 |
+
"version": 3
|
471 |
+
},
|
472 |
+
"file_extension": ".py",
|
473 |
+
"mimetype": "text/x-python",
|
474 |
+
"name": "python",
|
475 |
+
"nbconvert_exporter": "python",
|
476 |
+
"pygments_lexer": "ipython3",
|
477 |
+
"version": "3.12.3"
|
478 |
+
}
|
479 |
+
},
|
480 |
+
"nbformat": 4,
|
481 |
+
"nbformat_minor": 2
|
482 |
+
}
|
using.ipynb
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"data": {
|
10 |
+
"application/vnd.jupyter.widget-view+json": {
|
11 |
+
"model_id": "aebdeaaf529549a1ad39ccf838e39e71",
|
12 |
+
"version_major": 2,
|
13 |
+
"version_minor": 0
|
14 |
+
},
|
15 |
+
"text/plain": [
|
16 |
+
"tokenizer_config.json: 0%| | 0.00/26.0 [00:00<?, ?B/s]"
|
17 |
+
]
|
18 |
+
},
|
19 |
+
"metadata": {},
|
20 |
+
"output_type": "display_data"
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"data": {
|
24 |
+
"application/vnd.jupyter.widget-view+json": {
|
25 |
+
"model_id": "26da6488303344ce90900207ae33b6bd",
|
26 |
+
"version_major": 2,
|
27 |
+
"version_minor": 0
|
28 |
+
},
|
29 |
+
"text/plain": [
|
30 |
+
"vocab.json: 0%| | 0.00/1.04M [00:00<?, ?B/s]"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
"metadata": {},
|
34 |
+
"output_type": "display_data"
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"data": {
|
38 |
+
"application/vnd.jupyter.widget-view+json": {
|
39 |
+
"model_id": "1eea77d125a74e00b1f050960dfa42aa",
|
40 |
+
"version_major": 2,
|
41 |
+
"version_minor": 0
|
42 |
+
},
|
43 |
+
"text/plain": [
|
44 |
+
"merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]"
|
45 |
+
]
|
46 |
+
},
|
47 |
+
"metadata": {},
|
48 |
+
"output_type": "display_data"
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"data": {
|
52 |
+
"application/vnd.jupyter.widget-view+json": {
|
53 |
+
"model_id": "b704ae3896dc46698ca50b7a29230bde",
|
54 |
+
"version_major": 2,
|
55 |
+
"version_minor": 0
|
56 |
+
},
|
57 |
+
"text/plain": [
|
58 |
+
"tokenizer.json: 0%| | 0.00/1.36M [00:00<?, ?B/s]"
|
59 |
+
]
|
60 |
+
},
|
61 |
+
"metadata": {},
|
62 |
+
"output_type": "display_data"
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"data": {
|
66 |
+
"application/vnd.jupyter.widget-view+json": {
|
67 |
+
"model_id": "42eaaf035fad40f583ee01dac1a86e85",
|
68 |
+
"version_major": 2,
|
69 |
+
"version_minor": 0
|
70 |
+
},
|
71 |
+
"text/plain": [
|
72 |
+
"config.json: 0%| | 0.00/665 [00:00<?, ?B/s]"
|
73 |
+
]
|
74 |
+
},
|
75 |
+
"metadata": {},
|
76 |
+
"output_type": "display_data"
|
77 |
+
}
|
78 |
+
],
|
79 |
+
"source": [
|
80 |
+
"from transformers import GPT2LMHeadModel, GPT2Tokenizer\n",
|
81 |
+
"\n",
|
82 |
+
"model = GPT2LMHeadModel.from_pretrained(\"models/math-gpt2-v0/checkpoint-64\")\n",
|
83 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
|
84 |
+
"model = model.to(\"cuda\")"
|
85 |
+
]
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"cell_type": "code",
|
89 |
+
"execution_count": 2,
|
90 |
+
"metadata": {},
|
91 |
+
"outputs": [
|
92 |
+
{
|
93 |
+
"name": "stderr",
|
94 |
+
"output_type": "stream",
|
95 |
+
"text": [
|
96 |
+
"The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n"
|
97 |
+
]
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"name": "stdout",
|
101 |
+
"output_type": "stream",
|
102 |
+
"text": [
|
103 |
+
"Your math prompt here:\n",
|
104 |
+
"\n",
|
105 |
+
"$ x = (m^3 + k) + (m^4 + k) + (m^5 + k)\n",
|
106 |
+
"\n",
|
107 |
+
"Now, we have:\n",
|
108 |
+
"\n",
|
109 |
+
"$ x = (m^3 +\n"
|
110 |
+
]
|
111 |
+
}
|
112 |
+
],
|
113 |
+
"source": [
|
114 |
+
"def generate_text(prompt, max_length=50):\n",
|
115 |
+
"\n",
|
116 |
+
" inputs = tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\")\n",
|
117 |
+
" \n",
|
118 |
+
" outputs = model.generate(\n",
|
119 |
+
" inputs[\"input_ids\"],\n",
|
120 |
+
" max_length=max_length,\n",
|
121 |
+
" pad_token_id=tokenizer.eos_token_id,\n",
|
122 |
+
" num_return_sequences=1,\n",
|
123 |
+
" do_sample=True,\n",
|
124 |
+
" temperature=0.7\n",
|
125 |
+
" )\n",
|
126 |
+
" \n",
|
127 |
+
" return tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
|
128 |
+
"\n",
|
129 |
+
"prompt = \"Your math prompt here\"\n",
|
130 |
+
"generated_text = generate_text(prompt)\n",
|
131 |
+
"print(generated_text)"
|
132 |
+
]
|
133 |
+
}
|
134 |
+
],
|
135 |
+
"metadata": {
|
136 |
+
"kernelspec": {
|
137 |
+
"display_name": ".venv",
|
138 |
+
"language": "python",
|
139 |
+
"name": "python3"
|
140 |
+
},
|
141 |
+
"language_info": {
|
142 |
+
"codemirror_mode": {
|
143 |
+
"name": "ipython",
|
144 |
+
"version": 3
|
145 |
+
},
|
146 |
+
"file_extension": ".py",
|
147 |
+
"mimetype": "text/x-python",
|
148 |
+
"name": "python",
|
149 |
+
"nbconvert_exporter": "python",
|
150 |
+
"pygments_lexer": "ipython3",
|
151 |
+
"version": "3.12.3"
|
152 |
+
}
|
153 |
+
},
|
154 |
+
"nbformat": 4,
|
155 |
+
"nbformat_minor": 2
|
156 |
+
}
|
uv.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|