Sierra-W117 commited on
Commit
7daf5bf
·
0 Parent(s):

Initial commit

Browse files
Files changed (45) hide show
  1. .gitattributes +2 -0
  2. .gitignore +10 -0
  3. .python-version +1 -0
  4. README.md +0 -0
  5. data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/dataset_info.json +3 -0
  6. data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-test.arrow +3 -0
  7. data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00000-of-00005.arrow +3 -0
  8. data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00001-of-00005.arrow +3 -0
  9. data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00002-of-00005.arrow +3 -0
  10. data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00003-of-00005.arrow +3 -0
  11. data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00004-of-00005.arrow +3 -0
  12. data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517_builder.lock +0 -0
  13. data/_home_sierra_Desktop_schoolx_ml_math-gpt2_data_AI-MO___numina_math-co_t_default_0.0.0_9d8d210c9f6a36c8f3cd84045668c9b7800ef517.lock +0 -0
  14. data/test.txt +3 -0
  15. data/test_dataset/data-00000-of-00001.arrow +3 -0
  16. data/test_dataset/dataset_info.json +3 -0
  17. data/test_dataset/state.json +3 -0
  18. data/train.txt +3 -0
  19. data/train_dataset/data-00000-of-00012.arrow +3 -0
  20. data/train_dataset/data-00001-of-00012.arrow +3 -0
  21. data/train_dataset/data-00002-of-00012.arrow +3 -0
  22. data/train_dataset/data-00003-of-00012.arrow +3 -0
  23. data/train_dataset/data-00004-of-00012.arrow +3 -0
  24. data/train_dataset/data-00005-of-00012.arrow +3 -0
  25. data/train_dataset/data-00006-of-00012.arrow +3 -0
  26. data/train_dataset/data-00007-of-00012.arrow +3 -0
  27. data/train_dataset/data-00008-of-00012.arrow +3 -0
  28. data/train_dataset/data-00009-of-00012.arrow +3 -0
  29. data/train_dataset/data-00010-of-00012.arrow +3 -0
  30. data/train_dataset/data-00011-of-00012.arrow +3 -0
  31. data/train_dataset/dataset_info.json +3 -0
  32. data/train_dataset/state.json +3 -0
  33. git-guide.txt +28 -0
  34. models/math-gpt2-v0/checkpoint-64/config.json +3 -0
  35. models/math-gpt2-v0/checkpoint-64/generation_config.json +3 -0
  36. models/math-gpt2-v0/checkpoint-64/model.safetensors +3 -0
  37. models/math-gpt2-v0/checkpoint-64/optimizer.pt +3 -0
  38. models/math-gpt2-v0/checkpoint-64/rng_state.pth +3 -0
  39. models/math-gpt2-v0/checkpoint-64/scheduler.pt +3 -0
  40. models/math-gpt2-v0/checkpoint-64/trainer_state.json +3 -0
  41. models/math-gpt2-v0/checkpoint-64/training_args.bin +3 -0
  42. pyproject.toml +15 -0
  43. training.ipynb +482 -0
  44. using.ipynb +156 -0
  45. uv.lock +0 -0
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ data/**/* filter=lfs diff=lfs merge=lfs -text
2
+ models/**/* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
README.md ADDED
File without changes
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3e56bb3dc9a357e233a9fb7f8e4e8d78cf632912ee7cdb82ac7309ae13a444d
3
+ size 1841
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-test.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6100e0a2b7a8da10e1826327f03572f062c8692094e6f79c80a9a0fea0ffe3c2
3
+ size 283680
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00000-of-00005.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02602e4416333875f19cc121f04e3f59d13fe6281fe9172684dd326cb63d90e9
3
+ size 501967576
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00001-of-00005.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96c471c057f51439eed145f3adb76a0199a769e0d04b09d1c307002b64631d04
3
+ size 502355600
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00002-of-00005.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6976fd4b3ea2373fb4db96de99459a0afefa4eaa2a60df6e04ce84f9ddb37d4f
3
+ size 502181200
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00003-of-00005.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f52c13815176e107801de98c486340647c0cfdee24d0384cc1b52bc29a32a1d
3
+ size 501469240
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517/numina_math-co_t-train-00004-of-00005.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:019ca18217d6570a600ee3c3fec9789ea6480dae2bb247c38ae89032b27245c1
3
+ size 487732184
data/AI-MO___numina_math-co_t/default/0.0.0/9d8d210c9f6a36c8f3cd84045668c9b7800ef517_builder.lock ADDED
File without changes
data/_home_sierra_Desktop_schoolx_ml_math-gpt2_data_AI-MO___numina_math-co_t_default_0.0.0_9d8d210c9f6a36c8f3cd84045668c9b7800ef517.lock ADDED
File without changes
data/test.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe68549af30f20ef71cb8e885305e80ab40f38ce960a060b301d2f2df1bb9184
3
+ size 140685
data/test_dataset/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f77549e0a8318c2ad7f46f1c3df79f9a5e4bd3b2812a80a7f2b403bd0ea912cb
3
+ size 1125192
data/test_dataset/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee8e05b4381db6d6cc7427c83f0536c1da684169b0b57e251a76d23a3d084670
3
+ size 431
data/test_dataset/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb96174acb2f77d300ea1fdc971d183ba564d79853319a2ec41c19615af41301
3
+ size 298
data/train.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1d5690c7a443730e822249ea2e0b37799c330851f6c87342c0dcd5d77fc1126
3
+ size 1243204995
data/train_dataset/data-00000-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b06f1e008b17e8ee80546673206e4ade8842dc0b840f3352db50d09d90c7072
3
+ size 473206928
data/train_dataset/data-00001-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5036abbfeaf78a0b696fc26833d80c2b44ca32bd57f0a62eb1760ebf58261485
3
+ size 473246368
data/train_dataset/data-00002-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45186315c884dbb0b03aefc8d7ebc4a43fe416cb1991acdc6643f8038806aea1
3
+ size 472882424
data/train_dataset/data-00003-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4489cea42c1478171f9ed986fae4da66833daa184353b6d2638cd463d9e0d6e
3
+ size 473004376
data/train_dataset/data-00004-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:572836a964eef213e9570d19b5abf74d8b9d99591e25e498d9f4141a8a845b4b
3
+ size 473022936
data/train_dataset/data-00005-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a3d18f5c5a879daa106b1b596493471ca89075023d4a9aa3335c7aa866e7d06
3
+ size 472883784
data/train_dataset/data-00006-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e38a00712bcb23bdd1a93a1c5299969607a25319542b3c24a0d0e26eadc36d1
3
+ size 472906568
data/train_dataset/data-00007-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e6366921b4e83156591b110b91a85629cce7a56a2fc16922603f29da1b7e2da
3
+ size 473162936
data/train_dataset/data-00008-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a99404147a03ed183bf5dad5c834de9a83bcd2dc2dc6531c0b8ad31fe5a113f1
3
+ size 473061832
data/train_dataset/data-00009-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9593e6cfc36949e9107db0b17b06ad1f183cd9d3e0bb8afdc98b8f64bcf5765
3
+ size 472899088
data/train_dataset/data-00010-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71bb48eed735d31e8c1e4e3503140fd3c7137c5aec0cec4dca2eb2ac12faf59d
3
+ size 473032744
data/train_dataset/data-00011-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04884fad18f5ab356bebce10fbf996407f78f432d12d14e693cf18fcdcb7f700
3
+ size 472870416
data/train_dataset/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee8e05b4381db6d6cc7427c83f0536c1da684169b0b57e251a76d23a3d084670
3
+ size 431
data/train_dataset/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cfacc7784b691edb44b06521c361be2d888ed9cb961eb70584bab9303760293
3
+ size 947
git-guide.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ xdg-open ~/.git-credentials
2
+ xdg-open ~/.gitconfig
3
+ xdg-open ~/.cache/huggingface
4
+
5
+ huggingface-cli login
6
+ huggingface-cli repo create "repo-name"
7
+
8
+ git config --global user.email "[email protected]"
9
+ git config --global user.name "username"
10
+
11
+ uv init
12
+ * git lfs track отправляет большие файлы в специальное хранилище; пушатся не сами файлы, а ссылки на них в хранилище
13
+ git lfs track "data/**/*" (директория data; ** - все под-директории; * - все файлы)
14
+ git lfs track "models/**/*"
15
+
16
+ git add .gitattributes
17
+ git add . (добавь все файлы, что находится в этой директории)
18
+
19
+ git commit -m "Initial commit"
20
+
21
+ git add git-guide.txt
22
+ git commit --amend --no-edit (если изменил какой-то файл после того, как объявил комит и ты не хочешь создавать новый)
23
+
24
+ git remote add origin https://huggingface.co/Sierra-W117/math-gpt2
25
+ (git remote remove origin)
26
+
27
+ git branch -M main
28
+ git push -u origin main
models/math-gpt2-v0/checkpoint-64/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:285d59ee8e224c185ad2239766d59b8c8c60fc4911b1fc98000ed47c06749151
3
+ size 907
models/math-gpt2-v0/checkpoint-64/generation_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c831d77f4ccb7e22873162a06bdb0b846a588f6586890c6aab7cec1b7ef9a60e
3
+ size 119
models/math-gpt2-v0/checkpoint-64/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12b69d3d4944bac6ac070c5912020130f1a85aac5557f469a55e3170f7ab338a
3
+ size 497774208
models/math-gpt2-v0/checkpoint-64/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b03bc453c349d0eb2faba453ac1ab28097404a05d27a942294b8269e8894f1a6
3
+ size 995642298
models/math-gpt2-v0/checkpoint-64/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be2f08e0a34d93e3742746f4ecb1a5a6653f3b1da6db4b41f5f39cf3ae48a6f7
3
+ size 14244
models/math-gpt2-v0/checkpoint-64/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d0278b77bd0a6a0013d1641e2fe9dd8de3641b1d4a7ec8656f671cec7965afc
3
+ size 1064
models/math-gpt2-v0/checkpoint-64/trainer_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf0bf27b94c3ebb86dcdf29d67753a85cd2510fcfbb14ef590bc7046bbe49eff
3
+ size 744
models/math-gpt2-v0/checkpoint-64/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb4c73e15e28f4fa556da7bde75ca259de5e13418df731c6ac6b42092336fde5
3
+ size 5304
pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "math-gpt2"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "accelerate>=1.2.1",
9
+ "datasets>=3.2.0",
10
+ "ipykernel>=6.29.5",
11
+ "ipywidgets>=8.1.5",
12
+ "numpy>=2.2.1",
13
+ "torch>=2.5.1",
14
+ "transformers>=4.47.1",
15
+ ]
training.ipynb ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "True"
12
+ ]
13
+ },
14
+ "execution_count": 1,
15
+ "metadata": {},
16
+ "output_type": "execute_result"
17
+ }
18
+ ],
19
+ "source": [
20
+ "import torch\n",
21
+ "torch.cuda.is_available()"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 2,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "import os\n",
31
+ "from datasets import load_dataset, Dataset, concatenate_datasets\n",
32
+ "from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments, GPT2LMHeadModel"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 3,
38
+ "metadata": {},
39
+ "outputs": [
40
+ {
41
+ "data": {
42
+ "text/plain": [
43
+ "DatasetDict({\n",
44
+ " train: Dataset({\n",
45
+ " features: ['source', 'problem', 'solution', 'messages'],\n",
46
+ " num_rows: 859494\n",
47
+ " })\n",
48
+ " test: Dataset({\n",
49
+ " features: ['source', 'problem', 'solution', 'messages'],\n",
50
+ " num_rows: 100\n",
51
+ " })\n",
52
+ "})"
53
+ ]
54
+ },
55
+ "execution_count": 3,
56
+ "metadata": {},
57
+ "output_type": "execute_result"
58
+ }
59
+ ],
60
+ "source": [
61
+ "cache_dir = os.path.join(os.getcwd(), \"data\")\n",
62
+ "\n",
63
+ "dataset = load_dataset(\"AI-MO/NuminaMath-CoT\", cache_dir = cache_dir)\n",
64
+ "dataset"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 4,
70
+ "metadata": {},
71
+ "outputs": [
72
+ {
73
+ "name": "stdout",
74
+ "output_type": "stream",
75
+ "text": [
76
+ "Problem: Find all solutions to the equation $\\displaystyle\\sqrt[3]{3 - \\frac{x}{3}} = -2$. \n",
77
+ "Solution: Start by isolating the cube root:\n",
78
+ "$$ \\sqrt[3]{3 - \\frac{x}{3}} = -2 $$\n",
79
+ "\n",
80
+ "Cube both sides to eliminate the cube root:\n",
81
+ "$$ 3 - \\frac{x}{3} = (-2)^3 $$\n",
82
+ "$$ 3 - \\frac{x}{3} = -8 $$\n",
83
+ "\n",
84
+ "Solve for $x$:\n",
85
+ "$$ 3 + 8 = \\frac{x}{3} $$\n",
86
+ "$$ 11 = \\frac{x}{3} $$\n",
87
+ "$$ x = 33 $$\n",
88
+ "\n",
89
+ "Thus, the solution to the equation is:\n",
90
+ "$$ \\boxed{x = 33} $$\n"
91
+ ]
92
+ }
93
+ ],
94
+ "source": [
95
+ "constanta = 4\n",
96
+ "\n",
97
+ "problem = dataset['train']['problem'][constanta]\n",
98
+ "solution = dataset['train']['solution'][constanta]\n",
99
+ "\n",
100
+ "print(f\"Problem: {problem} \\nSolution: {solution}\")"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 5,
106
+ "metadata": {},
107
+ "outputs": [],
108
+ "source": [
109
+ "def format_text(item):\n",
110
+ " return f\"Problem: {item['problem']}\\nSolution: {item['solution']}\\n\\n\"\n",
111
+ "\n",
112
+ "def prepare_dataset(dataset_dict, file_name, cache_dir):\n",
113
+ "\n",
114
+ " file = os.path.join(cache_dir, f\"{file_name}.txt\")\n",
115
+ "\n",
116
+ " if os.path.exists(file):\n",
117
+ " return file\n",
118
+ " \n",
119
+ " with open(file, 'w', encoding='utf-8') as f:\n",
120
+ " for item in dataset_dict:\n",
121
+ " f.write(format_text(item))\n",
122
+ " \n",
123
+ " return file\n",
124
+ "\n",
125
+ "train_file = prepare_dataset(\n",
126
+ " dataset_dict = dataset['train'],\n",
127
+ " file_name = \"train\",\n",
128
+ " cache_dir = cache_dir\n",
129
+ ")\n",
130
+ "\n",
131
+ "test_file = prepare_dataset(\n",
132
+ " dataset_dict = dataset['test'],\n",
133
+ " file_name = \"test\",\n",
134
+ " cache_dir = cache_dir\n",
135
+ ")"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 6,
141
+ "metadata": {},
142
+ "outputs": [],
143
+ "source": [
144
+ "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n",
145
+ "tokenizer.pad_token = tokenizer.eos_token"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 7,
151
+ "metadata": {},
152
+ "outputs": [
153
+ {
154
+ "data": {
155
+ "text/plain": [
156
+ "26997273"
157
+ ]
158
+ },
159
+ "execution_count": 7,
160
+ "metadata": {},
161
+ "output_type": "execute_result"
162
+ }
163
+ ],
164
+ "source": [
165
+ "with open(train_file, 'r') as f:\n",
166
+ " num_lines = sum(1 for line in f)\n",
167
+ "num_lines"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 8,
173
+ "metadata": {},
174
+ "outputs": [],
175
+ "source": [
176
+ "def tokenize_function(examples, tokenizer):\n",
177
+ " item = tokenizer(\n",
178
+ " examples['text'],\n",
179
+ " truncation = True,\n",
180
+ " max_length = 64,\n",
181
+ " padding = True,\n",
182
+ " return_tensors = 'pt'\n",
183
+ " ).to(\"cuda\")\n",
184
+ " return item\n",
185
+ "\n",
186
+ "\n",
187
+ "def process_chunk(text_chunk, tokenizer):\n",
188
+ " dataset = Dataset.from_dict({\n",
189
+ " 'text': text_chunk\n",
190
+ " }).map(\n",
191
+ " lambda x: tokenize_function(x, tokenizer),\n",
192
+ " batched = True\n",
193
+ " )\n",
194
+ " return dataset\n",
195
+ "\n",
196
+ "\n",
197
+ "def create_dataset(file, tokenizer, dataset_name, cache_dir, chunk_size = 750_000, max_chunks = 20):\n",
198
+ " dataset_path = os.path.join(cache_dir, dataset_name)\n",
199
+ "\n",
200
+ " if os.path.exists(dataset_path):\n",
201
+ " return Dataset.load_from_disk(dataset_path)\n",
202
+ " \n",
203
+ " datasets = []\n",
204
+ " chunks_processed = 0\n",
205
+ " \n",
206
+ " with open(file, 'r', encoding = 'utf-8') as f:\n",
207
+ " while chunks_processed < max_chunks:\n",
208
+ " text_chunk = []\n",
209
+ " while len(text_chunk) < chunk_size:\n",
210
+ " line = f.readline()\n",
211
+ " if not line:\n",
212
+ " break\n",
213
+ " text_chunk.append(line)\n",
214
+ " \n",
215
+ " if text_chunk:\n",
216
+ " datasets.append(process_chunk(text_chunk, tokenizer))\n",
217
+ " chunks_processed += 1\n",
218
+ " \n",
219
+ "\n",
220
+ " the_dataset = concatenate_datasets(datasets)\n",
221
+ " the_dataset.save_to_disk(dataset_path)\n",
222
+ " \n",
223
+ " return the_dataset\n",
224
+ "\n",
225
+ "\n",
226
+ "train_dataset = create_dataset(\n",
227
+ " file = train_file, \n",
228
+ " tokenizer = tokenizer,\n",
229
+ " dataset_name = \"train_dataset\",\n",
230
+ " cache_dir = cache_dir\n",
231
+ " )\n",
232
+ "\n",
233
+ "test_dataset = create_dataset(\n",
234
+ " file = test_file, \n",
235
+ " tokenizer = tokenizer,\n",
236
+ " dataset_name = \"test_dataset\",\n",
237
+ " cache_dir = cache_dir,\n",
238
+ " max_chunks = 1\n",
239
+ " )"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": 9,
245
+ "metadata": {},
246
+ "outputs": [],
247
+ "source": [
248
+ "data_collator = DataCollatorForLanguageModeling(\n",
249
+ " tokenizer = tokenizer,\n",
250
+ " mlm = False\n",
251
+ " )"
252
+ ]
253
+ },
254
+ {
255
+ "cell_type": "code",
256
+ "execution_count": 10,
257
+ "metadata": {},
258
+ "outputs": [
259
+ {
260
+ "name": "stdout",
261
+ "output_type": "stream",
262
+ "text": [
263
+ "{'text': 'Problem: Suppose that $g(x) = 5x - 3$. What is $g^{-1}(g^{-1}(14))$?\\n', 'input_ids': [40781, 25, 39200, 326, 720, 70, 7, 87, 8, 796, 642, 87, 532, 513, 35307, 1867, 318, 720, 70, 36796, 12, 16, 92, 7, 70, 36796, 12, 16, 92, 7, 1415, 4008, 3, 30, 198, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}\n"
264
+ ]
265
+ }
266
+ ],
267
+ "source": [
268
+ "print(train_dataset[13])"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 11,
274
+ "metadata": {},
275
+ "outputs": [
276
+ {
277
+ "name": "stdout",
278
+ "output_type": "stream",
279
+ "text": [
280
+ "Problem: Suppose that $g(x) = 5x - 3$. What is $g^{-1}(g^{-1}(14))$?\n",
281
+ "<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>\n"
282
+ ]
283
+ }
284
+ ],
285
+ "source": [
286
+ "print(tokenizer.decode(train_dataset[13]['input_ids']))"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": 12,
292
+ "metadata": {},
293
+ "outputs": [],
294
+ "source": [
295
+ "def compute_metrics(eval_pred):\n",
296
+ " predictions, labels = eval_pred\n",
297
+ " \n",
298
+ " loss = torch.nn.CrossEntropyLoss()(\n",
299
+ " torch.tensor(predictions).cuda(), \n",
300
+ " torch.tensor(labels).cuda()\n",
301
+ " ).item()\n",
302
+ " \n",
303
+ " return loss"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": 13,
309
+ "metadata": {},
310
+ "outputs": [
311
+ {
312
+ "data": {
313
+ "text/plain": [
314
+ "15000000"
315
+ ]
316
+ },
317
+ "execution_count": 13,
318
+ "metadata": {},
319
+ "output_type": "execute_result"
320
+ }
321
+ ],
322
+ "source": [
323
+ "len(train_dataset)"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": 14,
329
+ "metadata": {},
330
+ "outputs": [],
331
+ "source": [
332
+ "train_dataset = train_dataset.select(range(len(train_dataset)//15_000))"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": 21,
338
+ "metadata": {},
339
+ "outputs": [],
340
+ "source": [
341
+ "os.makedirs(\"models\", exist_ok = True)\n",
342
+ "os.makedirs(\"models/math-gpt2-v0\", exist_ok = True)\n",
343
+ "os.makedirs(\"models/math-gpt2-v0/logs\", exist_ok = True)"
344
+ ]
345
+ },
346
+ {
347
+ "cell_type": "code",
348
+ "execution_count": 23,
349
+ "metadata": {},
350
+ "outputs": [],
351
+ "source": [
352
+ "import gc\n",
353
+ "\n",
354
+ "gc.collect()\n",
355
+ "torch.cuda.empty_cache()\n",
356
+ "\n",
357
+ "# идея - сохранить лучший и оба по обе стороны\n",
358
+ "# правильно ли я понимаю, что в текущих условиях, будет соохранена только одна модель, а logs - все"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "execution_count": 24,
364
+ "metadata": {},
365
+ "outputs": [],
366
+ "source": [
367
+ "model = GPT2LMHeadModel.from_pretrained('gpt2')\n",
368
+ "model.resize_token_embeddings(len(tokenizer))\n",
369
+ "model = model.to(\"cuda\")\n",
370
+ "\n",
371
+ "training_args = TrainingArguments(\n",
372
+ "\n",
373
+ " output_dir = \"models/math-gpt2-v0\", # директория для сохранения модели\n",
374
+ " logging_dir = \"models/math-gpt2-v0/logs\", # куда будут сохраняться логи обучения в формате TensorBoard \n",
375
+ " run_name = \"math-gpt2-v0\", # название эксперимента в системах логирования (например, TensorBoard)\n",
376
+ " overwrite_output_dir = False,\n",
377
+ " save_total_limit = 1, # сохранить модель с лучшим счетом из всех steps\n",
378
+ " save_strategy = \"steps\",\n",
379
+ " \n",
380
+ " do_eval = True, # валидация\n",
381
+ " eval_strategy = \"steps\", # стратегия запуска волидации: по эпохам или по шагам\n",
382
+ " eval_steps = 10000, # количество шагов между валидациями\n",
383
+ "\n",
384
+ " load_best_model_at_end = True,\n",
385
+ " metric_for_best_model = \"cross_entropy\", # по какой метрике оцениваем\n",
386
+ " greater_is_better = False, # хотим ее минимизировать или максимизировать\n",
387
+ "\n",
388
+ " num_train_epochs = 4,\n",
389
+ " per_device_train_batch_size = 32, # количество примеров для обучения на одном устройстве GPU\n",
390
+ " per_device_eval_batch_size = 64, \n",
391
+ " gradient_checkpointing = True, # экономит память GPU за счет пересчета некоторых промежуточных активаций вместо их хранения\n",
392
+ " fp16 = True, # использование 16-битных чисел с плавающей точкой вместо стандартных 32-битных (fp32)\n",
393
+ "\n",
394
+ " gradient_accumulation_steps = 2, # накапливает градиенты N батчей перед обновлением весов\n",
395
+ " eval_accumulation_steps = 2, # накапливает результаты N батчей при валидации\n",
396
+ "\n",
397
+ " learning_rate = 5e-5, # ОТВЕТ ЗНАТОКА: это базовый lr\n",
398
+ " warmup_steps = 500, # Количество шагов для прогрева (warmup) learning rate. Прогрев в этом случае - обновление learning rate.\n",
399
+ " weight_decay = 0.01, # ниже тоже базовые, знаю\n",
400
+ " adam_beta1 = 0.9,\n",
401
+ " adam_beta2 = 0.999,\n",
402
+ " max_grad_norm = 1.0,\n",
403
+ " \n",
404
+ " save_steps = 10000, # частота сохранения модели\n",
405
+ " logging_steps = 10000, # частота логирования метрик\n",
406
+ " push_to_hub = False, # загрузка модели на HuggingFace\n",
407
+ ")\n",
408
+ "\n",
409
+ "trainer = Trainer(\n",
410
+ " model = model,\n",
411
+ " args = training_args,\n",
412
+ " data_collator = data_collator,\n",
413
+ " train_dataset = train_dataset,\n",
414
+ " eval_dataset = test_dataset,\n",
415
+ " compute_metrics = compute_metrics\n",
416
+ ")"
417
+ ]
418
+ },
419
+ {
420
+ "cell_type": "code",
421
+ "execution_count": 25,
422
+ "metadata": {},
423
+ "outputs": [
424
+ {
425
+ "data": {
426
+ "application/vnd.jupyter.widget-view+json": {
427
+ "model_id": "c4125cf1faae456796edb8f452529acb",
428
+ "version_major": 2,
429
+ "version_minor": 0
430
+ },
431
+ "text/plain": [
432
+ " 0%| | 0/64 [00:00<?, ?it/s]"
433
+ ]
434
+ },
435
+ "metadata": {},
436
+ "output_type": "display_data"
437
+ },
438
+ {
439
+ "name": "stdout",
440
+ "output_type": "stream",
441
+ "text": [
442
+ "{'train_runtime': 27.7282, 'train_samples_per_second': 144.257, 'train_steps_per_second': 2.308, 'train_loss': 6.928213119506836, 'epoch': 4.0}\n"
443
+ ]
444
+ },
445
+ {
446
+ "data": {
447
+ "text/plain": [
448
+ "TrainOutput(global_step=64, training_loss=6.928213119506836, metrics={'train_runtime': 27.7282, 'train_samples_per_second': 144.257, 'train_steps_per_second': 2.308, 'total_flos': 130646016000000.0, 'train_loss': 6.928213119506836, 'epoch': 4.0})"
449
+ ]
450
+ },
451
+ "execution_count": 25,
452
+ "metadata": {},
453
+ "output_type": "execute_result"
454
+ }
455
+ ],
456
+ "source": [
457
+ "trainer.train()"
458
+ ]
459
+ }
460
+ ],
461
+ "metadata": {
462
+ "kernelspec": {
463
+ "display_name": ".venv",
464
+ "language": "python",
465
+ "name": "python3"
466
+ },
467
+ "language_info": {
468
+ "codemirror_mode": {
469
+ "name": "ipython",
470
+ "version": 3
471
+ },
472
+ "file_extension": ".py",
473
+ "mimetype": "text/x-python",
474
+ "name": "python",
475
+ "nbconvert_exporter": "python",
476
+ "pygments_lexer": "ipython3",
477
+ "version": "3.12.3"
478
+ }
479
+ },
480
+ "nbformat": 4,
481
+ "nbformat_minor": 2
482
+ }
using.ipynb ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "application/vnd.jupyter.widget-view+json": {
11
+ "model_id": "aebdeaaf529549a1ad39ccf838e39e71",
12
+ "version_major": 2,
13
+ "version_minor": 0
14
+ },
15
+ "text/plain": [
16
+ "tokenizer_config.json: 0%| | 0.00/26.0 [00:00<?, ?B/s]"
17
+ ]
18
+ },
19
+ "metadata": {},
20
+ "output_type": "display_data"
21
+ },
22
+ {
23
+ "data": {
24
+ "application/vnd.jupyter.widget-view+json": {
25
+ "model_id": "26da6488303344ce90900207ae33b6bd",
26
+ "version_major": 2,
27
+ "version_minor": 0
28
+ },
29
+ "text/plain": [
30
+ "vocab.json: 0%| | 0.00/1.04M [00:00<?, ?B/s]"
31
+ ]
32
+ },
33
+ "metadata": {},
34
+ "output_type": "display_data"
35
+ },
36
+ {
37
+ "data": {
38
+ "application/vnd.jupyter.widget-view+json": {
39
+ "model_id": "1eea77d125a74e00b1f050960dfa42aa",
40
+ "version_major": 2,
41
+ "version_minor": 0
42
+ },
43
+ "text/plain": [
44
+ "merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]"
45
+ ]
46
+ },
47
+ "metadata": {},
48
+ "output_type": "display_data"
49
+ },
50
+ {
51
+ "data": {
52
+ "application/vnd.jupyter.widget-view+json": {
53
+ "model_id": "b704ae3896dc46698ca50b7a29230bde",
54
+ "version_major": 2,
55
+ "version_minor": 0
56
+ },
57
+ "text/plain": [
58
+ "tokenizer.json: 0%| | 0.00/1.36M [00:00<?, ?B/s]"
59
+ ]
60
+ },
61
+ "metadata": {},
62
+ "output_type": "display_data"
63
+ },
64
+ {
65
+ "data": {
66
+ "application/vnd.jupyter.widget-view+json": {
67
+ "model_id": "42eaaf035fad40f583ee01dac1a86e85",
68
+ "version_major": 2,
69
+ "version_minor": 0
70
+ },
71
+ "text/plain": [
72
+ "config.json: 0%| | 0.00/665 [00:00<?, ?B/s]"
73
+ ]
74
+ },
75
+ "metadata": {},
76
+ "output_type": "display_data"
77
+ }
78
+ ],
79
+ "source": [
80
+ "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n",
81
+ "\n",
82
+ "model = GPT2LMHeadModel.from_pretrained(\"models/math-gpt2-v0/checkpoint-64\")\n",
83
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
84
+ "model = model.to(\"cuda\")"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 2,
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "name": "stderr",
94
+ "output_type": "stream",
95
+ "text": [
96
+ "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n"
97
+ ]
98
+ },
99
+ {
100
+ "name": "stdout",
101
+ "output_type": "stream",
102
+ "text": [
103
+ "Your math prompt here:\n",
104
+ "\n",
105
+ "$ x = (m^3 + k) + (m^4 + k) + (m^5 + k)\n",
106
+ "\n",
107
+ "Now, we have:\n",
108
+ "\n",
109
+ "$ x = (m^3 +\n"
110
+ ]
111
+ }
112
+ ],
113
+ "source": [
114
+ "def generate_text(prompt, max_length=50):\n",
115
+ "\n",
116
+ " inputs = tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\")\n",
117
+ " \n",
118
+ " outputs = model.generate(\n",
119
+ " inputs[\"input_ids\"],\n",
120
+ " max_length=max_length,\n",
121
+ " pad_token_id=tokenizer.eos_token_id,\n",
122
+ " num_return_sequences=1,\n",
123
+ " do_sample=True,\n",
124
+ " temperature=0.7\n",
125
+ " )\n",
126
+ " \n",
127
+ " return tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
128
+ "\n",
129
+ "prompt = \"Your math prompt here\"\n",
130
+ "generated_text = generate_text(prompt)\n",
131
+ "print(generated_text)"
132
+ ]
133
+ }
134
+ ],
135
+ "metadata": {
136
+ "kernelspec": {
137
+ "display_name": ".venv",
138
+ "language": "python",
139
+ "name": "python3"
140
+ },
141
+ "language_info": {
142
+ "codemirror_mode": {
143
+ "name": "ipython",
144
+ "version": 3
145
+ },
146
+ "file_extension": ".py",
147
+ "mimetype": "text/x-python",
148
+ "name": "python",
149
+ "nbconvert_exporter": "python",
150
+ "pygments_lexer": "ipython3",
151
+ "version": "3.12.3"
152
+ }
153
+ },
154
+ "nbformat": 4,
155
+ "nbformat_minor": 2
156
+ }
uv.lock ADDED
The diff for this file is too large to render. See raw diff