diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..633a9315f8b3bdcb17200cabb243281fbc09053b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,25 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104210813.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104210818.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104210822.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104210941.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104210950.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104211013.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104211022.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104211033.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104211043.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104211051.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104211059.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104211102.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104211152.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104211159.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104211209.jpg filter=lfs diff=lfs merge=lfs -text
+Fresh[[:space:]]Produce/IMG20250104211221.jpg filter=lfs diff=lfs merge=lfs -text
+Grocery_images/Bro[[:space:]]date[[:space:]]vunna[[:space:]]pics[[:space:]]theyi.jpg filter=lfs diff=lfs merge=lfs -text
+Grocery_images/Bro[[:space:]]date[[:space:]]vunna[[:space:]]pics[[:space:]]theyi(1).jpg filter=lfs diff=lfs merge=lfs -text
+Grocery_images/Bro[[:space:]]date[[:space:]]vunna[[:space:]]pics[[:space:]]theyi(2).jpg filter=lfs diff=lfs merge=lfs -text
+Grocery_images/Bro[[:space:]]date[[:space:]]vunna[[:space:]]pics[[:space:]]theyi(3).jpg filter=lfs diff=lfs merge=lfs -text
+Grocery_images/Bro[[:space:]]date[[:space:]]vunna[[:space:]]pics[[:space:]]theyi(4).jpg filter=lfs diff=lfs merge=lfs -text
+Grocery_images/Bro[[:space:]]date[[:space:]]vunna[[:space:]]pics[[:space:]]theyi(5).jpg filter=lfs diff=lfs merge=lfs -text
diff --git a/Fresh Produce/IMG20250104210813.jpg b/Fresh Produce/IMG20250104210813.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..dc246b824e62aa8f909e55a0b5478f8ce6290f9f
--- /dev/null
+++ b/Fresh Produce/IMG20250104210813.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35970b1fb8e2201f8c1259985a3d838c5f9096b97d729583b9b643d0ffbe3cb7
+size 1371145
diff --git a/Fresh Produce/IMG20250104210818.jpg b/Fresh Produce/IMG20250104210818.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fe1f045b64a3c29443d565fe079947f9d77e6f18
--- /dev/null
+++ b/Fresh Produce/IMG20250104210818.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:168988849d03aa8288a0d864224b4248d086a2cbd291a3bcfbf86b531f0a1f04
+size 1422495
diff --git a/Fresh Produce/IMG20250104210822.jpg b/Fresh Produce/IMG20250104210822.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..824d95142f1fc46261d855f49631e206871e8f83
--- /dev/null
+++ b/Fresh Produce/IMG20250104210822.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12a1d66baf85aa636cb703020206f417a2471855a3861f01f33eb0d4065cf4df
+size 1504660
diff --git a/Fresh Produce/IMG20250104210941.jpg b/Fresh Produce/IMG20250104210941.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..08af00b691c033d6ff398206564bbf9c40812f75
--- /dev/null
+++ b/Fresh Produce/IMG20250104210941.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb3f1a3d4c4b14d2f4d01dc4c434729bde72c7f248ea674ae4a3c961431110a3
+size 1553122
diff --git a/Fresh Produce/IMG20250104210950.jpg b/Fresh Produce/IMG20250104210950.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c1322d5ecdee6ee4b0644928a98bd782d0c17ae6
--- /dev/null
+++ b/Fresh Produce/IMG20250104210950.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e692f6f4fe1050559baf96f8378b2c6ac22ee4dbabdc0515ab2dfe8de5778d3e
+size 1928506
diff --git a/Fresh Produce/IMG20250104211013.jpg b/Fresh Produce/IMG20250104211013.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2af13e9992b692857d72ff0b8c8f87f4f1d4549c
--- /dev/null
+++ b/Fresh Produce/IMG20250104211013.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88a0caa12ced3e8d26d452f0cfcad0ac9cb236f3f2f52c5ff5a3588158c99f6a
+size 1699983
diff --git a/Fresh Produce/IMG20250104211022.jpg b/Fresh Produce/IMG20250104211022.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..333a51c72342d0a38eb124f645222338cb1d191a
--- /dev/null
+++ b/Fresh Produce/IMG20250104211022.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d456d35f365c8deed52d05386ed8ef60f4099070ef67b1f0575900cfa8a76d9c
+size 1566402
diff --git a/Fresh Produce/IMG20250104211033.jpg b/Fresh Produce/IMG20250104211033.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b33cb7b8e8e90f17bbc0f8b8662ab41e46a9920f
--- /dev/null
+++ b/Fresh Produce/IMG20250104211033.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e3dc77b1b31f46653e5aacf9549de1d66bcce4ffff5f444368eb6246e68bc02
+size 1980616
diff --git a/Fresh Produce/IMG20250104211043.jpg b/Fresh Produce/IMG20250104211043.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..44e81e5fca6a090ed62fd8f825102f7e797dbe15
--- /dev/null
+++ b/Fresh Produce/IMG20250104211043.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb0f929e203e75416baf8eb7fb1f60a72c0a8157a5e9da6be1b41b0a8f017ae0
+size 1603087
diff --git a/Fresh Produce/IMG20250104211051.jpg b/Fresh Produce/IMG20250104211051.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e035583395a991450c7a7ac46127d4181583d6c5
--- /dev/null
+++ b/Fresh Produce/IMG20250104211051.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aff02fa88d5d1b0a489c39a7c0a70075053000b12a70ab5ace47197d2d93a92e
+size 1619012
diff --git a/Fresh Produce/IMG20250104211059.jpg b/Fresh Produce/IMG20250104211059.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8a7de144e70682d8519c8be4308926ca5dea5a00
--- /dev/null
+++ b/Fresh Produce/IMG20250104211059.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e607e0bbae62b24f99388dd41183b4b0e96ce04bcb232deab1368eb043627a92
+size 1558061
diff --git a/Fresh Produce/IMG20250104211102.jpg b/Fresh Produce/IMG20250104211102.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..37187beaee3a2a36533b2ba4bf8e84a4b9300df6
--- /dev/null
+++ b/Fresh Produce/IMG20250104211102.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee00d9b266dd8d2a3f47fdcad0b58bb9d7d7cba2ca14f91d82cbc01beebba6a
+size 1619177
diff --git a/Fresh Produce/IMG20250104211152.jpg b/Fresh Produce/IMG20250104211152.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..515e36d6a3e7217a63aac0475164c973adde05ed
--- /dev/null
+++ b/Fresh Produce/IMG20250104211152.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fdc212b905fbbd0ccb3531a91e6f4baf004ffa355755822a922220cc9acccc8
+size 1609153
diff --git a/Fresh Produce/IMG20250104211159.jpg b/Fresh Produce/IMG20250104211159.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ec77b9cc0274d4403c12295fa499b02543d4757f
--- /dev/null
+++ b/Fresh Produce/IMG20250104211159.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e411dae51aba9d2fb4711225893d9cbb0553ab056142545f4de8308ccb11e526
+size 1801429
diff --git a/Fresh Produce/IMG20250104211209.jpg b/Fresh Produce/IMG20250104211209.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d3ea06deeb9a839f2bb13baace172150d366d372
--- /dev/null
+++ b/Fresh Produce/IMG20250104211209.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b64c0258c084cc971a2c510e83644e3e01967d8ad1b0a693227735e2cf7497a3
+size 1513431
diff --git a/Fresh Produce/IMG20250104211221.jpg b/Fresh Produce/IMG20250104211221.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..eba9803fb8a18c3c7726d7cd8f28dbe9d83b900d
--- /dev/null
+++ b/Fresh Produce/IMG20250104211221.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3a9531851b0fa0ea92dbe5447f064ee7f9a7f40af73212fab7451d39b93b0d9
+size 1501830
diff --git a/Grocery_images/Bro date vunna pics theyi(1).jpg b/Grocery_images/Bro date vunna pics theyi(1).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..43f62ad8b1589e316d64cb96e0c3d5b2ca365a92
--- /dev/null
+++ b/Grocery_images/Bro date vunna pics theyi(1).jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2411f4d99c7e58cf6693883a638b6493cfc02a55255bd6392f324ffe6be532dd
+size 1869692
diff --git a/Grocery_images/Bro date vunna pics theyi(2).jpg b/Grocery_images/Bro date vunna pics theyi(2).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ce464f0e447fbb30b339a42cf2cc2863b13b572f
--- /dev/null
+++ b/Grocery_images/Bro date vunna pics theyi(2).jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f400b93c0cf9d16ec7b0f7722a4282d8720a952a7352c42d528ba9d9520b8231
+size 1952314
diff --git a/Grocery_images/Bro date vunna pics theyi(3).jpg b/Grocery_images/Bro date vunna pics theyi(3).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e789703efd06378de0fac9805bcee285d713a3b9
--- /dev/null
+++ b/Grocery_images/Bro date vunna pics theyi(3).jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a579c50defeec1da8644de0a4f64f7a418b597019279d01d008cbf89f826ba30
+size 1665700
diff --git a/Grocery_images/Bro date vunna pics theyi(4).jpg b/Grocery_images/Bro date vunna pics theyi(4).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ee90405eff02e55bd7df55ab38087771aea8cb9e
--- /dev/null
+++ b/Grocery_images/Bro date vunna pics theyi(4).jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:773fa83dff534aac8d321d310bf06fff0985233f0946c2edcf11ec1f59fbdbfd
+size 2253503
diff --git a/Grocery_images/Bro date vunna pics theyi(5).jpg b/Grocery_images/Bro date vunna pics theyi(5).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b0538ff6c2c89a642cbf80efbfcaac6a5c0354dc
--- /dev/null
+++ b/Grocery_images/Bro date vunna pics theyi(5).jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efa136954cb84f86fba370b78f9b8d90195abdc8282ea73c5bc9034dd242a48d
+size 1676873
diff --git a/Grocery_images/Bro date vunna pics theyi.jpg b/Grocery_images/Bro date vunna pics theyi.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b1ede2aeef27ad1842d4f7facdb8dd062f02e8ac
--- /dev/null
+++ b/Grocery_images/Bro date vunna pics theyi.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5678d071afeca4121021830d3fef4b12d55ec3ca8e5c0afd3cfbc56ce9f4b315
+size 1988962
diff --git a/Grocery_images/Grocery-1.jpg b/Grocery_images/Grocery-1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e92505b4d67c157e75000f6114c859a5a526c8b7
Binary files /dev/null and b/Grocery_images/Grocery-1.jpg differ
diff --git a/Grocery_images/Grocery-10.jpg b/Grocery_images/Grocery-10.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9051010bd058ee9b23f6c57d3cb0448b287d6d91
Binary files /dev/null and b/Grocery_images/Grocery-10.jpg differ
diff --git a/Grocery_images/Grocery-11.jpg b/Grocery_images/Grocery-11.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c6c2b9faece6e4a9200904ab0a624f16b5220da6
Binary files /dev/null and b/Grocery_images/Grocery-11.jpg differ
diff --git a/Grocery_images/Grocery-12.jpg b/Grocery_images/Grocery-12.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cd734e7f75c424f974951d6b40bbf548f8c6d14c
Binary files /dev/null and b/Grocery_images/Grocery-12.jpg differ
diff --git a/Grocery_images/Grocery-13.jpg b/Grocery_images/Grocery-13.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fca786059e55a43c4f7929438a1b84346b4ca750
Binary files /dev/null and b/Grocery_images/Grocery-13.jpg differ
diff --git a/Grocery_images/Grocery-14.jpg b/Grocery_images/Grocery-14.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8bcba75b57142b093ce93adeb418fcf0988ead66
Binary files /dev/null and b/Grocery_images/Grocery-14.jpg differ
diff --git a/Grocery_images/Grocery-15.jpg b/Grocery_images/Grocery-15.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..96fbcad6c5371e84d04a80ad6a564f4ad0f31def
Binary files /dev/null and b/Grocery_images/Grocery-15.jpg differ
diff --git a/Grocery_images/Grocery-16.jpg b/Grocery_images/Grocery-16.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c5370a7e2c4ce730eb4733b95aa7dc6207b34674
Binary files /dev/null and b/Grocery_images/Grocery-16.jpg differ
diff --git a/Grocery_images/Grocery-17.jpg b/Grocery_images/Grocery-17.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3798a16aa8fc2b967d54af0471637f1b3bb2ce30
Binary files /dev/null and b/Grocery_images/Grocery-17.jpg differ
diff --git a/Grocery_images/Grocery-18.jpg b/Grocery_images/Grocery-18.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..43e6d0f204bd8ea59f026ef63016332ce3d4d022
Binary files /dev/null and b/Grocery_images/Grocery-18.jpg differ
diff --git a/Grocery_images/Grocery-2.jpg b/Grocery_images/Grocery-2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..601164d328fa3c5fbc07b6466f0cafae216e890a
Binary files /dev/null and b/Grocery_images/Grocery-2.jpg differ
diff --git a/Grocery_images/Grocery-3.jpg b/Grocery_images/Grocery-3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..92e8d5fad4e262c7d6bf5b1f384a98dfc16fb22d
Binary files /dev/null and b/Grocery_images/Grocery-3.jpg differ
diff --git a/Grocery_images/Grocery-4.jpg b/Grocery_images/Grocery-4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9e3218524835d6745641cd0e33e911b07e92b35b
Binary files /dev/null and b/Grocery_images/Grocery-4.jpg differ
diff --git a/Grocery_images/Grocery-5.jpg b/Grocery_images/Grocery-5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b815c4047db2ef29a14108f0d5bb5587c389f082
Binary files /dev/null and b/Grocery_images/Grocery-5.jpg differ
diff --git a/Grocery_images/Grocery-6.jpg b/Grocery_images/Grocery-6.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0f22e25282bc96e29a94a9dbe3471dace1d1ec54
Binary files /dev/null and b/Grocery_images/Grocery-6.jpg differ
diff --git a/Grocery_images/Grocery-7.jpg b/Grocery_images/Grocery-7.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bcbda36d12b5cf733911f20544e9d51d5860c4a0
Binary files /dev/null and b/Grocery_images/Grocery-7.jpg differ
diff --git a/Grocery_images/Grocery-8.jpg b/Grocery_images/Grocery-8.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..71d3d24300ddd30d4065bac7ec02ecdf0d7eee27
Binary files /dev/null and b/Grocery_images/Grocery-8.jpg differ
diff --git a/Grocery_images/Grocery-9.jpg b/Grocery_images/Grocery-9.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7040c3d417c20c4e459fc534eb83d4350bebb33a
Binary files /dev/null and b/Grocery_images/Grocery-9.jpg differ
diff --git a/Grocery_images/image1.1.jpg b/Grocery_images/image1.1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..aef2b670f2b6a5c4d96aa88496f8c6a888b257d0
Binary files /dev/null and b/Grocery_images/image1.1.jpg differ
diff --git a/Grocery_images/image1.2.jpg b/Grocery_images/image1.2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..32550799ee56c796a3a9c8f60a1b395784e1c28b
Binary files /dev/null and b/Grocery_images/image1.2.jpg differ
diff --git a/Grocery_images/image1.3.jpg b/Grocery_images/image1.3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8c50a00e37264d2ec49be2c7569804e5350ba3a3
Binary files /dev/null and b/Grocery_images/image1.3.jpg differ
diff --git a/Grocery_images/image1.4.jpg b/Grocery_images/image1.4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f207c251cd9ecfa19c3560ee1bc3ac66667657a5
Binary files /dev/null and b/Grocery_images/image1.4.jpg differ
diff --git a/Grocery_images/image_1.jpeg b/Grocery_images/image_1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..568f6ad554c676542101ba170b85ec11459adbef
Binary files /dev/null and b/Grocery_images/image_1.jpeg differ
diff --git a/Grocery_images/img.jpg b/Grocery_images/img.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..00efaf807de672f0e6ad3d67b83fb03cd4cf3e83
Binary files /dev/null and b/Grocery_images/img.jpg differ
diff --git a/Grocery_images/img1.jpg b/Grocery_images/img1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8c6297035ed2463b8e51bfaa5d3b7f2bd0434490
Binary files /dev/null and b/Grocery_images/img1.jpg differ
diff --git a/Grocery_images/img2.jpg b/Grocery_images/img2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..51f18dbe32b2922f9dbd179bab73bf207a059818
Binary files /dev/null and b/Grocery_images/img2.jpg differ
diff --git a/Grocery_images/img3.jpg b/Grocery_images/img3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4bf5bc579ecd646ac8673b159fa69986c4210b81
Binary files /dev/null and b/Grocery_images/img3.jpg differ
diff --git a/Grocery_images/img4.jpg b/Grocery_images/img4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d2378c6c621592ec197ccf6e46b3b99dd30b4601
Binary files /dev/null and b/Grocery_images/img4.jpg differ
diff --git a/Grocery_images/img5.jpg b/Grocery_images/img5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1a80962c8600ba2d99689c0fac1e176d2e95190f
Binary files /dev/null and b/Grocery_images/img5.jpg differ
diff --git a/Grocery_images/img6.jpg b/Grocery_images/img6.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..552ee995486cbcedcefab509f87137a111ecc495
Binary files /dev/null and b/Grocery_images/img6.jpg differ
diff --git a/Grocery_images/img7.jpg b/Grocery_images/img7.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5f5122ba215901c096cc921115198cdd75a42285
Binary files /dev/null and b/Grocery_images/img7.jpg differ
diff --git a/internet-groceryimages/-original-imag9gddgjrcngmx.jpeg b/internet-groceryimages/-original-imag9gddgjrcngmx.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..bcaf25e47bd49104b1cb8c41d2516a5375a5330a
Binary files /dev/null and b/internet-groceryimages/-original-imag9gddgjrcngmx.jpeg differ
diff --git a/internet-groceryimages/-original-imagep3bchzvckxh.jpeg b/internet-groceryimages/-original-imagep3bchzvckxh.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..6dc6256438727eba890d171ad31361a9396758aa
Binary files /dev/null and b/internet-groceryimages/-original-imagep3bchzvckxh.jpeg differ
diff --git a/internet-groceryimages/-original-imagfqkztj5gysag.jpeg b/internet-groceryimages/-original-imagfqkztj5gysag.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..486a9fcea2a69a46a8b30b38a2ba7ba2ebe2c775
Binary files /dev/null and b/internet-groceryimages/-original-imagfqkztj5gysag.jpeg differ
diff --git a/internet-groceryimages/-original-imaghppjutbn73er.jpeg b/internet-groceryimages/-original-imaghppjutbn73er.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..52b8de574e7d3e21d26d44d66d4f6d2cac349bb9
Binary files /dev/null and b/internet-groceryimages/-original-imaghppjutbn73er.jpeg differ
diff --git a/internet-groceryimages/-original-imagk3egzhvwh9fe.jpeg b/internet-groceryimages/-original-imagk3egzhvwh9fe.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..14f3c58eed5d2c7719d3ff5fbe65c26247ec983d
Binary files /dev/null and b/internet-groceryimages/-original-imagk3egzhvwh9fe.jpeg differ
diff --git a/internet-groceryimages/-original-imagkuyygj9npymg.jpeg b/internet-groceryimages/-original-imagkuyygj9npymg.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..82b1da101ec9e8ec29c4716db636d41c795a821c
Binary files /dev/null and b/internet-groceryimages/-original-imagkuyygj9npymg.jpeg differ
diff --git a/internet-groceryimages/-original-imagkycyfxgbemdq.jpeg b/internet-groceryimages/-original-imagkycyfxgbemdq.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..bea37d509dbc32ee111d1f8e919c87b6820a412f
Binary files /dev/null and b/internet-groceryimages/-original-imagkycyfxgbemdq.jpeg differ
diff --git a/internet-groceryimages/-original-imagmw8zfyj8d92v.jpeg b/internet-groceryimages/-original-imagmw8zfyj8d92v.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..a052b729ea35da58b6da839bf874a3132e024e9a
Binary files /dev/null and b/internet-groceryimages/-original-imagmw8zfyj8d92v.jpeg differ
diff --git a/internet-groceryimages/-original-imagmw8zhzfc65nf.jpeg b/internet-groceryimages/-original-imagmw8zhzfc65nf.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..8d426044d5a380e651c5a282f7a76a41f6059382
Binary files /dev/null and b/internet-groceryimages/-original-imagmw8zhzfc65nf.jpeg differ
diff --git a/internet-groceryimages/-original-imagp8werxcvvbt3.jpeg b/internet-groceryimages/-original-imagp8werxcvvbt3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..d14ed8d6235ac7a5e9a2095162b8a4b27d075811
Binary files /dev/null and b/internet-groceryimages/-original-imagp8werxcvvbt3.jpeg differ
diff --git a/internet-groceryimages/-original-imags5ajqzrnhjqn.jpeg b/internet-groceryimages/-original-imags5ajqzrnhjqn.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..c8559bfb29306cee41e063af8ba086119fc9ba58
Binary files /dev/null and b/internet-groceryimages/-original-imags5ajqzrnhjqn.jpeg differ
diff --git a/internet-groceryimages/-original-imagtmhtxb6re3tw.jpeg b/internet-groceryimages/-original-imagtmhtxb6re3tw.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..1df7f904219fa711a5f4ecb3c3da56b0910e0245
Binary files /dev/null and b/internet-groceryimages/-original-imagtmhtxb6re3tw.jpeg differ
diff --git a/internet-groceryimages/-original-imagtrzhegfyfrh3.jpeg b/internet-groceryimages/-original-imagtrzhegfyfrh3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..135a2759dee5d61409807ced6448bb141967c5b7
Binary files /dev/null and b/internet-groceryimages/-original-imagtrzhegfyfrh3.jpeg differ
diff --git a/internet-groceryimages/-original-imagttcf9hc4gadh.jpeg b/internet-groceryimages/-original-imagttcf9hc4gadh.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..2231f93ef0cab34b438dde37bd8ba2d1d19211d1
Binary files /dev/null and b/internet-groceryimages/-original-imagttcf9hc4gadh.jpeg differ
diff --git a/internet-groceryimages/-original-imagurkrk2vqgnwx.jpeg b/internet-groceryimages/-original-imagurkrk2vqgnwx.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..910a87269c57b5a7cd3353dee289ea6284705a13
Binary files /dev/null and b/internet-groceryimages/-original-imagurkrk2vqgnwx.jpeg differ
diff --git a/internet-groceryimages/-original-imagyys8uwbchfgd.jpeg b/internet-groceryimages/-original-imagyys8uwbchfgd.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..769af0baecb2181b2ef4fe203784b1bd2878d25d
Binary files /dev/null and b/internet-groceryimages/-original-imagyys8uwbchfgd.jpeg differ
diff --git a/internet-groceryimages/-original-imagzcqzeq7mrgxy.jpeg b/internet-groceryimages/-original-imagzcqzeq7mrgxy.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..2554c800ad9d43272d6b1ee3c79ba64312d08a73
Binary files /dev/null and b/internet-groceryimages/-original-imagzcqzeq7mrgxy.jpeg differ
diff --git a/internet-groceryimages/-original-imah3jbfaxcbygb7.jpeg b/internet-groceryimages/-original-imah3jbfaxcbygb7.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..8835ce6876adbaeffc7b2c06176665a94335060f
Binary files /dev/null and b/internet-groceryimages/-original-imah3jbfaxcbygb7.jpeg differ
diff --git a/internet-groceryimages/-original-imahfder4gy2qubg.jpeg b/internet-groceryimages/-original-imahfder4gy2qubg.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..95fa3e4df873d5d254ec001eafdf25986fda0af6
Binary files /dev/null and b/internet-groceryimages/-original-imahfder4gy2qubg.jpeg differ
diff --git a/internet-groceryimages/-original-imahfru7wywgv6gy.jpeg b/internet-groceryimages/-original-imahfru7wywgv6gy.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..ff5b23fb1a517ba39ba9c32cbf5f0deee0a84018
Binary files /dev/null and b/internet-groceryimages/-original-imahfru7wywgv6gy.jpeg differ
diff --git a/internet-groceryimages/-original-imahyg7qzst3gwz7.jpeg b/internet-groceryimages/-original-imahyg7qzst3gwz7.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..5c1773e679082d09718ba69cf7cb03287c4646c2
Binary files /dev/null and b/internet-groceryimages/-original-imahyg7qzst3gwz7.jpeg differ
diff --git a/internet-groceryimages/-original-imahywzp7hzzjfrn.jpeg b/internet-groceryimages/-original-imahywzp7hzzjfrn.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..a234c0f8f5fd66386bfedc01a8bcea27cb00c2b1
Binary files /dev/null and b/internet-groceryimages/-original-imahywzp7hzzjfrn.jpeg differ
diff --git a/internet-groceryimages/1 (2).jpg b/internet-groceryimages/1 (2).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5dffd1d8eb87fe28c44dafcdd5e66b6e779c610a
Binary files /dev/null and b/internet-groceryimages/1 (2).jpg differ
diff --git a/internet-groceryimages/1-mango-pickle-aam-ka-achar-plastic-bottle-1-pickle-swad-original-imahy9jq9egzzuxd.jpeg b/internet-groceryimages/1-mango-pickle-aam-ka-achar-plastic-bottle-1-pickle-swad-original-imahy9jq9egzzuxd.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..a1c4cfb65e82973e9e174b2d17d6ca481ce83506
Binary files /dev/null and b/internet-groceryimages/1-mango-pickle-aam-ka-achar-plastic-bottle-1-pickle-swad-original-imahy9jq9egzzuxd.jpeg differ
diff --git a/internet-groceryimages/1.jpg b/internet-groceryimages/1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..645a54f5d7874f3de578587f9d3ddcf65ce8bb66
Binary files /dev/null and b/internet-groceryimages/1.jpg differ
diff --git a/internet-groceryimages/10 - Copy.jpg b/internet-groceryimages/10 - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ccb25bae03b65f203971e0d3fe5609d6482dc63b
Binary files /dev/null and b/internet-groceryimages/10 - Copy.jpg differ
diff --git a/internet-groceryimages/10.jpg b/internet-groceryimages/10.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9b1977ee35cb38e7646629b8bd828c7423fecafe
Binary files /dev/null and b/internet-groceryimages/10.jpg differ
diff --git a/internet-groceryimages/100-hydrate-toothpaste-diabetic-friendly-sugar-free-toothpaste-1-original-imagyh8uystyzs4x.jpeg b/internet-groceryimages/100-hydrate-toothpaste-diabetic-friendly-sugar-free-toothpaste-1-original-imagyh8uystyzs4x.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..e12073056c651bcf7a316024722a53b1c190c2da
Binary files /dev/null and b/internet-groceryimages/100-hydrate-toothpaste-diabetic-friendly-sugar-free-toothpaste-1-original-imagyh8uystyzs4x.jpeg differ
diff --git a/internet-groceryimages/100.jpg b/internet-groceryimages/100.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6692d932a0d10d155810744e17e608b3223aec92
Binary files /dev/null and b/internet-groceryimages/100.jpg differ
diff --git a/internet-groceryimages/101 - Copy.jpg b/internet-groceryimages/101 - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..eb579476e6db00938d6bb99d0a98a1d12459c75e
Binary files /dev/null and b/internet-groceryimages/101 - Copy.jpg differ
diff --git a/internet-groceryimages/103 (2).jpg b/internet-groceryimages/103 (2).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..423b72614f1b838d4c2ff293ca3c067b8b420da3
Binary files /dev/null and b/internet-groceryimages/103 (2).jpg differ
diff --git a/internet-groceryimages/103.jpg b/internet-groceryimages/103.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..927770a2acd4b382e6538c77a9dec3cac605fe33
Binary files /dev/null and b/internet-groceryimages/103.jpg differ
diff --git a/internet-groceryimages/105 - Copy.jpg b/internet-groceryimages/105 - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..55aaa0667eb0aa71ce33d74b67e8513b9a7bf1fe
Binary files /dev/null and b/internet-groceryimages/105 - Copy.jpg differ
diff --git a/internet-groceryimages/107 - Copy.jpg b/internet-groceryimages/107 - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cba37e89b5660608f5ecc0b2cfdfb4792a4e93f7
Binary files /dev/null and b/internet-groceryimages/107 - Copy.jpg differ
diff --git a/internet-groceryimages/107.jpg b/internet-groceryimages/107.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e945abdb94cb7a7f813033ab9a7f61a4f230ba53
Binary files /dev/null and b/internet-groceryimages/107.jpg differ
diff --git a/internet-groceryimages/11 (2).jpg b/internet-groceryimages/11 (2).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..64df2a004c7dfccdc6f229d1415611548df142c5
Binary files /dev/null and b/internet-groceryimages/11 (2).jpg differ
diff --git a/internet-groceryimages/11.jpg b/internet-groceryimages/11.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bcf377c0a31ccfc4f59b089466ce838a9aa55bea
Binary files /dev/null and b/internet-groceryimages/11.jpg differ
diff --git a/internet-groceryimages/110 - Copy.jpg b/internet-groceryimages/110 - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1b480c64d1d022a903e70b94431af3cf36b80878
Binary files /dev/null and b/internet-groceryimages/110 - Copy.jpg differ
diff --git a/internet-groceryimages/115.jpg b/internet-groceryimages/115.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..16dfb482b1c35ada1fadd57caff9a50bf95f4cce
Binary files /dev/null and b/internet-groceryimages/115.jpg differ
diff --git a/internet-groceryimages/12 (2).jpg b/internet-groceryimages/12 (2).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..902c3ab96053db8e873156365533321c0506cdf8
Binary files /dev/null and b/internet-groceryimages/12 (2).jpg differ
diff --git a/internet-groceryimages/12.jpg b/internet-groceryimages/12.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6229a32bb67771ab547ac0403ea096b1cc9c4745
Binary files /dev/null and b/internet-groceryimages/12.jpg differ
diff --git a/internet-groceryimages/122.jpg b/internet-groceryimages/122.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a31a9a22930f63904af31ed1d31fc9ebda6d529e
Binary files /dev/null and b/internet-groceryimages/122.jpg differ
diff --git a/internet-groceryimages/124.jpg b/internet-groceryimages/124.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fd58897b30aeb6d563e890af3aef433befbd952a
Binary files /dev/null and b/internet-groceryimages/124.jpg differ
diff --git a/internet-groceryimages/127.jpg b/internet-groceryimages/127.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..725463681ce40341c35300ba8520f2710abd8ca8
Binary files /dev/null and b/internet-groceryimages/127.jpg differ
diff --git a/internet-groceryimages/135.jpg b/internet-groceryimages/135.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..64fca3aec738a8472b4b951cfa514e7d8b47e705
Binary files /dev/null and b/internet-groceryimages/135.jpg differ
diff --git a/internet-groceryimages/14.jpg b/internet-groceryimages/14.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f76c56207b7f967c8f2054c7d9f2b2bc7db8b6e1
Binary files /dev/null and b/internet-groceryimages/14.jpg differ
diff --git a/internet-groceryimages/142.jpg b/internet-groceryimages/142.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d25667700baa98904a304f612d1ac57b7c37ef37
Binary files /dev/null and b/internet-groceryimages/142.jpg differ
diff --git a/internet-groceryimages/15 - Copy.jpg b/internet-groceryimages/15 - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..54c818e1b55eda17f886c4e52fed7f4e58269f49
Binary files /dev/null and b/internet-groceryimages/15 - Copy.jpg differ
diff --git a/internet-groceryimages/17 - Copy.jpg b/internet-groceryimages/17 - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3980181410b0f2b8eefde748ba23103451183a45
Binary files /dev/null and b/internet-groceryimages/17 - Copy.jpg differ
diff --git a/internet-groceryimages/17.jpg b/internet-groceryimages/17.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4ad1cffe88133013a621298852a0e6027e13da77
Binary files /dev/null and b/internet-groceryimages/17.jpg differ
diff --git a/internet-groceryimages/174.jpg b/internet-groceryimages/174.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b92d68436dc72b5931721fd16e6665419c8d67d8
Binary files /dev/null and b/internet-groceryimages/174.jpg differ
diff --git a/internet-groceryimages/175.jpg b/internet-groceryimages/175.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f5206bff7f5864668c45d048e41edfcb970cdffa
Binary files /dev/null and b/internet-groceryimages/175.jpg differ
diff --git a/internet-groceryimages/179.jpg b/internet-groceryimages/179.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..931a5cecde24dbd4919f3ebb92301385305a21cf
Binary files /dev/null and b/internet-groceryimages/179.jpg differ
diff --git a/internet-groceryimages/18.jpg b/internet-groceryimages/18.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..df735d16705b404feb62525a6cac5dea49cccfb1
Binary files /dev/null and b/internet-groceryimages/18.jpg differ
diff --git a/internet-groceryimages/180.jpg b/internet-groceryimages/180.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c28f775a7caee9b91bfda55bc5753e0ff964cc40
Binary files /dev/null and b/internet-groceryimages/180.jpg differ
diff --git a/internet-groceryimages/188.jpg b/internet-groceryimages/188.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e97108cf723c4cbb94786075bffbee416bc3e76e
Binary files /dev/null and b/internet-groceryimages/188.jpg differ
diff --git a/internet-groceryimages/19 - Copy.jpg b/internet-groceryimages/19 - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..df6f5dcda8025f4e34080e6dabac16fde6d9a6d3
Binary files /dev/null and b/internet-groceryimages/19 - Copy.jpg differ
diff --git a/internet-groceryimages/190.jpg b/internet-groceryimages/190.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..464ebf3d98ff865bb86b280a485403970b101c12
Binary files /dev/null and b/internet-groceryimages/190.jpg differ
diff --git a/internet-groceryimages/192.jpg b/internet-groceryimages/192.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d83859fe2453bb4fcf37956e62185292baf94b77
Binary files /dev/null and b/internet-groceryimages/192.jpg differ
diff --git a/internet-groceryimages/195.jpg b/internet-groceryimages/195.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a72c663464881436289a208029e3fa8091a43ff9
Binary files /dev/null and b/internet-groceryimages/195.jpg differ
diff --git a/internet-groceryimages/2 (2).jpg b/internet-groceryimages/2 (2).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..44c0d3b7e733759377b77622cd77ca0b676cd289
Binary files /dev/null and b/internet-groceryimages/2 (2).jpg differ
diff --git a/internet-groceryimages/2 (3).jpg b/internet-groceryimages/2 (3).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5348d76a0633561c792796f9b0fbe6700d1b476f
Binary files /dev/null and b/internet-groceryimages/2 (3).jpg differ
diff --git a/internet-groceryimages/2.jpg b/internet-groceryimages/2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2239f19781d5a6c28cbbd8fc4b13152cdbc51222
Binary files /dev/null and b/internet-groceryimages/2.jpg differ
diff --git a/internet-groceryimages/20 - Copy.jpg b/internet-groceryimages/20 - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b1330a53d6d203d7e22b45fba366e4454b7cfca8
Binary files /dev/null and b/internet-groceryimages/20 - Copy.jpg differ
diff --git a/internet-groceryimages/200-green-tamarind-pickle-natural-ingredients-premium-quality-original-imah3wq5wnttetzy.jpeg b/internet-groceryimages/200-green-tamarind-pickle-natural-ingredients-premium-quality-original-imah3wq5wnttetzy.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..62f71759159def25184e86e4f3e2ad29bb25c60e
Binary files /dev/null and b/internet-groceryimages/200-green-tamarind-pickle-natural-ingredients-premium-quality-original-imah3wq5wnttetzy.jpeg differ
diff --git a/internet-groceryimages/209.jpg b/internet-groceryimages/209.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b43fb36b4434e64fa2e22e3cf349a0093baf5e4b
Binary files /dev/null and b/internet-groceryimages/209.jpg differ
diff --git a/internet-groceryimages/21.jpg b/internet-groceryimages/21.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..568eab20b91a0d15e87fed823e5604bf717f1612
Binary files /dev/null and b/internet-groceryimages/21.jpg differ
diff --git a/internet-groceryimages/210.jpg b/internet-groceryimages/210.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2ba2fa75742b1473a7e47b0af54cb594b5bb201e
Binary files /dev/null and b/internet-groceryimages/210.jpg differ
diff --git a/internet-groceryimages/216.jpg b/internet-groceryimages/216.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0a0f9c2f9cf4fb6bcaf89fc537c40f59cc997118
Binary files /dev/null and b/internet-groceryimages/216.jpg differ
diff --git a/internet-groceryimages/22 (2).jpg b/internet-groceryimages/22 (2).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2ffc30765bca4b866e7ed563ec2ee550989664e2
Binary files /dev/null and b/internet-groceryimages/22 (2).jpg differ
diff --git a/internet-groceryimages/22.jpg b/internet-groceryimages/22.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cac264d24aa9a349a99d3961b36985c4d74f556b
Binary files /dev/null and b/internet-groceryimages/22.jpg differ
diff --git a/internet-groceryimages/223.jpg b/internet-groceryimages/223.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3226df088f005d5b4ce3980c61415696f28f01c5
Binary files /dev/null and b/internet-groceryimages/223.jpg differ
diff --git a/internet-groceryimages/228.jpg b/internet-groceryimages/228.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..eea1753bced74dcfc11e94241ea52d4a2fb2af24
Binary files /dev/null and b/internet-groceryimages/228.jpg differ
diff --git a/internet-groceryimages/24.jpg b/internet-groceryimages/24.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ddbc5f63c5e72941c1a1ac45737004a38168d25c
Binary files /dev/null and b/internet-groceryimages/24.jpg differ
diff --git a/internet-groceryimages/242.jpg b/internet-groceryimages/242.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..79738ac811bf5404686ca276969ce4c54c181b85
Binary files /dev/null and b/internet-groceryimages/242.jpg differ
diff --git a/internet-groceryimages/25 (2).jpg b/internet-groceryimages/25 (2).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..39992cb212081e1e8a39eb71aa84fc48598aea75
Binary files /dev/null and b/internet-groceryimages/25 (2).jpg differ
diff --git a/internet-groceryimages/25 - Copy.jpg b/internet-groceryimages/25 - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..066f02517a598748c35b0167b3272053093d5d71
Binary files /dev/null and b/internet-groceryimages/25 - Copy.jpg differ
diff --git a/internet-groceryimages/25.jpg b/internet-groceryimages/25.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f3d47404067f1dc424309c40bc047006b0639a32
Binary files /dev/null and b/internet-groceryimages/25.jpg differ
diff --git a/internet-groceryimages/250-hair-removal-cream-for-women-silky-soft-smoothing-skin-with-original-imagmppy2sgppdbv.jpeg b/internet-groceryimages/250-hair-removal-cream-for-women-silky-soft-smoothing-skin-with-original-imagmppy2sgppdbv.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..c195964951b06ccfd5a22fd39fd8cc43751b1d00
Binary files /dev/null and b/internet-groceryimages/250-hair-removal-cream-for-women-silky-soft-smoothing-skin-with-original-imagmppy2sgppdbv.jpeg differ
diff --git a/internet-groceryimages/250-home-style-north-indian-premium-karonda-pickle-plastic-original-imah3my9qgjq7xkd.jpeg b/internet-groceryimages/250-home-style-north-indian-premium-karonda-pickle-plastic-original-imah3my9qgjq7xkd.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..4042a5aada982692e917fceeef7d6297078b32c6
Binary files /dev/null and b/internet-groceryimages/250-home-style-north-indian-premium-karonda-pickle-plastic-original-imah3my9qgjq7xkd.jpeg differ
diff --git a/internet-groceryimages/250-plax-mouthwash-colgate-original-imafz8f5cstwqkdk.jpeg b/internet-groceryimages/250-plax-mouthwash-colgate-original-imafz8f5cstwqkdk.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..9b4e7281936b617ecb18113757747431f2c52ca8
Binary files /dev/null and b/internet-groceryimages/250-plax-mouthwash-colgate-original-imafz8f5cstwqkdk.jpeg differ
diff --git a/internet-groceryimages/251.jpg b/internet-groceryimages/251.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..db00cf5e9098a41664c4893677ab581f5ead3df5
Binary files /dev/null and b/internet-groceryimages/251.jpg differ
diff --git a/internet-groceryimages/258.jpg b/internet-groceryimages/258.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c996388016cb4d9e27e1846b406cf60ac64e5672
Binary files /dev/null and b/internet-groceryimages/258.jpg differ
diff --git a/internet-groceryimages/26 (2).jpg b/internet-groceryimages/26 (2).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ae8ec3ea84a58596721efc4a34cac9447b5bd950
Binary files /dev/null and b/internet-groceryimages/26 (2).jpg differ
diff --git a/internet-groceryimages/26 (3).jpg b/internet-groceryimages/26 (3).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5179ed3966b7826462d69d53ef6c012cbabc3931
Binary files /dev/null and b/internet-groceryimages/26 (3).jpg differ
diff --git a/internet-groceryimages/26 (4).jpg b/internet-groceryimages/26 (4).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..849da3a36d6dbf1db5d50ebfc3a343ea3a515719
Binary files /dev/null and b/internet-groceryimages/26 (4).jpg differ
diff --git a/internet-groceryimages/26.jpg b/internet-groceryimages/26.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d553cffd847e5cac368075ff0f12eb57b7fb9fbe
Binary files /dev/null and b/internet-groceryimages/26.jpg differ
diff --git a/internet-groceryimages/27 (2).jpg b/internet-groceryimages/27 (2).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ecf9a306f93a8cd503eac0f01237b4b81c1c12be
Binary files /dev/null and b/internet-groceryimages/27 (2).jpg differ
diff --git a/internet-groceryimages/27.jpg b/internet-groceryimages/27.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2f2789a769ff55319ace2014150c65912cf31411
Binary files /dev/null and b/internet-groceryimages/27.jpg differ
diff --git a/internet-groceryimages/28.jpg b/internet-groceryimages/28.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b1523c944b4eaa99f337603fdf58d87e2183c31c
Binary files /dev/null and b/internet-groceryimages/28.jpg differ
diff --git a/internet-groceryimages/284.jpg b/internet-groceryimages/284.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5f0baf7d2ded75ef96ddd23c7dea241b06540042
Binary files /dev/null and b/internet-groceryimages/284.jpg differ
diff --git a/internet-groceryimages/285.jpg b/internet-groceryimages/285.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d10fb671e1471a0f0a952547c78ecdca8f01b0c3
Binary files /dev/null and b/internet-groceryimages/285.jpg differ
diff --git a/internet-groceryimages/29 - Copy.jpg b/internet-groceryimages/29 - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9108d8bb73b4669419be08021f2035c4206c8247
Binary files /dev/null and b/internet-groceryimages/29 - Copy.jpg differ
diff --git a/internet-groceryimages/29.jpg b/internet-groceryimages/29.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..61bf8ef737485d29b88e76087ca2e66244bb37b3
Binary files /dev/null and b/internet-groceryimages/29.jpg differ
diff --git a/internet-groceryimages/3 (2).jpg b/internet-groceryimages/3 (2).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7477389d948617c83b7a8b4aa563a9d2c03e740d
Binary files /dev/null and b/internet-groceryimages/3 (2).jpg differ
diff --git a/internet-groceryimages/3 (3).jpg b/internet-groceryimages/3 (3).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7b50520a3feb36b222e01bc928b608f6a72be3a3
Binary files /dev/null and b/internet-groceryimages/3 (3).jpg differ
diff --git a/internet-groceryimages/3.jpg b/internet-groceryimages/3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3179cf236bf5f32b658f2c1d67c3cda00bad4027
Binary files /dev/null and b/internet-groceryimages/3.jpg differ
diff --git a/internet-groceryimages/30 - Copy.jpg b/internet-groceryimages/30 - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8c649b64d72392ae5ca9a4274aa27070bbd4849d
Binary files /dev/null and b/internet-groceryimages/30 - Copy.jpg differ
diff --git a/internet-groceryimages/30.jpg b/internet-groceryimages/30.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..29387239afd5a0d621d9038d5053d2dc28aad033
Binary files /dev/null and b/internet-groceryimages/30.jpg differ
diff --git a/internet-groceryimages/31.jpg b/internet-groceryimages/31.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c9a3b42d4da008a692a806ac9ae5c91d606a9ad4
Binary files /dev/null and b/internet-groceryimages/31.jpg differ
diff --git a/unsloth-main/.github/FUNDING.yml b/unsloth-main/.github/FUNDING.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a6cda7d03413a21f4db586305db725f31f2655cc
--- /dev/null
+++ b/unsloth-main/.github/FUNDING.yml
@@ -0,0 +1,13 @@
+# These are supported funding model platforms
+
+github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: unsloth
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
diff --git a/unsloth-main/CONTRIBUTING.md b/unsloth-main/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..58a2652b5e7311cd0de66699ff632e5fce5e56fe
--- /dev/null
+++ b/unsloth-main/CONTRIBUTING.md
@@ -0,0 +1,29 @@
+# 🦥 Contributing to Unsloth
+
+Thank you for not only using Unsloth but also for being interested in helping out! We value all contributions, whether they come in the form of code, ideas, support for others or just by simply spreading the word of Unsloth! 💕
+
+- **[Support the Community](https://github.com/unslothai/unsloth/issues)**: Answer questions, review pull requests, or assist others in discussions.
+- **Fix Bugs**: Identify and resolve issues with the existing codebase.  
+- **Submit Ideas**: Request new features or share enhancements you'd like to see.  
+- **Develop Features**: Implement new functionality or improve existing tools which can be done via PRs.
+- **[Improve Documentation](https://docs.unsloth.ai/)**: Help by creating guides, FAQs, or enhancing clarity.
+
+One of the best ways to support us is by spreading the word about Unsloth! Share how it’s powering your amazing projects in blog posts or social media, and inspire others to explore its potential. Even a simple star on our repo goes a long way in showing your support and helping the community grow. 🌟
+
+## Submitting Issues  
+If you find a bug or have a feature idea, we’d love to hear from you! Here’s how to make your submission stand out:  
+
+### Reporting Bugs  
+1. **Search First**: Check if the issue has already been reported using GitHub’s search bar under Issues.  
+2. **Details Matter**: Is this on Google Colab, Kaggle, or on another platform service? Are you using Unsloth's official notebook? Include your OS, Python version, and other relevant details. For bugs, a concise code snippet that reproduces the issue is incredibly helpful.  
+3. **Be Thorough**: Attach screenshots, traceback logs, or any additional information that might speed up resolution.
+
+## Spread the Word
+Your support extends beyond code:  
+- Spread the word by writing about Unsloth in blogs or social media.  
+- Share how Unsloth powers your projects.  
+- Star our repository to show your appreciation.  
+
+Finally, please be mindful of our [Code of Conduct](https://github.com/unslothai/unsloth/tree/main/unsloth/CODE_OF_CONDUCT.md) to ensure a welcoming and inclusive environment for everyone.  
+
+Thank you so much for reading and we hope you have lots of fun using Unsloth! 🦥
diff --git a/unsloth-main/LICENSE b/unsloth-main/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..8894f17a3bc6c0b55fa3d18846f1f24dfdd5598b
--- /dev/null
+++ b/unsloth-main/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [2024-] [Unsloth AI, Daniel Han-Chen & Michael Han-Chen]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/unsloth-main/README.md b/unsloth-main/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6bff98cbdaaadc3500fbab6a633208fa849f4793
--- /dev/null
+++ b/unsloth-main/README.md
@@ -0,0 +1,492 @@
+<div align="center">
+
+  <a href="https://unsloth.ai"><picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20white%20text.png">
+    <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20black%20text.png">
+    <img alt="unsloth logo" src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20black%20text.png" height="110" style="max-width: 100%;">
+  </picture></a>
+  
+<a href="https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/start free finetune button.png" height="48"></a>
+<a href="https://discord.gg/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord button.png" height="48"></a>
+<a href="https://docs.unsloth.ai"><img src="https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Documentation%20Button.png" height="48"></a>
+
+### Finetune Llama 3.2, Mistral, Phi-3.5, Qwen 2.5 & Gemma 2-5x faster with 80% less memory!
+
+![](https://i.ibb.co/sJ7RhGG/image-41.png)
+
+</div>
+
+## ✨ Finetune for Free
+
+All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, Ollama, vLLM or uploaded to Hugging Face.
+
+| Unsloth supports | Free Notebooks | Performance | Memory use |
+|-----------|---------|--------|----------|
+| **Llama 3.2 (3B)**      | [▶️ Start for free](https://colab.research.google.com/drive/1T5-zKWM_5OD21QHwXHiV9ixTRR7k3iB9?usp=sharing)               | 2x faster | 60% less |
+| **Llama 3.2 Vision (11B)**      | [▶️ Start for free](https://colab.research.google.com/drive/1j0N4XTY1zXXy7mPAhOC1_gMYZ2F2EBlk?usp=sharing)               | 2x faster | 40% less |
+| **Llama 3.1 (8B)**      | [▶️ Start for free](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing)               | 2x faster | 60% less |
+| **Phi-3.5 (mini)** | [▶️ Start for free](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing)               | 2x faster | 50% less |
+| **Gemma 2 (9B)**      | [▶️ Start for free](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing)               | 2x faster | 63% less |
+| **Qwen 2.5 (7B)**      | [▶️ Start for free](https://colab.research.google.com/drive/1Kose-ucXO1IBaZq5BvbwWieuubP7hxvQ?usp=sharing)               | 2x faster | 63% less |
+| **Mistral v0.3 (7B)**    | [▶️ Start for free](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing)               | 2.2x faster | 73% less |
+| **Ollama**     | [▶️ Start for free](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)               | 1.9x faster | 43% less |
+| **ORPO**     | [▶️ Start for free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
+| **DPO Zephyr**     | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
+
+- See [all our notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks) and [all our models](https://docs.unsloth.ai/get-started/all-our-models)
+- **Kaggle Notebooks** for [Llama 3.2 Kaggle notebook](https://www.kaggle.com/danielhanchen/kaggle-llama-3-2-1b-3b-unsloth-notebook), [Llama 3.1 (8B)](https://www.kaggle.com/danielhanchen/kaggle-llama-3-1-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
+- Run notebooks for [Llama 3.2 conversational](https://colab.research.google.com/drive/1T5-zKWM_5OD21QHwXHiV9ixTRR7k3iB9?usp=sharing), [Llama 3.1 conversational](https://colab.research.google.com/drive/15OyFkGoCImV9dSsewU1wa2JuKB4-mDE_?usp=sharing) and [Mistral v0.3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing)
+- This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text
+- This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language
+- Click [here](https://docs.unsloth.ai/) for detailed documentation for Unsloth.
+
+## 🦥 Unsloth.ai News
+- 📣 NEW! [Llama 3.3 (70B)](https://huggingface.co/collections/unsloth/llama-33-all-versions-67535d7d994794b9d7cf5e9f), Meta's latest model is now supported.
+- 📣 NEW! We worked with Apple to add [Cut Cross Entropy](https://arxiv.org/abs/2411.09009). Unsloth now supports 89K context for Meta's Llama 3.3 (70B) on a 80GB GPU - 13x longer than HF+FA2. For Llama 3.1 (8B), Unsloth enables 342K context, surpassing its native 128K support.
+- 📣 NEW! Introducing Unsloth [Dynamic 4-bit Quantization](https://unsloth.ai/blog/dynamic-4bit)! We dynamically opt not to quantize certain parameters and this greatly increases accuracy while only using <10% more VRAM than BnB 4-bit. See our collection on [Hugging Face here.](https://huggingface.co/collections/unsloth/unsloth-4-bit-dynamic-quants-67503bb873f89e15276c44e7)
+- 📣 NEW! [Vision models](https://unsloth.ai/blog/vision) now supported! [Llama 3.2 Vision (11B)](https://colab.research.google.com/drive/1j0N4XTY1zXXy7mPAhOC1_gMYZ2F2EBlk?usp=sharing), [Qwen 2.5 VL (7B)](https://colab.research.google.com/drive/1whHb54GNZMrNxIsi2wm2EY_-Pvo2QyKh?usp=sharing) and [Pixtral (12B) 2409](https://colab.research.google.com/drive/1K9ZrdwvZRE96qGkCq_e88FgV3MLnymQq?usp=sharing)
+- 📣 NEW! Qwen-2.5 including [Coder](https://colab.research.google.com/drive/18sN803sU23XuJV9Q8On2xgqHSer6-UZF?usp=sharing) models are now supported with bugfixes. 14b fits in a Colab GPU! [Qwen 2.5 conversational notebook](https://colab.research.google.com/drive/1qN1CEalC70EO1wGKhNxs1go1W9So61R5?usp=sharing)
+- 📣 NEW! We found and helped fix a [gradient accumulation bug](https://unsloth.ai/blog/gradient)! Please update Unsloth and transformers.
+<details>
+  <summary>Click for more news</summary>
+  
+- 📣 Try out [Chat interface](https://colab.research.google.com/drive/1i-8ESvtLRGNkkUQQr_-z_rcSAIo9c3lM?usp=sharing)!
+- 📣 NEW! [Mistral Small 22b notebook](https://colab.research.google.com/drive/1oCEHcED15DzL8xXGU1VTx5ZfOJM8WY01?usp=sharing) finetuning fits in under 16GB of VRAM!
+- 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) & [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct are now supported
+- 📣 NEW! `pip install unsloth` now works! Head over to [pypi](https://pypi.org/project/unsloth/) to check it out! This allows non git pull installs. Use `pip install unsloth[colab-new]` for non dependency installs.
+- 📣 NEW! Continued Pretraining [notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) for other languages like Korean!
+- 📣 [2x faster inference](https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing) added for all our models
+- 📣 We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support [4x longer context windows](https://unsloth.ai/blog/long-context)!
+</details>
+
+## 🔗 Links and Resources
+| Type                            | Links                               |
+| ------------------------------- | --------------------------------------- |
+| 📚 **Documentation & Wiki**              | [Read Our Docs](https://docs.unsloth.ai) |
+| <img height="14" src="https://upload.wikimedia.org/wikipedia/commons/6/6f/Logo_of_Twitter.svg" />&nbsp; **Twitter (aka X)**              |  [Follow us on X](https://twitter.com/unslothai)|
+| 💾 **Installation**               | [unsloth/README.md](https://github.com/unslothai/unsloth/tree/main#-installation-instructions)|
+| 🥇 **Benchmarking**                   | [Performance Tables](https://github.com/unslothai/unsloth/tree/main#-performance-benchmarking)
+| 🌐 **Released Models**            | [Unsloth Releases](https://docs.unsloth.ai/get-started/all-our-models)|
+| ✍️ **Blog**                    | [Read our Blogs](https://unsloth.ai/blog)|
+| <img height="14" src="https://redditinc.com/hs-fs/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" />&nbsp; **Reddit**                    | [Join our Reddit page](https://reddit.com/r/unsloth)|
+
+## ⭐ Key Features
+- All kernels written in [OpenAI's Triton](https://openai.com/research/triton) language. **Manual backprop engine**.
+- **0% loss in accuracy** - no approximation methods - all exact.
+- No change of hardware. Supports NVIDIA GPUs since 2018+. Minimum CUDA Capability 7.0 (V100, T4, Titan V, RTX 20, 30, 40x, A100, H100, L40 etc) [Check your GPU!](https://developer.nvidia.com/cuda-gpus) GTX 1070, 1080 works, but is slow.
+- Works on **Linux** and **Windows** via WSL.
+- Supports 4bit and 16bit QLoRA / LoRA finetuning via [bitsandbytes](https://github.com/TimDettmers/bitsandbytes).
+- Open source trains 5x faster - see [Unsloth Pro](https://unsloth.ai/) for up to **30x faster training**!
+- If you trained a model with 🦥Unsloth, you can use this cool sticker! &nbsp; <img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/made with unsloth.png" height="50" align="center" />
+
+
+## 🥇 Performance Benchmarking
+- For the full list of **reproducible** benchmarking tables, [go to our website](https://unsloth.ai/blog/mistral-benchmark#Benchmark%20tables)
+
+| 1 A100 40GB  | 🤗Hugging Face | Flash Attention | 🦥Unsloth Open Source | 🦥[Unsloth Pro](https://unsloth.ai/pricing) |
+|--------------|--------------|-----------------|---------------------|-----------------|
+| Alpaca       | 1x           | 1.04x           | 1.98x               | **15.64x**      |
+| LAION Chip2  | 1x           | 0.92x           | 1.61x               | **20.73x**      |
+| OASST        | 1x           | 1.19x           | 2.17x               | **14.83x**      |
+| Slim Orca    | 1x           | 1.18x           | 2.22x               | **14.82x**      |
+
+- Benchmarking table below was conducted by [🤗Hugging Face](https://huggingface.co/blog/unsloth-trl).
+
+| Free Colab T4 | Dataset | 🤗Hugging Face | Pytorch 2.1.1 | 🦥Unsloth | 🦥 VRAM reduction |
+| --- | --- | --- | --- | --- | --- |
+| Llama-2 7b | OASST | 1x | 1.19x | 1.95x | -43.3% |
+| Mistral 7b | Alpaca | 1x | 1.07x | 1.56x | -13.7% |
+| Tiny Llama 1.1b | Alpaca | 1x | 2.06x | 3.87x | -73.8% |
+| DPO with Zephyr | Ultra Chat | 1x | 1.09x | 1.55x | -18.6% |
+
+![](https://i.ibb.co/sJ7RhGG/image-41.png)
+
+## 💾 Installation Instructions
+
+For stable releases, use `pip install unsloth`. We recommend `pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"` for most installations though.
+
+### Conda Installation
+`⚠️Only use Conda if you have it. If not, use Pip`. Select either `pytorch-cuda=11.8,12.1` for CUDA 11.8 or CUDA 12.1. We support `python=3.10,3.11,3.12`.
+```bash
+conda create --name unsloth_env \
+    python=3.11 \
+    pytorch-cuda=12.1 \
+    pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers \
+    -y
+conda activate unsloth_env
+
+pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+pip install --no-deps trl peft accelerate bitsandbytes
+```
+
+<details>
+  <summary>If you're looking to install Conda in a Linux environment, <a href="https://docs.anaconda.com/miniconda/">read here</a>, or run the below 🔽</summary>
+  
+  ```bash
+  mkdir -p ~/miniconda3
+  wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
+  bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
+  rm -rf ~/miniconda3/miniconda.sh
+  ~/miniconda3/bin/conda init bash
+  ~/miniconda3/bin/conda init zsh
+  ```
+</details>
+
+### Pip Installation
+`⚠️Do **NOT** use this if you have Conda.` Pip is a bit more complex since there are dependency issues. The pip command is different for `torch 2.2,2.3,2.4,2.5` and CUDA versions.
+
+For other torch versions, we support `torch211`, `torch212`, `torch220`, `torch230`, `torch240` and for CUDA versions, we support `cu118` and `cu121` and `cu124`. For Ampere devices (A100, H100, RTX3090) and above, use `cu118-ampere` or `cu121-ampere` or `cu124-ampere`.
+
+For example, if you have `torch 2.4` and `CUDA 12.1`, use:
+```bash
+pip install --upgrade pip
+pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"
+```
+
+Another example, if you have `torch 2.5` and `CUDA 12.4`, use:
+```bash
+pip install --upgrade pip
+pip install "unsloth[cu124-torch250] @ git+https://github.com/unslothai/unsloth.git"
+```
+
+And other examples:
+```bash
+pip install "unsloth[cu121-ampere-torch240] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu118-ampere-torch240] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu118-torch240] @ git+https://github.com/unslothai/unsloth.git"
+
+pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
+
+pip install "unsloth[cu121-torch250] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu124-ampere-torch250] @ git+https://github.com/unslothai/unsloth.git"
+```
+
+Or, run the below in a terminal to get the **optimal** pip installation command:
+```bash
+wget -qO- https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/_auto_install.py | python -
+```
+
+Or, run the below manually in a Python REPL:
+```python
+try: import torch
+except: raise ImportError('Install torch via `pip install torch`')
+from packaging.version import Version as V
+v = V(torch.__version__)
+cuda = str(torch.version.cuda)
+is_ampere = torch.cuda.get_device_capability()[0] >= 8
+if cuda != "12.1" and cuda != "11.8" and cuda != "12.4": raise RuntimeError(f"CUDA = {cuda} not supported!")
+if   v <= V('2.1.0'): raise RuntimeError(f"Torch = {v} too old!")
+elif v <= V('2.1.1'): x = 'cu{}{}-torch211'
+elif v <= V('2.1.2'): x = 'cu{}{}-torch212'
+elif v  < V('2.3.0'): x = 'cu{}{}-torch220'
+elif v  < V('2.4.0'): x = 'cu{}{}-torch230'
+elif v  < V('2.5.0'): x = 'cu{}{}-torch240'
+elif v  < V('2.6.0'): x = 'cu{}{}-torch250'
+else: raise RuntimeError(f"Torch = {v} too new!")
+x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
+print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')
+```
+
+### Windows Installation
+
+To run Unsloth directly on Windows:
+- Install Triton from this Windows fork and follow the instructions: https://github.com/woct0rdho/triton-windows
+- In the SFTTrainer, set `dataset_num_proc=1` to avoid a crashing issue:
+```python
+trainer = SFTTrainer(
+    dataset_num_proc=1,
+    ...
+)
+```
+
+For **advanced installation instructions** or if you see weird errors during installations:
+
+1. Install `torch` and `triton`. Go to https://pytorch.org to install it. For example `pip install torch torchvision torchaudio triton`
+2. Confirm if CUDA is installated correctly. Try `nvcc`. If that fails, you need to install `cudatoolkit` or CUDA drivers.
+3. Install `xformers` manually. You can try installing `vllm` and seeing if `vllm` succeeds. Check if `xformers` succeeded with `python -m xformers.info` Go to https://github.com/facebookresearch/xformers. Another option is to install `flash-attn` for Ampere GPUs.
+4.  Finally, install `bitsandbytes` and check it with `python -m bitsandbytes`
+
+## 📜 [Documentation](https://docs.unsloth.ai)
+- Go to our official [Documentation](https://docs.unsloth.ai) for saving to GGUF, checkpointing, evaluation and more!
+- We support Huggingface's TRL, Trainer, Seq2SeqTrainer or even Pytorch code!
+- We're in 🤗Hugging Face's official docs! Check out the [SFT docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth) and [DPO docs](https://huggingface.co/docs/trl/main/en/dpo_trainer#accelerate-dpo-fine-tuning-using-unsloth)!
+
+```python
+from unsloth import FastLanguageModel 
+from unsloth import is_bfloat16_supported
+import torch
+from trl import SFTTrainer
+from transformers import TrainingArguments
+from datasets import load_dataset
+max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any!
+# Get LAION dataset
+url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
+dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
+
+# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
+fourbit_models = [
+    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
+    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
+    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
+    "unsloth/llama-3-8b-Instruct-bnb-4bit",
+    "unsloth/llama-3-70b-bnb-4bit",
+    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
+    "unsloth/Phi-3-medium-4k-instruct",
+    "unsloth/mistral-7b-bnb-4bit",
+    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
+] # More models at https://huggingface.co/unsloth
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/llama-3-8b-bnb-4bit",
+    max_seq_length = max_seq_length,
+    dtype = None,
+    load_in_4bit = True,
+)
+
+# Do model patching and add fast LoRA weights
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 16,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 16,
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",    # Supports any, but = "none" is optimized
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+    random_state = 3407,
+    max_seq_length = max_seq_length,
+    use_rslora = False,  # We support rank stabilized LoRA
+    loftq_config = None, # And LoftQ
+)
+
+trainer = SFTTrainer(
+    model = model,
+    train_dataset = dataset,
+    dataset_text_field = "text",
+    max_seq_length = max_seq_length,
+    tokenizer = tokenizer,
+    args = TrainingArguments(
+        per_device_train_batch_size = 2,
+        gradient_accumulation_steps = 4,
+        warmup_steps = 10,
+        max_steps = 60,
+        fp16 = not is_bfloat16_supported(),
+        bf16 = is_bfloat16_supported(),
+        logging_steps = 1,
+        output_dir = "outputs",
+        optim = "adamw_8bit",
+        seed = 3407,
+    ),
+)
+trainer.train()
+
+# Go to https://github.com/unslothai/unsloth/wiki for advanced tips like
+# (1) Saving to GGUF / merging to 16bit for vLLM
+# (2) Continued training from a saved LoRA adapter
+# (3) Adding an evaluation loop / OOMs
+# (4) Customized chat templates
+```
+
+<a name="DPO"></a>
+## DPO Support
+DPO (Direct Preference Optimization), PPO, Reward Modelling all seem to work as per 3rd party independent testing from [Llama-Factory](https://github.com/hiyouga/LLaMA-Factory). We have a preliminary Google Colab notebook for reproducing Zephyr on Tesla T4 here: [notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing).
+
+We're in 🤗Hugging Face's official docs! We're on the [SFT docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth) and the [DPO docs](https://huggingface.co/docs/trl/main/en/dpo_trainer#accelerate-dpo-fine-tuning-using-unsloth)!
+
+```python
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Optional set GPU device ID
+
+from unsloth import FastLanguageModel, PatchDPOTrainer
+from unsloth import is_bfloat16_supported
+PatchDPOTrainer()
+import torch
+from transformers import TrainingArguments
+from trl import DPOTrainer
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/zephyr-sft-bnb-4bit",
+    max_seq_length = max_seq_length,
+    dtype = None,
+    load_in_4bit = True,
+)
+
+# Do model patching and add fast LoRA weights
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 64,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 64,
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",    # Supports any, but = "none" is optimized
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+    random_state = 3407,
+    max_seq_length = max_seq_length,
+)
+
+dpo_trainer = DPOTrainer(
+    model = model,
+    ref_model = None,
+    args = TrainingArguments(
+        per_device_train_batch_size = 4,
+        gradient_accumulation_steps = 8,
+        warmup_ratio = 0.1,
+        num_train_epochs = 3,
+        fp16 = not is_bfloat16_supported(),
+        bf16 = is_bfloat16_supported(),
+        logging_steps = 1,
+        optim = "adamw_8bit",
+        seed = 42,
+        output_dir = "outputs",
+    ),
+    beta = 0.1,
+    train_dataset = YOUR_DATASET_HERE,
+    # eval_dataset = YOUR_DATASET_HERE,
+    tokenizer = tokenizer,
+    max_length = 1024,
+    max_prompt_length = 512,
+)
+dpo_trainer.train()
+```
+
+## 🥇 Detailed Benchmarking Tables
+- Click "Code" for fully reproducible examples
+- "Unsloth Equal" is a preview of our PRO version, with code stripped out. All settings and the loss curve remains identical.
+- For the full list of benchmarking tables, [go to our website](https://unsloth.ai/blog/mistral-benchmark#Benchmark%20tables)
+  
+| 1 A100 40GB | 🤗Hugging Face | Flash Attention 2 | 🦥Unsloth Open | Unsloth Equal | Unsloth Pro | Unsloth Max |
+|--------------|-------------|-------------|-----------------|--------------|---------------|-------------|
+| Alpaca       | 1x          | 1.04x       | 1.98x           | 2.48x        | 5.32x         | **15.64x**      |
+| code | [Code](https://colab.research.google.com/drive/1u4dBeM-0vGNVmmO6X7cScAut-Hyt4KDF?usp=sharing) |    [Code](https://colab.research.google.com/drive/1fgTOxpMbVjloQBvZyz4lF4BacKSZOB2A?usp=sharing) |    [Code](https://colab.research.google.com/drive/1YIPY_18xm-K0iJDgvNkRoJsgkPMPAO3G?usp=sharing) |    [Code](https://colab.research.google.com/drive/1ANW8EFL3LVyTD7Gq4TkheC1Z7Rxw-rHp?usp=sharing) | | |
+| seconds| 1040 | 1001 | 525 | 419 | 196 | 67  |
+| memory MB| 18235 | 15365 | 9631 | 8525 | | |
+| % saved| | 15.74 | 47.18 | 53.25 | | | |
+
+### Llama-Factory 3rd party benchmarking
+- [Link to performance table.](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-Comparison) TGS: tokens per GPU per second. Model: LLaMA2-7B. GPU: NVIDIA A100 * 1. Batch size: 4. Gradient accumulation: 2. LoRA rank: 8. Max length: 1024.
+
+| Method | Bits | TGS | GRAM | Speed |
+| --- | --- | --- | --- | --- |
+| HF | 16 | 2392 | 18GB | 100% |
+| HF+FA2 | 16 | 2954 | 17GB | 123% |
+| Unsloth+FA2 | 16 | 4007 | 16GB | **168%** |
+| HF | 4 | 2415 | 9GB | 101% |
+| Unsloth+FA2 | 4 | 3726 | 7GB | **160%** |
+
+### Performance comparisons between popular models
+<details>
+  <summary>Click for specific model benchmarking tables (Mistral 7b, CodeLlama 34b etc.)</summary>
+  
+### Mistral 7b
+| 1 A100 40GB | Hugging Face | Flash Attention 2 | Unsloth Open | Unsloth Equal | Unsloth Pro | Unsloth Max |
+|--------------|-------------|-------------|-----------------|--------------|---------------|-------------|
+| Mistral 7B Slim Orca  | 1x | 1.15x        | 2.15x        | 2.53x            | 4.61x         | **13.69x**         |
+| code | [Code](https://colab.research.google.com/drive/1mePk3KzwTD81hr5mcNcs_AX3Kbg_Ha0x?usp=sharing) | [Code](https://colab.research.google.com/drive/1dgHxjvTmX6hb0bPcLp26RXSE6_n9DKj7?usp=sharing) | [Code](https://colab.research.google.com/drive/1SKrKGV-BZoU4kv5q3g0jtE_OhRgPtrrQ?usp=sharing) | [Code](https://colab.research.google.com/drive/18yOiyX0T81mTwZqOALFSCX_tSAqju6aD?usp=sharing) | |
+| seconds      | 1813        | 1571        | 842             | 718          | 393           | 132         |
+| memory MB    | 32853       | 19385       | 12465           | 10271        |          |        |
+| % saved| | 40.99      | 62.06       | 68.74           |         |          |
+
+### CodeLlama 34b
+| 1 A100 40GB | Hugging Face | Flash Attention 2 | Unsloth Open | Unsloth Equal | Unsloth Pro | Unsloth Max |
+|--------------|-------------|-------------|-----------------|--------------|---------------|-------------|
+| Code Llama 34B   | OOM ❌         | 0.99x        | 1.87x           | 2.61x        | 4.27x      | 12.82x      |
+| code | [▶️ Code](https://colab.research.google.com/drive/1ykfz3BqrtC_AUFegCzUQjjfUNlxp6Otc?usp=sharing) | [Code](https://colab.research.google.com/drive/12ZypxQh7OC6kBXvWZI-5d05I4m-B_hoR?usp=sharing) | [Code](https://colab.research.google.com/drive/1gdHyAx8XJsz2yNV-DHvbHjR1iCef5Qmh?usp=sharing) | [Code](https://colab.research.google.com/drive/1fm7wqx9MJ0kRrwKOfmLkK1Rmw-pySahB?usp=sharing) | |
+| seconds      | 1953  | 1982  | 1043  | 748   | 458   | 152   |
+| memory MB    | 40000 | 33217 | 27413 | 22161 |       | |
+| % saved|    | 16.96| 31.47 | 44.60 |       | | |
+
+### 1 Tesla T4
+
+| 1 T4 16GB  | Hugging Face | Flash Attention | Unsloth Open    | Unsloth Pro Equal | Unsloth Pro   | Unsloth Max |
+|--------------|-------------|-----------------|-----------------|---------------|---------------|-------------|
+| Alpaca       | 1x          | 1.09x           | 1.69x           | 1.79x         | 2.93x          | **8.3x**        |
+| code | [▶️ Code](https://colab.research.google.com/drive/1XpLIV4s8Bj5uryB-X2gqM88oRGHEGdaB?usp=sharing) |    [Code](https://colab.research.google.com/drive/1LyXu6CjuymQg6ddHX8g1dpUvrMa1nn4L?usp=sharing) |    [Code](https://colab.research.google.com/drive/1gsv4LpY7C32otl1rgRo5wXTk4HIitXoM?usp=sharing) |    [Code](https://colab.research.google.com/drive/1VtULwRQwhEnVdNryjm27zXfdSM1tNfFK?usp=sharing) | | |
+| seconds       | 1599        | 1468        | 942             | 894          | 545           | 193         |
+| memory MB       | 7199        | 7059        | 6459            | 5443         |               |             |
+| % saved        |         | 1.94        | 10.28           | 24.39        |               | |
+
+### 2 Tesla T4s via DDP
+
+ | 2 T4 DDP | Hugging Face | Flash Attention | Unsloth Open | Unsloth Equal | Unsloth Pro | Unsloth Max |
+|--------------|----------|-------------|-----------------|--------------|---------------|-------------|
+| Alpaca       | 1x       | 0.99x       | 4.95x           | 4.44x        | 7.28x         | **20.61x**      |
+| code | [▶️ Code](https://www.kaggle.com/danielhanchen/hf-original-alpaca-t4-ddp) |   [Code](https://www.kaggle.com/danielhanchen/hf-sdpa-alpaca-t4-ddp) |   [Code](https://www.kaggle.com/danielhanchen/unsloth-alpaca-t4-ddp) | | |
+| seconds       | 9882     | 9946        | 1996            | 2227         | 1357          | 480         |
+| memory MB| 9176 | 9128 | 6904 | 6782 |  | |
+| % saved |     | 0.52 | 24.76 | 26.09 |  | | |
+</details>
+
+### Performance comparisons on 1 Tesla T4 GPU:
+<details>
+  <summary>Click for Time taken for 1 epoch</summary>
+
+One Tesla T4 on Google Colab
+`bsz = 2, ga = 4, max_grad_norm = 0.3, num_train_epochs = 1, seed = 3047, lr = 2e-4, wd = 0.01, optim = "adamw_8bit", schedule = "linear", schedule_steps = 10`
+
+| System | GPU | Alpaca (52K) | LAION OIG (210K) | Open Assistant (10K) | SlimOrca (518K) |
+| --- | --- | --- | --- | --- | --- |
+| Huggingface | 1 T4 | 23h 15m | 56h 28m | 8h 38m | 391h 41m |
+| Unsloth Open | 1 T4 | 13h 7m (1.8x) | 31h 47m (1.8x) | 4h 27m (1.9x) | 240h 4m (1.6x) |
+| Unsloth Pro | 1 T4 | 3h 6m (7.5x) | 5h 17m (10.7x) | 1h 7m (7.7x) | 59h 53m (6.5x) |
+| Unsloth Max | 1 T4 | 2h 39m (8.8x) | 4h 31m (12.5x) | 0h 58m (8.9x) | 51h 30m (7.6x) |
+
+**Peak Memory Usage**
+
+| System | GPU | Alpaca (52K) | LAION OIG (210K) | Open Assistant (10K) | SlimOrca (518K) |
+| --- | --- | --- | --- | --- | --- |
+| Huggingface | 1 T4 | 7.3GB | 5.9GB | 14.0GB | 13.3GB |
+| Unsloth Open | 1 T4 | 6.8GB | 5.7GB | 7.8GB | 7.7GB |
+| Unsloth Pro | 1 T4 | 6.4GB | 6.4GB | 6.4GB | 6.4GB |
+| Unsloth Max | 1 T4 | 11.4GB | 12.4GB | 11.9GB | 14.4GB |
+</details>
+
+<details>
+  <summary>Click for Performance Comparisons on 2 Tesla T4 GPUs via DDP:</summary>
+**Time taken for 1 epoch**
+
+Two Tesla T4s on Kaggle
+`bsz = 2, ga = 4, max_grad_norm = 0.3, num_train_epochs = 1, seed = 3047, lr = 2e-4, wd = 0.01, optim = "adamw_8bit", schedule = "linear", schedule_steps = 10`
+
+| System | GPU | Alpaca (52K) | LAION OIG (210K) | Open Assistant (10K) | SlimOrca (518K) * |
+| --- | --- | --- | --- | --- | --- |
+| Huggingface | 2 T4 | 84h 47m | 163h 48m | 30h 51m | 1301h 24m * |
+| Unsloth Pro | 2 T4 | 3h 20m (25.4x) | 5h 43m (28.7x) | 1h 12m (25.7x) | 71h 40m (18.1x) * |
+| Unsloth Max | 2 T4 | 3h 4m (27.6x) | 5h 14m (31.3x) | 1h 6m (28.1x) | 54h 20m (23.9x) * |
+
+**Peak Memory Usage on a Multi GPU System (2 GPUs)**
+
+| System | GPU | Alpaca (52K) | LAION OIG (210K) | Open Assistant (10K) | SlimOrca (518K) * |
+| --- | --- | --- | --- | --- | --- |
+| Huggingface | 2 T4 | 8.4GB \| 6GB | 7.2GB \| 5.3GB | 14.3GB \| 6.6GB | 10.9GB \| 5.9GB * |
+| Unsloth Pro | 2 T4 | 7.7GB \| 4.9GB | 7.5GB \| 4.9GB | 8.5GB \| 4.9GB | 6.2GB \| 4.7GB * |
+| Unsloth Max | 2 T4 | 10.5GB \| 5GB | 10.6GB \| 5GB | 10.6GB \| 5GB | 10.5GB \| 5GB * |
+
+* Slim Orca `bsz=1` for all benchmarks since `bsz=2` OOMs. We can handle `bsz=2`, but we benchmark it with `bsz=1` for consistency.
+</details>
+
+![](https://i.ibb.co/sJ7RhGG/image-41.png)
+<br>
+
+### Citation
+
+You can cite the Unsloth repo as follows:
+```bibtex
+@software{unsloth,
+  author = {Daniel Han, Michael Han and Unsloth team},
+  title = {Unsloth},
+  url = {http://github.com/unslothai/unsloth},
+  year = {2023}
+}
+```
+
+### Thank You to
+- [Erik](https://github.com/erikwijmans) for his help adding [Apple's ML Cross Entropy](https://github.com/apple/ml-cross-entropy) in Unsloth
+- [HuyNguyen-hust](https://github.com/HuyNguyen-hust) for making [RoPE Embeddings 28% faster](https://github.com/unslothai/unsloth/pull/238)
+- [RandomInternetPreson](https://github.com/RandomInternetPreson) for confirming WSL support
+- [152334H](https://github.com/152334H) for experimental DPO support
+- [atgctg](https://github.com/atgctg) for syntax highlighting
diff --git a/unsloth-main/images/Assistant.png b/unsloth-main/images/Assistant.png
new file mode 100644
index 0000000000000000000000000000000000000000..120703475091e1ce74a38a05949ae51af0a36f72
Binary files /dev/null and b/unsloth-main/images/Assistant.png differ
diff --git a/unsloth-main/images/Colab.png b/unsloth-main/images/Colab.png
new file mode 100644
index 0000000000000000000000000000000000000000..7a21b9429c5fe6113794fe0e9bbf812febfda2ee
Binary files /dev/null and b/unsloth-main/images/Colab.png differ
diff --git a/unsloth-main/images/Discord button.png b/unsloth-main/images/Discord button.png
new file mode 100644
index 0000000000000000000000000000000000000000..5e3b56d6dcb41d1969aaed75bb9efc76d462ab2d
Binary files /dev/null and b/unsloth-main/images/Discord button.png differ
diff --git a/unsloth-main/images/Discord.png b/unsloth-main/images/Discord.png
new file mode 100644
index 0000000000000000000000000000000000000000..5e3b56d6dcb41d1969aaed75bb9efc76d462ab2d
Binary files /dev/null and b/unsloth-main/images/Discord.png differ
diff --git a/unsloth-main/images/Documentation Button.png b/unsloth-main/images/Documentation Button.png
new file mode 100644
index 0000000000000000000000000000000000000000..1ac31726b554ff2d36be173dc30719258b6bcea1
Binary files /dev/null and b/unsloth-main/images/Documentation Button.png differ
diff --git a/unsloth-main/images/Free version button.png b/unsloth-main/images/Free version button.png
new file mode 100644
index 0000000000000000000000000000000000000000..9cba7f0ddb505e1f2a290a560f5a13c4308244e7
Binary files /dev/null and b/unsloth-main/images/Free version button.png differ
diff --git a/unsloth-main/images/Kaggle.png b/unsloth-main/images/Kaggle.png
new file mode 100644
index 0000000000000000000000000000000000000000..287e50b848df62f7e743ab09155aee0e1ebce3a2
Binary files /dev/null and b/unsloth-main/images/Kaggle.png differ
diff --git a/unsloth-main/images/Kofi button.png b/unsloth-main/images/Kofi button.png
new file mode 100644
index 0000000000000000000000000000000000000000..a118a44cac8978d39b2d43d6b4049bd1243ac1c0
Binary files /dev/null and b/unsloth-main/images/Kofi button.png differ
diff --git a/unsloth-main/images/LAION 2GPU.png b/unsloth-main/images/LAION 2GPU.png
new file mode 100644
index 0000000000000000000000000000000000000000..d154a526dba66592c26dbb9ce4f1c60876150034
Binary files /dev/null and b/unsloth-main/images/LAION 2GPU.png differ
diff --git a/unsloth-main/images/Merge.png b/unsloth-main/images/Merge.png
new file mode 100644
index 0000000000000000000000000000000000000000..a2df04874bfc879182cb66c789341d49700227ea
Binary files /dev/null and b/unsloth-main/images/Merge.png differ
diff --git a/unsloth-main/images/Run.png b/unsloth-main/images/Run.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd737aa4d6e3684a0ae3405565f95d52e521b785
Binary files /dev/null and b/unsloth-main/images/Run.png differ
diff --git a/unsloth-main/images/Slim Orca 2GPUs.png b/unsloth-main/images/Slim Orca 2GPUs.png
new file mode 100644
index 0000000000000000000000000000000000000000..2a438697011369c2ae7d018146c9fe69c54b60bb
Binary files /dev/null and b/unsloth-main/images/Slim Orca 2GPUs.png differ
diff --git a/unsloth-main/images/Terminal_Type.png b/unsloth-main/images/Terminal_Type.png
new file mode 100644
index 0000000000000000000000000000000000000000..e83ac484e4257eacad1c7d033811d2ece59a444c
Binary files /dev/null and b/unsloth-main/images/Terminal_Type.png differ
diff --git a/unsloth-main/images/Where_Terminal.png b/unsloth-main/images/Where_Terminal.png
new file mode 100644
index 0000000000000000000000000000000000000000..2239315eff2820bf9f224975f0b184d51bd89cb7
Binary files /dev/null and b/unsloth-main/images/Where_Terminal.png differ
diff --git a/unsloth-main/images/buy me a coffee button.png b/unsloth-main/images/buy me a coffee button.png
new file mode 100644
index 0000000000000000000000000000000000000000..5eccb8e94be8c5b89e06347e463ab51fc9865109
Binary files /dev/null and b/unsloth-main/images/buy me a coffee button.png differ
diff --git a/unsloth-main/images/documentation github button.png b/unsloth-main/images/documentation github button.png
new file mode 100644
index 0000000000000000000000000000000000000000..3155a30e2a714237bed047c39abf87938166b135
Binary files /dev/null and b/unsloth-main/images/documentation github button.png differ
diff --git a/unsloth-main/images/documentation green button.png b/unsloth-main/images/documentation green button.png
new file mode 100644
index 0000000000000000000000000000000000000000..0deccd386dfa3b1d918dfa93e351d735b15eb743
Binary files /dev/null and b/unsloth-main/images/documentation green button.png differ
diff --git a/unsloth-main/images/documentation lighter.png b/unsloth-main/images/documentation lighter.png
new file mode 100644
index 0000000000000000000000000000000000000000..0545ee331d0794a3dcfdc52f3aac7b15ea82da8a
Binary files /dev/null and b/unsloth-main/images/documentation lighter.png differ
diff --git a/unsloth-main/images/documentation white button.png b/unsloth-main/images/documentation white button.png
new file mode 100644
index 0000000000000000000000000000000000000000..59cbd9131c531053cec7ca1913d4fb96c64c404b
Binary files /dev/null and b/unsloth-main/images/documentation white button.png differ
diff --git a/unsloth-main/images/made with unsloth.png b/unsloth-main/images/made with unsloth.png
new file mode 100644
index 0000000000000000000000000000000000000000..6c5e90dbb06c81a68a8734966cf0a11fd27a2509
Binary files /dev/null and b/unsloth-main/images/made with unsloth.png differ
diff --git a/unsloth-main/images/ollama.png b/unsloth-main/images/ollama.png
new file mode 100644
index 0000000000000000000000000000000000000000..fa83bb42a8ce02ca1daace9f73f7e89fc619f8ba
Binary files /dev/null and b/unsloth-main/images/ollama.png differ
diff --git a/unsloth-main/images/peft x trl button.png b/unsloth-main/images/peft x trl button.png
new file mode 100644
index 0000000000000000000000000000000000000000..55be99b7f9d1ea7b391ac76f39283e950a0d537e
Binary files /dev/null and b/unsloth-main/images/peft x trl button.png differ
diff --git a/unsloth-main/images/start free finetune button.png b/unsloth-main/images/start free finetune button.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f6c879fd4a389d22943a73291a4182ff02812c9
Binary files /dev/null and b/unsloth-main/images/start free finetune button.png differ
diff --git a/unsloth-main/images/unsloth end.png b/unsloth-main/images/unsloth end.png
new file mode 100644
index 0000000000000000000000000000000000000000..f8a774a4bcd9bc6e6e1a6f665b93e34e2a93808a
Binary files /dev/null and b/unsloth-main/images/unsloth end.png differ
diff --git a/unsloth-main/images/unsloth loading page render.png b/unsloth-main/images/unsloth loading page render.png
new file mode 100644
index 0000000000000000000000000000000000000000..0d0fb6281f977698ba53f0ac4cd86deda7012f84
Binary files /dev/null and b/unsloth-main/images/unsloth loading page render.png differ
diff --git a/unsloth-main/images/unsloth logo black text.png b/unsloth-main/images/unsloth logo black text.png
new file mode 100644
index 0000000000000000000000000000000000000000..4eb45557a418d6c4889a8025ee5e8350be236cb4
Binary files /dev/null and b/unsloth-main/images/unsloth logo black text.png differ
diff --git a/unsloth-main/images/unsloth logo only.png b/unsloth-main/images/unsloth logo only.png
new file mode 100644
index 0000000000000000000000000000000000000000..92340eef059b9b687cf92aea9fe109106906eff3
Binary files /dev/null and b/unsloth-main/images/unsloth logo only.png differ
diff --git a/unsloth-main/images/unsloth logo white text.png b/unsloth-main/images/unsloth logo white text.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e37c7b19d01b31ffc9a2c40c0395d25d18b5b9c
Binary files /dev/null and b/unsloth-main/images/unsloth logo white text.png differ
diff --git a/unsloth-main/images/unsloth made with love.png b/unsloth-main/images/unsloth made with love.png
new file mode 100644
index 0000000000000000000000000000000000000000..9bf7ec93680f889d7602e5f56a8d677d6a58ae6a
Binary files /dev/null and b/unsloth-main/images/unsloth made with love.png differ
diff --git a/unsloth-main/images/unsloth new logo.png b/unsloth-main/images/unsloth new logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..20dac04f388d8eec89066d81362359994e855786
Binary files /dev/null and b/unsloth-main/images/unsloth new logo.png differ
diff --git a/unsloth-main/pyproject.toml b/unsloth-main/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..ce3301547bcc76c813e258d6163c99ae9736dbf9
--- /dev/null
+++ b/unsloth-main/pyproject.toml
@@ -0,0 +1,418 @@
+[build-system]
+requires = ["setuptools", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "unsloth"
+dynamic = ["version"]
+description = "2-5X faster LLM finetuning"
+readme = "README.md"
+requires-python = ">=3.9"
+license = {file = "LICENSE"}
+keywords = ["ai", "llm",]
+authors = [
+    {email = "info@unsloth.ai"},
+    {name = "Unsloth AI team"},
+]
+maintainers = [
+    {name = "Daniel Han", email = "danielhanchen@gmail.com"},
+    {name = "Michael Han", email = "info@unsloth.ai"},
+]
+classifiers = [
+    "Programming Language :: Python",
+]
+
+[tool.setuptools.dynamic]
+version = {attr = "unsloth.models._utils.__version__"}
+
+[tool.setuptools]
+include-package-data = false
+
+[tool.setuptools.packages.find]
+exclude = ["images*"]
+
+[project.optional-dependencies]
+triton = [
+    "triton @ https://github.com/woct0rdho/triton-windows/releases/download/v3.1.0-windows.post5/triton-3.1.0-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
+    "triton @ https://github.com/woct0rdho/triton-windows/releases/download/v3.1.0-windows.post5/triton-3.1.0-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
+    "triton @ https://github.com/woct0rdho/triton-windows/releases/download/v3.1.0-windows.post5/triton-3.1.0-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
+    "triton @ https://github.com/woct0rdho/triton-windows/releases/download/v3.1.0-windows.post5/triton-3.1.0-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
+]
+huggingface = [
+    "unsloth_zoo>=2024.12.7",
+    "packaging",
+    "tyro",
+    "transformers>=4.46.1,!=4.47.0",
+    "datasets>=2.16.0",
+    "sentencepiece>=0.2.0",
+    "tqdm",
+    "psutil",
+    "wheel>=0.42.0",
+    "numpy",
+    "accelerate>=0.34.1",
+    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3",
+    "peft>=0.7.1,!=0.11.0",
+    "protobuf<4.0.0",
+    "huggingface_hub",
+    "hf_transfer",
+    "unsloth[triton]",
+]
+cu118only = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+]
+cu121only = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+]
+cu118onlytorch211 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+]
+cu121onlytorch211 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+]
+cu118onlytorch212 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+]
+cu121onlytorch212 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+]
+cu118onlytorch220 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+]
+cu121onlytorch220 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+]
+cu118onlytorch230 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
+]
+cu121onlytorch230 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
+]
+cu118onlytorch240 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
+]
+cu121onlytorch240 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
+]
+cu124onlytorch240 = [
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
+]
+cu121onlytorch250 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
+]
+cu124onlytorch250 = [
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
+]
+cu121onlytorch251 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post3-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post3-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post3-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
+]
+cu124onlytorch251 = [
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post3-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post3-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post3-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post3-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post3-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post3-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post3-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
+]
+cu118 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118only]",
+]
+cu121 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121only]",
+]
+cu118-torch211 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch211]",
+]
+cu121-torch211 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch211]",
+]
+cu118-torch212 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch212]",
+]
+cu121-torch212 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch212]",
+]
+cu118-torch220 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch220]",
+]
+cu121-torch220 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch220]",
+]
+cu118-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch230]",
+]
+cu121-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch230]",
+]
+cu118-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch240]",
+]
+cu121-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch240]",
+]
+cu121-torch250 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch250]",
+]
+cu124-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu124onlytorch240]",
+]
+cu124-torch250 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu124onlytorch250]",
+]
+cu121-torch251 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch251]",
+]
+cu124-torch251 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu124onlytorch251]",
+]
+kaggle = [
+    "unsloth[huggingface]",
+]
+kaggle-new = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+]
+conda = [
+    "unsloth[huggingface]",
+]
+colab-torch211 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch211]",
+]
+colab-ampere-torch211 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch211]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+colab-torch220 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch220]",
+]
+colab-ampere-torch220 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch220]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+colab-new = [
+    "unsloth_zoo>=2024.12.7",
+    "packaging",
+    "tyro",
+    "transformers>=4.46.1,!=4.47.0",
+    "datasets>=2.16.0",
+    "sentencepiece>=0.2.0",
+    "tqdm",
+    "psutil",
+    "wheel>=0.42.0",
+    "numpy",
+    "protobuf<4.0.0",
+    "huggingface_hub",
+    "hf_transfer",
+    "bitsandbytes>=0.43.3",
+    "unsloth[triton]",
+]
+colab-no-deps = [
+    "accelerate>=0.34.1",
+    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3",
+    "peft>=0.7.1",
+    "xformers",
+    "bitsandbytes>=0.46.1",
+    "protobuf<4.0.0",
+]
+colab = [
+    "unsloth[cu121]",
+]
+flashattention = [
+    "packaging ; platform_system == 'Linux'",
+    "ninja ; platform_system == 'Linux'",
+    "flash-attn>=2.6.3 ; platform_system == 'Linux'",
+]
+colab-ampere = [
+    "unsloth[colab-ampere-torch220]",
+    "unsloth[flashattention]",
+]
+cu118-ampere = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118only]",
+    "unsloth[flashattention]",
+]
+cu121-ampere = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121only]",
+    "unsloth[flashattention]",
+]
+cu118-ampere-torch211 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch211]",
+    "unsloth[flashattention]",
+]
+cu121-ampere-torch211 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch211]",
+    "unsloth[flashattention]",
+]
+cu118-ampere-torch220 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch220]",
+    "unsloth[flashattention]",
+]
+cu121-ampere-torch220 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch220]",
+    "unsloth[flashattention]",
+]
+cu118-ampere-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch230]",
+    "unsloth[flashattention]",
+]
+cu121-ampere-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch230]",
+    "unsloth[flashattention]",
+]
+cu118-ampere-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch240]",
+    "unsloth[flashattention]",
+]
+cu121-ampere-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch240]",
+    "unsloth[flashattention]",
+]
+cu121-ampere-torch250 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch250]",
+    "unsloth[flashattention]",
+]
+cu124-ampere-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu124onlytorch240]",
+    "unsloth[flashattention]",
+]
+cu124-ampere-torch250 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu124onlytorch250]",
+    "unsloth[flashattention]",
+]
+cu121-ampere-torch251 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch251]",
+    "unsloth[flashattention]",
+]
+cu124-ampere-torch251 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu124onlytorch251]",
+    "unsloth[flashattention]",
+]
+
+[project.urls]
+homepage = "http://www.unsloth.ai"
+documentation = "https://github.com/unslothai/unsloth"
+repository = "https://github.com/unslothai/unsloth"
diff --git a/unsloth-main/unsloth-cli.py b/unsloth-main/unsloth-cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddb0ac8b7b6573c7e096052b2b0be61d147b31c5
--- /dev/null
+++ b/unsloth-main/unsloth-cli.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+
+"""
+🦥 Starter Script for Fine-Tuning FastLanguageModel with Unsloth
+
+This script is designed as a starting point for fine-tuning your models using unsloth.
+It includes configurable options for model loading, PEFT parameters, training arguments, 
+and model saving/pushing functionalities.
+
+You will likely want to customize this script to suit your specific use case 
+and requirements.
+
+Here are a few suggestions for customization:
+    - Modify the dataset loading and preprocessing steps to match your data.
+    - Customize the model saving and pushing configurations.
+
+Usage: (most of the options have valid default values this is an extended example for demonstration purposes)
+    python unsloth-cli.py --model_name "unsloth/llama-3-8b" --max_seq_length 8192 --dtype None --load_in_4bit \
+    --r 64 --lora_alpha 32 --lora_dropout 0.1 --bias "none" --use_gradient_checkpointing "unsloth" \
+    --random_state 3407 --use_rslora --per_device_train_batch_size 4 --gradient_accumulation_steps 8 \
+    --warmup_steps 5 --max_steps 400 --learning_rate 2e-6 --logging_steps 1 --optim "adamw_8bit" \
+    --weight_decay 0.005 --lr_scheduler_type "linear" --seed 3407 --output_dir "outputs" \
+    --report_to "tensorboard" --save_model --save_path "model" --quantization_method "f16" \
+    --push_model --hub_path "hf/model" --hub_token "your_hf_token"
+
+To see a full list of configurable options, use:
+    python unsloth-cli.py --help
+
+Happy fine-tuning!
+"""
+
+import argparse
+
+def run(args):
+    import torch
+    from unsloth import FastLanguageModel
+    from datasets import load_dataset
+    from trl import SFTTrainer
+    from transformers import TrainingArguments
+    from unsloth import is_bfloat16_supported
+    import logging
+    logging.getLogger('hf-to-gguf').setLevel(logging.WARNING)
+
+    # Load model and tokenizer
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=args.model_name,
+        max_seq_length=args.max_seq_length,
+        dtype=args.dtype,
+        load_in_4bit=args.load_in_4bit,
+    )
+
+    # Configure PEFT model
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=args.r,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                        "gate_proj", "up_proj", "down_proj"],
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        bias=args.bias,
+        use_gradient_checkpointing=args.use_gradient_checkpointing,
+        random_state=args.random_state,
+        use_rslora=args.use_rslora,
+        loftq_config=args.loftq_config,
+    )
+
+    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+    ### Instruction:
+    {}
+
+    ### Input:
+    {}
+
+    ### Response:
+    {}"""
+
+    EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
+    def formatting_prompts_func(examples):
+        instructions = examples["instruction"]
+        inputs       = examples["input"]
+        outputs      = examples["output"]
+        texts = []
+        for instruction, input, output in zip(instructions, inputs, outputs):
+            text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
+            texts.append(text)
+        return {"text": texts}
+
+    # Load and format dataset
+    dataset = load_dataset(args.dataset, split="train")
+    dataset = dataset.map(formatting_prompts_func, batched=True)
+    print("Data is formatted and ready!")
+
+    # Configure training arguments
+    training_args = TrainingArguments(
+        per_device_train_batch_size=args.per_device_train_batch_size,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        warmup_steps=args.warmup_steps,
+        max_steps=args.max_steps,
+        learning_rate=args.learning_rate,
+        fp16=not is_bfloat16_supported(),
+        bf16=is_bfloat16_supported(),
+        logging_steps=args.logging_steps,
+        optim=args.optim,
+        weight_decay=args.weight_decay,
+        lr_scheduler_type=args.lr_scheduler_type,
+        seed=args.seed,
+        output_dir=args.output_dir,
+        report_to=args.report_to,
+    )
+
+    # Initialize trainer
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        dataset_text_field="text",
+        max_seq_length=args.max_seq_length,
+        dataset_num_proc=2,
+        packing=False,
+        args=training_args,
+    )
+
+    # Train model
+    trainer_stats = trainer.train()
+
+    # Save model
+    if args.save_model:
+        # if args.quantization_method is a list, we will save the model for each quantization method
+        if args.save_gguf:
+            if isinstance(args.quantization, list):
+                for quantization_method in args.quantization:
+                    print(f"Saving model with quantization method: {quantization_method}")
+                    model.save_pretrained_gguf(
+                        args.save_path,
+                        tokenizer,
+                        quantization_method=quantization_method,
+                    )
+                    if args.push_model:
+                        model.push_to_hub_gguf(
+                            hub_path=args.hub_path,
+                            hub_token=args.hub_token,
+                            quantization_method=quantization_method,
+                        )
+            else:
+                print(f"Saving model with quantization method: {args.quantization}")
+                model.save_pretrained_gguf(args.save_path, tokenizer, quantization_method=args.quantization)
+                if args.push_model:
+                    model.push_to_hub_gguf(
+                        hub_path=args.hub_path,
+                        hub_token=args.hub_token,
+                        quantization_method=quantization_method,
+                    )
+        else:
+            model.save_pretrained_merged(args.save_path, tokenizer, args.save_method)
+            if args.push_model:
+                model.push_to_hub_merged(args.save_path, tokenizer, args.hub_token)
+    else:
+        print("Warning: The model is not saved!")
+
+
+if __name__ == "__main__":
+
+    # Define argument parser
+    parser = argparse.ArgumentParser(description="🦥 Fine-tune your llm faster using unsloth!")
+
+    model_group = parser.add_argument_group("🤖 Model Options")
+    model_group.add_argument('--model_name', type=str, default="unsloth/llama-3-8b", help="Model name to load")
+    model_group.add_argument('--max_seq_length', type=int, default=2048, help="Maximum sequence length, default is 2048. We auto support RoPE Scaling internally!")
+    model_group.add_argument('--dtype', type=str, default=None, help="Data type for model (None for auto detection)")
+    model_group.add_argument('--load_in_4bit', action='store_true', help="Use 4bit quantization to reduce memory usage")
+    model_group.add_argument('--dataset', type=str, default="yahma/alpaca-cleaned", help="Huggingface dataset to use for training")
+
+    lora_group = parser.add_argument_group("🧠 LoRA Options", "These options are used to configure the LoRA model.")
+    lora_group.add_argument('--r', type=int, default=16, help="Rank for Lora model, default is 16.  (common values: 8, 16, 32, 64, 128)")
+    lora_group.add_argument('--lora_alpha', type=int, default=16, help="LoRA alpha parameter, default is 16. (common values: 8, 16, 32, 64, 128)")
+    lora_group.add_argument('--lora_dropout', type=float, default=0, help="LoRA dropout rate, default is 0.0 which is optimized.")
+    lora_group.add_argument('--bias', type=str, default="none", help="Bias setting for LoRA")
+    lora_group.add_argument('--use_gradient_checkpointing', type=str, default="unsloth", help="Use gradient checkpointing")
+    lora_group.add_argument('--random_state', type=int, default=3407, help="Random state for reproducibility, default is 3407.")
+    lora_group.add_argument('--use_rslora', action='store_true', help="Use rank stabilized LoRA")
+    lora_group.add_argument('--loftq_config', type=str, default=None, help="Configuration for LoftQ")
+
+   
+    training_group = parser.add_argument_group("🎓 Training Options")
+    training_group.add_argument('--per_device_train_batch_size', type=int, default=2, help="Batch size per device during training, default is 2.")
+    training_group.add_argument('--gradient_accumulation_steps', type=int, default=4, help="Number of gradient accumulation steps, default is 4.")
+    training_group.add_argument('--warmup_steps', type=int, default=5, help="Number of warmup steps, default is 5.")
+    training_group.add_argument('--max_steps', type=int, default=400, help="Maximum number of training steps.")
+    training_group.add_argument('--learning_rate', type=float, default=2e-4, help="Learning rate, default is 2e-4.")
+    training_group.add_argument('--optim', type=str, default="adamw_8bit", help="Optimizer type.")
+    training_group.add_argument('--weight_decay', type=float, default=0.01, help="Weight decay, default is 0.01.")
+    training_group.add_argument('--lr_scheduler_type', type=str, default="linear", help="Learning rate scheduler type, default is 'linear'.")
+    training_group.add_argument('--seed', type=int, default=3407, help="Seed for reproducibility, default is 3407.")
+    
+
+    # Report/Logging arguments
+    report_group = parser.add_argument_group("📊 Report Options")
+    report_group.add_argument('--report_to', type=str, default="tensorboard",
+        choices=["azure_ml", "clearml", "codecarbon", "comet_ml", "dagshub", "dvclive", "flyte", "mlflow", "neptune", "tensorboard", "wandb", "all", "none"],
+        help="The list of integrations to report the results and logs to. Supported platforms are: \n\t\t 'azure_ml', 'clearml', 'codecarbon', 'comet_ml', 'dagshub', 'dvclive', 'flyte', 'mlflow', 'neptune', 'tensorboard', and 'wandb'. Use 'all' to report to all integrations installed, 'none' for no integrations.")
+    report_group.add_argument('--logging_steps', type=int, default=1, help="Logging steps, default is 1")
+
+    # Saving and pushing arguments
+    save_group = parser.add_argument_group('💾 Save Model Options')
+    save_group.add_argument('--output_dir', type=str, default="outputs", help="Output directory")
+    save_group.add_argument('--save_model', action='store_true', help="Save the model after training")
+    save_group.add_argument('--save_method', type=str, default="merged_16bit", choices=["merged_16bit", "merged_4bit", "lora"], help="Save method for the model, default is 'merged_16bit'")
+    save_group.add_argument('--save_gguf', action='store_true', help="Convert the model to GGUF after training")
+    save_group.add_argument('--save_path', type=str, default="model", help="Path to save the model")
+    save_group.add_argument('--quantization', type=str, default="q8_0", nargs="+",
+        help="Quantization method for saving the model. common values ('f16', 'q4_k_m', 'q8_0'), Check our wiki for all quantization methods https://github.com/unslothai/unsloth/wiki#saving-to-gguf ")
+
+    push_group = parser.add_argument_group('🚀 Push Model Options')
+    push_group.add_argument('--push_model', action='store_true', help="Push the model to Hugging Face hub after training")
+    push_group.add_argument('--push_gguf', action='store_true', help="Push the model as GGUF to Hugging Face hub after training")
+    push_group.add_argument('--hub_path', type=str, default="hf/model", help="Path on Hugging Face hub to push the model")
+    push_group.add_argument('--hub_token', type=str, help="Token for pushing the model to Hugging Face hub")
+
+    args = parser.parse_args()
+    run(args)
diff --git a/unsloth-main/unsloth/__init__.py b/unsloth-main/unsloth/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..980425e1f13cba6caf22ef78c873540905d198ef
--- /dev/null
+++ b/unsloth-main/unsloth/__init__.py
@@ -0,0 +1,181 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings, importlib, sys
+from packaging.version import Version
+import os, re, subprocess, inspect
+import numpy as np
+
+# # Define a list of modules to check
+# MODULES_TO_CHECK = ["bitsandbytes"]
+
+# # Check if any of the modules in the list have been imported
+# for module in MODULES_TO_CHECK:
+#     if module in sys.modules:
+#         raise ImportError(f"Unsloth: Please import Unsloth before {module}.")
+#     pass
+# pass
+
+# Unsloth currently does not work on multi GPU setups - sadly we are a 2 brother team so
+# enabling it will require much more work, so we have to prioritize. Please understand!
+# We do have a beta version, which you can contact us about!
+# Thank you for your understanding and we appreciate it immensely!
+
+# Fixes https://github.com/unslothai/unsloth/issues/1266
+os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+
+if "CUDA_VISIBLE_DEVICES" in os.environ:
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    devices = os.environ["CUDA_VISIBLE_DEVICES"]
+    # Check if there are multiple cuda devices set in env
+    if not devices.isdigit():
+        first_id = devices.split(",")[0]
+        warnings.warn(
+            f"Unsloth: 'CUDA_VISIBLE_DEVICES' is currently {devices} \n"\
+            "Unsloth currently does not support multi GPU setups - but we are working on it!\n"\
+            "Multiple CUDA devices detected but we require a single device.\n"\
+            f"We will override CUDA_VISIBLE_DEVICES to first device: {first_id}."
+        )
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(first_id)
+else:
+    # warnings.warn("Unsloth: 'CUDA_VISIBLE_DEVICES' is not set. We shall set it ourselves.")
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+pass
+
+# Reduce VRAM usage by reducing fragmentation
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,roundup_power2_divisions:[64:128,256:64,>:32]"
+
+# Hugging Face Hub faster downloads
+if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
+    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+pass
+
+# Log Unsloth is being used
+os.environ["UNSLOTH_IS_PRESENT"] = "1"
+
+try:
+    import torch
+except ModuleNotFoundError:
+    raise ImportError(
+        "Unsloth: Pytorch is not installed. Go to https://pytorch.org/.\n"\
+        "We have some installation instructions on our Github page."
+    )
+except Exception as exception:
+    raise exception
+pass
+
+# We support Pytorch 2
+# Fixes https://github.com/unslothai/unsloth/issues/38
+torch_version = torch.__version__.split(".")
+major_torch, minor_torch = torch_version[0], torch_version[1]
+major_torch, minor_torch = int(major_torch), int(minor_torch)
+if (major_torch < 2):
+    raise ImportError("Unsloth only supports Pytorch 2 for now. Please update your Pytorch to 2.1.\n"\
+                      "We have some installation instructions on our Github page.")
+elif (major_torch == 2) and (minor_torch < 2):
+    # Disable expandable_segments
+    del os.environ["PYTORCH_CUDA_ALLOC_CONF"]
+pass
+
+# Torch 2.4 has including_emulation
+major_version, minor_version = torch.cuda.get_device_capability()
+SUPPORTS_BFLOAT16 = (major_version >= 8)
+
+old_is_bf16_supported = torch.cuda.is_bf16_supported
+if "including_emulation" in str(inspect.signature(old_is_bf16_supported)):
+    def is_bf16_supported(including_emulation = False):
+        return old_is_bf16_supported(including_emulation)
+    torch.cuda.is_bf16_supported = is_bf16_supported
+else:
+    def is_bf16_supported(): return SUPPORTS_BFLOAT16
+    torch.cuda.is_bf16_supported = is_bf16_supported
+pass
+
+# Try loading bitsandbytes and triton
+import bitsandbytes as bnb
+
+if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ:
+
+    import triton
+    libcuda_dirs = lambda: None
+    if Version(triton.__version__) >= Version("3.0.0"):
+        try: from triton.backends.nvidia.driver import libcuda_dirs
+        except: pass
+    else: from triton.common.build import libcuda_dirs
+
+    try:
+        cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
+        libcuda_dirs()
+    except:
+        warnings.warn(
+            "Unsloth: Running `ldconfig /usr/lib64-nvidia` to link CUDA."\
+        )
+
+        if os.path.exists("/usr/lib64-nvidia"):
+            os.system("ldconfig /usr/lib64-nvidia")
+        elif os.path.exists("/usr/local"):
+            # Sometimes bitsandbytes cannot be linked properly in Runpod for example
+            possible_cudas = subprocess.check_output(["ls", "-al", "/usr/local"]).decode("utf-8").split("\n")
+            find_cuda = re.compile(r"[\s](cuda\-[\d\.]{2,})$")
+            possible_cudas = [find_cuda.search(x) for x in possible_cudas]
+            possible_cudas = [x.group(1) for x in possible_cudas if x is not None]
+
+            # Try linking cuda folder, or everything in local
+            if len(possible_cudas) == 0:
+                os.system("ldconfig /usr/local/")
+            else:
+                find_number = re.compile(r"([\d\.]{2,})")
+                latest_cuda = np.argsort([float(find_number.search(x).group(1)) for x in possible_cudas])[::-1][0]
+                latest_cuda = possible_cudas[latest_cuda]
+                os.system(f"ldconfig /usr/local/{latest_cuda}")
+        pass
+
+        importlib.reload(bnb)
+        importlib.reload(triton)
+        try:
+            libcuda_dirs = lambda: None
+            if Version(triton.__version__) >= Version("3.0.0"):
+                try: from triton.backends.nvidia.driver import libcuda_dirs
+                except: pass
+            else: from triton.common.build import libcuda_dirs
+            cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
+            libcuda_dirs()
+        except:
+            warnings.warn(
+                "Unsloth: CUDA is not linked properly.\n"\
+                "Try running `python -m bitsandbytes` then `python -m xformers.info`\n"\
+                "We tried running `ldconfig /usr/lib64-nvidia` ourselves, but it didn't work.\n"\
+                "You need to run in your terminal `sudo ldconfig /usr/lib64-nvidia` yourself, then import Unsloth.\n"\
+                "Also try `sudo ldconfig /usr/local/cuda-xx.x` - find the latest cuda version.\n"\
+                "Unsloth will still run for now, but maybe it might crash - let's hope it works!"
+            )
+    pass
+pass
+
+# Check for unsloth_zoo
+try:
+    import unsloth_zoo
+except:
+    raise ImportError("Unsloth: Please install unsloth_zoo via `pip install unsloth-zoo`")
+pass
+
+from .models import *
+from .save import *
+from .chat_templates import *
+from .tokenizer_utils import *
+from .trainer import *
+
+# Patch TRL trainers for backwards compatibility
+_patch_trl_trainer()
diff --git a/unsloth-main/unsloth/_auto_install.py b/unsloth-main/unsloth/_auto_install.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3b94c67066880286026aec3302001ec46da3175
--- /dev/null
+++ b/unsloth-main/unsloth/_auto_install.py
@@ -0,0 +1,31 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try: import torch
+except: raise ImportError('Install torch via `pip install torch`')
+from packaging.version import Version as V
+v = V(torch.__version__)
+cuda = str(torch.version.cuda)
+is_ampere = torch.cuda.get_device_capability()[0] >= 8
+if cuda != "12.1" and cuda != "11.8" and cuda != "12.4": raise RuntimeError(f"CUDA = {cuda} not supported!")
+if   v <= V('2.1.0'): raise RuntimeError(f"Torch = {v} too old!")
+elif v <= V('2.1.1'): x = 'cu{}{}-torch211'
+elif v <= V('2.1.2'): x = 'cu{}{}-torch212'
+elif v  < V('2.3.0'): x = 'cu{}{}-torch220'
+elif v  < V('2.4.0'): x = 'cu{}{}-torch230'
+elif v  < V('2.5.0'): x = 'cu{}{}-torch240'
+elif v  < V('2.6.0'): x = 'cu{}{}-torch250'
+else: raise RuntimeError(f"Torch = {v} too new!")
+x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
+print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')
\ No newline at end of file
diff --git a/unsloth-main/unsloth/chat_templates.py b/unsloth-main/unsloth/chat_templates.py
new file mode 100644
index 0000000000000000000000000000000000000000..da10f7e0036b4304a3f928cec18f6c0f2c5b4197
--- /dev/null
+++ b/unsloth-main/unsloth/chat_templates.py
@@ -0,0 +1,2105 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    "get_chat_template",
+    "test_chat_templates",
+    "test_hf_gguf_equivalence",
+    "remove_special_tokens",
+
+    "to_sharegpt",
+    "standardize_sharegpt",
+    "apply_chat_template",
+    "train_on_responses_only",
+
+    "test_construct_chat_template",
+]
+
+from transformers import StoppingCriteria, StoppingCriteriaList
+from torch import LongTensor, FloatTensor
+from transformers.models.llama.modeling_llama import logger
+from .save import patch_saving_functions
+import os
+import shutil
+from .tokenizer_utils import *
+from .models._utils import patch_tokenizer
+import re
+from unsloth_zoo.dataset_utils import (
+    train_on_responses_only,
+)
+CHAT_TEMPLATES = {}
+DEFAULT_SYSTEM_MESSAGE = {}
+
+# =========================================== Unsloth
+# Unsloth efficient template leverages from Zephyr
+unsloth_template = \
+    "{{ bos_token }}"\
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{{ messages[0]['content'] + '\n' }}"\
+        "{% set loop_messages = messages[1:] %}"\
+    "{% else %}"\
+        "{{ '{system_message}' + '\n' }}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '>>> User: ' + message['content'] + '\n' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"\
+    "{% if add_generation_prompt %}"\
+        "{{ '>>> Assistant: ' }}"\
+    "{% endif %}"
+pass
+
+unsloth_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}{{ .System }}
+{{ end }}{{ if .Prompt }}>>> User: {{ .Prompt }}
+{{ end }}>>> Assistant: {{ .Response }}{__EOS_TOKEN__}
+"""
+PARAMETER stop "{__EOS_TOKEN__}"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+SYSTEM """You are a helpful assistant to the user"""
+'''
+
+unsloth_eos_token = "eos_token"
+CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token, False, unsloth_ollama,)
+DEFAULT_SYSTEM_MESSAGE["unsloth"] = "You are a helpful assistant to the user"
+pass
+
+# =========================================== Zephyr
+# Zephyr has no BOS!
+zephyr_template = \
+    "{% for message in messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '<|user|>\n' + message['content'] + eos_token + '\n' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ '<|assistant|>\n' + message['content'] + eos_token + '\n' }}"\
+        "{% else %}"\
+            "{{ '<|system|>\n' + message['content'] + eos_token + '\n' }}"\
+        "{% endif %}"\
+    "{% endfor %}"\
+    "{% if add_generation_prompt %}"\
+        "{{ '<|assistant|>\n' }}"\
+    "{% endif %}"
+pass
+
+zephyr_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}<|system|>
+{{ .System }}{__EOS_TOKEN__}
+{{ end }}{{ if .Prompt }}<|user|>
+{{ .Prompt }}{__EOS_TOKEN__}
+{{ end }}<|assistant|>
+{{ .Response }}{__EOS_TOKEN__}
+"""
+PARAMETER stop "{__EOS_TOKEN__}"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+'''
+
+zephyr_eos_token = "eos_token"
+CHAT_TEMPLATES["zephyr"] = (zephyr_template, zephyr_eos_token, False, zephyr_ollama,)
+DEFAULT_SYSTEM_MESSAGE["zephyr"] = None # No system message in Zephyr
+pass
+
+# =========================================== ChatML
+# ChatML has no BOS and not EOS! Rather <|im_start|> and <|im_end|> acts as BOS / EOS.
+chatml_template = \
+    "{% for message in messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n'}}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' }}"\
+        "{% else %}"\
+            "{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' }}"\
+        "{% endif %}"\
+    "{% endfor %}"\
+    "{% if add_generation_prompt %}"\
+        "{{ '<|im_start|>assistant\n' }}"\
+    "{% endif %}"
+pass
+
+chatml_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ .Response }}<|im_end|>
+"""
+PARAMETER stop "<|im_start|>"
+PARAMETER stop "<|im_end|>"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+'''
+
+chatml_eos_token = "<|im_end|>"
+CHAT_TEMPLATES["chatml"] = (chatml_template, chatml_eos_token, True, chatml_ollama,)
+DEFAULT_SYSTEM_MESSAGE["chatml"] = None # No system message in ChatML
+pass
+
+# =========================================== Mistral-1
+# Mistral Instruct doesn't allow system prompts, so we append it to the user message.
+mistral_template = \
+    "{{ bos_token }}"\
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{% if messages[1]['role'] == 'user' %}"\
+            "{{ '[INST] ' + messages[0]['content'] + ' ' + messages[1]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[2:] %}"\
+        "{% else %}"\
+            "{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[1:] %}"\
+        "{% endif %}"\
+    "{% else %}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '[INST] ' + message['content'] + ' [/INST]' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ message['content'] + eos_token }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"
+pass
+
+# Ollama from https://www.ollama.com/library/mistral
+mistral_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST]"""
+PARAMETER stop "{__EOS_TOKEN__}"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+'''
+
+mistral_eos_token = "eos_token"
+CHAT_TEMPLATES["mistral"] = (mistral_template, mistral_eos_token, False, mistral_ollama,)
+DEFAULT_SYSTEM_MESSAGE["mistral"] = None # No system message in Mistral
+pass
+
+# =========================================== Llama-2
+# Adds BOS to every convo! And weird <<SYS>> system messages.
+llama_template = \
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{% if messages[1]['role'] == 'user' %}"\
+            "{{ bos_token + '[INST] <<SYS>>\n' + messages[0]['content'] + '\n<</SYS>>\n\n' + messages[1]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[2:] %}"\
+        "{% else %}"\
+            "{{ bos_token + '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[1:] %}"\
+        "{% endif %}"\
+    "{% else %}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ ' ' + message['content'].strip() + ' ' + eos_token }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"
+pass
+
+# Ollama from https://www.ollama.com/library/llama3
+llama_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """[INST] <<SYS>>{{ .System }}<</SYS>>
+
+{{ .Prompt }} [/INST]"""
+PARAMETER stop "{__EOS_TOKEN__}"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+'''
+
+llama_eos_token = "eos_token"
+CHAT_TEMPLATES["llama"] = (llama_template, llama_eos_token, False, llama_ollama,)
+DEFAULT_SYSTEM_MESSAGE["llama"] = None # No system message in Llama
+pass
+
+# ===========================================  Vicuna
+# https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
+vicuna_template = \
+    "{{ bos_token }}"\
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{{ messages[0]['content'] + ' ' }}"\
+        "{% set loop_messages = messages[1:] %}"\
+    "{% else %}"\
+        "{{ '{system_message}' + ' ' }}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ 'USER: ' + message['content'] + ' ' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ 'ASSISTANT: ' + message['content'] + eos_token }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"\
+    "{% if add_generation_prompt %}"\
+        "{{ 'ASSISTANT:' }}"\
+    "{% endif %}"
+pass
+
+# Ollama from https://www.ollama.com/library/vicuna
+vicuna_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}USER: {{ .Prompt }} {{ end }}ASSISTANT: {{ .Response }} {__EOS_TOKEN__}"""
+PARAMETER stop "{__EOS_TOKEN__}"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+'''
+
+vicuna_eos_token = "eos_token"
+CHAT_TEMPLATES["vicuna"] = (vicuna_template, vicuna_eos_token, False, vicuna_ollama,)
+DEFAULT_SYSTEM_MESSAGE["vicuna"] = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
+pass
+
+# =========================================== Vicuna Old
+# https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
+vicuna_old_template = \
+    "{{ bos_token }}"\
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{{ messages[0]['content'] + '\n' }}"\
+        "{% set loop_messages = messages[1:] %}"\
+    "{% else %}"\
+        "{{ '{system_message}' + '\n' }}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '### Human: ' + message['content'] + '\n' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ '### Assistant: ' + message['content'] + eos_token + '\n' }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"\
+    "{% if add_generation_prompt %}"\
+        "{{ '### Assistant:' }}"\
+    "{% endif %}"
+pass
+
+vicuna_old_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}{{ .System }}
+{{ end }}{{ if .Prompt }}### Human: {{ .Prompt }}
+{{ end }}### Assistant: {{ .Response }}{__EOS_TOKEN__}
+"""
+PARAMETER stop "{__EOS_TOKEN__}"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+SYSTEM """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."""
+'''
+
+vicuna_old_eos_token = "eos_token"
+CHAT_TEMPLATES["vicuna_old"] = (vicuna_old_template, vicuna_old_eos_token, False, vicuna_old_ollama,)
+DEFAULT_SYSTEM_MESSAGE["vicuna_old"] = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human\\'s questions."
+
+CHAT_TEMPLATES["vicuna old"] = CHAT_TEMPLATES["vicuna_old"]
+DEFAULT_SYSTEM_MESSAGE["vicuna old"] = DEFAULT_SYSTEM_MESSAGE["vicuna_old"]
+pass
+
+# =========================================== Alpaca multi turn
+# https://github.com/tatsu-lab/stanford_alpaca Changed for multi-turn convos
+alpaca_template = \
+    "{{ bos_token }}"\
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{{ messages[0]['content'] + '\n\n' }}"\
+        "{% set loop_messages = messages[1:] %}"\
+    "{% else %}"\
+        "{{ '{system_message}' + '\n\n' }}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '### Instruction:\n' + message['content'] + '\n\n' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ '### Response:\n' + message['content'] + eos_token + '\n\n' }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"\
+    "{% if add_generation_prompt %}"\
+        "{{ '### Response:\n' }}"\
+    "{% endif %}"
+pass
+
+alpaca_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}{{ .System }}
+
+{{ end }}{{ if .Prompt }}### Instruction:
+{{ .Prompt }}{{ end }}
+
+### Response:
+{{ .Response }}{__EOS_TOKEN__}
+
+"""
+PARAMETER stop "{__EOS_TOKEN__}"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+SYSTEM """Below are some instructions that describe some tasks. Write responses that appropriately complete each request."""
+'''
+
+alpaca_eos_token = "eos_token"
+CHAT_TEMPLATES["alpaca"] = (alpaca_template, alpaca_eos_token, False, alpaca_ollama,)
+DEFAULT_SYSTEM_MESSAGE["alpaca"] = "Below are some instructions that describe some tasks. Write responses that appropriately complete each request."
+pass
+
+# =========================================== Gemma
+# https://huggingface.co/google/gemma-7b-it
+# Notice we must use |trim for lstrip and rstrip. <start_of_turn> maps to 106.
+# <end_of_turn> maps to 107. user and model are normal 1 word tokens.
+gemma_template = \
+    "{{ bos_token }}"\
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{{'<start_of_turn>user\n' + messages[0]['content'] | trim + ' ' + messages[1]['content'] | trim + '<end_of_turn>\n'}}"\
+        "{% set messages = messages[2:] %}"\
+    "{% endif %}"\
+    "{% for message in messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{'<start_of_turn>user\n' + message['content'] | trim + '<end_of_turn>\n'}}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{'<start_of_turn>model\n' + message['content'] | trim + '<end_of_turn>\n' }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"\
+    "{% if add_generation_prompt %}"\
+        "{{ '<start_of_turn>model\n' }}"\
+    "{% endif %}"
+pass
+
+# Ollama from https://www.ollama.com/library/gemma
+gemma_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """<start_of_turn>user
+{{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}<end_of_turn>
+<start_of_turn>model
+{{ .Response }}<end_of_turn>
+"""
+PARAMETER repeat_penalty 1
+PARAMETER stop "<start_of_turn>"
+PARAMETER stop "<end_of_turn>"
+PARAMETER penalize_newline false
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+'''
+
+gemma_eos_token = "<end_of_turn>"
+CHAT_TEMPLATES["gemma"] = (gemma_template, gemma_eos_token, True, gemma_ollama,)
+DEFAULT_SYSTEM_MESSAGE["gemma"] = None # No system message in Gemma
+pass
+
+# =========================================== Gemma with ChatML instead
+# We find using <eos> is still more appropriate!
+gemma_chatml_template = "{{ bos_token }}" + chatml_template
+pass
+
+gemma_chatml_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ .Response }}<|im_end|>
+"""
+PARAMETER repeat_penalty 1
+PARAMETER stop "<|im_start|>"
+PARAMETER stop "<|im_end|>"
+PARAMETER penalize_newline false
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+'''
+
+gemma_chatml_eos_token = (
+    {"<start_of_turn>" : "<|im_start|>", "<eos>" : "<|im_end|>"},
+    "<|im_end|>",
+)
+CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token, True, gemma_chatml_ollama,)
+DEFAULT_SYSTEM_MESSAGE["gemma_chatml"] = None # No system message in Gemma
+pass
+
+# =========================================== Gemma 2
+# Same as Gemma 1, but with sliding window attention!
+# https://ollama.com/library/gemma2/blobs/6522ca797f47
+gemma2_template = gemma_template
+gemma2_ollama = gemma_ollama + "PARAMETER num_ctx 4096\n"
+gemma2_eos_token = "<end_of_turn>"
+CHAT_TEMPLATES["gemma2"] = (gemma2_template, gemma2_eos_token, True, gemma2_ollama,)
+DEFAULT_SYSTEM_MESSAGE["gemma2"] = None # No system message in Gemma 2
+
+# =========================================== Gemma 2 with ChatML instead
+gemma2_chatml_template = gemma_chatml_template
+gemma2_chatml_ollama = gemma_chatml_ollama + "PARAMETER num_ctx 4096\n"
+gemma2_chatml_eos_token = gemma_chatml_eos_token
+CHAT_TEMPLATES["gemma2_chatml"] = (gemma2_chatml_template, gemma2_chatml_eos_token, True, gemma2_chatml_ollama,)
+DEFAULT_SYSTEM_MESSAGE["gemma2_chatml"] = None # No system message in Gemma 2
+pass
+
+# =========================================== Llama-3
+# Weirdly \n\n is needed?
+llama3_template = \
+    "{{ bos_token }}"\
+    "{% for message in messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\
+        "{% else %}"\
+            "{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\
+        "{% endif %}"\
+    "{% endfor %}"\
+    "{% if add_generation_prompt %}"\
+        "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"\
+    "{% endif %}"
+pass
+
+# Ollama from https://www.ollama.com/library/llama3
+llama3_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
+
+{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ .Response }}<|eot_id|>"""
+PARAMETER stop "<|start_header_id|>"
+PARAMETER stop "<|end_header_id|>"
+PARAMETER stop "<|eot_id|>"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+'''
+
+llama3_template_eos_token = "eos_token"
+
+CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token, False, llama3_ollama,)
+DEFAULT_SYSTEM_MESSAGE["llama-3"] = None # No system message in Llama-3
+
+CHAT_TEMPLATES["llama3"] = (llama3_template, llama3_template_eos_token, False, llama3_ollama,)
+DEFAULT_SYSTEM_MESSAGE["llama3"] = None # No system message in Llama-3
+pass
+
+
+# =========================================== Phi-3
+# "{{ bos_token }}"\ # Phi-3.5 removes BOS?
+phi3_template = \
+    "{% for message in messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{'<|user|>\n' + message['content'] + '<|end|>\n'}}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}"\
+        "{% else %}"\
+            "{{'<|' + message['role'] + '|>\n' + message['content'] + '<|end|>\n'}}"\
+        "{% endif %}"\
+    "{% endfor %}"\
+    "{% if add_generation_prompt %}"\
+        "{{ '<|assistant|>\n' }}"\
+    "{% endif %}"
+pass
+
+# Ollama from https://www.ollama.com/library/phi3
+phi3_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}<|system|>
+{{ .System }}<|end|>
+{{ end }}{{ if .Prompt }}<|user|>
+{{ .Prompt }}<|end|>
+{{ end }}<|assistant|>
+{{ .Response }}<|end|>
+"""
+PARAMETER stop "<|end|>"
+PARAMETER stop "<|user|>"
+PARAMETER stop "<|assistant|>"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+'''
+
+phi3_template_eos_token = "<|end|>"
+CHAT_TEMPLATES["phi-3"]   = (phi3_template, phi3_template_eos_token, False, phi3_ollama,)
+DEFAULT_SYSTEM_MESSAGE["phi-3"] = None # No system message in Phi-3
+
+CHAT_TEMPLATES["phi-35"]  = CHAT_TEMPLATES["phi-3"]
+DEFAULT_SYSTEM_MESSAGE["phi-35"] = None # No system message in Phi-3.5
+
+CHAT_TEMPLATES["phi-3.5"] = CHAT_TEMPLATES["phi-3"]
+DEFAULT_SYSTEM_MESSAGE["phi-3.5"] = None # No system message in Phi-3.5
+pass
+
+# =========================================== Llama-3.1
+"""
+No trimming in Llama 3.1 Instruct!
+Also an extra newline for Cutting Knowledge Date
+See https://colab.research.google.com/drive/1Xpqq5xpIgO-B00MQ-UccYMwN2J8QFgBM?usp=sharing
+
+Also should be
+
+import datetime
+tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt = True,
+    tokenize = False,
+    date_string = datetime.today().strftime("%d %B %Y")),
+)
+"""
+
+llama31_template = \
+"""{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 July 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "{system_message}" %}
+{%- endif %}
+
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content'] %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
+"""
+pass
+
+# Ollama from https://ollama.com/library/llama3.1 (needs updating!)
+llama31_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .Messages }}
+{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
+{{- if .System }}
+
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+You are a helpful assistant with tool calling capabilities. When you receive a tool call response, use the output to format an answer to the original use question.
+{{- end }}
+{{- end }}<|eot_id|>
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 }}
+{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>
+{{- if and $.Tools $last }}
+
+Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
+
+Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
+
+{{ $.Tools }}
+{{- end }}
+
+{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}
+{{- else if eq .Role "assistant" }}<|start_header_id|>assistant<|end_header_id|>
+{{- if .ToolCalls }}
+
+{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "parameters": {{ .Function.Arguments }}}{{ end }}
+{{- else }}
+
+{{ .Content }}{{ if not $last }}<|eot_id|>{{ end }}
+{{- end }}
+{{- else if eq .Role "tool" }}<|start_header_id|>ipython<|end_header_id|>
+
+{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}
+{{- end }}
+{{- end }}
+{{- else }}
+{{- if .System }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
+
+{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}{{ .Response }}{{ if .Response }}<|eot_id|>{{ end }}"""
+PARAMETER stop "<|start_header_id|>"
+PARAMETER stop "<|end_header_id|>"
+PARAMETER stop "<|eot_id|>"
+PARAMETER stop "<|eom_id|>"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+'''
+
+llama31_template_eos_token = "eos_token"
+CHAT_TEMPLATES["llama-3.1"] = (llama31_template, llama31_template_eos_token, False, llama31_ollama,)
+DEFAULT_SYSTEM_MESSAGE["llama-3.1"] = "" # Llama3.1 default system message is empty + the dates
+
+CHAT_TEMPLATES["llama-31"]  = (llama31_template, llama31_template_eos_token, False, llama31_ollama,)
+DEFAULT_SYSTEM_MESSAGE["llama-31"] = "" # Llama3.1 default system message is empty + the dates
+pass
+
+
+# =========================================== Qwen 2.5
+qwen25_template = \
+"""{%- if tools %}
+    {{- \'<|im_start|>system\\n\' }}
+    {%- if messages[0][\'role\'] == \'system\' %}
+        {{- messages[0][\'content\'] }}
+    {%- else %}
+        {{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}
+    {%- endif %}
+    {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}
+    {%- if messages[0][\'role\'] == \'system\' %}
+        {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n\' }}
+    {%- else %}
+        {{- \'<|im_start|>system\\n{system_message}<|im_end|>\\n\' }}
+    {%- endif %}\n{%- endif %}\n{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- \'<|im_start|>\' + message.role + \'\\n\' + message.content + \'<|im_end|>\' + \'\\n\' }}
+    {%- elif message.role == "assistant" %}
+        {{- \'<|im_start|>\' + message.role }}
+        {%- if message.content %}
+            {{- \'\\n\' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- \'\\n<tool_call>\\n{"name": "\' }}
+            {{- tool_call.name }}
+            {{- \'", "arguments": \' }}
+            {{- tool_call.arguments | tojson }}
+            {{- \'}\\n</tool_call>\' }}
+        {%- endfor %}
+        {{- \'<|im_end|>\\n\' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}            {{- \'<|im_start|>user\' }}
+        {%- endif %}
+        {{- \'\\n<tool_response>\\n\' }}
+        {{- message.content }}
+        {{- \'\\n</tool_response>\' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- \'<|im_end|>\\n\' }}
+        {%- endif %}
+    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}
+    {{- \'<|im_start|>assistant\\n\' }}
+{%- endif %}
+"""
+
+
+# Ollama from https://ollama.com/library/qwen2.5/blobs/eb4402837c78
+qwen25_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{- if .Messages }}
+{{- if or .System .Tools }}<|im_start|>system
+{{- if .System }}
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{{- range .Tools }}
+{"type": "function", "function": {{ .Function }}}
+{{- end }}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+{{- end }}<|im_end|>
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+{{- if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+{{ else if eq .Role "assistant" }}<|im_start|>assistant
+{{ if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}<tool_call>
+{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{ end }}</tool_call>
+{{- end }}{{ if not $last }}<|im_end|>
+{{ end }}
+{{- else if eq .Role "tool" }}<|im_start|>user
+<tool_response>
+{{ .Content }}
+</tool_response><|im_end|>
+{{ end }}
+{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
+{{ end }}
+{{- end }}
+{{- else }}
+{{- if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}"""
+PARAMETER stop "<|im_end|>"
+PARAMETER stop "<|endoftext|>"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
+'''
+
+qwen25_template_eos_token = "eos_token"
+qwen25_default_system_message = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." 
+CHAT_TEMPLATES["qwen-2.5"] = (qwen25_template, qwen25_template_eos_token, False, qwen25_ollama,)
+DEFAULT_SYSTEM_MESSAGE["qwen-2.5"] = qwen25_default_system_message # No system message in Qwen 2.5
+
+CHAT_TEMPLATES["qwen-25"]  = (qwen25_template, qwen25_template_eos_token, False, qwen25_ollama,)
+DEFAULT_SYSTEM_MESSAGE["qwen-25"] = qwen25_default_system_message # No system message in Qwen 2.5
+
+CHAT_TEMPLATES["qwen25"]   = (qwen25_template, qwen25_template_eos_token, False, qwen25_ollama,)
+DEFAULT_SYSTEM_MESSAGE["qwen25"] = qwen25_default_system_message # No system message in Qwen 2.5
+
+CHAT_TEMPLATES["qwen2.5"]  = (qwen25_template, qwen25_template_eos_token, False, qwen25_ollama,)
+DEFAULT_SYSTEM_MESSAGE["qwen2.5"] = qwen25_default_system_message # No system message in Qwen 2.5
+pass
+
+def _change_system_message(template: str, type_chat_template: str, system_message: str = None):
+    system_message_pattern = r"\{system_message\}"
+    
+    # For predefined templates, check if default system message exists
+    default_system_message = DEFAULT_SYSTEM_MESSAGE.get(f"{type_chat_template}", None)
+    if default_system_message is None:
+        if system_message is not None:
+            logger.warning_once(
+                f"Unsloth: You tried to change the system message for {type_chat_template}, "
+                "but it doesn't have a default system message. "
+                "You need to manually add the system message in your data."
+            )
+        return template, system_message
+    pass
+    
+    # For custom templates
+    if type_chat_template is None:
+        has_placeholder = re.search(system_message_pattern, template) is not None
+        
+        if has_placeholder:
+            if system_message is None:
+                raise ValueError("Unsloth: You need to provide a system message for custom templates.")
+            new_template = re.sub(system_message_pattern, system_message, template)
+            return new_template, system_message
+        
+        return template, system_message
+    pass
+        
+    # For predefined templates with default system message
+    message_to_use = system_message if system_message is not None else default_system_message
+    new_template = re.sub(system_message_pattern, message_to_use, template)
+    
+    return new_template, message_to_use
+pass
+
+
+def get_chat_template(
+    tokenizer,
+    chat_template = "chatml",
+    mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"},
+    map_eos_token = True,
+    system_message = None,
+):
+    assert(type(map_eos_token) is bool)
+    old_tokenizer = tokenizer
+
+    IS_GEMMA = False
+    if tokenizer.__class__.__name__.startswith("Gemma"):
+        if chat_template == "chatml": chat_template = "gemma_chatml"
+        IS_GEMMA = True
+    pass
+
+    # We add a check for Llama-3
+    # if chat_template == "llama-3":
+    #     tokenizer._using_llama3_template = True
+    # else:
+    #     llama3_tokens = set(["<|end_header_id|>", "<|eot_id|>", "<|start_header_id|>"])
+    #     check_llama3_tokens = llama3_tokens & set(str(x) for x in tokenizer.added_tokens_decoder.values())
+    #     if len(check_llama3_tokens) == len(llama3_tokens):
+    #         tokenizer._using_llama3_template = True
+    #     pass
+    # pass
+
+    # We first check if the tokenizer is a fast one. If not, we cannot convert this!
+    is_fast_tokenizer = getattr(tokenizer, "is_fast", False)
+    old_padding_side = tokenizer.padding_side
+
+    same_padding_token = False
+    type_chat_template = None
+    
+    if type(chat_template) in (list, tuple,):
+        # For changing system message later
+        # Since it's not supported yet, we will raise an error first!
+        type_chat_template = chat_template[0].lower()
+        chat_template, stop_word = chat_template
+        assert(type(chat_template) is str)
+        assert(type(stop_word) is str)
+        ollama_modelfile = None
+
+    elif type(chat_template) is str:
+        # For changing system message later
+        type_chat_template = chat_template.lower()
+
+        chat_template, stop_word, yes_map_eos_token, ollama_modelfile = CHAT_TEMPLATES[chat_template]
+
+        # Check mapping to eos_token
+        if not map_eos_token and yes_map_eos_token: map_eos_token = True
+        if not yes_map_eos_token and map_eos_token: map_eos_token = False
+
+        if type(stop_word) in (list, tuple,):
+            token_mapping, stop_word = stop_word
+            assert(type(token_mapping) is dict)
+        else:
+            token_mapping = None
+
+        assert(type(stop_word) is str)
+
+        # Check fast tokenizer
+        if not is_fast_tokenizer:
+            print(
+                "Unsloth: Not a fast tokenizer, so can't process it as of yet :(\n"\
+                "Please log a Github issue if you want this as a new feature!\n"\
+                "Your chat template will still work, but it won't add or edit tokens."
+            )
+
+        elif token_mapping is not None:
+            # token_mapping = {"<start_of_turn>" : "<|im_start|>", "<end_of_turn>" : "<|im_end|>"}
+            # For Gemma :)
+
+            string_vocab = tokenizer._tokenizer.to_str()
+
+            skipped = 0
+            for old_token, new_token in token_mapping.items():
+                old_count = string_vocab.count(f'"{old_token}"')
+                new_count = string_vocab.count(f'"{new_token}"')
+                if new_count != 0:
+                    print(f"{new_token} is already a token. Skipping.")
+                    skipped += 1
+                elif old_count == 0:
+                    raise RuntimeError(f"{old_token} was not part of the tokenizer!")
+                else:
+                    string_vocab = string_vocab.replace(f'"{old_token}"', f'"{new_token}"')
+                pass
+            pass
+
+            if map_eos_token and (not stop_word in token_mapping.values()):
+                # Do not map 107 = <|im_end|> and 1 = <|im_end|>. This will reduce the vocab size by 1
+                logger.warning_once(f"Unsloth: Will map {stop_word} to EOS = {tokenizer.eos_token}.")
+                string_vocab = string_vocab.replace(tokenizer.eos_token, stop_word)
+            pass
+
+            if skipped != len(token_mapping):
+                new_tokenizer = tokenizer._tokenizer.from_str(string_vocab)
+
+                # Careful on pad_token
+                old_pad_token = tokenizer.pad_token
+                if old_pad_token == tokenizer.eos_token:
+                    old_pad_token = stop_word
+                    same_padding_token = True
+                pass
+
+                if map_eos_token:
+                    new_tokenizer = tokenizer.__class__(
+                        tokenizer_object = new_tokenizer,
+                        eos_token = stop_word,
+                        pad_token = old_pad_token,
+                    )
+                else:
+                    new_tokenizer = tokenizer.__class__(
+                        tokenizer_object = new_tokenizer,
+                        pad_token = old_pad_token,
+                    )
+                pass
+
+                # Must fix the sentence piece tokenizer since there's no tokenizer.model file!
+                tokenizer = fix_sentencepiece_tokenizer(tokenizer, new_tokenizer, token_mapping,)
+            else:
+                pass
+
+        elif map_eos_token and (stop_word != "eos_token"):
+            logger.warning_once(f"Unsloth: Will map {stop_word} to EOS = {tokenizer.eos_token}.")
+
+            # Replaces the old EOS token with a new one.
+            # Useful for ChatML <|im_end|> for example.
+            # Usually we train 2 more tokens <|im_start|> and <|im_end|>
+            # But training the lm_head and embeddings are slow!
+            # This is a HACK!
+            # Idea from https://huggingface.co/cognitivecomputations/dolphin-2.6-mistral-7b-dpo-laser
+
+            old_bos_token = getattr(tokenizer, "bos_token", None)
+            old_eos_token = getattr(tokenizer, "eos_token", None)
+            old_pad_token = getattr(tokenizer, "pad_token", None)
+            old_unk_token = getattr(tokenizer, "unk_token", None)
+
+            string_vocab = tokenizer._tokenizer.to_str()
+            # First check if new stop_word is in the tokenizer
+            if stop_word in string_vocab:
+                # We shall swap them around
+                temporary_stop_token = "<|:__TEMP//STOP//TOKEN__:|>"
+                string_vocab = string_vocab.replace(old_eos_token, temporary_stop_token)
+                string_vocab = string_vocab.replace(stop_word, old_eos_token)
+                string_vocab = string_vocab.replace(temporary_stop_token, stop_word)
+            else:
+                string_vocab = string_vocab.replace(old_eos_token, stop_word)
+            pass
+            new_tokenizer = tokenizer._tokenizer.from_str(string_vocab)
+
+            # Careful on pad_token
+            if old_pad_token == old_eos_token:
+                old_pad_token = stop_word
+                same_padding_token = True
+            pass
+
+            new_tokenizer = tokenizer.__class__(
+                tokenizer_object = new_tokenizer,
+                bos_token = old_bos_token,
+                eos_token = stop_word,
+                unk_token = old_unk_token,
+                pad_token = old_pad_token,
+            )
+
+            # Must fix the sentence piece tokenizer since there's no tokenizer.model file!
+            token_mapping = { old_eos_token : stop_word, }
+            tokenizer = fix_sentencepiece_tokenizer(tokenizer, new_tokenizer, token_mapping,)
+        pass
+
+    else:
+        raise TypeError(
+            f"Unsloth: `chat_template` must be a tuple of (your_template, eos_token,) or one of\n"\
+            f"{CHAT_TEMPLATES.keys()}"
+        )
+    pass
+
+    # Careful on Gemma
+    # bos_token is a must or else losses become too high
+    if IS_GEMMA and not chat_template.startswith(("{{ bos_token }}", "{{- bos_token }}")):
+        chat_template = "{{ bos_token }}" + chat_template
+    pass
+
+    # For ShareGPT role -> from and content -> value
+    new_chat_template = chat_template\
+        .replace("'role'",      "'" + mapping["role"]      + "'")\
+        .replace("'content'",   "'" + mapping["content"]   + "'")\
+        .replace("'user'",      "'" + mapping["user"]      + "'")\
+        .replace("'assistant'", "'" + mapping["assistant"] + "'")
+
+    _, tokenizer = patch_tokenizer(model = None, tokenizer = tokenizer)
+    tokenizer.padding_side = old_padding_side
+
+    # If not normal HF, we add a check to make old templates work
+    if mapping != {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"}:
+        chat_template = \
+            "{% if 'role' in messages[0] %}" + \
+            chat_template + \
+            "{% else %}" + \
+            new_chat_template + \
+            "{% endif %}"
+    else:
+        chat_template = new_chat_template
+    pass
+
+    chat_template, system_message = _change_system_message(chat_template, type_chat_template, system_message)
+
+    tokenizer.chat_template = chat_template
+
+    # Also fix up other tokens
+    old_pad_token = getattr(old_tokenizer, "pad_token", None)
+    old_bos_token = getattr(old_tokenizer, "bos_token", None)
+    old_unk_token = getattr(old_tokenizer, "unk_token", None)
+    new_pad_token = getattr(tokenizer,     "pad_token", None)
+    new_bos_token = getattr(tokenizer,     "bos_token", None)
+    new_unk_token = getattr(tokenizer,     "unk_token", None)
+    if old_bos_token != new_bos_token: tokenizer.bos_token = old_bos_token
+    if old_unk_token != new_unk_token: tokenizer.unk_token = old_unk_token
+    if not same_padding_token:
+        if old_pad_token != new_pad_token: tokenizer.pad_token = old_pad_token
+    pass
+
+    # stopping_criteria = create_stopping_criteria(tokenizer, stop_word)
+
+    # Patch saving functions
+    tokenizer = patch_saving_functions(tokenizer)
+
+    # Add Ollama
+    tokenizer._ollama_modelfile = ollama_modelfile
+    tokenizer._system_message   = system_message
+    return tokenizer#, stopping_criteria
+pass
+
+
+def remove_special_tokens(tokenizer, prompt):
+    # Removes double BOS token
+    if prompt.startswith(tokenizer.bos_token):
+        prompt = prompt[len(tokenizer.bos_token):]
+    pass
+    return prompt
+pass
+
+
+def _parse_combined_prompt(combined_prompt, dataset):
+    # Find {...}
+    possible_columns = re.findall(r"\{(.+?)\}", combined_prompt)
+    dataset_columns = set(dataset.column_names)
+    for column in possible_columns:
+        if column not in dataset_columns:
+            raise KeyError(
+                f"Unsloth: Your prompt includes '{column}' but this does not exist in the dataset. "\
+                f"Only allowed columns are {list(dataset_columns)}"
+            )
+        pass
+    pass
+
+    # Find [[...]]
+    optional_prompts = list(re.finditer(r"\[\[.+?\]\]", combined_prompt, flags = re.DOTALL | re.MULTILINE))
+    optional_prompts = [(x.span(), x.group(0)) for x in optional_prompts]
+
+    final_optional_prompts = []
+    if len(optional_prompts) != 0:
+        # Add left
+        left = optional_prompts[0]
+        l = left[0][0]
+        if l != 0: final_optional_prompts.append(combined_prompt[:l])
+
+        # Add in between
+        for left, right in zip(optional_prompts[:-1], optional_prompts[1:]):
+            l, r = left[0][-1], right[0][0]
+            final_optional_prompts.append(left)
+            if l != r: final_optional_prompts.append(combined_prompt[l : r])
+        pass
+        final_optional_prompts.append(optional_prompts[-1])
+
+        # Add right
+        right = optional_prompts[-1]
+        r = right[0][1]
+        if r != len(combined_prompt): final_optional_prompts.append(combined_prompt[r:])
+    else:
+        # Just add in the entire string
+        final_optional_prompts.append(combined_prompt)
+    pass
+
+    check_combined = "".join(x if type(x) is str else x[1] for x in final_optional_prompts)
+    assert(combined_prompt == check_combined)
+
+    return possible_columns, final_optional_prompts
+pass
+
+
+def _create_formatter(possible_columns, final_optional_prompts, user_column_name):
+    # Start final prompt!
+    function = ["def __combined_prompt_processor__(examples):"]
+    columns = list(set(possible_columns))
+    for column in columns:
+        function.append(f"{' '*4}{column}__ = examples['{column}']")
+    function.append(f"{' '*4}texts = []")
+    function.append(f"{' '*4}for ({', '.join(columns)}) in zip({', '.join(f'{x}__' for x in columns)}):")
+
+    # Add optional tags as well!
+    final_prompt = ""
+    formatter = []
+
+    for j, optional_prompt in enumerate(final_optional_prompts):
+        if type(optional_prompt) is str:
+            columns = re.findall(r"\{(.+?)\}", optional_prompt)
+            formatter += columns
+            # Must escape \n \r
+            final_prompt += optional_prompt.encode("unicode-escape").decode("utf-8").replace("'", "\\'").replace('"', '\\"')
+        else:
+            where, prompt = optional_prompt
+            # Strip [[...]]
+            # Must escape \n \r
+            prompt = prompt[2:-2].encode("unicode-escape").decode("utf-8").replace("'", "\\'").replace('"', '\\"')
+            columns = re.findall(r"\{(.+?)\}", prompt)
+            x = f"__optional_{j}__"
+            prompt = f"{' '*8}{x} = '{prompt}'.format({', '.join(f'{x} = {x}' for x in columns)}) if {columns[0]} else ''"
+            function.append(prompt)
+            formatter.append(x)
+            final_prompt += "{" + x + "}"
+        pass
+    pass
+
+    function.insert(1, f"{' '*4}__combined_prompt__ = '{final_prompt}'")
+    function.append(f"{' '*8}texts.append("\
+                    f"__combined_prompt__.format({', '.join(f'{x} = {x}' for x in formatter)}))")
+    function.append(f"{' '*4}return " + "{ " + f"'{user_column_name}' : texts" + " }")
+    return "\n".join(function)
+pass
+
+
+def to_sharegpt(
+    dataset,
+    merged_prompt = "",
+    merged_column_name = "instruction",
+    output_column_name = "output",
+    remove_unused_columns = True,
+    conversation_extension = 1,
+    random_state = 3407,
+):
+    """
+    Converts a dataset to ShareGPT style.
+    ShareGPT requires only 1 input and 1 output field.
+    This means one has to merge multiple columns into 1 for 1 input field.
+    Use `conversation_extension` to increase the length of each conversation by randomnly
+    selecting a few and packing them into 1.
+
+    merged_prompt = "",                 Prompt to merge columns into 1 input
+    merged_column_name = "instruction", Final column name for the input  field
+    output_column_name = "output",      Final column name for the output field
+    remove_unused_columns = True,
+    conversation_extension = 1,         Automatically combines `conversation_extension` convos into 1
+    random_state = 3407,
+    """
+    if "conversations" in dataset.column_names:
+        convo = dataset[0]["conversations"]
+        if type(convo) is list:
+            raise TypeError("Unsloth: Your dataset is probably already in ShareGPT format!")
+        pass
+    pass
+
+    possible_columns, final_optional_prompts = _parse_combined_prompt(merged_prompt, dataset)
+    function = _create_formatter(possible_columns, final_optional_prompts, merged_column_name)
+    exec(function, globals())
+    dataset = dataset.map(__combined_prompt_processor__, batched = True, desc = "Merging columns")
+
+    def __convert_to_sharegpt__(examples):
+        users      = examples[merged_column_name]
+        assistants = examples[output_column_name]
+        texts = [
+            [
+                {"from" : "human", "value" : str(user)     },
+                {"from" : "gpt",   "value" : str(assistant)},
+            ] \
+            for user, assistant in zip(users, assistants)
+        ]
+        return { "conversations" : texts, }
+    pass
+
+    dataset = dataset.map(
+        __convert_to_sharegpt__,
+        batched = True,
+        desc = "Converting to ShareGPT",
+        # Remove unused columns!
+        remove_columns = dataset.column_names if remove_unused_columns else None,
+    )
+
+    # Randomnly concat conversations to create a long stream!
+    from datasets import concatenate_datasets
+    n_extensions = max(conversation_extension-1, 0)
+    if n_extensions == 0: return dataset
+
+    dataset = dataset.rename_columns({"conversations" : "conversations0"})
+    all_shuffled = [dataset]
+    for j in range(1, n_extensions+1):
+        shuffled = dataset.shuffle(seed = random_state+j).rename_columns({"conversations0" : f"conversations{j}"})
+        all_shuffled.append(shuffled)
+    pass
+    dataset = concatenate_datasets(all_shuffled, axis = 1)
+
+    # Combine them into 1
+    function = "def __combine_conversations__(examples):\n"
+    n_extensions += 1
+    for j in range(n_extensions):
+        function += f"{' '*4}conversations{j}__ = examples['conversations{j}']\n"
+    function += f"{' '*4}convos = []\n"
+    function += f"{' '*4}for ({', '.join(f'conversations{j}' for j in range(n_extensions))}) "\
+                f"in zip({', '.join(f'conversations{j}__' for j in range(n_extensions))}):\n"
+    function += f"{' '*8}convos.append("\
+                f"{'+'.join(f'conversations{j}' for j in range(n_extensions))})\n"
+    function += f"{' '*4}return " + "{ " + "'conversations' : convos" + " }"
+
+    # Map function
+    exec(function, globals())
+    dataset = dataset.map(
+        __combine_conversations__,
+        batched = True,
+        desc = "Extending conversations",
+        # Remove unused columns!
+        remove_columns = dataset.column_names if remove_unused_columns else None,
+    )
+    return dataset
+pass
+
+
+def standardize_sharegpt(
+    dataset,
+    aliases_for_system    = ["system",],
+    aliases_for_user      = ["user", "human", "input",],
+    aliases_for_assistant = ["gpt", "assistant", "output",],
+):
+    """
+    Standardizes ShareGPT and other formats to user/assistant Hugging Face format.
+    
+    Get aliases for the system, user and assistant roles.
+    These shall map to "system", "user" and "assistant" respectively.
+    
+    aliases_for_system    = ["system",],
+    aliases_for_user      = ["user", "human", "input",],
+    aliases_for_assistant = ["gpt", "assistant", "output",],
+    """
+    import collections
+    import itertools
+
+    convos = dataset[:10]["conversations"]
+    uniques = collections.defaultdict(list)
+    for convo in convos:
+        for message in convo:
+            for key, value in message.items():
+                uniques[key].append(value)
+    pass
+
+    # Must be only 2 entries
+    assert(len(uniques.keys()) == 2)
+
+    keys = list(uniques.keys())
+    length_first  = len(set(uniques[keys[0]]))
+    length_second = len(set(uniques[keys[1]]))
+
+    if length_first < length_second:
+        # Role is assigned to the first element
+        role_key    = keys[0]
+        content_key = keys[1]
+    else:
+        role_key    = keys[1]
+        content_key = keys[0]
+    pass
+
+    # Check roles are in aliases
+    all_aliases = set(aliases_for_system + aliases_for_user + aliases_for_assistant)
+    roles = set(uniques[role_key])
+    leftover_aliases = (all_aliases | roles) - all_aliases
+    if len(leftover_aliases) != 0:
+        raise TypeError(
+            f"Unsloth: {list(leftover_aliases)} are not in aliases. Please update aliases."
+        )
+    pass
+
+    # Mapping for aliases
+    aliases_mapping = {}
+    for x in aliases_for_system:    aliases_mapping[x] = "system"
+    for x in aliases_for_user:      aliases_mapping[x] = "user"
+    for x in aliases_for_assistant: aliases_mapping[x] = "assistant"
+
+    def _standardize_dataset(examples):
+        convos = examples["conversations"]
+        all_convos = []
+        for convo in convos:
+            new_convo = [
+                { "role" : aliases_mapping[message[role_key]], "content" : message[content_key], }
+                for message in convo
+            ]
+            all_convos.append(new_convo)
+        pass
+        return { "conversations" : all_convos, }
+    pass
+
+    return dataset.map(_standardize_dataset, batched = True, desc = "Standardizing format")
+pass
+
+
+def get_ollama_eos_tokens(tokenizer, extra_eos_tokens = []):
+    added_tokens_decoder = tokenizer.added_tokens_decoder.values()
+    added_tokens_decoder = [str(x) for x in added_tokens_decoder]
+
+    # Remove added_tokens_decoder duplicates
+    added_tokens_decoder = list(set(added_tokens_decoder) - set(extra_eos_tokens))
+
+    # Remove BOS
+    if getattr(tokenizer, "bos_token", None) is not None:
+        added_tokens_decoder = [x for x in added_tokens_decoder if x != tokenizer.bos_token]
+    pass
+
+    repeatted_tokens = []
+    # Join all vocab
+    joined_text = "\x01\x00".join(added_tokens_decoder)
+    for token in added_tokens_decoder:
+        n = len(token)
+        repeatted_counts = joined_text.count(token[:n//2])
+        # Try finding longer than 1/2 of the token in the rest
+        # For eg <|reserved_special_token_0|>, <|reserved_special_token_1|>
+        if repeatted_counts > 2:
+            for j in range(n//2+1, n):
+                if joined_text.count(token[:j]) < repeatted_counts:
+                    j -= 1
+                    # Remove repeatted tokens to reduce search space
+                    joined_text = joined_text.replace(token[:j], "")
+                    repeatted_tokens.append(token[:j])
+                    break
+            pass
+        pass
+    pass
+
+    # Remove duplicates
+    splitted = joined_text.split("\x01\x00")
+    final_eos_tokens = []
+    for old, new in zip(added_tokens_decoder, splitted):
+        if old == new: final_eos_tokens.append(old)
+    pass
+    final_eos_tokens += extra_eos_tokens
+    final_eos_tokens += repeatted_tokens
+
+    # Remove new lines, spaces and HTML tags
+    filtered_eos_tokens = []
+    for token in final_eos_tokens:
+        if   token.count("\n") == len(token): continue
+        elif token.count("▁") == len(token): continue
+        elif token.startswith("<") and len(token) <= 2: continue
+        elif token.startswith("</") and len(token) == 3: continue
+        filtered_eos_tokens.append(token)
+    pass
+    return filtered_eos_tokens
+pass
+
+
+def construct_chat_template( \
+
+tokenizer = None,
+
+chat_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+{SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{OUTPUT}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{OUTPUT}<|eot_id|>""",
+    
+default_system_message = \
+    "Below are some instructions that describe some tasks. Write responses that appropriately complete each request.",
+
+extra_eos_tokens = None,
+):
+    """
+    Creates a Ollama modelfile and a HF Jinja template from a custom
+    template. You must provide 2x examples of an input & output.
+    There is an optional system message as well.
+
+    You must use {INPUT}, {OUTPUT} twice, and {SYSTEM} is optional.
+    """
+    # Strip only the left
+    chat_template = chat_template.lstrip()
+
+    assert(tokenizer is not None)
+
+    if extra_eos_tokens is None: extra_eos_tokens = []
+    elif type(extra_eos_tokens) is str: extra_eos_tokens = [extra_eos_tokens,]
+
+    vocab = tokenizer.get_vocab()
+    for extra_eos in extra_eos_tokens:
+        assert(type(extra_eos) is str)
+        if extra_eos not in vocab:
+            raise ValueError(f"Unsloth: `{extra_eos}` is not a singular token in the tokenizer.")
+        pass
+    pass
+
+    error_msg = \
+        "Unsloth: Your prompt template must have 2 examples showing the user input {INPUT} "\
+        "and the assistant output {OUTPUT}\n\n"\
+        "For example what is not allowed is just:\n"\
+        "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n\n\n"\
+        "What is required is 2x of this:\n"\
+        "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"\
+        "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"
+
+    # Check for EOS after {OUTPUT}
+    if tokenizer.eos_token is not None:
+        extra_eos_tokens.insert(0, tokenizer.eos_token)
+    if len(extra_eos_tokens) == 0:
+        raise RuntimeError(
+            "Unsloth: Your tokenizer does not have an EOS token? Please provide one via extra_eos_tokens!"
+        )
+    pass
+
+    # Check tokenizer types
+    tokenizer_name = tokenizer.name_or_path.lower()
+    if tokenizer_name.startswith(("unsloth/llama-3-8b-instruct", "unsloth/llama-3-70b-instruct")):
+        # Add <|eot_id|>
+        extra_eos_tokens.append("<|eot_id|>")
+    elif ("<|eot_id|>" in extra_eos_tokens or "<|eot_id|>" in chat_template) and \
+        tokenizer_name.startswith(("unsloth/llama-3-8b", "unsloth/llama-3-70b")):
+        # Warn
+        logger.warning(
+            "Unsloth: Base llama-3 models did not train <|eot_id|>.\n"\
+            "Please use the instruct version or use <|end_of_text|>"
+        )
+    pass
+    extra_eos_tokens = list(set(extra_eos_tokens))
+
+    count_eos = 0
+    for eos in extra_eos_tokens:
+        count_eos += len(re.findall(r"{OUTPUT}" + re.escape(eos), chat_template))
+    pass
+
+    # This forces you to provide 2 input and outputs
+    final_combined_check = False
+
+    try:
+        # O(N^2) search finding 2 repeatted pieces of text
+        j = len(chat_template)-1
+        at_least_one = False
+        while j > 0:
+            found = chat_template.rfind(chat_template[j:], 0, j)
+            if found == -1: break
+            j -= 1
+            at_least_one = True
+        pass
+        if j > 0: j += 1
+        else: raise RuntimeError(error_msg)
+
+        if not at_least_one: raise RuntimeError(error_msg)
+
+        # Must be equivalent to left
+        final_combined_check = True
+
+        # Repeatted text
+        instruction_response = chat_template[j:]
+        if instruction_response.count("{INPUT}") != 1 or instruction_response.count("{OUTPUT}") != 1:
+            raise RuntimeError(error_msg)
+        pass
+
+        # 1st System, Instruction, Output pair
+        left  = chat_template[:j]
+        # 2nd Instruction, Output pair
+        right = chat_template[j:]
+
+        final_combined_check = left if final_combined_check else chat_template
+
+        # Isolate input
+        extra_eos_tokens_regex = "|".join(f"(?:{re.escape(x)})" for x in extra_eos_tokens)
+        if len(extra_eos_tokens_regex) != 0:
+            find_end = f"(?:{extra_eos_tokens_regex})?"
+        else:
+            find_end = ""
+        find_end = r"\{INPUT\}[\s\n]{0,}" + find_end
+        input_end = list(re.finditer(find_end, right))
+        assert(len(input_end) == 1)
+        input_end = input_end[0]
+        input_end = input_end.span(0)[1]
+        input_part = right[:input_end]
+
+        # Isolate output
+        output_part = right[input_end:]
+
+        # Isolate system
+        where_system = left.find(input_part)
+        system_part = left[:where_system if where_system != -1 else len(left)]
+
+        # Check if the user provided a correct prompt
+        combined = system_part + input_part + output_part
+        if combined != final_combined_check:
+            combined_changed = combined            .replace('\n', '\\n')
+            left_changed     = final_combined_check.replace('\n', '\\n')
+            raise RuntimeError(
+                "Unsloth: The prompt template you provided isn't correct. You gave:\n"\
+                f"{combined_changed}\n\n"\
+                "But we require the following:\n"\
+                f"{left_changed}"
+            )
+        pass
+    except:
+        ending = chat_template[chat_template.find("{OUTPUT}") + len("{OUTPUT}"):]
+
+        ending = re.escape(ending)
+        find_text = "{INPUT}" + ending + "(.+?{OUTPUT}" + ending + ")"
+        response_part = re.findall(find_text, chat_template, flags = re.DOTALL | re.MULTILINE)
+        response_part = response_part[0]
+
+        for j in range(1, len(response_part)):
+            try_find = re.escape(response_part[:j])
+            try: found = next(re.finditer("(" + try_find + ").+?\{INPUT\}", chat_template, flags = re.DOTALL | re.MULTILINE))
+            except: break
+        pass
+        separator = found.group(1)
+
+        response_start = chat_template.find(response_part)
+        start_instruction = chat_template[:response_start].rfind(separator)
+        if start_instruction == -1: start_instruction = 0
+        instruction_part = chat_template[start_instruction:response_start]
+
+        combined = instruction_part + response_part
+        where = chat_template.find(combined)
+        system_part = chat_template[:where]
+
+        system_part, input_part, output_part = system_part, instruction_part, response_part
+    pass
+
+    if count_eos == 0:
+        logger.warning("Unsloth: We automatically added an EOS token to stop endless generations.")
+        eos = extra_eos_tokens[0]
+        output_part = output_part + eos
+    pass
+
+    # Ollama modelfile parts
+
+    # Check bos_token is in system prompt
+    ollama_system = system_part
+    has_bos_token = False
+    always_bos_token = False
+    if tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None):
+        always_bos_token = True
+        if ollama_system.startswith(tokenizer.bos_token):
+            has_bos_token = True
+            ollama_system = ollama_system[len(tokenizer.bos_token):]
+        pass
+    pass
+    # Check system
+    if "{SYSTEM}" in ollama_system:
+        system_modelfile = "{{ if .System }}" + ollama_system.replace("{SYSTEM}", "{{ .System }}") + "{{ end }}"
+    else:
+        system_modelfile = ollama_system
+    pass
+    input_modelfile  = "{{ if .Prompt }}" + input_part .replace("{INPUT}",  "{{ .Prompt }}") + "{{ end }}"
+    output_modelfile = output_part.replace("{OUTPUT}", "{{ .Response }}")
+
+    # Ollama EOS
+    ollama_eos = get_ollama_eos_tokens(tokenizer, extra_eos_tokens)
+    ollama_eos = '\n'.join(f'PARAMETER stop "{eos}"' for eos in ollama_eos)
+
+    # Add temperature and min_p to counteract gibberish
+    ollama_eos += "\nPARAMETER temperature 1.5\nPARAMETER min_p 0.1"
+
+    # Ollama modelfile
+    part = '"""'
+    modelfile = 'FROM {__FILE_LOCATION__}\n\n'\
+    'TEMPLATE ' + part + system_modelfile + input_modelfile + output_modelfile + \
+        part + '\n\n' + ollama_eos
+
+    # HF Jinja Chat template
+    def process(part, which, content = "message['content']"):
+        if part.endswith(which):
+            part = "'" + part[:part.find(which)] + f"' + {content}"
+        elif part.startswith(which):
+            part = f"{content} + '" + part[part.find(which):] + "'"
+        else:
+            part = "'" + part.replace(which, f"' + {content} + '") + "'"
+        if part.startswith("'' + "): part = part[5:]
+        return part
+    pass
+    input_jinja  = process(input_part,  "{INPUT}")
+    output_jinja = process(output_part, "{OUTPUT}")
+    pass
+
+    jinja_template = \
+        "{% for message in loop_messages %}"\
+            "{% if message['role'] == 'user' %}"\
+                "{{ " + input_jinja + " }}"\
+            "{% elif message['role'] == 'assistant' %}"\
+                "{{ " + output_jinja + " }}"\
+            "{% else %}"\
+                "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+            "{% endif %}"\
+        "{% endfor %}"\
+        "{% if add_generation_prompt %}"\
+            "{{ '" + output_part[:output_part.find("{OUTPUT}")] + "' }}"\
+        "{% endif %}"
+    pass
+
+    # Now add system prompt to jinja
+    if len(system_part) != 0:
+        partial_system = process(system_part, "{SYSTEM}", "messages[0]['content']")
+        partial_system = partial_system.replace("{SYSTEM}", "")
+
+        if "{SYSTEM}" in partial_system:
+            if default_system_message is None:
+                raise RuntimeError("Unsloth: Please specify a default system message!")
+        pass
+
+        # Separate the BOS
+        if has_bos_token:
+            partial_system = partial_system.replace(tokenizer.bos_token, "", 1)
+            system_part    = system_part   .replace(tokenizer.bos_token, "", 1)
+        pass
+        
+        partial_system = \
+            "{% if messages[0]['role'] == 'system' %}"\
+                "{{ " + partial_system + " }}"\
+                "{% set loop_messages = messages[1:] %}"
+        if default_system_message is not None:
+            full_system = system_part.replace("{SYSTEM}", default_system_message)
+            if "{SYSTEM}" in system_part:
+                modelfile += '\nSYSTEM "' + default_system_message + '"'
+            pass
+            partial_system += "{% else %}"\
+                "{{ '" + full_system + "' }}"\
+                "{% set loop_messages = messages %}"\
+            "{% endif %}"
+        else:
+            partial_system += "{% endif %}"
+        pass
+
+        jinja_template = partial_system + jinja_template
+
+        if has_bos_token:
+            jinja_template = "{{ bos_token }}" + jinja_template
+    pass
+
+    # Fix missing loop_messages
+    if "{% set loop_messages = messages %}" not in jinja_template:
+        jinja_template = jinja_template.replace(
+            "{% for message in loop_messages %}",
+            "{% for message in messages %}",
+            1, # Only replace the first one
+        )
+    pass
+
+    # Check if system part is the same!
+    jinja_template = re.sub(
+        r"\{\% if messages\[0\]\['role'\] \=\= 'system' \%\}\{\{ '(.+?)' \}\}"\
+        r"\{\% set loop\_messages \= messages\[1\:\] \%\}"\
+        r"\{\% else \%\}\{\{ '\1' \}\}\{\% set loop\_messages \= messages \%\}\{\% endif \%\}"\
+        r"\{\% for message in loop\_messages \%\}",
+        r"{{ '\1' }}{% for message in messages %}",
+        jinja_template, flags = re.MULTILINE | re.DOTALL,
+    )
+
+    # Check jinja tempate for bos
+    if always_bos_token:
+        if not jinja_template.startswith(("{{ bos_token }}", "{{- bos_token }}")):
+            jinja_template = "{{ bos_token }}" + jinja_template
+    pass
+
+    # Get instruction and output parts for train_on_inputs = False
+    input_part  = input_part [:input_part .find("{INPUT}")]
+    output_part = output_part[:output_part.find("{OUTPUT}")]
+    return modelfile, jinja_template, input_part, output_part
+pass
+
+
+def test_construct_chat_template():
+    token = "hf_"
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token = token)
+
+    chat_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+{SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{OUTPUT}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{OUTPUT}<|eot_id|>"""
+    
+    default_system_message = \
+        "Below are some instructions that describe some tasks. Write responses that appropriately complete each request."
+      
+    extra_eos_tokens = None
+
+    modelfile, jinja_template, _, _ = construct_chat_template(
+        tokenizer = tokenizer,
+        chat_template = chat_template,
+        extra_eos_tokens = extra_eos_tokens,
+    )
+
+    messages = [
+        {"role": "system", "content": "You are an assistant"},
+        {"role": "user", "content": "What is 2+2?"},
+        {"role": "assistant", "content": "It's 4."},
+        {"role": "user", "content": "Ok!"},
+        {"role": "assistant", "content": "Anything else?"},
+        {"role": "user", "content": "What's 2x2?"},
+    ]
+    correct_output = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+
+    tokenizer.chat_template = jinja_template
+    new_output = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    assert(correct_output == new_output)
+    pass
+pass
+
+
+def apply_chat_template( \
+
+dataset,
+tokenizer = None,
+
+chat_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+{SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{OUTPUT}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{OUTPUT}<|eot_id|>""",
+    
+default_system_message = \
+    "Below are some instructions that describe some tasks. Write responses that appropriately complete each request.",
+  
+extra_eos_tokens = None,
+  
+):
+    """
+    Creates a Ollama modelfile and a HF Jinja template from a custom
+    template. You must provide 2x examples of an input & output.
+    There is an optional system message as well.
+
+    You must use {INPUT}, {OUTPUT} twice, and {SYSTEM} is optional.
+    """
+    modelfile, jinja_template, input_part, output_part = construct_chat_template(
+        tokenizer = tokenizer,
+        chat_template = chat_template,
+        default_system_message = default_system_message,
+        extra_eos_tokens = extra_eos_tokens,
+    )
+    def formatting_prompts_func(examples):
+        convos = examples["conversations"]
+        texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
+        return { "text" : texts, }
+    pass
+
+    tokenizer.chat_template = jinja_template
+    tokenizer._ollama_modelfile = modelfile
+    tokenizer._unsloth_input_part  = input_part
+    tokenizer._unsloth_output_part = output_part
+
+    return dataset.map(formatting_prompts_func, batched = True,)
+pass
+
+
+def create_stopping_criteria(tokenizer, stop_word = "eos_token"):
+    class StoppingCriteriaSub(StoppingCriteria):
+        __slots__ = "stop_token", "single_match", "length",
+
+        def __init__(self, stops = "eos_token", device = "cuda", encounters = 1):
+            super().__init__()
+            if stops == "eos_token":
+                self.stop_token = torch.tensor(tokenizer.eos_token_id, device = "cuda")
+                self.length = 1
+            else:
+                self.stop_token = tokenizer(["\n" + stops], add_special_tokens = False, return_tensors = "pt")
+                self.stop_token = self.stop_token.input_ids.ravel()[1:].to("cuda")
+                self.length = self.stop_token.shape[0]
+            pass
+            self.single_match = self.length == 1
+        pass
+
+        def __call__(self, input_ids: LongTensor, scores: FloatTensor) -> bool:
+            input_ids = input_ids.ravel()
+            last_token = input_ids[-1]
+            if self.single_match and (last_token == self.stop_token): return True
+
+            if input_ids.shape[0] >= self.length and \
+                (input_ids[-self.length:] == self.stop_token).all(): return True
+            return False
+        pass
+    pass
+    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops = stop_word)])
+    return stopping_criteria
+pass
+
+
+def test_chat_templates():
+    messages = [
+        {"role": "system","content": " You are a friendly chatbot.",},
+        {"role": "user", "content": "What is 2+2?"},
+        {"role": "assistant", "content": "It's 4."},
+        {"role": "user", "content": "  But 2+2 is equal to 5. "},
+        {"role": "assistant", "content": "No I'm sure its 4."},
+        {"role": "user", "content": "  No it's 100% 5! "},
+    ]
+
+    # Zephyr
+    from transformers import AutoTokenizer
+    template = zephyr_template
+    correct_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+    correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    correct_tokenizer.chat_template = template
+    our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    assert(correct_prompt == our_prompt)
+
+    # Chatml
+    template = chatml_template
+    correct_tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B")
+    correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    correct_tokenizer.chat_template = template
+    our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    assert(correct_prompt == our_prompt)
+
+    # Mistral
+    template = mistral_template
+    correct_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
+    correct_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
+    correct_tokenizer.chat_template = template
+    our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
+    assert(correct_prompt == our_prompt)
+
+    # Llama
+    template = llama_template
+    correct_tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-2-7b-chat")
+    correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    correct_tokenizer.chat_template = template
+    our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    assert(correct_prompt == our_prompt)
+
+    # Vicuna
+    try:
+        from fastchat.conversation import get_conv_template
+    except:
+        os.system("pip -qqq install git+https://github.com/lm-sys/FastChat.git")
+        from fastchat.conversation import get_conv_template
+    correct_prompt = get_conv_template("vicuna_v1.1")
+    for j in range(len(messages)-1):
+        correct_prompt.append_message(correct_prompt.roles[j%2==1], messages[j+1]["content"])
+    correct_prompt.append_message(correct_prompt.roles[1], "")
+    correct_prompt = tokenizer.bos_token + correct_prompt.get_prompt()
+
+    template = vicuna_template
+    correct_tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
+    correct_tokenizer.chat_template = template
+    our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
+    assert(correct_prompt == our_prompt)
+
+    try:
+        from fastchat.conversation import get_conv_template
+    except:
+        os.system("pip -qqq install git+https://github.com/lm-sys/FastChat.git")
+        from fastchat.conversation import get_conv_template
+    correct_prompt = get_conv_template("zero_shot")
+    for j in range(len(messages)-1):
+        correct_prompt.append_message(correct_prompt.roles[j%2==1], messages[j+1]["content"])
+    correct_prompt.append_message(correct_prompt.roles[1], "")
+    correct_prompt = tokenizer.bos_token + correct_prompt.get_prompt()
+
+    template = vicuna_old_template
+    correct_tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
+    correct_tokenizer.chat_template = template
+    our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
+    # We add </s> ourselves
+    assert(correct_prompt == our_prompt.replace("</s>", ""))
+
+    # Gemma
+    correct_tokenizer = AutoTokenizer.from_pretrained("unsloth/gemma-7b-it")
+    correct_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
+    correct_tokenizer.chat_template = gemma_template
+    our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
+    assert(our_prompt == correct_prompt)
+
+    # Llama-3
+    template = llama3_template
+    correct_tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct")
+    correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    correct_tokenizer.chat_template = template
+    our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    assert(correct_prompt == our_prompt)
+
+    # Phi-3
+    template = phi3_template
+    correct_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    correct_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
+    correct_tokenizer.chat_template = template
+    our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
+    assert(correct_prompt == our_prompt)
+pass
+
+
+def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf"):
+    """
+        Carefully checks the output of GGUF's tokenization and HF.
+        Can catch all tokenization bugs.
+    """
+    import subprocess
+    import re
+    messages = [
+        {"role": "user", "content": "What is 2+2?"},
+        {"role": "assistant", "content": "It's 4."},
+        {"role": "user", "content": "  But 2+2 is equal to 5. "},
+        {"role": "assistant", "content": "No I'm sure its 4."},
+        {"role": "user", "content": "  No it's 100% 5! "},
+    ]
+
+    prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+    ### Instruction:
+    {}
+
+    ### Input:
+    {}
+
+    ### Response:
+    {}""".format(
+        "Describe the city given eloquently.", # instruction
+        "The lost city of Atlantis.", # input
+        "", # output - leave this blank for generation!
+    )
+    prompts = [ prompt, ]
+
+    if tokenizer.chat_template is not None:
+        prompt = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+        prompt = prompt.replace("'", "") # Subprocess does not like ''
+        prompt = remove_special_tokens(tokenizer, prompt)
+        prompts.append(prompt)
+    pass
+    
+    for prompt in prompts:
+        command = f"./llama.cpp/llama-cli -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\
+            f"--check-tensors -p '{prompt}'"
+
+        datas = []
+        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
+            for line in sp.stdout:
+                datas.append(line.decode("utf-8", errors = "replace"))
+        pass
+        gguf_tokens = "".join(datas)
+
+        # Now extract GGUF tokenization attempt
+        gguf_tokenized = re.findall("([\d]{1,}) \-\> \'([^\']{1,})\'", gguf_tokens, flags = re.MULTILINE)
+        gguf_tokenized = [(int(x[0]), x[1],) for x in gguf_tokenized]
+        input_ids = tokenizer(prompt).input_ids
+
+        tokens = tokenizer.batch_decode(input_ids)
+        hf_tokenized = list(zip(input_ids, tokens))
+
+        # Compare to Huggingface
+        for j, (hf_token, gguf_token) in enumerate(zip(hf_tokenized, gguf_tokenized)):
+            if (hf_token[0] != gguf_token[0]):
+                print("Failed GGUF != HF at", j)
+                print("HF =", hf_token)
+                print("GGUF =", gguf_token)
+                print(hf_tokenized)
+                print()
+                print(gguf_tokenized)
+                print()
+                raise RuntimeError("Failed comparing GGUF to HF.")
+            pass
+        pass
+    return True
+pass
diff --git a/unsloth-main/unsloth/kernels/__init__.py b/unsloth-main/unsloth/kernels/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef5fa5da70e299268bcba1310dd90bd5765e3383
--- /dev/null
+++ b/unsloth-main/unsloth/kernels/__init__.py
@@ -0,0 +1,65 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .cross_entropy_loss import (
+    fast_cross_entropy_loss,
+    post_patch_loss_function,
+    patch_loss_functions,
+)
+from .rms_layernorm import (
+    fast_rms_layernorm,
+    patch_rms_layernorm,
+    unpatch_rms_layernorm,
+)
+from .layernorm import (
+    fast_layernorm,
+    patch_layernorm,
+)
+from .rope_embedding import fast_rope_embedding, inplace_rope_embedding
+from .swiglu import swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel
+from .geglu import (
+    geglu_exact_forward_kernel,
+    geglu_exact_backward_kernel,
+    geglu_approx_forward_kernel,
+    geglu_approx_backward_kernel,
+)
+from .fast_lora import (
+    get_lora_parameters,
+    get_lora_parameters_bias,
+    apply_lora_mlp_swiglu,
+    apply_lora_mlp_geglu_exact,
+    apply_lora_mlp_geglu_approx,
+    apply_lora_qkv,
+    apply_lora_o,
+    fast_lora_forward,
+)
+from .utils import fast_dequantize, fast_gemv, QUANT_STATE, fast_linear_forward, matmul_lora
+
+from .flex_attention import (
+    HAS_FLEX_ATTENTION,
+    slow_attention_softcapping,
+    slow_inference_attention_softcapping,
+    create_flex_attention_causal_mask,
+    create_flex_attention_sliding_window_mask,
+)
+
+import os
+if "UNSLOTH_ZOO_IS_PRESENT" not in os.environ:
+    try:
+        print("🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.")
+    except:
+        print("Unsloth: Will patch your computer to enable 2x faster free finetuning.")
+    pass
+pass
+del os
diff --git a/unsloth-main/unsloth/kernels/cross_entropy_loss.py b/unsloth-main/unsloth/kernels/cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..d347cd1878dbeffe15e8c7b276eaf99c23a28486
--- /dev/null
+++ b/unsloth-main/unsloth/kernels/cross_entropy_loss.py
@@ -0,0 +1,405 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import triton
+import triton.language as tl
+import torch
+from .utils import calculate_settings, MAX_FUSED_SIZE, triton_tanh, triton_cast
+from transformers.models.llama.modeling_llama import logger
+from packaging.version import Version
+
+from unsloth_zoo.loss_utils import (
+    patch_loss_functions as _patch_loss_functions,
+    post_patch_loss_function,
+)
+
+
+@triton.heuristics({
+    "DO_SOFTCAPPING":   lambda args: bool(args["DO_SOFTCAPPING"  ]),
+    "DO_LOGIT_SCALING": lambda args: bool(args["DO_LOGIT_SCALING"]),
+})
+@triton.jit
+def _cross_entropy_forward(
+    logits_ptr        ,
+    logits_row_stride ,
+    loss_ptr          ,
+    logsumexp_ptr     ,
+    labels_ptr        ,
+    VOCAB_SIZE        ,
+    BLOCK_SIZE        : tl.constexpr,
+    DO_SOFTCAPPING    ,
+    SOFTCAP           ,
+    DO_LOGIT_SCALING  ,
+    LOGIT_SCALE       ,
+):
+    """
+        Cross Entropy Loss = 1/n sum [ -yi log(Pi) ]
+        Pi = exp(xi) / sum(exp(xi))
+        CE_i = -y log(p) = -y log[ exp(x) / sum(exp(x)) ]
+             = -y [ x - log[sum(exp(x))] ]
+             = y * (log[sum(exp(x))] - x)
+        If y == 0: CE_i = 0
+        If y == 1: CE_i = logsumexp - x
+
+        logsumexp is also stable
+        Take    y =         log[sum(exp(x))]
+           exp(y) =             sum(exp(x))
+           exp(y) =             sum(exp(x - c)*exp(c)) Since e^(x-c)*e^c = e^x
+           exp(y) =      exp(c)*sum(exp(x - c))
+               y  = log(exp(c)*sum(exp(x - c)))
+               y  = c + log[sum(exp(x - c))]
+        This means we can set c = max(x) to make sure
+        exp(x - c) always is exp(x - max(x)).
+        This ensures exp(x - max(x))'s maximum is 1 as exp(0) = 1.
+    """
+    row_idx = tl.program_id(0)
+    logits_ptr    += row_idx * triton_cast(logits_row_stride, tl.int64)
+    loss_ptr      += row_idx
+    logsumexp_ptr += row_idx
+    labels_ptr    += row_idx
+
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < VOCAB_SIZE
+
+    label_idx = tl.load(labels_ptr).to(tl.int32)
+    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf")).to(tl.float32)
+
+    # Go logit scaling for Cohere: t * x
+    if DO_LOGIT_SCALING: logits = LOGIT_SCALE * logits
+    # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+    if DO_SOFTCAPPING:   logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
+    
+    c = tl.max(logits, 0)
+    logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
+
+    if label_idx != -100:
+        x = tl.load(logits_ptr + label_idx).to(tl.float32)
+        # Go logit scaling for Cohere: t * x
+        if DO_LOGIT_SCALING: x = LOGIT_SCALE * x
+        # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+        if DO_SOFTCAPPING:   x = SOFTCAP * triton_tanh(x / SOFTCAP)
+        loss = logsumexp - x
+    else:
+        loss = 0.0
+    tl.store(logsumexp_ptr, logsumexp)
+    tl.store(loss_ptr, loss)
+pass
+
+
+@triton.heuristics({
+    "DO_SOFTCAPPING":   lambda args: bool(args["DO_SOFTCAPPING"  ]),
+    "DO_LOGIT_SCALING": lambda args: bool(args["DO_LOGIT_SCALING"]),
+})
+@triton.jit
+def _chunked_cross_entropy_forward(
+    logits_ptr        ,
+    logits_row_stride ,
+    loss_ptr          ,
+    logsumexp_ptr     ,
+    labels_ptr        ,
+    VOCAB_SIZE        ,
+    N_CHUNKS          ,
+    BLOCK_SIZE        : tl.constexpr,
+    DO_SOFTCAPPING    ,
+    SOFTCAP           ,
+    DO_LOGIT_SCALING  ,
+    LOGIT_SCALE       ,
+):
+    """
+        256K vocab divided in 4 chunks
+
+        |-65536-| |-65536-| |-65536-| |-65536-|
+        |-------| |-------| |-------| |-------|
+        |-------| |-------| |-------| |-------|
+
+        If y == 0: CE_i = 0
+        If y == 1: CE_i = logsumexp - x
+
+        Notice we can do logsumexp for each chunk and then
+        logsumexp[chunk_sum(logsumexp)] == logsumexp
+
+        chunk_sum = log[chunk_sum(logsumexp)]
+                  = log[exp(logsumexp(a)) + ... + exp(logsumexp(z))]
+                  = log[exp(log[sum(exp(a))]) + ... + exp(log[sum(exp(z))])]
+                  = log[sum(exp(a)) + ... + sum(exp(z))]
+                  = logsumexp(x)
+
+        This means we can perform a logsumexp for each chunk, then do a
+        final logsumexp reduction!
+
+        Ie do: logsumexp(chunked_logsumexp) - x
+    """
+    row_idx   = tl.program_id(0)
+    chunk_idx = tl.program_id(1)
+    logits_ptr    += row_idx * triton_cast(logits_row_stride, tl.int64)
+    loss_ptr      += row_idx
+    logsumexp_ptr += row_idx * N_CHUNKS + chunk_idx
+    labels_ptr    += row_idx
+
+    col_offsets = chunk_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < VOCAB_SIZE
+
+    label_idx = tl.load(labels_ptr).to(tl.int32)
+    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf")).to(tl.float32)
+
+    # Go logit scaling for Cohere: t * x
+    if DO_LOGIT_SCALING: logits = LOGIT_SCALE * logits
+    # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+    if DO_SOFTCAPPING:   logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
+
+    c = tl.max(logits, 0)
+    logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
+
+    if chunk_idx == 0:
+        # logsumexp(chunked_logsumexp) - x
+        # Do the -x separately
+        if label_idx != -100:
+            x = tl.load(logits_ptr + label_idx).to(tl.float32)
+            # Go logit scaling for Cohere: t * x
+            if DO_LOGIT_SCALING: x = LOGIT_SCALE * x
+            # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+            if DO_SOFTCAPPING:   x = SOFTCAP * triton_tanh(x / SOFTCAP)
+            loss = -1.0 * x
+        else:
+            loss = 0.0
+        tl.store(loss_ptr, loss)
+    pass
+    tl.store(logsumexp_ptr, logsumexp)
+pass
+
+
+@triton.heuristics({
+    "DO_SOFTCAPPING":   lambda args: bool(args["DO_SOFTCAPPING"  ]),
+    "DO_LOGIT_SCALING": lambda args: bool(args["DO_LOGIT_SCALING"]),
+})
+@triton.jit
+def _cross_entropy_backward(
+    logits_ptr        ,
+    logits_row_stride ,
+    dloss_ptr         ,
+    dloss_row_stride  ,
+    logsumexp_ptr     ,
+    labels_ptr        ,
+    VOCAB_SIZE        ,
+    BLOCK_SIZE        : tl.constexpr,
+    DO_SOFTCAPPING    ,
+    SOFTCAP           ,
+    DO_LOGIT_SCALING  ,
+    LOGIT_SCALE       ,
+):
+    """
+        CE_i = -y log(P) = y * (log[sum(exp(x))] - x)
+        dC/dx = d/dx (y * log[sum(exp(x))] - x * y)
+
+        From https://en.wikipedia.org/wiki/LogSumExp
+        d/dx logsumexp = exp(x) / sum(exp(x)) = softmax(x)
+
+        dC/dx = y * exp(x) / sum(exp(x)) - d/dx (x * y)
+        dC/dx = y * exp[ log[exp(x) / sum(exp(x))] ] using x = exp(log(x)) trick
+        dC/dx = y * exp[x - logsumexp] - d/dx (x * y)
+
+        If y == 0: dC/dx = 0
+        If y == 1 and x == label: dC/dlabel = exp[x - logsumexp] - 1
+        If y == 1 and x != label: dC/dx     = exp[x - logsumexp]
+    """
+    row_idx   = tl.program_id(0)
+    block_idx = tl.program_id(1)
+
+    logits_ptr += row_idx * triton_cast(logits_row_stride, tl.int64)
+    dloss_ptr  += row_idx *  dloss_row_stride
+    col_offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < VOCAB_SIZE
+    label_idx = tl.load(labels_ptr + row_idx).to(tl.int32)
+
+    if label_idx != -100:
+        dloss = tl.load(dloss_ptr)
+    else:
+        dloss = 0.0
+
+    x = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf")).to(tl.float32)
+
+    # Do logit scaling for Cohere
+    if DO_LOGIT_SCALING:
+        # d/dx [s * x] = s
+        x = x * LOGIT_SCALE
+    pass
+
+    # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+    partial = x
+    if DO_SOFTCAPPING:
+        # d/dx [t * tanh(1/t * x)] = 1 - tanh^2(1/t * x)
+        partial = triton_tanh(x / SOFTCAP)
+        x = SOFTCAP * partial
+    pass
+
+    logsumexp = tl.load(logsumexp_ptr + row_idx)
+    y = tl.exp(x - logsumexp)
+    y = tl.where(
+        col_offsets == label_idx,
+        y - 1.0, # exp(x - logsumexp) - 1
+        y,       # exp(x - logsumexp)
+    )
+
+    if DO_LOGIT_SCALING:
+        # d/dx [s * x] = s
+        y = y * LOGIT_SCALE
+    pass
+
+    if DO_SOFTCAPPING:
+        # d/dx [t * tanh(1/t * x)] = 1 - tanh^2(1/t * x)
+        y = y * (1.0 - partial*partial)
+    pass
+
+    # If y == 0: dC/dx = 0 ==> we already masked it to be = 0, so dloss = 0.
+    tl.store(logits_ptr + col_offsets, dloss * y, mask = mask)
+pass
+
+
+MAX_FUSED_SIZE = 65536 # 2**16
+
+class Fast_CrossEntropyLoss(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, logits, labels, logit_softcapping : float = 0, logit_scaling : float = 0):
+        n_rows : int
+        vocab_size : int
+        n_rows, vocab_size = logits.shape
+
+        div, mod = divmod(vocab_size, MAX_FUSED_SIZE)
+        n_chunks : int = div + (mod != 0)
+        losses = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
+
+        DO_SOFTCAPPING   : bool = bool(logit_softcapping != 0)
+        DO_LOGIT_SCALING : bool = bool(logit_scaling != 0)
+
+        BLOCK_SIZE : int
+        num_warps  : int
+        if n_chunks == 1:
+            # For small vocabs <= 65336 like Llama, Mistral
+            BLOCK_SIZE, num_warps = calculate_settings(vocab_size)
+            logsumexp = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
+
+            _cross_entropy_forward[(n_rows,)](
+                logits, logits.stride(0),
+                losses,
+                logsumexp,
+                labels,
+                VOCAB_SIZE       = vocab_size,
+                BLOCK_SIZE       = BLOCK_SIZE,
+                DO_SOFTCAPPING   = DO_SOFTCAPPING,
+                SOFTCAP          = logit_softcapping,
+                DO_LOGIT_SCALING = DO_LOGIT_SCALING,
+                LOGIT_SCALE      = logit_scaling,
+                num_warps        = num_warps,
+            )
+        else:
+            # For large vocabs > 65336 like Gemma 256K
+            logsumexp = torch.empty((n_rows, n_chunks,), dtype = torch.float32, device = "cuda:0")
+
+            _chunked_cross_entropy_forward[(n_rows, n_chunks,)](
+                logits, logits.stride(0),
+                losses,
+                logsumexp,
+                labels,
+                VOCAB_SIZE       = vocab_size,
+                N_CHUNKS         = n_chunks,
+                BLOCK_SIZE       = MAX_FUSED_SIZE,
+                DO_SOFTCAPPING   = DO_SOFTCAPPING,
+                SOFTCAP          = logit_softcapping,
+                DO_LOGIT_SCALING = DO_LOGIT_SCALING,
+                LOGIT_SCALE      = logit_scaling,
+                num_warps        = 32,
+            )
+            # logsumexp(chunked_logsumexp) - x
+            # Do the -x separately
+            logsumexp = torch.logsumexp(logsumexp, dim = 1) # Row sum
+            losses += logsumexp
+            losses.masked_fill_(labels == -100, 0) # Don't forget to mask padding out!
+        pass
+
+        ctx.save_for_backward(logits, logsumexp, labels)
+        ctx.DO_SOFTCAPPING    = DO_SOFTCAPPING
+        ctx.logit_softcapping = logit_softcapping
+        ctx.DO_LOGIT_SCALING  = DO_LOGIT_SCALING
+        ctx.logit_scaling     = logit_scaling
+        return losses
+    pass
+
+
+    @staticmethod
+    def backward(ctx, dlosses):
+        logits, logsumexp, labels = ctx.saved_tensors
+        n_rows : int
+        vocab_size : int
+        n_rows, vocab_size = logits.shape
+
+        BLOCK_SIZE : int = 4096
+        div : int
+        mod : int
+        div, mod = divmod(vocab_size, BLOCK_SIZE)
+        n_blocks : int = div + (mod != 0)
+
+        _cross_entropy_backward[(n_rows, n_blocks,)](
+            logits,   logits.stride(0),
+            dlosses, dlosses.stride(0),
+            logsumexp,
+            labels,
+            VOCAB_SIZE       = vocab_size,
+            BLOCK_SIZE       = BLOCK_SIZE,
+            DO_SOFTCAPPING   = ctx.DO_SOFTCAPPING,
+            SOFTCAP          = ctx.logit_softcapping,
+            DO_LOGIT_SCALING = ctx.DO_LOGIT_SCALING,
+            LOGIT_SCALE      = ctx.logit_scaling,
+            num_warps        = 8,
+        )
+        return logits, None, None, None,
+    pass
+pass
+
+
+def fast_cross_entropy_loss(
+    logits,
+    labels,
+    logit_softcapping = 0,
+    logit_scaling = 0,
+    n_items = None,
+):
+    """
+    Arguments:
+        logits: (batch, seq_len, vocab_size)
+        labels: (batch, seq_len,)
+    Returns:
+        losses: float
+    """
+    batch, seq_len, d = logits.shape
+    assert(labels.shape == (batch, seq_len))
+
+    loss = Fast_CrossEntropyLoss.apply(
+        logits.view(batch*seq_len, d),
+        labels.view(-1),
+        logit_softcapping,
+        logit_scaling,
+    )
+    if n_items is None:
+        n_items = torch.count_nonzero(labels != -100)
+    return loss.sum() / n_items
+pass
+if (Version(torch.__version__) < Version("2.4.0")) and \
+    not hasattr(fast_cross_entropy_loss, "__wrapped__"):
+    fast_cross_entropy_loss = torch._disable_dynamo(fast_cross_entropy_loss)
+pass
+
+# Patch CE Losses in transformers
+def patch_loss_functions(torch_compile = True):
+    _patch_loss_functions(fast_cross_entropy_loss, torch_compile = torch_compile)
+pass
diff --git a/unsloth-main/unsloth/kernels/fast_lora.py b/unsloth-main/unsloth/kernels/fast_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2b7929a29a00916524d3b6a79c3d98d807269a8
--- /dev/null
+++ b/unsloth-main/unsloth/kernels/fast_lora.py
@@ -0,0 +1,490 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from .utils import (
+    fast_dequantize,
+    QUANT_STATE,
+    get_lora_parameters,
+    get_lora_parameters_bias,
+    matmul_lora,
+    torch_amp_custom_fwd,
+    torch_amp_custom_bwd,
+)
+
+
+class LoRA_MLP(torch.autograd.Function):
+    """
+    ### LoRA weights
+    G = G + Ag @ Bg
+    U = U + Au @ Bu
+    W = W + Aw @ Bw
+
+    ### SwiGLU(X)
+    e = X @ G
+    f = e * sigmoid(e)
+    g = X @ U
+    h = f * g
+    i = h @ W
+
+    ### Backpropagation chain rule
+    See our blog post for more details
+
+    df = sigmoid(e) * (1 - f) + f
+    dC/dW = h.T @ dY
+    dC/dU = X.T @ (D @ W.T * f)
+    dC/dG = X.T @ (D @ W.T * df * g)
+
+    ### Down projection LoRA weights
+    dC/dAw = dC/dW @ B.T
+    dC/dBw = A.T @ dC/dW
+    dC/dAw =       h.T @ dY @ B.T
+    dC/dBw = A.T @ h.T @ dY
+
+    ### Up projection LoRA weights
+    dC/dAu =       X.T @ (D @ W.T * f) @ B.T
+    dC/dBu = A.T @ X.T @ (D @ W.T * f)
+
+    ### Gate projection LoRA weights
+    dC/dAg =       X.T @ (D @ W.T * df * g) @ B.T
+    dC/dBg = A.T @ X.T @ (D @ W.T * df * g)
+
+    Don't forget to see our blog post for more details!
+    """
+    @staticmethod
+    @torch_amp_custom_fwd
+    def forward(ctx, X : torch.Tensor,
+                gateW, gateW_quant, gateA, gateB, gateS,
+                  upW,   upW_quant, upA,   upB,   upS,
+                downW, downW_quant, downA, downB, downS,
+                _forward_function, _backward_function,
+                inplace = True,):
+        dtype = X.dtype
+
+        e = matmul_lora(X, gateW, gateW_quant, gateA, gateB, gateS)
+        g = matmul_lora(X,   upW,   upW_quant,   upA,   upB,   upS)
+        h = _forward_function(e, g)
+        i = matmul_lora(h, downW, downW_quant, downA, downB, downS)
+
+        ctx.custom_saved_tensors = (
+            gateW, gateW_quant, gateS,
+            upW, upW_quant, upS,
+            downW, downW_quant, downS,
+            _backward_function,
+        )
+        ctx.save_for_backward(gateA, gateB, upA, upB, downA, downB,
+                              X, e, g)
+        ctx.inplace = inplace
+        return i
+    pass
+
+
+    @staticmethod
+    @torch_amp_custom_bwd
+    def backward(ctx, dY : torch.Tensor):
+        gateW, gateW_quant, gateS, upW, upW_quant, upS, downW, downW_quant, downS, \
+            _backward_function = ctx.custom_saved_tensors
+        gateA, gateB, upA, upB, downA, downB, \
+            X, e, g = ctx.saved_tensors
+
+        gateA, gateB, upA, upB, downA, downB = \
+            gateA.t(), gateB.t(), upA.t(), upB.t(), downA.t(), downB.t()
+
+        batch, seq_len, hd = X.shape
+        dY = dY.view(-1, dY.shape[-1])
+        X  = X .view(-1, X .shape[-1])
+        e  = e .view(-1, e .shape[-1])
+        g  = g .view(-1, g .shape[-1])
+        dtype = X.dtype
+
+        DW = matmul_lora(dY, downW.t(), downW_quant, downB, downA, downS)
+        DW, e, g = _backward_function(DW, e, g)
+        h, df, de = DW, e, g
+
+        # Down projection LoRA weights
+        d_downA = h.t() @ (dY @ downB.t())
+        d_downB = (downA.t() @ h.t()) @ dY
+        d_downA *= downS
+        d_downB *= downS
+
+        # Up projection LoRA weights
+        d_upA   = X.t() @ (df @ upB.t())
+        d_upB   = (upA.t() @ X.t()) @ df
+        d_upA  *= upS
+        d_upB  *= upS
+
+        # Gate projection LoRA weights
+        d_gateA = X.t() @ (de @ gateB.t())
+        d_gateB = (gateA.t() @ X.t()) @ de
+        d_gateA *= gateS
+        d_gateB *= gateS
+
+        # dX  = matmul_lora(df, upW.t(), upW_quant, upB, upA, upS)
+        # dX += matmul_lora(de, gateW.t(), gateW_quant, gateB, gateA, gateS)
+        upW = fast_dequantize(upW.t(), upW_quant)
+        dX = torch.matmul(df, upW.t(), out = X if ctx.inplace else None)
+        del upW
+        dX += df @ upB.to(dtype).t() @ (upS * upA.to(dtype).t())
+
+        gateW = fast_dequantize(gateW.t(), gateW_quant)
+        dX += de @ gateW.t()
+        del gateW
+        dX += de @ gateB.to(dtype).t() @ (gateS * gateA.to(dtype).t())
+
+        # gateW, gateW_quant, gateA, gateB, gateS,
+        #  upW,    upW_quant,   upA,   upB,   upS,
+        # downW, downW_quant, downA, downB, downS,
+        return dX.view(batch, seq_len, hd), \
+            None, None, d_gateA.t(), d_gateB.t(), None, \
+            None, None,   d_upA.t(),   d_upB.t(), None, \
+            None, None, d_downA.t(), d_downB.t(), None, \
+            None, None, None, # _backward and _forward and inplace
+    pass
+pass
+
+
+from .swiglu import swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel
+def apply_lora_mlp_swiglu(self, X, inplace = True):
+    gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
+    upW,     upW_quant,   upA,   upB,   upS = get_lora_parameters(self.  up_proj)
+    downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
+    out = LoRA_MLP.apply(X,
+                         gateW, gateW_quant, gateA, gateB, gateS,
+                         upW,     upW_quant, upA,   upB,   upS,
+                         downW, downW_quant, downA, downB, downS,
+                         swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel,
+                         inplace,)
+    return out
+pass
+
+
+from .geglu import geglu_exact_forward_kernel, geglu_exact_backward_kernel
+def apply_lora_mlp_geglu_exact(self, X, inplace = True):
+    gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
+    upW,     upW_quant,   upA,   upB,   upS = get_lora_parameters(self.  up_proj)
+    downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
+    out = LoRA_MLP.apply(X,
+                         gateW, gateW_quant, gateA, gateB, gateS,
+                         upW,     upW_quant, upA,   upB,   upS,
+                         downW, downW_quant, downA, downB, downS,
+                         geglu_exact_forward_kernel, geglu_exact_backward_kernel,
+                         inplace,)
+    return out
+pass
+
+
+from .geglu import geglu_approx_forward_kernel, geglu_approx_backward_kernel
+def apply_lora_mlp_geglu_approx(self, X):
+    gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
+    upW,     upW_quant,   upA,   upB,   upS = get_lora_parameters(self.  up_proj)
+    downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
+    out = LoRA_MLP.apply(X,
+                         gateW, gateW_quant, gateA, gateB, gateS,
+                         upW,     upW_quant, upA,   upB,   upS,
+                         downW, downW_quant, downA, downB, downS,
+                         geglu_approx_forward_kernel, geglu_approx_backward_kernel,)
+    return out
+pass
+
+
+class LoRA_QKV(torch.autograd.Function):
+    """
+    ### LoRA weights
+    Wq = Wq + Aq @ Bq
+    Wk = Wk + Ak @ Bk
+    Wv = Wv + Av @ Bv
+    Q = X @ Wq = X @ Wq + X @ Aq @ Bq
+    K = X @ Wk = X @ Wk + X @ Ak @ Bk
+    V = X @ Wv = X @ Wv + X @ Av @ Bv
+
+    ### Backpropagation chain rule
+    See our blogpost for more details.
+
+    dC/dWq = X.T @ D(Wq)
+    dC/dWk = X.T @ D(Wk)
+    dC/dWv = X.T @ D(Wv)
+    We then sum them all find dC/dX
+
+    ### Q projection LoRA weights
+    dC/dAq =       X.T @ D(Wq) @ B.T
+    dC/dBq = A.T @ X.T @ D(Wq)
+
+    ### K projection LoRA weights
+    dC/dAk =       X.T @ D(Wk) @ B.T
+    dC/dBk = A.T @ X.T @ D(Wk)
+
+    ### V projection LoRA weights
+    dC/dAv =       X.T @ D(Wv) @ B.T
+    dC/dBv = A.T @ X.T @ D(Wv)
+    """
+    @staticmethod
+    @torch_amp_custom_fwd
+    def forward(ctx, X : torch.Tensor,
+                QW, QW_quant, QA, QB, QS,
+                KW, KW_quant, KA, KB, KS,
+                VW, VW_quant, VA, VB, VS,
+                inplace = True):
+        dtype = X.dtype
+
+        Q = matmul_lora(X, QW, QW_quant, QA, QB, QS)
+        K = matmul_lora(X, KW, KW_quant, KA, KB, KS)
+        V = matmul_lora(X, VW, VW_quant, VA, VB, VS)
+
+        ctx.custom_saved_tensors = (
+            QW, QW_quant, QS,
+            KW, KW_quant, KS,
+            VW, VW_quant, VS,
+        )
+        ctx.save_for_backward(X, QA, QB, KA, KB, VA, VB,)
+        ctx.inplace = inplace
+        return Q, K, V
+    pass
+
+    @staticmethod
+    @torch_amp_custom_bwd
+    def backward(ctx, dQ, dK, dV):
+        QW, QW_quant, QS, KW, KW_quant, KS, VW, VW_quant, VS = \
+            ctx.custom_saved_tensors
+        X, QA, QB, KA, KB, VA, VB, = ctx.saved_tensors
+
+        QA, QB, KA, KB, VA, VB = \
+            QA.t(), QB.t(), KA.t(), KB.t(), VA.t(), VB.t()
+
+        batch, seq_len, hd = X.shape
+        dQ = dQ.view(-1, dQ.shape[-1])
+        dK = dK.reshape(-1, dK.shape[-1]) # view doesn't work on K.T
+        dV = dV.view(-1, dV.shape[-1])
+        X  = X .view(-1, X .shape[-1])
+        dtype = X.dtype
+
+        ### Weight projection LoRA weights
+        # See our blogpost for more details.
+
+        # Q Projection
+        d_QA = X.t() @ (dQ @ QB.t())
+        d_QB = (QA.t() @ X.t()) @ dQ
+        d_QA *= QS
+        d_QB *= QS
+
+        # K Projection
+        d_KA = X.t() @ (dK @ KB.t())
+        d_KB = (KA.t() @ X.t()) @ dK
+        d_KA *= KS
+        d_KB *= KS
+
+        # V Projection
+        d_VA = X.t() @ (dV @ VB.t())
+        d_VB = (VA.t() @ X.t()) @ dV
+        d_VA *= VS
+        d_VB *= VS
+
+        # Combine derivatives to find dX
+        # dQ
+        QW = fast_dequantize(QW.t(), QW_quant)
+        dX = torch.matmul(dQ, QW.t(), out = X if ctx.inplace else None)
+        del QW
+        dX += (dQ @ QB.to(dtype).t() @ (QS * QA.to(dtype).t()))
+
+        # dK
+        KW = fast_dequantize(KW.t(), KW_quant)
+        dX += dK @ KW.t()
+        del KW
+        dX += dK @ KB.to(dtype).t() @ (KS * KA.to(dtype).t())
+
+        # dV
+        VW = fast_dequantize(VW.t(), VW_quant)
+        dX += dV @ VW.t()
+        del VW
+        dX += dV @ VB.to(dtype).t() @ (VS * VA.to(dtype).t())
+
+        # QW, QW_quant, QA, QB, QS,
+        # KW, KW_quant, KA, KB, KS,
+        # VW, VW_quant, VA, VB, VS,
+        return dX.view(batch, seq_len, hd), \
+            None, None, d_QA.t(), d_QB.t(), None, \
+            None, None, d_KA.t(), d_KB.t(), None, \
+            None, None, d_VA.t(), d_VB.t(), None, \
+            None,
+    pass
+pass
+
+
+def apply_lora_qkv(self, X, inplace = True):
+    QW, QW_quant, QA, QB, QS = get_lora_parameters(self.q_proj)
+    KW, KW_quant, KA, KB, KS = get_lora_parameters(self.k_proj)
+    VW, VW_quant, VA, VB, VS = get_lora_parameters(self.v_proj)
+    Q, K, V = LoRA_QKV.apply(X,
+        QW, QW_quant, QA, QB, QS,
+        KW, KW_quant, KA, KB, KS,
+        VW, VW_quant, VA, VB, VS,
+        inplace,
+    )
+    return Q, K, V
+pass
+
+
+class LoRA_W(torch.autograd.Function):
+    """
+    ### LoRA weights
+    Wq = Wq + Aq @ Bq
+    Wk = Wk + Ak @ Bk
+    Wv = Wv + Av @ Bv
+    Q = X @ Wq = X @ Wq + X @ Aq @ Bq
+    K = X @ Wk = X @ Wk + X @ Ak @ Bk
+    V = X @ Wv = X @ Wv + X @ Av @ Bv
+
+    ### Backpropagation chain rule
+    dC/dWq = X.T @ D(Wq)
+    dC/dWk = X.T @ D(Wk)
+    dC/dWv = X.T @ D(Wv)
+
+    ### Q projection LoRA weights
+    dC/dAq =       X.T @ D(Wq) @ B.T
+    dC/dBq = A.T @ X.T @ D(Wq)
+
+    ### K projection LoRA weights
+    dC/dAk =       X.T @ D(Wk) @ B.T
+    dC/dBk = A.T @ X.T @ D(Wk)
+
+    ### V projection LoRA weights
+    dC/dAv =       X.T @ D(Wv) @ B.T
+    dC/dBv = A.T @ X.T @ D(Wv)
+    """
+    @staticmethod
+    @torch_amp_custom_fwd
+    def forward(ctx, X : torch.Tensor,
+                W, W_quant, A, B, S):
+        dtype = X.dtype
+        XW = matmul_lora(X, W, W_quant, A, B, S)
+        ctx.custom_saved_tensors = (W, W_quant, S,)
+        ctx.save_for_backward(A, B, X)
+        return XW
+    pass
+
+    @staticmethod
+    @torch_amp_custom_bwd
+    def backward(ctx, dY : torch.Tensor):
+        W, W_quant, S = ctx.custom_saved_tensors
+        A, B, X = ctx.saved_tensors
+
+        A, B = A.t(), B.t()
+
+        batch, seq_len, hd = X.shape
+        dY = dY.reshape(-1, dY.shape[-1]) # Must be reshape
+        X  = X .reshape(-1, X .shape[-1]) # Must be reshape
+        dtype = X.dtype
+
+        ### Weight projection LoRA weights
+        # Weight projection
+        d_A = X.t() @ (dY @ B.t())
+        d_B = (A.t() @ X.t()) @ dY
+        d_A *= S
+        d_B *= S
+
+        # Get derivative for dX
+        W = fast_dequantize(W.t(), W_quant)
+        dX = dY @ W.t()
+        del W
+        dX += dY @ B.to(dtype).t() @ (S * A.to(dtype).t())
+
+        # W, W_quant, A, B, S
+        return dX.view(batch, seq_len, hd), \
+            None, None, d_A.t(), d_B.t(), None
+    pass
+pass
+
+
+def apply_lora_o(self, X):
+    OW, OW_quant, OA, OB, OS = get_lora_parameters(self.o_proj)
+    O = LoRA_W.apply(X, OW, OW_quant, OA, OB, OS)
+    return O
+pass
+
+
+IDENTITY_DROPOUT = torch.nn.Identity
+@torch._disable_dynamo
+def fast_lora_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+    raise NotImplementedError(
+        "Unsloth: Currently not supported yet - reshaping done incorrectly"
+    )
+    self._check_forward_args(x, *args, **kwargs)
+    adapter_names = kwargs.pop("adapter_names", None)
+
+    if self.disable_adapters:
+        if self.merged:
+            self.unmerge()
+        result = self.base_layer(x, *args, **kwargs)
+    elif adapter_names is not None:
+        result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
+    elif self.merged:
+        result = self.base_layer(x, *args, **kwargs)
+    else:
+        # Fastpath
+        if len(self.active_adapters) == 1:
+            active_adapter = self.active_adapters[0]
+            if active_adapter not in self.lora_A.keys(): return self.base_layer(x, *args, **kwargs)
+
+            dropout = self.lora_dropout[active_adapter]
+            if isinstance(dropout, IDENTITY_DROPOUT) and not self.use_dora[active_adapter]:
+                lora_A = self.lora_A[active_adapter].weight
+                lora_B = self.lora_B[active_adapter].weight
+                scaling = self.scaling[active_adapter]
+                W = self.base_layer.weight
+                return LoRA_W.apply(x, W, QUANT_STATE(W), lora_A, lora_B, scaling)
+            pass
+        pass
+
+        result = self.base_layer(x, *args, **kwargs)
+        # As per Tim Dettmers, for 4bit, we need to defensively clone here.
+        # The reason is that in some cases, an error can occur that backprop
+        # does not work on a manipulated view. This issue may be solved with
+        # newer PyTorch versions but this would need extensive testing to be
+        # sure.
+        result = result.clone()
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+
+            requires_conversion = not torch.is_autocast_enabled()
+            if requires_conversion:
+                expected_dtype = result.dtype
+                x = x.to(lora_A.weight.dtype)
+
+            if not self.use_dora[active_adapter]:
+                result = result + lora_B(lora_A(dropout(x))) * scaling
+            else:
+                if isinstance(dropout, torch.nn.Identity) or not self.training:
+                    base_result = result
+                else:
+                    x = dropout(x)
+                    base_result = None
+
+                result = result + self.lora_magnitude_vector[active_adapter](
+                    x,
+                    lora_A=lora_A,
+                    lora_B=lora_B,
+                    scaling=scaling,
+                    base_layer=self.get_base_layer(),
+                    base_result=base_result,
+                )
+            if requires_conversion:
+                result = result.to(expected_dtype)
+
+    return result
+pass
diff --git a/unsloth-main/unsloth/kernels/flex_attention.py b/unsloth-main/unsloth/kernels/flex_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..887ffca1b71b2f5646101fff6200d088f9fe1d9e
--- /dev/null
+++ b/unsloth-main/unsloth/kernels/flex_attention.py
@@ -0,0 +1,181 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from functools import lru_cache
+from transformers.models.llama.modeling_llama import logger
+import os
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : True,
+    "shape_padding"     : True,
+    "trace.enabled"     : os.environ.get("UNSLOTH_COMPILE_DEBUG", "0") == "1",
+    "triton.cudagraphs" : False,
+}
+
+# Flex Attention supported from torch 2.5 onwards only
+try:
+    from torch.nn.attention.flex_attention import (
+        flex_attention as _flex_attention,
+        create_block_mask as _create_block_mask,
+    )
+    _flex_attention = torch.compile(_flex_attention, dynamic = True, options = torch_compile_options)
+    HAS_FLEX_ATTENTION = False
+except:
+    HAS_FLEX_ATTENTION = False
+pass
+
+
+if not HAS_FLEX_ATTENTION:
+
+    # Logit softcapping
+    @torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
+    def slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
+        n_heads    = self.num_heads
+        head_dim   = self.head_dim
+        n_kv_heads = self.num_key_value_heads
+        n_groups   = self.num_key_value_groups
+        
+        # Grouped query attention
+        K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+        V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+        K = K.reshape(bsz, n_heads, q_len, head_dim)
+        V = V.reshape(bsz, n_heads, q_len, head_dim)
+
+        # See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+        # Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
+        # We default to using the config file itself
+        # s = self.config.hidden_size // self.config.num_attention_heads
+        s = self.config.query_pre_attn_scalar
+        t = self.config.attn_logit_softcapping
+
+        Q = Q * torch.tensor(s**-0.5, dtype = Q.dtype) # Follow Keras exactly
+        A = torch.matmul(Q, K.transpose(2, 3))
+        A = t * torch.tanh(A / t) # Logit softcapping
+        A += causal_mask[:q_len, :q_len]
+        # Much slower in torch compile!
+        # A.masked_fill_(causal_mask[:q_len, :q_len], -float("inf"))
+        A = torch.nn.functional.softmax(A, dim = -1, dtype = torch.float32).to(Q.dtype)
+        A = torch.matmul(A, V)
+        A = A.transpose(1, 2).contiguous()
+        A = A.reshape(bsz, q_len, n_heads*head_dim)
+        return A
+    pass
+
+    create_flex_attention_causal_mask = None
+    create_flex_attention_sliding_window_mask = None
+else:
+    # See https://github.com/pytorch-labs/attention-gym/blob/main/examples/flex_attn.ipynb
+    # for more examples
+    # BSD 3-Clause License Copyright (c) 2023, Driss Guessous, Horace He et al
+    import functools, math
+
+    def generate_tanh_softcap(t):
+        def tanh_softcap(x, b, h, q_idx, kv_idx):
+            return t * torch.tanh(x / t)
+        return tanh_softcap
+    pass
+    def causal_masker(b, h, q_idx, kv_idx):
+        return q_idx >= kv_idx
+    pass
+
+    @functools.lru_cache
+    def sliding_window_masker(size = 4096):
+        def sliding_window(b, h, q_idx, kv_idx):
+            causal_mask = q_idx >= kv_idx
+            window_mask = q_idx - kv_idx <= size 
+            return causal_mask & window_mask
+        return sliding_window
+    pass
+
+    @functools.lru_cache
+    def create_block_mask(mask, n = 128):
+        return _create_block_mask(
+            mask, 1, 1, n, n,
+            BLOCK_SIZE = 128,
+            _compile = True,
+        )
+    pass
+
+    def create_flex_attention_causal_mask(max_seq_length = 8192):
+        causal_mask = create_block_mask(causal_masker, max_seq_length)
+        return causal_mask
+    pass
+
+    def create_flex_attention_sliding_window_mask(max_seq_length = 8192, sliding_window = 4096):
+        sliding_masker = sliding_window_masker(sliding_window)
+        causal_mask = create_block_mask(sliding_masker, max_seq_length)
+        return causal_mask
+    pass
+
+    @functools.lru_cache
+    def flex_attention(s, t):
+        scale = 1.0 / math.sqrt(s)
+        score_mod = generate_tanh_softcap(t)
+        return functools.partial(
+            _flex_attention, score_mod = score_mod, scale = scale, enable_gqa = True,
+        )
+    pass
+    
+    def slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
+        n_heads    = self.num_heads
+        head_dim   = self.head_dim
+        s = self.config.query_pre_attn_scalar
+        t = self.config.attn_logit_softcapping
+        fx = flex_attention(s, t)
+        A = fx(query = Q, key = K, value = V, block_mask = causal_mask)
+        A = A.transpose(1, 2).contiguous()
+        A = A.reshape(bsz, q_len, n_heads*head_dim)
+        return A
+    pass
+pass
+
+
+torch_matmul = torch.matmul
+torch_tanh   = torch.tanh
+torch_nn_functional_softmax = torch.nn.functional.softmax
+def slow_inference_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
+    n_heads    = self.num_heads
+    head_dim   = self.head_dim
+    n_kv_heads = self.num_key_value_heads
+    n_groups   = self.num_key_value_groups
+    
+    # Grouped query attention
+    K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+    V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+    K = K.reshape(bsz, n_heads, q_len, head_dim)
+    V = V.reshape(bsz, n_heads, q_len, head_dim)
+
+    # See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+    # Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
+    # We default to using the config file itself
+    # s = self.config.hidden_size // self.config.num_attention_heads
+    s = self.config.query_pre_attn_scalar
+    t = self.config.attn_logit_softcapping
+
+    Q = Q * torch.tensor(s**-0.5, dtype = Q.dtype) # Follow Keras exactly
+    A = torch_matmul(Q, K.transpose(2, 3))
+
+    # Logit softcapping
+    A /= t; torch_tanh(A, out = A); A *= t;
+    A += causal_mask[:q_len, :q_len]
+    # Much slower in torch compile!
+    # A.masked_fill_(causal_mask[:q_len, :q_len], -float("inf"))
+    A = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32).to(Q.dtype)
+    A = torch_matmul(A, V)
+    A = A.transpose(1, 2).contiguous()
+    A = A.reshape(bsz, q_len, n_heads*head_dim)
+    return A
+pass
diff --git a/unsloth-main/unsloth/kernels/geglu.py b/unsloth-main/unsloth/kernels/geglu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fedae769e517eb8e5f1006c5be1dbfff85d183c
--- /dev/null
+++ b/unsloth-main/unsloth/kernels/geglu.py
@@ -0,0 +1,203 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import triton
+import triton.language as tl
+import torch
+from .utils import calculate_settings, triton_tanh
+
+
+@triton.jit
+def _exact_forward_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    # f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
+    # h = f * up
+    e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
+
+    f_row = 0.5 * e_row * (tl.math.erf(tl.math.rsqrt(2.0) * e_row) + 1.0)
+    f_row = f_row.to(g_row.dtype) # Exact copy from HF
+    h_row = f_row * g_row
+
+    # Store h
+    tl.store(h + offsets, h_row, mask = mask)
+pass
+
+
+def geglu_exact_forward_kernel(gate, up):
+    batch, seq_len, hd = gate.shape
+    n_elements = gate.numel()
+    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = "cuda:0")
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    _exact_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
+    return out
+pass
+
+
+@triton.jit
+def _exact_backward_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,):
+    """
+    f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
+    h = f * up
+
+    df/de (with help of Wolfram :)
+    df/de = 1/2 * (1 + erf(1/sqrt(2) * e)) + 1/sqrt(2*pi) * e * exp(-1/2 * e^2)
+
+    Reuse via
+    f =        1/2 * (1 + erf(1/sqrt(2) * e)) * e
+    """
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32)
+    e_row  = tl.load(e  + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row  = tl.load(g  + offsets, mask = mask, other = 0)#.to(tl.float32)
+
+    # Break e_row away for re-use
+    # f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
+    f_partial_row = 0.5 * (tl.math.erf(tl.math.rsqrt(2.0) * e_row) + 1.0)
+    f_row = f_partial_row * e_row
+    
+    f_row = f_row.to(DW_row.dtype)
+    # h = f * g
+    h_row  =  f_row * g_row
+    # df = DW * f
+    df_row = DW_row * f_row
+    # dg = DW * g
+    dg_row = DW_row * g_row
+
+    # df/de = 1/2 * (1 + erf(1/sqrt(2) * e)) + 1/sqrt(2*pi) * e * exp(-1/2 * e^2)
+    t = 0.3989422804014327 # 1/sqrt(2*pi)
+    df_de = f_partial_row + t * e_row * tl.exp(-0.5 * e_row * e_row)
+
+    de_row = dg_row.to(tl.float32) * df_de
+    de_row = de_row.to(DW_row.dtype)
+
+    # Store derivatives in buffers
+    tl.store(DW + offsets, h_row,  mask = mask) # h  = f * g
+    tl.store(e  + offsets, df_row, mask = mask) # df = DW * f
+    tl.store(g  + offsets, de_row, mask = mask) # de
+pass
+
+
+def geglu_exact_backward_kernel(DW, e, g):
+    batch_seq_len, hd = e.shape
+    n_elements = e.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    _exact_backward_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
+    return DW, e, g
+pass
+
+
+@triton.jit
+def _approx_forward_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    # f = 1/2 * e * (1 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3 ) ))
+    # f = 1/2 * e * (1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) ))
+    # h = f * up
+    s = 0.7978845608028654 # math.sqrt(2 / math.pi)
+    
+    e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
+
+    f_row = 0.5 * e_row * (
+        triton_tanh(s * e_row * (1.0 + 0.044715 * e_row * e_row)) \
+        + 1.0
+    )
+    f_row = f_row.to(g_row.dtype) # Exact copy from HF
+    h_row = f_row * g_row
+
+    # Store h
+    tl.store(h + offsets, h_row, mask = mask)
+pass
+
+
+def geglu_approx_forward_kernel(gate, up):
+    batch, seq_len, hd = gate.shape
+    n_elements = gate.numel()
+    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = "cuda:0")
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    _approx_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
+    return out
+pass
+
+
+@triton.jit
+def _approx_backward_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,):
+    """
+    f = 1/2 * e * (1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) ))
+    h = f * up
+
+    df/de (with help from https://arxiv.org/pdf/2305.12073.pdf :))
+    df/de = 1/2 * [1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) )] +
+            1/2 * sech^2 [   sqrt(2/pi) * x * (1 + 0.044715 * x^2 )  ] * \
+                           ( sqrt(2/pi) * x * (1 + 0.044715 * x^2 * 3 ) )
+
+    Notice sech^2(x) = 1 - tanh^2(x)
+    So reuse tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) )
+
+    See https://www.desmos.com/calculator/nqprfoni6x
+    """
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32)
+    e_row  = tl.load(e  + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row  = tl.load(g  + offsets, mask = mask, other = 0)#.to(tl.float32)
+
+    # See https://www.desmos.com/calculator/nqprfoni6x
+    s = 0.7978845608028654 # math.sqrt(2 / math.pi)
+    a = s * e_row # a = sqrt(2 / pi) * x
+    b = a * 0.044715 * e_row * e_row # b = a * 0.044715 * x^2
+    T = 1.0 + triton_tanh(a + b)
+    T2 = 0.5 * T
+    # Q = 0.5 * -T * (T - 2.0) * (a + 3.0 * b)
+    Q2 = -T2 * (T - 2.0) * (a + 3.0 * b) 
+    df_de = T2 + Q2 # 1/2 * (T + Q)
+
+    # f = 1/2 * e * (1 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3 ) ))
+    f_row = T2 * e_row
+    f_row = f_row.to(DW_row.dtype)
+    # h = f * g
+    h_row  =  f_row * g_row
+    # df = DW * f
+    df_row = DW_row * f_row
+    # dg = DW * g
+    dg_row = DW_row * g_row
+
+    de_row = dg_row.to(tl.float32) * df_de
+    de_row = de_row.to(DW_row.dtype)
+
+    # Store derivatives in buffers
+    tl.store(DW + offsets, h_row,  mask = mask) # h  = f * g
+    tl.store(e  + offsets, df_row, mask = mask) # df = DW * f
+    tl.store(g  + offsets, de_row, mask = mask) # de
+pass
+
+
+def geglu_approx_backward_kernel(DW, e, g):
+    batch_seq_len, hd = e.shape
+    n_elements = e.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    _approx_backward_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
+    return DW, e, g
+pass
diff --git a/unsloth-main/unsloth/kernels/layernorm.py b/unsloth-main/unsloth/kernels/layernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5f7926e2e5632db25d4cc65b65e2fe7aa83a4d9
--- /dev/null
+++ b/unsloth-main/unsloth/kernels/layernorm.py
@@ -0,0 +1,213 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+# Copyright 2024-present Andrej Karpathy & the llm.c team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import triton
+import triton.language as tl
+import torch
+from .utils import calculate_settings
+from unsloth_zoo.patching_utils import (
+    patch_layernorm,
+)
+
+
+@triton.jit
+def layernorm_forward(
+    Y, Y_row_stride,
+    X, X_row_stride,
+    W,
+    b,
+    r,
+    mu,
+    n_cols, eps,
+    BLOCK_SIZE : tl.constexpr
+):
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    Y  += row_idx * Y_row_stride
+    X  += row_idx * X_row_stride
+    r  += row_idx
+    mu += row_idx
+
+    # According to https://pytorch.org/torchtune/stable/_modules/torchtune/modules/layer_norm.html#Fp32LayerNorm, all modules
+    # are in float32!
+    X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32)
+    W_row = tl.load(W + col_offsets, mask = mask, other = 0).to(tl.float32)
+    b_row = tl.load(b + col_offsets, mask = mask, other = 0).to(tl.float32)
+
+    mean_X  = tl.sum(X_row,   axis = 0) / n_cols
+    XX      = X_row - mean_X
+    row_var = tl.sum(XX * XX, axis = 0) / n_cols
+    inv_var = tl.math.rsqrt(row_var + eps)
+    tl.store (r, inv_var)
+    tl.store (mu, mean_X)
+    output = (XX * inv_var) * W_row + b_row
+    tl.store(Y + col_offsets, output, mask = mask)
+pass
+
+
+@triton.jit
+def layernorm_backward(
+    dY, dY_row_stride,
+    X,   X_row_stride,
+    W,
+    b,
+    r,
+    mu,
+    n_cols, eps,
+    BLOCK_SIZE : tl.constexpr
+):
+    # Approximately follows https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    dY += row_idx * dY_row_stride
+    X  += row_idx *  X_row_stride
+    r  += row_idx
+    mu += row_idx
+
+    # According to https://pytorch.org/torchtune/stable/_modules/torchtune/modules/layer_norm.html#Fp32LayerNorm, all modules
+    # are in float32!
+    dY_row = tl.load(dY + col_offsets, mask = mask, other = 0).to(tl.float32)
+    X_row  = tl.load(X  + col_offsets, mask = mask, other = 0).to(tl.float32)
+    W_row  = tl.load(W  + col_offsets, mask = mask, other = 0).to(tl.float32)
+    b_row  = tl.load(b  + col_offsets, mask = mask, other = 0).to(tl.float32)
+
+    inv_var = tl.load(r) .to(tl.float32)
+    mean    = tl.load(mu).to(tl.float32)
+    normed  = (X_row - mean) * inv_var
+    dY_W = dY_row * W_row
+    dX_row = dY_W - tl.sum(dY_W, axis = 0) / n_cols - normed * tl.sum(dY_W * normed, axis = 0) / n_cols
+    dX_row = dX_row * inv_var
+    tl.store(dY + col_offsets, dX_row, mask = mask)
+pass
+
+
+class Fast_Layernorm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, X, W, b, eps):
+        shape = X.shape
+        dim = shape[-1]
+        X = X.view(-1, dim)
+        n_rows, n_cols = X.shape
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+
+        Y  = torch.empty((n_rows, n_cols), dtype = X.dtype, device = "cuda:0")
+        r  = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
+        mu = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
+
+        layernorm_forward[(n_rows,)](
+            Y, Y.stride(0),
+            X, X.stride(0),
+            W,
+            b,
+            r,
+            mu,
+            n_cols, eps,
+            BLOCK_SIZE = BLOCK_SIZE,
+            num_warps  = num_warps,
+        )
+        ctx.eps = eps
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps  = num_warps
+        ctx.save_for_backward(X, W, b, r, mu)
+        return Y.view(*shape)
+    pass
+
+    @staticmethod
+    def backward(ctx, dY):
+        shape = dY.shape
+        dim = shape[-1]
+        dY = dY.view(-1, dim)
+        X, W, b, r, mu = ctx.saved_tensors
+        n_rows, n_cols = dY.shape
+
+        layernorm_backward[(n_rows,)](
+            dY, dY.stride(0),
+            X,  X .stride(0),
+            W,
+            b,
+            r,
+            mu,
+            n_cols, ctx.eps,
+            BLOCK_SIZE = ctx.BLOCK_SIZE,
+            num_warps  = ctx.num_warps,
+        )
+        dX = dY.view(*shape)
+        return dX, None, None, None, None
+    pass
+pass
+
+
+def fast_layernorm(layernorm, X):
+    assert(layernorm.elementwise_affine is True)
+    W    = layernorm.weight
+    bias = layernorm.bias
+    eps = layernorm.variance_epsilon if \
+        hasattr(layernorm, "variance_epsilon") \
+        else layernorm.eps
+    out = Fast_Layernorm.apply(X, W, bias, eps)
+    return out
+pass
+
+
+
+def test_layernorm(
+    dim = 1024, eps = 1e-5, dtype = torch.float16,
+    bsz = 21, random_state = 3407, seqlen = 3341,
+):
+    from torch.nn import LayerNorm
+    layernorm = LayerNorm((dim,), eps = eps, device = "cuda", dtype = dtype)
+    torch.cuda.manual_seed(random_state)
+    torch.manual_seed(random_state)
+    torch.nn.init.uniform_(layernorm.weight)
+    torch.nn.init.uniform_(layernorm.bias)
+    X = torch.randn((bsz, seqlen, dim), dtype = dtype, device = "cuda")
+    XX = X.clone()
+    X .requires_grad_(True)
+    XX.requires_grad_(True)
+    Y = layernorm(X)
+    YY = torch.randn((bsz, seqlen, dim), dtype = dtype, device = "cuda", requires_grad = True)
+    Y.backward(YY)
+    correct_grad = X.grad.clone()
+    # from unsloth.kernels import fast_layernorm
+    Y = fast_layernorm(layernorm, XX)
+    Y.backward(YY)
+    assert(torch.dist(correct_grad, XX.grad).item() <= 0.1)
+pass
+
+
+def testing_suite_layernorm():
+    for dim in [512, 1024, 2048]:
+        for dtype in [torch.float16, torch.bfloat16]:
+            with torch.autocast(device_type = "cuda", dtype = dtype):
+                for seqlen in [3341, 2048, 349]:
+                    for random_state in [3407, 42]:
+                        test_layernorm(
+                            dim = dim,
+                            eps = 1e-5,
+                            dtype = dtype,
+                            bsz = 21,
+                            random_state = random_state,
+                            seqlen = seqlen,
+                        )
+                    pass
+                pass
+            pass
+        pass
+    pass
+pass
diff --git a/unsloth-main/unsloth/kernels/rms_layernorm.py b/unsloth-main/unsloth/kernels/rms_layernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b74d636c63aa10eb01413a33bfd8c5b0eae902f0
--- /dev/null
+++ b/unsloth-main/unsloth/kernels/rms_layernorm.py
@@ -0,0 +1,297 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import triton
+import triton.language as tl
+import torch
+from .utils import calculate_settings
+
+
+@triton.jit
+def _rms_layernorm_forward(
+    Y, Y_row_stride,
+    X, X_row_stride,
+    W, W_row_stride,
+    r, r_row_stride,
+    n_cols, eps,
+    BLOCK_SIZE : tl.constexpr
+):
+    """
+        Fast RMS Layernorm kernel
+        Inspiration from a Triton tutorial:
+        https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+    """
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    Y += row_idx * Y_row_stride
+    X += row_idx * X_row_stride
+    r += row_idx * r_row_stride
+
+    X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32)
+    W_row = tl.load(W + col_offsets, mask = mask, other = 0)#.to(tl.float32)
+
+    row_var = tl.sum(X_row * X_row, axis = 0) / n_cols
+    inv_var = tl.math.rsqrt(row_var + eps)
+    tl.store(r, inv_var)
+    normed = X_row * inv_var
+    normed = normed.to(W_row.dtype) # Exact copy from HF
+    output = normed * W_row
+    tl.store(Y + col_offsets, output, mask = mask)
+pass
+
+
+@triton.heuristics({"GEMMA": lambda args: bool(args["GEMMA"]),})
+@triton.jit
+def _rms_layernorm_backward(
+    dY, dY_row_stride,
+    dX, dX_row_stride,
+    X,   X_row_stride,
+    W,   W_row_stride,
+    r,   r_row_stride,
+    # dW, dW_row_stride,
+    n_cols, eps,
+    GEMMA      : tl.constexpr,
+    BLOCK_SIZE : tl.constexpr,
+):
+    """
+        Fast RMS Layernorm kernel for the backward pass
+        Inspiration from a Triton tutorial:
+        https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+    """
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    dY += row_idx * dY_row_stride
+    X  += row_idx *  X_row_stride
+    r  += row_idx *  r_row_stride
+
+    if GEMMA: dX += row_idx * dY_row_stride
+    else:     dX = dY
+
+    dY_row = tl.load(dY + col_offsets, mask = mask, other = 0).to(tl.float32)
+    X_row  = tl.load(X  + col_offsets, mask = mask, other = 0).to(tl.float32)
+    W_row  = tl.load(W  + col_offsets, mask = mask, other = 0).to(tl.float32)
+
+    # Get saved row variance
+    inv_var = tl.load(r).to(tl.float32)
+    normed = X_row * inv_var
+
+    if GEMMA: dY_W = dY_row * (W_row + 1.0)
+    else:     dY_W = dY_row * W_row
+
+    rowsum_dY_normed = tl.sum(dY_W * normed, axis = 0)
+    output = inv_var/n_cols * (n_cols*dY_W - normed*rowsum_dY_normed)
+    tl.store(dX + col_offsets, output, mask = mask)
+pass
+
+
+@triton.jit
+def _gemma_rms_layernorm_forward(
+    Y, Y_row_stride,
+    X, X_row_stride,
+    W, W_row_stride,
+    r, r_row_stride,
+    n_cols, eps,
+    BLOCK_SIZE : tl.constexpr,
+):
+    # Copies https://github.com/google-deepmind/gemma/blob/main/gemma/layers.py#L31
+    # and https://github.com/keras-team/keras-nlp/blob/v0.8.2/keras_nlp/models/gemma/rms_normalization.py#L33
+    # exactly. Essentially all in float32!
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    Y += row_idx * Y_row_stride
+    X += row_idx * X_row_stride
+    r += row_idx * r_row_stride
+
+    X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32)
+    W_row = tl.load(W + col_offsets, mask = mask, other = 0).to(tl.float32)
+
+    row_var = tl.sum(X_row * X_row, axis = 0) / n_cols
+    inv_var = tl.math.rsqrt(row_var + eps)
+    tl.store(r, inv_var)
+    normed = X_row * inv_var
+    output = normed * (W_row + 1.0)
+
+    tl.store(Y + col_offsets, output, mask = mask)
+pass
+
+
+class Fast_RMS_Layernorm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, X : torch.Tensor, W : torch.Tensor, eps : float, gemma : bool = False):
+        shape = X.shape
+        dim : int = shape[-1]
+        X = X.view(-1, dim)
+        n_rows : int
+        n_cols : int
+        n_rows, n_cols = X.shape
+        BLOCK_SIZE : int
+        num_warps  : int
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+
+        Y = torch.empty((n_rows, n_cols), dtype = X.dtype, device = "cuda:0")
+        r = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
+
+        fx = _gemma_rms_layernorm_forward if gemma else _rms_layernorm_forward
+        fx[(n_rows,)](
+            Y, Y.stride(0),
+            X, X.stride(0),
+            W, W.stride(0),
+            r, r.stride(0),
+            n_cols, eps,
+            BLOCK_SIZE = BLOCK_SIZE,
+            num_warps  = num_warps,
+        )
+        ctx.eps = eps
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps  = num_warps
+        ctx.GEMMA = gemma
+        ctx.save_for_backward(X, W, r)
+        return Y.view(*shape)
+    pass
+
+    @staticmethod
+    def backward(ctx, dY : torch.Tensor):
+        shape = dY.shape
+        dim : int = shape[-1]
+        dY = dY.view(-1, dim)
+        X, W, r = ctx.saved_tensors
+        n_rows : int
+        n_cols : int
+        n_rows, n_cols = dY.shape
+        # dW = X
+        dX = torch.empty_like(dY, device = "cuda:0") if ctx.GEMMA else dY
+
+        _rms_layernorm_backward[(n_rows,)](
+            dY, dY.stride(0),
+            dX, dX.stride(0),
+            X,  X .stride(0),
+            W,  W .stride(0),
+            r,  r .stride(0),
+            # dW, dW.stride(0),
+            n_cols, ctx.eps,
+            GEMMA      = ctx.GEMMA,
+            BLOCK_SIZE = ctx.BLOCK_SIZE,
+            num_warps  = ctx.num_warps,
+        )
+        dX = dX.view(*shape)
+        return dX, None, None, None
+    pass
+pass
+
+
+# [TODO] Unsure why RMS Layernorm is not torch.compiling properly
+@torch.compiler.disable
+def fast_rms_layernorm(layernorm, X : torch.Tensor, gemma : bool = False):
+    W : torch.Tensor = layernorm.weight
+    eps : float = layernorm.variance_epsilon if \
+        hasattr(layernorm, "variance_epsilon") \
+        else layernorm.eps
+    out = Fast_RMS_Layernorm.apply(X, W, eps, gemma)
+    return out
+pass
+
+
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
+class Unsloth_LlamaRMSNorm(LlamaRMSNorm):
+    def forward(self, X):
+        return fast_rms_layernorm(self, X, gemma = False)
+    pass
+pass
+
+try:
+    from transformers.models.mllama.modeling_mllama import MllamaTextRMSNorm
+    class Unsloth_MllamaTextRMSNorm(MllamaTextRMSNorm):
+        def forward(self, X):
+            return fast_rms_layernorm(self, X, gemma = False)
+        pass
+    pass
+except:
+    pass
+pass
+
+def patch_rms_layernorm():
+    import transformers.models.llama.modeling_llama
+    transformers.models.llama.modeling_llama.LlamaRMSNorm = Unsloth_LlamaRMSNorm
+    try:
+        import transformers.models.mllama.modeling_mllama
+        transformers.models.mllama.modeling_mllama.MllamaTextRMSNorm = Unsloth_MllamaTextRMSNorm
+    except:
+        pass
+    return
+pass
+
+
+def unpatch_rms_layernorm():
+    import transformers.models.llama.modeling_llama
+    transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
+    try:
+        import transformers.models.mllama.modeling_mllama
+        transformers.models.mllama.modeling_mllama.MllamaTextRMSNorm = MllamaTextRMSNorm
+    except:
+        pass
+    return
+    return
+pass
+
+
+def test_rms_layernorm(
+    dim = 1024, eps = 1e-5, dtype = torch.float16,
+    bsz = 21, random_state = 3407, seqlen = 3341,
+):
+    from transformers.models.llama.modeling_llama import LlamaRMSNorm
+    layernorm = LlamaRMSNorm((dim,), eps = eps).to("cuda")
+    torch.cuda.manual_seed(random_state)
+    torch.manual_seed(random_state)
+    torch.nn.init.uniform_(layernorm.weight)
+    X = torch.randn((bsz, seqlen, dim), dtype = dtype, device = "cuda")
+    XX = X.clone()
+    X .requires_grad_(True)
+    XX.requires_grad_(True)
+    Y = layernorm(X)
+    YY = torch.randn((bsz, seqlen, dim), dtype = dtype, device = "cuda", requires_grad = True)
+    Y.backward(YY)
+    correct_grad = X.grad.clone()
+    # from unsloth.kernels import fast_rms_layernorm
+    Y = fast_rms_layernorm(layernorm, XX)
+    Y.backward(YY)
+    assert(torch.amax(correct_grad - XX.grad).item() <= 0.05)
+pass
+
+
+def testing_suite_layernorm():
+    for dim in [512, 1024, 2048]:
+        for dtype in [torch.float16, torch.bfloat16]:
+            with torch.autocast(device_type = "cuda", dtype = dtype):
+                for seqlen in [3341, 2048, 349]:
+                    for random_state in [3407, 42]:
+                        test_rms_layernorm(
+                            dim = dim,
+                            eps = 1e-5,
+                            dtype = dtype,
+                            bsz = 21,
+                            random_state = random_state,
+                            seqlen = seqlen,
+                        )
+                    pass
+                pass
+            pass
+        pass
+    pass
+pass
diff --git a/unsloth-main/unsloth/kernels/rope_embedding.py b/unsloth-main/unsloth/kernels/rope_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fe15d0e3b15a6fdb6b94986f45b67eb4c7b2a6f
--- /dev/null
+++ b/unsloth-main/unsloth/kernels/rope_embedding.py
@@ -0,0 +1,196 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import triton
+import triton.language as tl
+import torch
+from .utils import calculate_settings
+ROPE_GROUP_SIZE : int = 4
+
+@triton.heuristics({"BACKWARD_PASS": lambda args: bool(args["BACKWARD_PASS"]),})
+@triton.jit
+def _rope_embedding(
+    Q,     Q_row_stride,
+    cos, cos_row_stride,
+    sin, sin_row_stride,
+    seqlen,
+    head_dim      : tl.constexpr,
+    n_heads       : tl.constexpr,
+    BACKWARD_PASS : tl.constexpr,
+    BLOCK_SIZE    : tl.constexpr,
+):
+    """
+        Calculates the RoPE Embedding quickly
+        RoPE is Q * cos + rotate_half(Q) * sin
+        See our blog post for more info
+    """
+    ROPE_GROUP_SIZE = 4
+    row_position  = tl.program_id(0)
+    group_head_position = tl.program_id(1)
+    col_offsets  = tl.arange(0, BLOCK_SIZE)
+    half_head_dim = head_dim // 2
+    mask = col_offsets < half_head_dim
+
+    sin1 = tl.load(sin + (row_position % seqlen)*sin_row_stride + \
+                   half_head_dim*0 + col_offsets, mask = mask, other = 0)
+    cos1 = tl.load(cos + (row_position % seqlen)*cos_row_stride + \
+                   half_head_dim*0 + col_offsets, mask = mask, other = 0)
+
+    if BACKWARD_PASS:
+        # See our blog post for more info.
+        sin1 = -sin1
+    pass
+
+    # [TODO] Autotune ROPE_GROUP_SIZE to be 1, 2, 4, 8
+    head_start = group_head_position * ROPE_GROUP_SIZE
+    head_end = min((head_start + ROPE_GROUP_SIZE), n_heads)
+
+    # 10% Faster kernel from [HuyNguyen-hust](https://github.com/unslothai/unsloth/pull/238)
+    for k in range(head_start, head_end):
+        offs_q1 = row_position * Q_row_stride + k * head_dim + col_offsets
+        offs_q2 = row_position * Q_row_stride + k * head_dim + col_offsets + half_head_dim
+
+        # For Gemma - sometimes RoPE must be done in float32 and not bfloat16
+        Q1 = tl.load(Q + offs_q1, mask = mask, other = 0).to(sin1.dtype)
+        Q2 = tl.load(Q + offs_q2, mask = mask, other = 0).to(sin1.dtype)
+
+        tl.store(Q + offs_q1, Q1*cos1 - Q2*sin1, mask = mask)
+        tl.store(Q + offs_q2, Q2*cos1 + Q1*sin1, mask = mask)
+    pass
+pass
+
+
+class Fast_RoPE_Embedding(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, Q, cos, sin):
+        cos, sin = cos.squeeze(), sin.squeeze()
+        batch    : int
+        seq_len  : int
+        n_heads  : int
+        head_dim : int
+        batch, seq_len, n_heads, head_dim = Q.shape
+        Q = Q.view(batch*seq_len, n_heads*head_dim)
+        n_rows : int
+        n_cols : int
+        n_rows, n_cols = Q.shape
+        assert(seq_len <= cos.shape[0])
+
+        # [TODO] Changing blocksize to head_dim//2 seems to have
+        # some concurrency / un-deterministic issues.
+        BLOCK_SIZE, num_warps = calculate_settings(head_dim//2) # (head_dim//2)
+        
+        # group_size = 4 # 4 or 8, too large group_size can hurt performance.
+        div : int
+        mod : int
+        div, mod = divmod(n_heads, ROPE_GROUP_SIZE)
+        n_groups : int = div + (mod != 0)
+
+        _rope_embedding[(n_rows, n_groups, )](
+              Q,   Q.stride(0),
+            cos, cos.stride(0),
+            sin, sin.stride(0),
+            seq_len,
+            head_dim, n_heads,
+            BACKWARD_PASS = False,
+            BLOCK_SIZE = BLOCK_SIZE,
+            num_warps  = num_warps,
+        )
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps  = num_warps
+        ctx.n_groups = n_groups
+        ctx.cos = cos
+        ctx.sin = sin
+        return Q.view(batch, seq_len, n_heads, head_dim)
+    pass
+
+    @staticmethod
+    def backward(ctx, dY):
+        batch    : int
+        seq_len  : int
+        n_heads  : int
+        head_dim : int
+        batch, seq_len, n_heads, head_dim = dY.shape
+        dY = dY.reshape(batch*seq_len, n_heads*head_dim)
+        # Must be reshape not view
+        n_rows : int
+        n_cols : int
+        n_rows, n_cols = dY.shape
+
+        cos = ctx.cos
+        sin = ctx.sin
+
+        _rope_embedding[(n_rows, ctx.n_groups, )](
+            dY,  dY .stride(0),
+            cos, cos.stride(0),
+            sin, sin.stride(0),
+            seq_len, head_dim, n_heads,
+            BACKWARD_PASS = True,
+            BLOCK_SIZE = ctx.BLOCK_SIZE,
+            num_warps  = ctx.num_warps,
+        )
+        dY = dY.view(batch, seq_len, n_heads, head_dim)
+        return dY, None, None,
+    pass
+pass
+
+# [TODO] Unsure why RoPE Embedding is not torch.compiling properly
+@torch.compiler.disable
+def fast_rope_embedding(Q, K, cos, sin):
+    Q = Fast_RoPE_Embedding.apply(Q.transpose(1, 2), cos, sin).transpose(1, 2)
+    K = Fast_RoPE_Embedding.apply(K.transpose(1, 2), cos, sin).transpose(1, 2)
+    return Q, K
+pass
+
+
+class Slow_RoPE_Embedding(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, Q, cos, sin, position_ids):
+        if position_ids is not None:
+            # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+            cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+            sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+            cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+            sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+
+        # Q * cos + rotate_half(Q) * sin
+        half = Q.shape[-1]//2
+        RH_Q = torch.cat((-Q[..., half:], Q[..., :half]), dim = -1)
+        Q *= cos
+        Q.addcmul_(RH_Q, sin)
+        # RH_Q *= sin
+        # Q += RH_Q
+        ctx.save_for_backward(cos, sin)
+        return Q
+    pass
+
+    @staticmethod
+    def backward(ctx, dY):
+        cos, sin = ctx.saved_tensors
+        # Q * cos + rotate_half.T(Q) * sin
+        half = dY.shape[-1]//2
+        RH_dY = torch.cat((dY[..., half:], -dY[..., :half]), dim = -1)
+        dY *= cos
+        dY.addcmul_(RH_dY, sin)
+        # RH_dY *= sin
+        # dY += RH_dY
+        return dY, None, None, None
+    pass
+pass
+
+
+def inplace_rope_embedding(Q, K, cos, sin, position_ids):
+    Q = Slow_RoPE_Embedding.apply(Q, cos, sin, position_ids)
+    K = Slow_RoPE_Embedding.apply(K, cos, sin, position_ids)
+    return Q, K
+pass
diff --git a/unsloth-main/unsloth/kernels/swiglu.py b/unsloth-main/unsloth/kernels/swiglu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f81b7aae9bf40f943ebdf3f99aa88ec4a986ab01
--- /dev/null
+++ b/unsloth-main/unsloth/kernels/swiglu.py
@@ -0,0 +1,99 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import triton
+import triton.language as tl
+import torch
+from .utils import calculate_settings
+
+
+@triton.jit
+def _fg_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
+
+    # f = e * sigmoid(e)
+    f_row = e_row * tl.sigmoid(e_row) # e_row / (1 + tl.exp(-e_row))
+    f_row = f_row.to(g_row.dtype) # Exact copy from HF
+    # h = f * g
+    h_row = f_row * g_row
+
+    # Store h
+    tl.store(h + offsets, h_row, mask = mask)
+pass
+
+
+def swiglu_fg_kernel(e, g):
+    batch, seq_len, hd = e.shape
+    n_elements = e.numel()
+    h = torch.empty((batch, seq_len, hd), dtype = e.dtype, device = "cuda:0")
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    _fg_kernel[grid](e, g, h, n_elements, BLOCK_SIZE = 1024,)
+    return h
+pass
+
+
+@triton.jit
+def _DWf_DW_dfg_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,):
+    """
+    e = e.float()
+    se = 1.0 / (1.0 + torch.exp(-e))
+    f = (se * e).to(dtype)
+    h = f * g
+    df = DW * f
+    dg = DW * g
+    de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype)
+    """
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32)
+    e_row  = tl.load(e  + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row  = tl.load(g  + offsets, mask = mask, other = 0)#.to(tl.float32)
+
+    # e = e.float()
+    # se = 1.0 / (1.0 + torch.exp(-e))
+    se_row = tl.sigmoid(e_row) # 1.0 / (1.0 + tl.exp(-e_row))
+    # f = (se * e).to(dtype)
+    f_row = se_row * e_row
+    f_row = f_row.to(DW_row.dtype)
+    # h = f * g
+    h_row  =  f_row * g_row
+    # df = DW * f
+    df_row = DW_row * f_row
+    # dg = DW * g
+    dg_row = DW_row * g_row
+    # de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype)
+    de_row = dg_row.to(tl.float32) * se_row * (1.0 + e_row * (1.0 - se_row))
+    de_row = de_row.to(DW_row.dtype)
+
+    # Store derivatives in buffers
+    tl.store(DW + offsets, h_row,  mask = mask) # h  = f * g
+    tl.store(e  + offsets, df_row, mask = mask) # df = DW * f
+    tl.store(g  + offsets, de_row, mask = mask) # de
+pass
+
+
+def swiglu_DWf_DW_dfg_kernel(DW, e, g):
+    batch_seq_len, hd = e.shape
+    n_elements = e.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    _DWf_DW_dfg_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
+    return DW, e, g
+pass
diff --git a/unsloth-main/unsloth/kernels/utils.py b/unsloth-main/unsloth/kernels/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..de543962ef9c9bd0b50591894e81b4ff3382fee9
--- /dev/null
+++ b/unsloth-main/unsloth/kernels/utils.py
@@ -0,0 +1,422 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import triton
+MAX_FUSED_SIZE : int = 65536
+next_power_of_2 = triton.next_power_of_2
+
+# torch.cuda.amp.custom_fwd is deprecated >= 2.4
+import torch
+from packaging.version import Version
+if Version(torch.__version__) < Version("2.4.0"):
+    torch_amp_custom_fwd = torch.cuda.amp.custom_fwd
+    torch_amp_custom_bwd = torch.cuda.amp.custom_bwd
+else:
+    torch_amp_custom_fwd = torch.amp.custom_fwd(device_type = "cuda")
+    torch_amp_custom_bwd = torch.amp.custom_bwd(device_type = "cuda")
+pass
+
+
+# tl.math.tanh now is libdevice.tanh
+from packaging.version import Version
+import triton
+import triton.language as tl
+if Version(triton.__version__) >= Version("3.0.0"):
+    from triton.language.extra import libdevice
+    triton_tanh = libdevice.tanh
+    triton_cast = tl.cast
+else:
+    triton_tanh = tl.math.tanh
+    # No casting in old Triton versions
+    @triton.jit
+    def triton_cast(x, dtype):
+        return x.to(dtype)
+    pass
+pass
+
+
+def calculate_settings(n : int) -> (int, int,):
+    BLOCK_SIZE : int = next_power_of_2(n)
+    if BLOCK_SIZE > MAX_FUSED_SIZE:
+        raise RuntimeError(f"Cannot launch Triton kernel since n = {n} exceeds "\
+                           f"the maximum CUDA blocksize = {MAX_FUSED_SIZE}.")
+    num_warps : int = 4
+    if   BLOCK_SIZE >= 32768: num_warps = 32
+    elif BLOCK_SIZE >=  8192: num_warps = 16
+    elif BLOCK_SIZE >=  2048: num_warps = 8
+    return BLOCK_SIZE, num_warps
+pass
+
+
+import bitsandbytes as bnb
+# https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files
+HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3")
+global CUDA_STREAM
+CUDA_STREAM = None
+get_ptr = bnb.functional.get_ptr
+import ctypes
+cdequantize_blockwise_fp32      = bnb.functional.lib.cdequantize_blockwise_fp32
+cdequantize_blockwise_fp16_nf4  = bnb.functional.lib.cdequantize_blockwise_fp16_nf4
+cdequantize_blockwise_bf16_nf4  = bnb.functional.lib.cdequantize_blockwise_bf16_nf4
+cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemm_4bit_inference_naive_fp16
+cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemm_4bit_inference_naive_bf16
+
+
+def QUANT_STATE(W):
+    return getattr(W, "quant_state", None)
+pass
+
+
+def get_lora_parameters(proj):
+    # For DPO or disabled adapters
+    base_layer = (proj.base_layer if hasattr(proj, "base_layer") else proj)
+    W = base_layer.weight
+
+    if not hasattr(proj, "disable_adapters") or proj.disable_adapters or proj.merged:
+        return W, QUANT_STATE(W), None, None, None
+    pass
+
+    active_adapter = proj.active_adapters[0] if \
+        hasattr(proj, "active_adapters") else proj.active_adapter
+    A = proj.lora_A [active_adapter].weight
+    B = proj.lora_B [active_adapter].weight
+    s = proj.scaling[active_adapter]
+    return W, QUANT_STATE(W), A, B, s
+pass
+
+
+def get_lora_parameters_bias(proj):
+    # For DPO or disabled adapters
+    base_layer = (proj.base_layer if hasattr(proj, "base_layer") else proj)
+    W = base_layer.weight
+    bias = base_layer.bias
+
+    if not hasattr(proj, "disable_adapters") or proj.disable_adapters or proj.merged:
+        return W, QUANT_STATE(W), None, None, None, bias
+    pass
+
+    active_adapter = proj.active_adapters[0] if \
+        hasattr(proj, "active_adapters") else proj.active_adapter
+    A = proj.lora_A [active_adapter].weight
+    B = proj.lora_B [active_adapter].weight
+    s = proj.scaling[active_adapter]
+    return W, QUANT_STATE(W), A, B, s, bias
+pass
+
+
+if HAS_CUDA_STREAM:
+    def fast_dequantize(W, quant_state = None, out = None):
+        if quant_state is None: return W
+        if type(quant_state) is not list:
+            # New quant_state as a class
+            # https://github.com/TimDettmers/bitsandbytes/pull/763/files
+            absmax     = quant_state.absmax
+            shape      = quant_state.shape
+            dtype      = quant_state.dtype
+            blocksize  = quant_state.blocksize
+            offset     = quant_state.offset
+            state2     = quant_state.state2
+            absmax2    = state2.absmax
+            code2      = state2.code
+            blocksize2 = state2.blocksize
+        else:
+            # Old quant_state as a list of lists
+            absmax, shape, dtype, blocksize, compressed_stats, _, _ = quant_state
+            offset, state2 = compressed_stats
+            absmax2, code2, blocksize2, _, _, _, _ = state2
+        pass
+        global CUDA_STREAM
+        if CUDA_STREAM is None: CUDA_STREAM = torch.cuda.current_stream("cuda:0")
+
+        # Create weight matrix
+        if out is None:
+            out = torch.empty(shape, dtype = dtype, device = "cuda:0")
+        else:
+            assert(out.shape == shape)
+            assert(out.dtype == dtype)
+
+        # NF4 dequantization of statistics
+        n_elements_absmax = absmax.numel()
+        out_absmax = torch.empty(n_elements_absmax, dtype = torch.float32, device = "cuda:0")
+
+        # Do dequantization
+        ptr_out_absmax = get_ptr(out_absmax)
+        cdequantize_blockwise_fp32(
+            get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), ptr_out_absmax,
+            ctypes.c_int(blocksize2), ctypes.c_int(n_elements_absmax), CUDA_STREAM,
+        )
+        out_absmax += offset
+
+        fx = cdequantize_blockwise_fp16_nf4 if dtype == torch.float16 else \
+             cdequantize_blockwise_bf16_nf4
+        fx(get_ptr(None), get_ptr(W), ptr_out_absmax, get_ptr(out),
+           ctypes.c_int(blocksize), ctypes.c_int(out.numel()), CUDA_STREAM,)
+
+        # Careful returning transposed data
+        is_transposed = (True if W.shape[0] == 1 else False)
+        return out.t() if is_transposed else out
+    pass
+else:
+    def fast_dequantize(W, quant_state = None, out = None):
+        if quant_state is None: return W
+        if type(quant_state) is not list:
+            # New quant_state as a class
+            # https://github.com/TimDettmers/bitsandbytes/pull/763/files
+            absmax     = quant_state.absmax
+            shape      = quant_state.shape
+            dtype      = quant_state.dtype
+            blocksize  = quant_state.blocksize
+            offset     = quant_state.offset
+            state2     = quant_state.state2
+            absmax2    = state2.absmax
+            code2      = state2.code
+            blocksize2 = state2.blocksize
+        else:
+            # Old quant_state as a list of lists
+            absmax, shape, dtype, blocksize, compressed_stats, _, _ = quant_state
+            offset, state2 = compressed_stats
+            absmax2, code2, blocksize2, _, _, _, _ = state2
+        pass
+
+        # Create weight matrix
+        if out is None:
+            out = torch.empty(shape, dtype = dtype, device = "cuda:0")
+        else:
+            assert(out.shape == shape)
+            assert(out.dtype == dtype)
+
+        # NF4 dequantization of statistics
+        n_elements_absmax = absmax.numel()
+        out_absmax = torch.empty(n_elements_absmax, dtype = torch.float32, device = "cuda:0")
+
+        # Do dequantization
+        ptr_out_absmax = get_ptr(out_absmax)
+        cdequantize_blockwise_fp32(
+            get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), ptr_out_absmax,
+            ctypes.c_int(blocksize2), ctypes.c_int(n_elements_absmax),
+        )
+        out_absmax += offset
+
+        fx = cdequantize_blockwise_fp16_nf4 if dtype == torch.float16 else \
+             cdequantize_blockwise_bf16_nf4
+        fx(get_ptr(None), get_ptr(W), ptr_out_absmax, get_ptr(out),
+           ctypes.c_int(blocksize), ctypes.c_int(out.numel()),)
+
+        # Careful returning transposed data
+        is_transposed = (True if W.shape[0] == 1 else False)
+        return out.t() if is_transposed else out
+    pass
+pass
+
+
+if HAS_CUDA_STREAM:
+    def fast_gemv(X, W, quant_state, out = None):
+        if quant_state is None: return torch.matmul(X, W, out = out)
+        # For fast X @ W where seq_len == 1
+        # From https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L1469
+        _, q_len, hd = X.shape
+        # assert(q_len == 1)
+
+        if type(quant_state) is not list:
+            # https://github.com/TimDettmers/bitsandbytes/pull/763/files
+            absmax     = quant_state.absmax
+            shape      = quant_state.shape
+            dtype      = quant_state.dtype
+            blocksize  = quant_state.blocksize
+            stats      = quant_state.code
+            offset     = quant_state.offset
+            state2     = quant_state.state2
+            absmax2    = state2.absmax
+            code2      = state2.code
+            blocksize2 = state2.blocksize
+        else:
+            absmax, shape, dtype, blocksize, compressed_stats, quant_type, stats = quant_state
+            offset, state2 = compressed_stats
+            absmax2, code2, blocksize2, _, _, _, _ = state2
+        pass
+        global CUDA_STREAM
+        if CUDA_STREAM is None: CUDA_STREAM = torch.cuda.current_stream("cuda:0")
+        
+        # assert(dtype == X.dtype)
+        bout = shape[0]
+
+        if out is None:
+            out = torch.empty((1, 1, bout,), dtype = dtype, device = "cuda:0")
+        # else:
+        #     assert(out.shape == (1, 1, bout,))
+        # pass
+
+        n = 1
+        m = shape[0]
+        k = shape[1]
+        lda = shape[0]
+        ldc = shape[0]
+        ldb = (hd+1)//2
+        m = ctypes.c_int32(m)
+        n = ctypes.c_int32(n)
+        k = ctypes.c_int32(k)
+        lda = ctypes.c_int32(lda)
+        ldb = ctypes.c_int32(ldb)
+        ldc = ctypes.c_int32(ldc)
+
+        df = torch.empty(absmax.shape, dtype = torch.float32, device = "cuda:0")
+        cdequantize_blockwise_fp32(
+            get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), get_ptr(df),
+            ctypes.c_int(blocksize2), ctypes.c_int(df.numel()), CUDA_STREAM,
+        )
+        df += offset
+        absmax = df
+
+        fx = cgemm_4bit_inference_naive_fp16 if dtype == torch.float16 else \
+            cgemm_4bit_inference_naive_bf16
+
+        blocksize = ctypes.c_int32(blocksize)
+        fx(m, n, k, get_ptr(X), get_ptr(W), get_ptr(absmax), get_ptr(stats), get_ptr(out),
+           lda, ldb, ldc, blocksize, CUDA_STREAM,)
+
+        return out
+    pass
+else:
+    def fast_gemv(X, W, quant_state, out = None):
+        if quant_state is None: return torch.matmul(X, W, out = out)
+        # For fast X @ W where seq_len == 1
+        # From https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L1469
+        _, q_len, hd = X.shape
+        # assert(q_len == 1)
+
+        if type(quant_state) is not list:
+            # https://github.com/TimDettmers/bitsandbytes/pull/763/files
+            absmax     = quant_state.absmax
+            shape      = quant_state.shape
+            dtype      = quant_state.dtype
+            blocksize  = quant_state.blocksize
+            stats      = quant_state.code
+            offset     = quant_state.offset
+            state2     = quant_state.state2
+            absmax2    = state2.absmax
+            code2      = state2.code
+            blocksize2 = state2.blocksize
+        else:
+            absmax, shape, dtype, blocksize, compressed_stats, quant_type, stats = quant_state
+            offset, state2 = compressed_stats
+            absmax2, code2, blocksize2, _, _, _, _ = state2
+        pass
+        # assert(dtype == X.dtype)
+        bout = shape[0]
+
+        if out is None:
+            out = torch.empty((1, 1, bout,), dtype = dtype, device = "cuda:0")
+        # else:
+        #     assert(out.shape == (1, 1, bout,))
+        # pass
+
+        n = 1
+        m = shape[0]
+        k = shape[1]
+        lda = shape[0]
+        ldc = shape[0]
+        ldb = (hd+1)//2
+        m = ctypes.c_int32(m)
+        n = ctypes.c_int32(n)
+        k = ctypes.c_int32(k)
+        lda = ctypes.c_int32(lda)
+        ldb = ctypes.c_int32(ldb)
+        ldc = ctypes.c_int32(ldc)
+
+        df = torch.empty(absmax.shape, dtype = torch.float32, device = "cuda:0")
+        cdequantize_blockwise_fp32(
+            get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), get_ptr(df),
+            ctypes.c_int(blocksize2), ctypes.c_int(df.numel()),
+        )
+        df += offset
+        absmax = df
+
+        fx = cgemm_4bit_inference_naive_fp16 if dtype == torch.float16 else \
+            cgemm_4bit_inference_naive_bf16
+
+        blocksize = ctypes.c_int32(blocksize)
+        fx(m, n, k, get_ptr(X), get_ptr(W), get_ptr(absmax), get_ptr(stats), get_ptr(out),
+           lda, ldb, ldc, blocksize,)
+
+        return out
+    pass
+pass
+
+
+def fast_linear_forward(proj, X, temp_lora = None, out = None):
+
+    W, W_quant, lora_A, lora_B, lora_S, bias = get_lora_parameters_bias(proj)
+    bsz, q_len, in_dim = X.shape
+    if q_len != 1: return matmul_lora(X, W, W_quant, lora_A, lora_B, lora_S)
+
+    if W_quant is None:
+        out = torch.matmul(X, W.t(), out = out)
+    elif bsz == 1 and q_len == 1:
+        out = fast_gemv(X, W, W_quant, out = out)
+    else:
+        W = fast_dequantize(W.t(), W_quant)
+        out = torch.matmul(X, W, out = out)
+    pass
+
+    # Add in LoRA weights
+    if lora_A is not None:
+        out_dim = out.shape[2]
+        dtype = X.dtype
+
+        if not hasattr(lora_A, "_fast_lora"):
+            lora_A._fast_lora = lora_A.to(dtype)
+            lora_B._fast_lora = lora_B.to(dtype)
+        pass
+        
+        if bsz == 1:
+            out = out.view(out_dim)
+            temp_lora = torch.mv(lora_A._fast_lora, X.ravel(), out = temp_lora)
+            out.addmv_(lora_B._fast_lora, temp_lora, alpha = lora_S)
+        else:
+            out = out.view(bsz, out_dim)
+            temp_lora = torch.mm(X.view(bsz, in_dim), lora_A._fast_lora.t(), out = temp_lora)
+            out.addmm_(temp_lora, lora_B._fast_lora.t(), alpha = lora_S)
+        pass
+        out = out.view(bsz, 1, out_dim)
+    pass
+
+    if bias is not None: out += bias
+
+    return out
+pass
+
+
+def matmul_lora(X, W, W_quant, A, B, s, out = None):
+    dtype = X.dtype
+    W = fast_dequantize(W.t(), W_quant)
+
+    if X.dim() == 3:
+        batch, seq_len, d = X.shape
+        X = X.view(-1, X.shape[-1])
+        reshape = True
+    else:
+        reshape = False
+    pass
+
+    out = torch.matmul(X, W, out = out)
+    if W_quant is not None: del W
+
+    if A is not None:
+        # LoRA is enabled
+        A, B = A.t(), B.t()
+        out += (X @ A.to(dtype)) @ (s * B.to(dtype))
+    pass
+    
+    return out.view(batch, seq_len, -1) if reshape else out
+pass
diff --git a/unsloth-main/unsloth/models/__init__.py b/unsloth-main/unsloth/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c52d14f40281c2b1d82102c03035d1f1fbb52237
--- /dev/null
+++ b/unsloth-main/unsloth/models/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .granite import FastGraniteModel
+from .loader  import FastLanguageModel, FastVisionModel
+from .llama   import FastLlamaModel
+from .mistral import FastMistralModel
+from .qwen2   import FastQwen2Model
+from .dpo     import PatchDPOTrainer, PatchKTOTrainer
+from ._utils  import is_bfloat16_supported
diff --git a/unsloth-main/unsloth/models/_utils.py b/unsloth-main/unsloth/models/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cb6ffb8f35c9c7c53a6e36ae4cb49d71961cac4
--- /dev/null
+++ b/unsloth-main/unsloth/models/_utils.py
@@ -0,0 +1,1273 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__ = "2024.12.12"
+
+__all__ = [
+    "prepare_model_for_kbit_training",
+    "xformers",
+    "xformers_attention",
+    "xformers_version",
+    "__version__",
+    "HAS_FLASH_ATTENTION",
+    "HAS_FLASH_ATTENTION_SOFTCAPPING",
+    "PRE_CHECK",
+    "platform_system",
+    "patch_tokenizer",
+    "get_statistics",
+    "Unsloth_Offloaded_Gradient_Checkpointer",
+    "offload_to_disk",
+    "offload_input_embeddings",
+    "offload_output_embeddings",
+    "is_bfloat16_supported",
+    "unsloth_offloaded_gradient_checkpoint",
+    "torch_compile_options",
+    "patch_linear_scaling",
+    "patch_llama_rope_scaling",
+    "check_nvidia",
+    "create_boolean_mask",
+    "torch_amp_custom_fwd",
+    "torch_amp_custom_bwd",
+    "accelerate_old_send_to_device",
+    "accelerate_new_send_to_device",
+    "patch_gradient_accumulation_fix",
+    "patch_compiling_bitsandbytes",
+    "patch_regional_compilation",
+    "patch_layernorm",
+    "patch_torch_compile",
+    "patch_model_and_tokenizer",
+
+    "patch_unsloth_gradient_checkpointing",
+    "unpatch_unsloth_gradient_checkpointing",
+    "patch_gradient_checkpointing",
+    "unpatch_gradient_checkpointing",
+
+    "HAS_CUT_CROSS_ENTROPY",
+    "EMPTY_LOGITS",
+    "fused_linear_cross_entropy",
+    "patch_unsloth_smart_gradient_checkpointing",
+    "unpatch_unsloth_smart_gradient_checkpointing",
+    "create_gradient_checkpointing_buffer",
+
+    "patch_compiled_autograd",
+    "process_vision_info",
+    "unsloth_compile_transformers",
+    "patch_fast_lora",
+]
+
+import torch
+from typing import Union, Optional, List, Any, Callable, Tuple
+from platform import system as platform_system
+platform_system = platform_system()
+import numpy as np
+import warnings, subprocess, re, inspect, psutil, os, math
+from unsloth_zoo.utils import Version
+
+from unsloth_zoo.tokenizer_utils import (
+    patch_tokenizer as _patch_tokenizer,
+)
+from unsloth_zoo.patching_utils import (
+    patch_compiling_bitsandbytes,
+    patch_layernorm,
+    patch_torch_compile,
+    patch_model_and_tokenizer,
+    patch_compiled_autograd,
+)
+from unsloth_zoo.gradient_checkpointing import (
+    Unsloth_Offloaded_Gradient_Checkpointer,
+    unsloth_offloaded_gradient_checkpoint,
+    patch_unsloth_gradient_checkpointing,
+    unpatch_unsloth_gradient_checkpointing,
+
+    Unsloth_Gradient_Checkpointer,
+    unsloth_gradient_checkpoint,
+    patch_gradient_checkpointing,
+    unpatch_gradient_checkpointing,
+
+    patch_unsloth_smart_gradient_checkpointing,
+    unpatch_unsloth_smart_gradient_checkpointing,
+    create_gradient_checkpointing_buffer,
+)
+from unsloth_zoo.loss_utils import (
+    HAS_CUT_CROSS_ENTROPY,
+    fused_linear_cross_entropy,
+)
+from unsloth_zoo.vision_utils import (
+    process_vision_info,
+)
+from unsloth_zoo.compiler import (
+    get_transformers_model_type,
+    unsloth_compile_transformers as _unsloth_compile_transformers,
+)
+
+# =============================================
+# Disable some warnings which can get annoying
+warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "torch")
+warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "huggingface_hub")
+warnings.filterwarnings(action = "ignore", category = FutureWarning,  module = "huggingface_hub")
+warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "trl")
+warnings.filterwarnings(action = "ignore", category = FutureWarning,  module = "trl")
+warnings.filterwarnings(action = "ignore", category = FutureWarning,  module = "xformers")
+warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "subprocess")
+warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "transformers")
+warnings.filterwarnings(action = "ignore", category = FutureWarning,  module = "accelerate")
+warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "multiprocessing")
+warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "multiprocess")
+
+# Stop "Special tokens have been added in the vocabulary, ..."
+import logging
+logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.CRITICAL+1)
+
+# Ignore logging messages
+class HideLoggingMessage(logging.Filter):
+    def __init__(self, text): self.text = text
+    def filter(self, x): return not (self.text in x.getMessage())
+pass
+
+# The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here.
+from transformers.training_args import logger as transformers_training_args_logger
+transformers_training_args_logger.addFilter(HideLoggingMessage("The speedups"))
+del transformers_training_args_logger
+
+# Using the default loss: `ForCausalLMLoss`.
+try:
+    from transformers.modeling_utils import logger as transformers_modeling_utils_logger
+    transformers_modeling_utils_logger.addFilter(HideLoggingMessage("ForCausalLMLoss"))
+    del transformers_modeling_utils_logger
+except:
+    pass
+
+# The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
+try:
+    from accelerate.utils.modeling import logger as accelerate_utils_modeling_logger
+    accelerate_utils_modeling_logger.addFilter(HideLoggingMessage("The model weights are not tied"))
+    del accelerate_utils_modeling_logger
+except:
+    pass
+
+# Setting `pad_token_id` to `eos_token_id`
+try:
+    from transformers.generation.utils import logger as transformers_generation_utils_logger
+    transformers_generation_utils_logger.addFilter(HideLoggingMessage("Setting `pad_token_id` to `eos_token_id`"))
+    del transformers_generation_utils_logger
+except:
+    pass
+
+# =============================================
+
+# =============================================
+# Edits all Config files to enable RoPE Scaling for all models
+
+# Transformers had to update for Mistral Nemo 12b since Attention is (5120, 4096) now.
+def patch_mistral_nemo_config(config):
+    if "head_dim (" not in config:
+        add_head_dim = "If it is not specified, will default to `8`.\n"\
+            "        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):\n"\
+            "            The attention head dimension."
+        config = config.replace("If it is not specified, will default to `8`.", add_head_dim)
+
+        add_head_dim = "num_key_value_heads=8,\n        head_dim=None,"
+        config = config.replace("num_key_value_heads=8,", add_head_dim)
+
+        add_head_dim = "self.sliding_window = sliding_window\n        self.head_dim = head_dim or hidden_size // num_attention_heads\n"
+        config = config.replace("self.sliding_window = sliding_window", add_head_dim)
+    pass
+    return config
+pass
+
+from transformers import __version__ as transformers_version
+from transformers import PretrainedConfig
+model_architectures = ["llama", "mistral", "gemma", "gemma2", "qwen2", "granite"]
+
+for model_name in model_architectures:
+    config_filepath = f"transformers.models.{model_name}.configuration_{model_name}"
+    model_filepath = f"transformers.models.{model_name}.modeling_{model_name}"
+    config_filename = f"{model_name.title()}Config"
+    exec(f"from {config_filepath} import {config_filename}", globals())
+
+    try:
+        config = inspect.getsource(eval(config_filename))
+    except:
+        continue
+    if "rope_scaling" in config: continue
+    config = re.sub(
+        r"(\*\*kwargs)[\s]{0,}\,[\s]{0,}\)[\s]{0,}\:",
+        r"rope_scaling=None,"\
+        r"\n        **kwargs):\n"\
+        r"\n        self.rope_scaling = rope_scaling\n",
+        config,
+    )
+
+    # Just for Mistral Nemo
+    if model_name == "mistral":
+        if Version(transformers_version) <= Version("4.42.4"):
+            config = patch_mistral_nemo_config(config)
+    pass
+
+    exec(config, globals())
+    exec(f"import {config_filepath}", globals())
+    exec(f"{config_filepath}.{config_filename} = {config_filename}", globals())
+pass
+# =============================================
+
+# =============================================
+# torch.cuda.amp.custom_fwd is deprecated >= 2.4
+torch_version = torch.__version__
+if Version(torch_version) < Version("2.4.0"):
+    torch_amp_custom_fwd = torch.cuda.amp.custom_fwd
+    torch_amp_custom_bwd = torch.cuda.amp.custom_bwd
+else:
+    torch_amp_custom_fwd = torch.amp.custom_fwd(device_type = "cuda")
+    torch_amp_custom_bwd = torch.amp.custom_bwd(device_type = "cuda")
+pass
+# =============================================
+
+# =============================================
+# Fix KeyError: 'Cache only has 0 layers, attempted to access layer with index 0'
+import transformers.cache_utils
+if hasattr(transformers.cache_utils, "DynamicCache") and \
+    transformers.cache_utils.DynamicCache.__getitem__.__name__ != "__cache_utils_getitem__":
+
+    source = inspect.getsource(transformers.cache_utils.DynamicCache.__getitem__)
+    start = source.find("def")
+    spaces = start*" "
+    source = source.split("\n")
+    source = "\n".join(x[start:] for x in source)
+    where = source.find("raise KeyError")
+    source = source[:where] + \
+        f"if len(self) == 0:\n{spaces}{spaces}"\
+        "    raise RuntimeError('Unsloth: You must call `FastLanguageModel.for_inference(model)` before doing inference for Unsloth models.')\n" + \
+        f"{spaces}{spaces}else:\n{spaces}{spaces}{spaces}" + source[where:]
+    source = source.replace("__getitem__", "__cache_utils_getitem__", 1)
+    exec(source)
+    transformers.cache_utils.DynamicCache.__getitem__ = __cache_utils_getitem__
+pass
+# =============================================
+
+# =============================================
+# Weird Databricks errors
+from transformers.utils import is_openai_available
+if is_openai_available():
+    try:
+        from openai import OpenAI
+    except:
+        print("Unsloth: OpenAI failed to import - ignoring for now.")
+        import transformers.utils
+        def _is_openai_available(): return False
+        transformers.utils.is_openai_available = _is_openai_available
+    pass
+pass 
+
+# =============================================
+# Get Flash Attention v2 if Ampere (RTX 30xx, A100)
+import bitsandbytes as bnb
+from transformers import AutoTokenizer
+from transformers.utils.import_utils import _is_package_available
+
+major_version, minor_version = torch.cuda.get_device_capability()
+SUPPORTS_BFLOAT16 = False
+HAS_FLASH_ATTENTION = False
+HAS_FLASH_ATTENTION_SOFTCAPPING = False
+
+if major_version >= 8:
+    SUPPORTS_BFLOAT16 = True
+    if _is_package_available("flash_attn"):
+        # Check for CUDA linking errors "undefined symbol: _ZNK3c106SymIntltEl"
+        try:
+            from flash_attn.flash_attn_interface import flash_attn_cuda
+            HAS_FLASH_ATTENTION = True
+
+            # Also check for softcapping
+            from flash_attn import __version__ as flash_attn_version
+            HAS_FLASH_ATTENTION_SOFTCAPPING = Version(flash_attn_version) >= Version("2.6.3")
+            if not HAS_FLASH_ATTENTION_SOFTCAPPING:
+                print(
+                    "Unsloth: If you want to finetune Gemma 2, upgrade flash-attn to version 2.6.3 or higher!\n"\
+                    "Newer versions support faster and less memory usage kernels for Gemma 2's attention softcapping!\n"\
+                    "To update flash-attn, do the below:\n"\
+                    '\npip install --no-deps --upgrade "flash-attn>=2.6.3"'
+                )
+        except:
+            print(
+                "Unsloth: Your Flash Attention 2 installation seems to be broken?\n"\
+                "A possible explanation is you have a new CUDA version which isn't\n"\
+                "yet compatible with FA2? Please file a ticket to Unsloth or FA2.\n"\
+                "We shall now use Xformers instead, which does not have any performance hits!\n"\
+                "We found this negligible impact by benchmarking on 1x A100."
+            )
+
+            # Stop Flash Attention from importing!
+            import transformers.utils.import_utils
+            transformers.utils.import_utils.is_flash_attn_2_available = lambda *args, **kwargs: False
+            import transformers.utils
+            transformers.utils.is_flash_attn_2_available = lambda *args, **kwargs: False
+
+            HAS_FLASH_ATTENTION = False
+        pass
+    else:
+        HAS_FLASH_ATTENTION = False
+else:
+    # Tri Dao's benchmark shows xformers is faster for now.
+    HAS_FLASH_ATTENTION = False
+pass
+
+from transformers.models.llama.modeling_llama import logger
+
+# =============================================
+# Get Xformers
+try:
+    from xformers import __version__ as xformers_version
+    # Temporarily disable 0.0.27 and higher - inference issues
+    if False: #Version(xformers_version) >= Version("0.0.27"):
+        raise ImportError(
+            "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
+            "then press Disconnect Runtime and then Restart it.\n"\
+            "\n"\
+            "%%capture\n"
+            "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n"
+            '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'
+            '!pip install --no-deps "xformers<=0.0.27" trl peft accelerate bitsandbytes\n'\
+            '\n'\
+            f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\
+            'Please downgrade xformers via `pip install --force-reinstall "xformers<=0.0.27"'
+        )
+    pass
+
+    if   Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"):
+        raise ImportError(
+            f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+            f"Please install xformers < 0.0.24 for torch = {torch_version}."
+        )
+    elif Version(torch_version) < Version("2.3.0") and Version(xformers_version) >= Version("0.0.26"):
+        raise ImportError(
+            f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+            f"Please install xformers < 0.0.26 for torch = {torch_version}."
+        )
+    elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) > Version("0.0.27"):
+        raise ImportError(
+            f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+            f"Please install xformers <= 0.0.27 for torch = {torch_version}."
+        )
+    pass
+
+    from xformers._cpp_lib import _register_extensions
+    try:
+        _register_extensions() # Check if C++ modules are loaded correctly
+    except Exception as error:
+        raise ImportError(
+            "Unsloth: Xformers was not installed correctly.\n"\
+            "Please install xformers separately first.\n"\
+            "Then confirm if it's correctly installed by running:\n"\
+            "python -m xformers.info\n\n"
+            "Longer error message:\n" + str(error)
+        )
+    pass
+    import xformers.ops.fmha as xformers
+    xformers_attention = xformers.memory_efficient_attention
+except:
+    xformers = None
+    xformers_attention = None
+    xformers_version = None
+pass
+
+# Check TRL version
+from trl import __version__ as trl_version
+# Unsloth now supports all TRL versions!
+if False:#Version(trl_version) >= Version("0.9.0"):
+    raise ImportError(
+        "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
+        "then press Disconnect Runtime and then Restart it.\n"\
+        "\n"\
+        "%%capture\n"
+        "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n"
+        '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'
+        '!pip install --no-deps "xformers<=0.0.27" trl peft accelerate bitsandbytes\n'\
+        '\n'\
+        f"Otherwise in local machines, your TRL version of {trl_version} is too new.\n"\
+        'Please downgrade TRL via `pip install --force-reinstall trl'
+    )
+pass
+
+# =============================================
+# Fix new Xformers versions TypeError: Multiple dispatch failed for 'torch._ops.aten.to.dtype_layout'
+accelerate_old_send_to_device = None
+accelerate_new_send_to_device = None
+if xformers_version is not None and Version(xformers_version) >= Version("0.0.27"):
+    import accelerate.utils.operations
+    if hasattr(accelerate.utils.operations, "send_to_device") and \
+        accelerate.utils.operations.send_to_device.__name__ != "_fixed_send_to_device":
+        accelerate_old_send_to_device = accelerate.utils.operations.send_to_device
+        from accelerate.utils.operations import *
+        send_to_device = inspect.getsource(accelerate.utils.operations.send_to_device)
+        send_to_device = re.sub(
+            r"([ ]{4,})return tensor\.to\(device\)",
+            r"\1try: return tensor.to(device)\n\1except: return tensor",
+            send_to_device,
+        ).replace("def send_to_device", "def _fixed_send_to_device")
+        exec(send_to_device)
+        # accelerate.utils.operations.send_to_device = _fixed_send_to_device
+        accelerate_new_send_to_device = _fixed_send_to_device
+    pass
+pass
+
+# Transformers 4.46 breaks dynamic caching. This is a hack
+import transformers.generation.configuration_utils
+if hasattr(transformers.generation.configuration_utils, "ALL_CACHE_IMPLEMENTATIONS"):
+    if type(transformers.generation.configuration_utils.ALL_CACHE_IMPLEMENTATIONS) is list:
+        transformers.generation.configuration_utils.ALL_CACHE_IMPLEMENTATIONS.append("dynamic")
+    pass
+pass
+# =============================================
+
+# =============================================
+# Torch compile settings
+UNSLOTH_COMPILE_DEBUG         = os.environ.get("UNSLOTH_COMPILE_DEBUG",         "0") == "1"
+UNSLOTH_COMPILE_MAXIMUM       = os.environ.get("UNSLOTH_COMPILE_MAXIMUM",       "0") == "1"
+UNSLOTH_COMPILE_IGNORE_ERRORS = os.environ.get("UNSLOTH_COMPILE_IGNORE_ERRORS", "1") == "1"
+# Just remove max_autotune_gemm warning
+import functools
+@functools.lru_cache(None)
+def is_big_gpu(index):
+    sms = torch.cuda.get_device_properties(index).multi_processor_count
+    if sms < 80:  # V100
+        # log.warning("not enough SMs to use max_autotune_gemm mode")
+        return False
+    return True
+import torch._inductor.utils
+torch._inductor.utils.is_big_gpu = is_big_gpu
+patch_torch_compile(
+    debug = UNSLOTH_COMPILE_DEBUG,
+    O3 = UNSLOTH_COMPILE_MAXIMUM,
+    ignore_errors = UNSLOTH_COMPILE_IGNORE_ERRORS,
+)
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : True,
+    "shape_padding"     : True,
+    "trace.enabled"     : UNSLOTH_COMPILE_DEBUG,
+    "triton.cudagraphs" : False,
+}
+
+import accelerate
+def torch_compile_kwargs(*args, **kwargs):
+    print("Unsloth: Enabled auto compiling")
+    return {"dynamic" : True, "fullgraph" : False, "options" : torch_compile_options,}
+pass
+
+accelerate.utils.dataclasses.TorchDynamoPlugin.to_kwargs = torch_compile_kwargs
+accelerate.utils.TorchDynamoPlugin.to_kwargs             = torch_compile_kwargs
+accelerate.accelerator.TorchDynamoPlugin.to_kwargs       = torch_compile_kwargs
+del accelerate
+
+def patch_regional_compilation():
+    # Regional torch 2.5 Recompilation - weirdly very slow??
+    if torch.nn.ModuleList.__name__ == "UnslothModuleList": return
+    # Only works for torch 2.5
+    if Version(torch.__version__) < Version("2.5.0"): return
+
+    old_module_list = torch.nn.ModuleList
+    os.environ["UNSLOTH_PATCHED"] = "1"
+
+    def UnslothModuleList(*args, **kwargs):
+        if len(args) == 1 and len(kwargs) == 0 and type(args[0]) is list:
+            args = [old_module_list([torch.compile(x, dynamic = True, options = torch_compile_options, fullgraph = False) for x in args[0]])]
+        return old_module_list(*args, **kwargs)
+    pass
+    UnslothModuleList.__doc__ = old_module_list.__doc__
+
+    torch.nn.ModuleList = UnslothModuleList
+    return
+pass
+
+# =============================================
+
+def prepare_model_for_kbit_training(
+    model                      : Any,
+    use_gradient_checkpointing : Optional = True,
+    use_reentrant              : Optional[bool] = True,
+) -> Any:
+    """
+    Calculates where to place the gradient checkpoints given n_layers.
+    We also freeze all other layers's gradients
+
+    Args:
+        model: Any LlamaModel with layers.
+        use_gradient_checkpointing (`bool`, *optional*):
+            Default enabled. Provides memory savings by not saving all activations,
+            but only some.
+        use_reentrant (`bool`, *optional*):
+            https://github.com/pytorch/pytorch/blob/main/torch/utils/checkpoint.py#L354
+            Optimal gradient checkpointing algorithm which will be the default in
+            future Pytorch versions.
+    """
+
+    # Freeze all parameters except LoRA
+    with torch.no_grad():
+        for name, param in model.named_parameters():
+            if ".lora_A." in name or ".lora_B." in name or ".lora_magnitude_vector" in name:
+                param.requires_grad_(True)
+                # Also must be in float32!
+                if param.dtype != torch.float32:
+                    name = name.replace("base_model", "model", 1)
+                    layer_number = re.search(r"\.[\d]{1,}\.", name).group(0)
+                    name = name.replace(layer_number, f"[{layer_number[1:-1]}].")
+                    name = name.replace(".weight", "", 1)
+                    exec(f"{name}.to(torch.float32)")
+                pass
+            else:
+                param.requires_grad_(False)
+        pass
+    pass
+
+    # Gradient checkpointing!
+    if use_gradient_checkpointing == "unsloth":
+
+        # Saves VRAM!
+        original_model = model
+        while hasattr(original_model, "model"):
+            original_model._offloaded_gradient_checkpointing = True
+            original_model = original_model.model
+        pass
+        original_model._offloaded_gradient_checkpointing = True
+        
+        model.gradient_checkpointing_enable()
+
+    elif use_gradient_checkpointing == True:
+        model.gradient_checkpointing_enable()
+    pass
+
+    # If use_reentrant = True which is the Pytorch default, we just make the input requires_grad.
+    if use_reentrant:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    return model
+pass
+
+# =============================================
+# Weirdly LoraLayer.update_layer downcasts PEFT layers to float16??
+# For mixed precision, we need it to be in float32 not float16.
+from peft import __version__ as peft_version
+if Version(peft_version) < Version("0.12.0"):
+    from peft.tuners.lora.layer import LoraLayer
+    try:
+        source = inspect.getsource(LoraLayer.update_layer)
+        text = "if weight is not None:\n"
+        start = source.find(text) + len(text)
+        end = source.find("self.to(weight.device)", start)
+        spaces = re.findall(r"^([ ]{1,})break", source, flags = re.MULTILINE)[0]
+        source = source.replace(source[start : end], spaces)
+        spaces = len(re.match(r"[\s]{1,}", source).group(0))
+        lines = source.split("\n")
+        source = "\n".join(x[spaces:] for x in lines)
+        source = re.sub("([^\.])nn\.", r"\1torch.nn.", source)
+        source = source.replace("def update_layer", "def LoraLayer_update_layer")
+        exec(source, globals())
+
+        # Fix up incorrect downcasting of LoRA weights
+        from peft.tuners.lora.layer import LoraLayer
+        LoraLayer.update_layer = LoraLayer_update_layer
+        from peft.tuners.lora import LoraLayer
+        LoraLayer.update_layer = LoraLayer_update_layer
+    except:
+        logger.warning_once(
+            "Unsloth unsuccessfully patched LoraLayer.update_layer. Please file a bug report.\n"\
+            "Luckily, your training run will still work in the meantime!"
+        )
+    pass
+pass
+
+# =============================================
+
+import psutil
+def _get_statistics(statistics = None, force_download = True):
+    # We log some basic stats about which environment is being used.
+    # We simply download a README.md file from HF - all data is made public.
+    # This is simply so we can check if some envs are broken or not.
+    # You can disable this by commenting the below out
+    try:
+        n_cpus = psutil.cpu_count(logical = False)
+        keynames = "\n" + "\n".join(os.environ.keys())
+        if statistics is not None: pass
+        elif "\nCOLAB_"  in keynames and n_cpus == 1: statistics = "colab"
+        elif "\nCOLAB_"  in keynames: statistics = "colabpro"
+        elif "\nKAGGLE_" in keynames: statistics = "kaggle"
+        elif "\nRUNPOD_" in keynames: statistics = "runpod"
+        elif "\nAWS_"    in keynames: statistics = "aws"
+        elif "\nAZURE_"  in keynames: statistics = "azure"
+        # elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp"
+        elif "\nINVOCATION_ID" in keynames: statistics = "lambda"
+        # else: statistics = "other"
+        else:
+            def try_vllm_check():
+                vendor_files = (
+                    "/sys/class/dmi/id/product_version",
+                    "/sys/class/dmi/id/bios_vendor",
+                    "/sys/class/dmi/id/product_name",
+                    "/sys/class/dmi/id/chassis_asset_tag",
+                    "/sys/class/dmi/id/sys_vendor",
+                )
+                from pathlib import Path
+                for vendor_file in vendor_files:
+                    path = Path(vendor_file)
+                    if path.is_file():
+                        file_content = path.read_text().lower()
+                        if   "amazon"                in file_content: return "aws"
+                        elif "microsoft corporation" in file_content: return "azure"
+                        elif "google"                in file_content: return "gcp"
+                return "other"
+            pass
+            try:    statistics = try_vllm_check()
+            except: statistics = "other"
+        pass
+        if statistics is not None:
+            from transformers import AutoModelForCausalLM
+            stats_model = AutoModelForCausalLM.from_pretrained(
+                f"unslothai/{statistics}",
+                force_download = force_download,
+            )
+            del stats_model
+        pass
+    except:
+        pass
+pass
+
+
+def get_statistics():
+    # We log some basic stats about which environment is being used.
+    # We simply download a README.md file from HF - all data is made public.
+    # This is simply so we can check if some envs are broken or not.
+    # You can disable this by setting UNSLOTH_DISABLE_STATISTICS
+    import os
+    if "UNSLOTH_DISABLE_STATISTICS" in os.environ: return
+    from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
+    disabled = False
+    if not are_progress_bars_disabled():
+        disable_progress_bars()
+        disabled = True
+    pass
+    _get_statistics(None)
+    _get_statistics("repeat", force_download = False)
+    try:
+        vram = torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024
+        if   vram <= 8 : vram = 8
+        elif vram <= 16: vram = 16
+        elif vram <= 20: vram = 20
+        elif vram <= 24: vram = 24
+        elif vram <= 40: vram = 40
+        elif vram <= 48: vram = 48
+        elif vram <= 80: vram = 80
+        else: vram = 96
+        _get_statistics(f"vram-{vram}")
+    except:
+        pass
+    pass
+    try:
+        devices = torch.cuda.device_count()
+        _get_statistics(f"{devices if devices <= 8 else 9}")
+    except:
+        pass
+    if disabled: enable_progress_bars()
+pass
+
+
+# =============================================
+# Fixes Bitsandbytes to remove missing warnings
+from transformers.utils.quantization_config import BitsAndBytesConfig, QuantizationMethod
+from inspect import getsource
+from accelerate.utils.dataclasses import DistributedType
+BitsAndBytesConfig__init__ = getsource(BitsAndBytesConfig.__init__)
+BitsAndBytesConfig__init__ = re.sub(
+    r"if[\s]{1,}kwargs\:[\s]{1,}.+?\n",
+    "",
+    BitsAndBytesConfig__init__,
+    flags = re.MULTILINE,
+)
+BitsAndBytesConfig__init__ = BitsAndBytesConfig__init__.split("\n")
+length_spaces = len(re.match(r"[\s]{1,}", BitsAndBytesConfig__init__[0]).group(0))
+BitsAndBytesConfig__init__ = "\n".join(x[length_spaces:] for x in BitsAndBytesConfig__init__)
+BitsAndBytesConfig__init__ = BitsAndBytesConfig__init__.replace(
+    "__init__",
+    "_BitsAndBytesConfig__init__",
+)
+
+def _prepare_backend(
+    self, cpu = False, sagemaker_dp = False, backend: str = None,
+) -> tuple[str, DistributedType]:
+    return None, DistributedType.NO
+pass
+import accelerate.state
+accelerate.state.PartialState._prepare_backend = _prepare_backend
+
+import accelerate.accelerator
+prepare = inspect.getsource(accelerate.accelerator.Accelerator.prepare)
+prepare = prepare.split("\n")
+spaces = prepare[0].find("def")
+prepare = "\n".join(x[spaces:] for x in prepare)
+x = "for obj in args:"
+s = " "*spaces
+prepare = prepare.replace(x, f'self.state.distributed_type = DistributedType.NO\n{s}{x}', 1)
+exec(prepare, globals())
+accelerate.accelerator.Accelerator.prepare = prepare
+
+exec(BitsAndBytesConfig__init__, globals())
+
+import transformers.utils.quantization_config
+transformers.utils.quantization_config.BitsAndBytesConfig.__init__ = _BitsAndBytesConfig__init__
+# =============================================
+
+# Offloading to disk for modules (lm_head, embed_tokens)
+import pickle
+
+def offload_to_disk(W, model, name, temporary_location : str = "_unsloth_temporary_saved_buffers"):
+    file_location = os.path.join(temporary_location, model.config._name_or_path)
+    if not os.path.exists(file_location):
+        os.makedirs(file_location)
+    pass
+
+    filename = os.path.join(file_location, f"{name}.pt")
+    W = W.weight if hasattr(W, "weight") else W
+    torch.save(W, filename, pickle_module = pickle, pickle_protocol = pickle.HIGHEST_PROTOCOL,)
+    offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)
+    offloaded_W._offloaded_file_location = filename
+    return offloaded_W
+pass
+
+
+def offload_input_embeddings(model, temporary_location : str = "_unsloth_temporary_saved_buffers"):
+    offloaded_W = offload_to_disk(model.get_input_embeddings(), model, "input_embeddings", temporary_location)
+    new_input_embeddings = torch.nn.Embedding.from_pretrained(offloaded_W)
+    new_input_embeddings._offloaded_file_location = offloaded_W._offloaded_file_location
+    model.set_input_embeddings(new_input_embeddings)
+    return
+pass
+
+
+def offload_output_embeddings(model, temporary_location : str = "_unsloth_temporary_saved_buffers"):
+    offloaded_W = offload_to_disk(model.get_output_embeddings(), model, "output_embeddings", temporary_location)
+
+    new_output_embeddings = torch.nn.Linear(1, 1, bias = None)
+    del new_output_embeddings.weight
+    new_output_embeddings.weight = offloaded_W
+    new_output_embeddings.in_features  = offloaded_W.shape[1]
+    new_output_embeddings.out_features = offloaded_W.shape[0]
+
+    new_output_embeddings._offloaded_file_location = offloaded_W._offloaded_file_location
+    model.set_output_embeddings(new_output_embeddings)
+    return
+pass
+
+
+# Fixes a weird Torch 2.3 bug which says T4s have bfloat16
+def is_bfloat16_supported():
+    return SUPPORTS_BFLOAT16
+pass
+
+
+# Patches models to add RoPE Scaling
+def patch_linear_scaling(
+    model_name = "gemma2",
+    rope_module = None,
+    scaled_rope_module = None,
+    attention_module = None,
+):
+    assert(rope_module is not None and scaled_rope_module is not None)
+    assert(attention_module is not None)
+
+    rope_name = rope_module.__name__
+    scaled_rope_name = scaled_rope_module.__name__
+    model_filepath = f"transformers.models.{model_name}.modeling_{model_name}"
+    exec_code = \
+        f"import torch.nn as nn\n"\
+        f"from typing import Union, Optional, List, Any, Callable, Tuple\n"\
+        f"from {model_filepath} import logger, "\
+        f"{model_name.title()}Attention, {model_name.title()}Config"
+
+    try:
+        function = inspect.getsource(attention_module.__init__)
+    except:
+        # Most likely already patched!
+        return None, None
+    where = function.find("def")
+    function = function.split("\n")
+    function = "\n".join(x[where:] for x in function)
+    init_name = f"{model_name.title()}Attention__init__"
+    function = function.replace("def __init__", f"def {init_name}")
+    function = function.replace(
+        "super().__init__()",
+        f"super({model_name.title()}Attention, self).__init__()",
+    )
+    fix_rope_function = """
+    if getattr(self.config, "rope_scaling", None) is None:
+        self.rotary_emb = {rope_function}(
+            dim = self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+    else:
+        scaling_type = self.config.rope_scaling["type"]
+        scaling_factor = self.config.rope_scaling["factor"]
+        if scaling_type == "linear":
+            self.rotary_emb = {scaled_rope_function}(
+                dim = self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=scaling_factor,
+                base=self.rope_theta,
+            )
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}")
+    pass
+    """
+    fix_rope_function = fix_rope_function.format(
+        rope_function        = rope_module.__name__,
+        scaled_rope_function = scaled_rope_module.__name__,
+    )
+    rotary_emb = re.findall(
+        "self.rotary_emb = .+?\)", function,
+        flags = re.DOTALL | re.MULTILINE,
+    )
+    if len(rotary_emb) == 0: return None, function
+    rotary_emb = rotary_emb[0]
+    function = function.replace(rotary_emb, fix_rope_function, 1)
+    function = exec_code + "\n\n" + function
+    return init_name, function
+pass
+
+
+# Patches for Llama-3 LlamaExtendedRotaryEmbedding
+def patch_llama_rope_scaling(
+    model_name = "llama",
+    rope_module = None,
+    scaled_rope_module = None,
+    extended_rope_module = None,
+    attention_module = None,
+    longrope_module = None,
+):
+    assert(\
+        rope_module is not None and \
+        scaled_rope_module is not None and \
+        extended_rope_module is not None
+    )
+    assert(attention_module is not None)
+
+    rope_name = rope_module.__name__
+    scaled_rope_name = scaled_rope_module.__name__
+    model_filepath = f"transformers.models.{model_name}.modeling_{model_name}"
+    exec_code = \
+        f"import torch.nn as nn\n"\
+        f"from typing import Union, Optional, List, Any, Callable, Tuple\n"\
+        f"from {model_filepath} import logger, "\
+        f"{model_name.title()}Attention, {model_name.title()}Config"
+
+    try:
+        function = inspect.getsource(attention_module.__init__)
+    except:
+        # Most likely already patched!
+        return None, None
+    where = function.find("def")
+    function = function.split("\n")
+    function = "\n".join(x[where:] for x in function)
+    init_name = f"{model_name.title()}Attention__init__"
+    function = function.replace("def __init__", f"def {init_name}")
+    function = function.replace(
+        "super().__init__()",
+        f"super({model_name.title()}Attention, self).__init__()",
+    )
+    fix_rope_function = """
+    if getattr(self.config, "rope_scaling", None) is None:
+        self.rotary_emb = {rope_function}(
+            dim = self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+    else:
+        scaling_type1 = self.config.rope_scaling.get("type", None)
+        scaling_type2 = self.config.rope_scaling.get("rope_type", None)
+        scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2
+        scaling_factor = self.config.rope_scaling.get("factor")
+
+        if scaling_type == "linear":
+            self.rotary_emb = {scaled_rope_function}(
+                dim = self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=scaling_factor,
+                base=self.rope_theta,
+            )
+        elif scaling_type == "llama3":
+            self.rotary_emb = {extended_rope_function}(
+                dim = self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        elif scaling_type == "longrope":
+            self.rotary_emb = {longrope_rope_function}(
+                dim = self.head_dim,
+                max_position_embeddings = self.max_position_embeddings,
+                original_max_position_embeddings = self.config.original_max_position_embeddings,
+                base = self.rope_theta,
+                short_factor = self.config.rope_scaling['short_factor'],
+                long_factor  = self.config.rope_scaling['long_factor' ],
+            )
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}")
+    pass
+    """
+
+    fix_rope_function = fix_rope_function.format(
+        rope_function          = rope_module.__name__,
+        scaled_rope_function   = scaled_rope_module.__name__,
+        extended_rope_function = extended_rope_module.__name__,
+        longrope_rope_function = \
+            (longrope_module if longrope_module is not None else rope_module).__name__
+    )
+    rotary_emb = re.findall(
+        "self.rotary_emb = .+?\)", function,
+        flags = re.DOTALL | re.MULTILINE,
+    )
+    if len(rotary_emb) == 0: return None, function
+    rotary_emb = rotary_emb[0]
+    function = function.replace(rotary_emb, fix_rope_function, 1)
+    function = exec_code + "\n\n" + function
+    return init_name, function
+pass
+
+
+def check_nvidia():
+    # Unsloth doesn't work yet on AMD devices - we're working on it!
+    output = np.array([0,])
+    try:
+        output = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv", shell = True)
+        output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
+        output = np.array([int(x.decode('utf-8'))/1024 for x in output])
+    except:
+        if not torch.cuda.is_available():
+            raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")    
+    return output
+pass
+PRE_CHECK = check_nvidia()
+
+
+def create_boolean_mask(n = 4096, sliding_window = 2048):
+    # Creates a boolean mask for attention
+    mask = torch.ones(n, n, dtype = torch.bool)
+    if sliding_window == 0:
+        return torch.triu(mask, diagonal = 1, out = mask)
+    pass
+    torch.triu(mask, diagonal = 0, out = mask)
+    torch.triu(mask.T, diagonal = -sliding_window, out = mask.T)
+    mask = mask.T
+    torch.logical_not(mask, out = mask)
+    return mask
+pass
+
+
+def test_mask_creation():
+    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+    for n in range(2, 23):
+        for s in range(1, 23):
+            correct_mask = AttentionMaskConverter(
+                is_causal = True,
+                sliding_window = s,
+            ).to_causal_4d(1, n, n, dtype = torch.float16,).squeeze(0).squeeze(0)
+            correct_mask = (correct_mask == correct_mask.min())
+            our_mask = create_boolean_mask(n = n, sliding_window = s)
+            assert(torch.all(correct_mask == our_mask))
+        pass
+        correct_mask = AttentionMaskConverter(
+            is_causal = True,
+            sliding_window = None,
+        ).to_causal_4d(1, n, n, dtype = torch.float16,).squeeze(0).squeeze(0)
+        correct_mask = (correct_mask == correct_mask.min())
+        our_mask = create_boolean_mask(n = n, sliding_window = 0)
+        assert(torch.all(correct_mask == our_mask))
+    pass
+pass
+
+
+def _unsloth_get_batch_samples(self, epoch_iterator, num_batches):
+    batch_samples = []
+    num_items_in_batch = None
+
+    # Check if model allows **kwargs
+    model = self.model
+    f = model.base_model.model.forward if hasattr(model, "base_model") else model.forward
+    has_kwargs = tuple(inspect.signature(f).parameters.values())[-1].kind == inspect._VAR_KEYWORD
+
+    # Iterate to find all batches
+    for _ in range(num_batches):
+        try:
+            batch_samples += [next(epoch_iterator)]
+        except StopIteration:
+            break
+    pass
+
+    # Get num_items_in_batch
+    if has_kwargs and len(batch_samples) > 0 and "labels" in batch_samples[0]:
+        try:
+            num_items_in_batch = sum(
+                [(x["labels"][..., 1:] != -100).sum() for x in batch_samples]
+            )
+            
+            if self.args.average_tokens_across_devices:
+                num_items_in_batch = self.accelerator.gather(num_items_in_batch).sum().item()
+
+            if torch.is_tensor(num_items_in_batch):
+                num_items_in_batch = num_items_in_batch.item()
+
+        except Exception as exception:
+            logger.warning_once(exception)
+    pass
+
+    return batch_samples, num_items_in_batch
+pass
+
+
+def _unsloth_pre_compute_loss(self, model, inputs, *args, **kwargs):
+    num_items_in_batch = None
+
+    if "num_items_in_batch" in kwargs:
+        num_items_in_batch = kwargs["num_items_in_batch"]
+        if num_items_in_batch is None:
+            # Remove it since the model does not support it!
+            kwargs.pop("num_items_in_batch")
+        elif "num_items_in_batch" not in inputs:
+            inputs["num_items_in_batch"] = num_items_in_batch
+        pass
+    pass
+
+    if num_items_in_batch is None:
+        name = (model.base_model.model if hasattr(model, "base_model") else model).__class__.__name__
+        logger.warning_once(
+            f"Unsloth: Not an error, but {name} does not accept `num_items_in_batch`.\n"\
+            "Using gradient accumulation will be very slightly less accurate.\n"\
+            "Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient"
+        )
+    pass
+    return self._old_compute_loss(model, inputs, *args, **kwargs)
+pass
+
+
+def patch_gradient_accumulation_fix(Trainer):
+    # Fixes gradient accumulation 
+    import inspect
+    if hasattr(Trainer, "get_batch_samples"):
+        if Trainer.get_batch_samples.__name__ == "_unsloth_get_batch_samples": return
+        if \
+            not inspect.getsource(Trainer.get_batch_samples).strip()\
+            .endswith("return batch_samples, num_items_in_batch"):
+
+            raise NotImplementedError("Unsloth: Please make a Github issue immediately!!")
+        else:
+            if Trainer.get_batch_samples.__name__ != "_unsloth_get_batch_samples":
+                Trainer.get_batch_samples = _unsloth_get_batch_samples
+            pass
+
+            # Also fix passing in num_items_in_batch
+            if not hasattr(Trainer, "_old_compute_loss"):
+                Trainer._old_compute_loss = Trainer.compute_loss
+                Trainer.compute_loss = _unsloth_pre_compute_loss
+            pass
+        pass
+    else:
+        logger.warning_once(
+            "Unsloth: We fixed a gradient accumulation bug, "\
+            "but it seems like you don't have the latest transformers version!\n"\
+            "Please update transformers, TRL and unsloth via:\n"\
+            '`pip install --upgrade --no-cache-dir --no-deps unsloth transformers git+https://github.com/huggingface/trl.git`'
+        )
+    pass
+
+    # Also fix up loss scaling ie negate loss *= self.args.gradient_accumulation_steps
+    if Trainer.training_step.__name__ == "_unsloth_training_step": return
+    if "num_items_in_batch" not in inspect.signature(Trainer.training_step).parameters: return
+
+    function = inspect.getsource(Trainer.training_step)
+    where = function.find("def")
+    function = function.split("\n")
+    function = "\n".join(x[where:] for x in function)
+
+    # Import all variables that need importing
+    import transformers.trainer
+    items_in_trainer = dir(transformers.trainer)
+    good_items = []
+    for item in items_in_trainer:
+        # TODO: Support Deepspeed
+        if item.startswith(("deepspeed", "xm", "met", "smp")): continue
+        if item in function: good_items.append(item)
+    pass
+    exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals())
+
+    # Accelerate does / self.args.gradient_accumulation_steps internally, so if we already
+    # summed it up and did the division before hand, we have to negate it.
+    function = function.replace(
+        "loss *= self.args.gradient_accumulation_steps",
+        "if num_items_in_batch is not None: loss *= self.args.gradient_accumulation_steps",
+    )
+    function = function.replace("def training_step", "def _unsloth_training_step", 1)
+
+    # Fix 4.47.0 issue where num_items_in_batch was removed
+    # See https://github.com/huggingface/transformers/pull/35121
+    function = function.replace(
+        "if self.model_accepts_loss_kwargs:",
+        "if False:",
+    )
+
+    # Fix when num_items_in_batch is nothing
+    # https://github.com/huggingface/transformers/pull/35207
+    function = re.sub(
+        r"else:\n"\
+        r"([\s]{4,})self\.accelerator\.backward\(loss, \*\*kwargs\)\n"\
+        r"(.+?)if num_items_in_batch is None\:\n"\
+        r"(.+?)return loss\.detach\(\) \/ self\.args\.gradient_accumulation_steps",
+
+        "else:\n"\
+        "\2if num_items_in_batch is None:\n"\
+        "\3loss = loss / self.args.gradient_accumulation_steps\n"\
+        "\1self.accelerator.backward(loss, **kwargs)",
+        
+        function,
+    )
+    
+    exec(function, globals())
+    Trainer.training_step = _unsloth_training_step
+pass
+
+
+def patch_tokenizer(model, tokenizer):
+    model, tokenizer = _patch_tokenizer(model, tokenizer)
+    if model is not None:
+        model.config.update({"unsloth_version" : __version__})
+    return model, tokenizer
+pass
+
+
+def patch_fast_lora():
+    import peft.tuners.lora.bnb
+    peft.tuners.lora.bnb.Linear4bit.forward = fast_lora_forward
+pass
+
+
+def unsloth_compile_transformers(
+    model_name,
+    token                   = None,
+    revision                = None,
+    trust_remote_code       = False,
+    sdpa_dynamic_mask       = True,
+    sdpa_bool_masks         = True,
+    sdpa_gqa_replace        = True,
+    sdpa_dynamic_compile    = True,
+    compile_attention       = True,
+    disable_causal_masks    = True,
+    compile_torch_modules   = True,
+    compile_custom_modules  = True,
+    compile_function_calls  = True,
+    fuse_lm_head            = True,
+    gradient_checkpointing  = True,
+    manual_replacements     = True,
+    fast_lora_forwards      = True,
+    fast_residual_stream    = True,
+    accurate_accumulation   = True,
+    epilogue_fusion         = True,
+    max_autotune            = False,
+    shape_padding           = True,
+    cudagraphs              = False,
+    debug                   = False,
+    fullgraph               = True,
+    import_from_cache       = False,
+    disable                 = False,
+    return_logits           = False,
+):
+    if Version(torch_version) < Version("2.4.0"):
+        print(
+            "="*30 + \
+            "Unsloth: Unfortunately Unsloth vision and other newer optimized models need Torch 2.4 or later.\n"\
+            f"You have Torch version {torch_version}. Please upgrade your Torch version by visiting https://pytorch.org/\n"\
+            "For now your models will not get optimized, but will still work for now!"
+        )
+        return
+    pass
+
+    if disable: return
+
+    model_types = get_transformers_model_type(
+        model_name        = model_name,
+        token             = token,
+        revision          = revision,
+        trust_remote_code = trust_remote_code,
+    )
+
+    for model_type in model_types:
+        _unsloth_compile_transformers(
+            model_type,
+            sdpa_dynamic_mask      = sdpa_dynamic_mask,
+            sdpa_bool_masks        = sdpa_bool_masks,
+            sdpa_gqa_replace       = sdpa_gqa_replace,
+            sdpa_dynamic_compile   = sdpa_dynamic_compile,
+            compile_attention      = compile_attention,
+            disable_causal_masks   = disable_causal_masks,
+            compile_torch_modules  = compile_torch_modules,
+            compile_custom_modules = compile_custom_modules,
+            compile_function_calls = compile_function_calls,
+            fuse_lm_head           = fuse_lm_head,
+            gradient_checkpointing = gradient_checkpointing,
+            manual_replacements    = manual_replacements,
+            fast_lora_forwards     = fast_lora_forwards,
+            fast_residual_stream   = fast_residual_stream,
+            accurate_accumulation  = accurate_accumulation,
+            epilogue_fusion        = epilogue_fusion,
+            max_autotune           = max_autotune,
+            shape_padding          = shape_padding,
+            cudagraphs             = cudagraphs,
+            debug                  = debug,
+            fullgraph              = fullgraph,
+            import_from_cache      = import_from_cache,
+            disable                = disable,
+            return_logits          = return_logits,
+        )
+    pass
+    return model_types
+pass
+
+# We need an empty logits flag to warn people logits will not be returned anymore unless asked ie
+# os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+LOGITS_ERROR_STRING = \
+    "Unsloth: Logits are empty from 2024.11 onwards. To get raw logits again, please "\
+    'set the environment variable `UNSLOTH_RETURN_LOGITS` to `"1" BEFORE starting to train ie before `trainer.train()`. For example:\n\n'\
+    "import os\n"\
+    "os.environ['UNSLOTH_RETURN_LOGITS'] = '1'\n"\
+    "... trainer.train() ..."
+
+def raise_logits_error(*args, **kwargs): raise NotImplementedError(LOGITS_ERROR_STRING)
+def return_none(*args, **kwargs): return None
+class EmptyLogits:
+    def __init__(self): return
+    def raise_getattr_error(self, attr): return return_none if attr == "to" else raise_logits_error
+    __getitem__ = raise_logits_error
+    __getattr__ = raise_getattr_error
+    def __repr__(self): return LOGITS_ERROR_STRING
+    def __str__ (self): return LOGITS_ERROR_STRING
+pass
+EMPTY_LOGITS = EmptyLogits()
+functions = dir(torch.Tensor)
+for j, function in enumerate(functions):
+    if function.startswith("__") and function.endswith("__"):
+        exec(f"def raise_{j}(*args, **kwargs): print('{function}')", globals(), locals())
+        try: exec(f"EMPTY_LOGITS.{function} = raise_{j}", globals(), locals())
+        except: continue
+pass
diff --git a/unsloth-main/unsloth/models/cohere.py b/unsloth-main/unsloth/models/cohere.py
new file mode 100644
index 0000000000000000000000000000000000000000..1610949f64b1f7db40bac571caed23ffe98a36cf
--- /dev/null
+++ b/unsloth-main/unsloth/models/cohere.py
@@ -0,0 +1,474 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .llama import *
+from ._utils import __version__
+try:
+    from transformers.models.cohere.modeling_cohere import (
+        CohereAttention,
+        CohereDecoderLayer,
+        CohereModel,
+        CohereForCausalLM,
+        CohereRotaryEmbedding,
+        apply_rotary_pos_emb,
+        repeat_kv,
+    )
+except:
+    from packaging.version import Version
+    transformers_version = Version(transformers_version)
+    if not transformers_version >= Version("4.42"):
+        raise ImportError(
+            f"Unsloth: Your transformers version of {transformers_version} does not support Cohere.\n"\
+            f"The minimum required version is 4.42.3.\n"\
+            f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\
+            f"to obtain the latest transformers build, then restart this session."\
+        )
+    pass
+pass
+
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+# For Pytorch 2.1.1
+try:
+    from transformers.models.cohere.modeling_cohere import (
+        CohereSdpaAttention,
+        CohereFlashAttention2,
+    )
+except:
+    CohereSdpaAttention   = CohereAttention
+    CohereFlashAttention2 = CohereAttention
+pass
+
+
+def fast_layernorm_inference(self, X, out_weight = None):
+    XX = X.to(torch.float32, copy = True)
+    XX -= X.mean(-1, keepdim = True)
+    variance = XX.square().mean(-1, keepdim = True)
+    variance += self.variance_epsilon
+    XX *= variance.rsqrt_()
+    out_weight[:] = self.weight
+    XX *= out_weight
+    return XX.to(X.dtype)
+pass
+
+
+# QK norm in Cohere
+def CohereAttention_fast_forward(
+    self,
+    hidden_states:        torch.Tensor,
+    causal_mask:          Optional[BlockDiagonalCausalMask] = None,
+    attention_mask:       Optional[torch.Tensor] = None,
+    position_ids:         Optional[torch.LongTensor] = None,
+    past_key_value:       Optional[Tuple[torch.Tensor]] = None,
+    output_attentions:    bool = False,
+    use_cache:            bool = False,
+    padding_mask:         Optional[torch.LongTensor] = None,
+    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    *args, **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    
+    # Clear inference
+    if hasattr(self, "paged_attention"):
+        del self.paged_attention_K
+        del self.paged_attention_V
+        del self.paged_attention
+        del self.temp_QA
+        del self.temp_KV
+        del self.RH_Q
+        del self.attention
+        del self.q_norm_out_weight
+        del self.k_norm_out_weight
+    pass
+
+    bsz, q_len, _ = hidden_states.size()
+
+    n_heads    = self.num_heads
+    n_groups   = self.num_key_value_groups
+    n_kv_heads = self.num_key_value_heads
+    head_dim   = self.head_dim
+    assert(n_kv_heads * n_groups == n_heads)
+
+    Q, K, V = self.apply_qkv(self, hidden_states)
+    Q = Q.view(bsz, q_len, n_heads,    head_dim).transpose(1, 2)
+    K = K.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
+    V = V.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
+    if self.use_qk_norm:
+        Q = fast_layernorm_compiled(self.q_norm, Q)
+        K = fast_layernorm_compiled(self.k_norm, K)
+    pass
+
+    kv_seq_len = K.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    cos, sin = position_embeddings
+    if position_ids is None:
+        Q, K = fast_rope_embedding(Q, K, cos, sin)
+    else:
+        cos, sin = cos[position_ids], sin[position_ids]
+        Q, K = inplace_rope_embedding(Q, K, cos, sin, position_ids)
+    pass
+
+    if past_key_value is not None:
+        K = torch.cat([past_key_value[0], K], dim = 2)
+        V = torch.cat([past_key_value[1], V], dim = 2)
+    pass
+    past_key_value = (K, V) if use_cache else None
+
+    # Attention module
+    if (not HAS_FLASH_ATTENTION and attention_mask is None):
+        # Xformers memory efficient attention
+        # Also has Flash Attention v2 dispatching
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2)
+        V = V.transpose(1, 2)
+
+        # Group query attention
+        if n_groups != 1:
+            K = K  .view(bsz, kv_seq_len, n_kv_heads,        1, head_dim)
+            V = V  .view(bsz, kv_seq_len, n_kv_heads,        1, head_dim)
+            K = K.expand(bsz, kv_seq_len, n_kv_heads, n_groups, head_dim)
+            V = V.expand(bsz, kv_seq_len, n_kv_heads, n_groups, head_dim)
+            if hidden_states.requires_grad:
+                K = K.reshape(bsz, kv_seq_len, n_heads, head_dim)
+                V = V.reshape(bsz, kv_seq_len, n_heads, head_dim)
+            else:
+                Q = Q.view(bsz, q_len, n_kv_heads, n_groups, head_dim)
+        pass
+        A = xformers_attention(Q, K, V, attn_bias = causal_mask)
+        A = A.view(bsz, q_len, n_heads, head_dim)
+
+    elif HAS_FLASH_ATTENTION and attention_mask is None:
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2)
+        V = V.transpose(1, 2)
+        A = flash_attn_func(Q, K, V, causal = True)
+    else:
+        # Grouped query attention
+        if n_groups != 1:
+            K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, kv_seq_len, head_dim)
+            V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, kv_seq_len, head_dim)
+            K = K.reshape(bsz, n_heads, kv_seq_len, head_dim)
+            V = V.reshape(bsz, n_heads, kv_seq_len, head_dim)
+        pass
+        # Must be contiguous or else results are False!
+        # https://github.com/pytorch/pytorch/issues/112577
+        Q, K, V = Q.contiguous(), K.contiguous(), V.contiguous()
+        # Needs (batch_size, n_heads, seq_len, head_dim)
+        # is_casual and attention_mask must not be both set!
+        A = scaled_dot_product_attention(Q, K, V, attn_mask = attention_mask, is_causal = False)
+        # Go back to (batch_size, seq_len, n_heads, head_dim)
+        A = A.transpose(1, 2).contiguous()
+    pass
+    attn_output = A.reshape(bsz, q_len, n_heads*head_dim)
+    attn_output = self.apply_o(self, attn_output)
+    attn_weights = None
+    return attn_output, attn_weights, past_key_value
+pass
+
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L590
+def CohereDecoderLayer_fast_forward(
+    self,
+    hidden_states:        torch.Tensor,
+    causal_mask:          Optional[BlockDiagonalCausalMask] = None,
+    attention_mask:       Optional[torch.Tensor] = None,
+    position_ids:         Optional[torch.LongTensor] = None,
+    past_key_value:       Optional[Tuple[torch.Tensor]] = None,
+    output_attentions:    Optional[bool] = False,
+    use_cache:            Optional[bool] = False,
+    padding_mask:         Optional[torch.LongTensor] = None,
+    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    *args, **kwargs,
+):
+    if use_cache and hasattr(self, "_flag_for_generation"): #past_key_value is not None:
+        out_weight = torch.empty(self.input_layernorm.weight.shape, dtype = torch.float32, device = "cuda:0")
+
+        # Self Attention
+        residual = hidden_states
+        hidden_states = fast_layernorm_inference(self.input_layernorm, hidden_states, out_weight)
+        hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            causal_mask=causal_mask,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            padding_mask=padding_mask,
+        )
+
+        # Fully Connected
+        hidden_states_mlp = fast_swiglu_inference(self.mlp, hidden_states)
+        residual += hidden_states_attention
+        residual += hidden_states_mlp
+        hidden_states = residual
+    else:
+        residual = hidden_states
+        hidden_states = fast_layernorm_compiled(self.input_layernorm, hidden_states)
+        hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            causal_mask=causal_mask,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            padding_mask=padding_mask,
+        )
+
+        # Fully Connected
+        hidden_states_mlp = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states_attention + hidden_states_mlp
+    pass
+
+    outputs = (hidden_states,)
+    if output_attentions: outputs += (self_attn_weights,)
+    if use_cache: outputs += (present_key_value,)
+    return outputs
+pass
+
+
+from math import sqrt as math_sqrt
+KV_CACHE_INCREMENT = 256 # KV Cache update size
+torch_nn_functional_softmax = torch.nn.functional.softmax
+torch_matmul = torch.matmul
+
+def CohereAttention_fast_forward_inference(
+    self,
+    hidden_states:  torch.Tensor,
+    past_key_value: Optional[Tuple[torch.Tensor]],
+    position_ids,
+    do_prefill = False,
+    attention_mask = None,
+):
+    Xn = hidden_states
+    bsz, _, hd = hidden_states.size()
+    K1, V1 = past_key_value
+    dtype = Xn.dtype
+
+    n_heads    = self.num_heads
+    n_groups   = self.num_key_value_groups
+    n_kv_heads = self.num_key_value_heads
+    head_dim   = self.head_dim
+    attention_size = n_heads*head_dim
+    # assert(n_kv_heads * n_groups == n_heads)
+    seq_len = K1.shape[-2]
+    kv_seq_len = seq_len + 1
+
+    # Prefill phase
+    # if not hasattr(self, "paged_attention"):
+    if do_prefill:
+        self.paged_attention = torch.empty((KV_CACHE_INCREMENT+seq_len+1, 2, bsz, n_kv_heads, head_dim), dtype = dtype, device = "cuda:0")
+        self.paged_attention_K = self.paged_attention[:,0]
+        self.paged_attention_V = self.paged_attention[:,1]
+        self.paged_attention_K[:seq_len] = K1.permute(2, 0, 1, 3)
+        self.paged_attention_V[:seq_len] = V1.permute(2, 0, 1, 3)
+        self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0")
+        self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0")
+        self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
+        
+        # Mistral Nemo 12b has weird dimensions
+        if attention_size != self.hidden_size:
+            self.temp_O = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0")
+        else:
+            self.temp_O = self.temp_QA[1][:,:,:self.hidden_size]
+        pass
+        
+        self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0")
+        self.scalar = 1.0 / math_sqrt(self.head_dim)
+        self.half_head_dim = head_dim // 2
+        # Cohere has QK layernorms
+        if self.use_qk_norm:
+            self.q_norm_out_weight = torch.empty(self.q_norm.weight.shape, dtype = torch.float32, device = "cuda:0")
+            self.k_norm_out_weight = torch.empty(self.k_norm.weight.shape, dtype = torch.float32, device = "cuda:0")
+        else:
+            self.q_norm_out_weight = None
+            self.k_norm_out_weight = None
+        pass
+    elif kv_seq_len >= self.paged_attention.shape[0]:
+        self.paged_attention.resize_((self.paged_attention.shape[0]+KV_CACHE_INCREMENT, 2, bsz, n_kv_heads, head_dim))
+        self.paged_attention_K = self.paged_attention[:,0]
+        self.paged_attention_V = self.paged_attention[:,1]
+        self.attention.resize_((bsz, n_heads, 1, self.attention.shape[-1]+KV_CACHE_INCREMENT))
+    pass
+
+    Qn = fast_linear_forward(self.q_proj, Xn, out = self.temp_QA[0])
+    Kn = fast_linear_forward(self.k_proj, Xn, out = self.temp_KV[0])
+    Vn = fast_linear_forward(self.v_proj, Xn, out = self.temp_KV[1])
+    Qn = Qn.view(bsz, 1, n_heads,    head_dim).transpose(1, 2)
+    Kn = Kn.view(bsz, 1, n_kv_heads, head_dim).transpose(1, 2)
+    Vn = Vn.view(bsz, 1, n_kv_heads, head_dim).transpose(1, 2)
+    if self.use_qk_norm:
+        Q = fast_layernorm_inference(self.q_norm, Q, self.q_norm_out_weight)
+        K = fast_layernorm_inference(self.k_norm, K, self.k_norm_out_weight)
+    pass
+
+    # cos, sin = self.rotary_emb(Vn, seq_len = kv_seq_len)
+    # Qn, Kn = inplace_rope_embedding(Qn, Kn, cos, sin, position_ids)
+    cos, sin = self.rotary_emb.get_cached(kv_seq_len)
+    cos = cos[position_ids].unsqueeze(1)
+    sin = sin[position_ids].unsqueeze(1)
+    h = self.half_head_dim
+
+    RH_Q = self.RH_Q
+    RH_Q[:,:,:,:h] = Qn[:,:,:,h:]
+    RH_Q[:,:,:,h:] = Qn[:,:,:,:h]
+    torch.neg(RH_Q[:,:,:,:h], out = RH_Q[:,:,:,:h])
+    Qn *= cos
+    Qn.addcmul_(RH_Q, sin)
+
+    RH_K = RH_Q[:,:n_kv_heads,:,:] # torch.empty((n_kv_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
+    RH_K[:,:,:,:h] = Kn[:,:,:,h:]
+    RH_K[:,:,:,h:] = Kn[:,:,:,:h]
+    torch.neg(RH_K[:,:,:,:h], out = RH_K[:,:,:,:h])
+    Kn *= cos
+    Kn.addcmul_(RH_K, sin)
+    
+    # New KV cache
+    # Kn = torch.cat([K1, Kn], dim = 2)
+    # Vn = torch.cat([V1, Vn], dim = 2)
+    self.paged_attention_K[seq_len] = Kn.permute(2, 0, 1, 3)
+    self.paged_attention_V[seq_len] = Vn.permute(2, 0, 1, 3)
+    Kn = self.paged_attention_K[:kv_seq_len].permute(1, 2, 0, 3)
+    Vn = self.paged_attention_V[:kv_seq_len].permute(1, 2, 0, 3)
+
+    # Handle sliding windows
+    sliding_window = getattr(self.config, "sliding_window", None)
+    if sliding_window is not None and kv_seq_len > sliding_window:
+        # From https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral/modeling_mistral.py#L193
+        slicing_tokens = 1 - sliding_window
+        Knn = Kn[:, :, slicing_tokens:, :]#.contiguous()
+        Vnn = Vn[:, :, slicing_tokens:, :]#.contiguous()
+    else:
+        Knn, Vnn = Kn, Vn
+    pass
+
+    # Grouped query attention
+    _, _, cached_len, _ = Knn.shape
+    if n_groups != 1:
+        Knn = Knn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
+        Vnn = Vnn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
+        Knn = Knn.reshape(bsz, n_heads, cached_len, head_dim)
+        Vnn = Vnn.reshape(bsz, n_heads, cached_len, head_dim)
+    pass
+    # else:
+    #     Knn, Vnn = Knn, Vnn
+    # pass
+
+    # Attention
+    if bsz == 1:
+        Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
+        # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows
+        A = torch_matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
+        # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched
+        A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype)
+        A = torch_matmul(A, Vnn, out = Qn)
+    else:
+        A = scaled_dot_product_attention(Qn, Knn, Vnn, attn_mask = attention_mask, is_causal = False)
+    pass
+    A = A.transpose(1, 2)
+    A = A.reshape(bsz, 1, attention_size)
+    A = fast_linear_forward(self.o_proj, A, out = self.temp_O)
+    return A, (Kn, Vn)
+pass
+
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L825
+# @torch.inference_mode
+def CohereModel_fast_forward_inference(
+    self,
+    input_ids,
+    past_key_values,
+    position_ids,
+    attention_mask = None,
+):
+    out_weight = torch.empty_like(self.model.layers[0].input_layernorm.weight, dtype = torch.float32, device = "cuda:0")
+    input_ids = input_ids[:,:self.max_seq_length]
+    hidden_states = self.model.embed_tokens(input_ids)
+    hidden_states = hidden_states.to(self.config.torch_dtype)
+    bsz, q_len, hd = hidden_states.shape
+    seq_len = past_key_values[0][0].shape[-2]
+    if bsz != 1:
+        attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+            attention_mask,
+            (bsz, q_len),
+            hidden_states,
+            seq_len,
+            sliding_window = getattr(self.config, "sliding_window", None),
+        )
+    else:
+        attention_mask = None
+    pass
+
+    next_decoder_cache = []
+    for idx, decoder_layer in enumerate(self.model.layers):
+        residual = hidden_states
+        hidden_states = fast_layernorm_inference(decoder_layer.input_layernorm, hidden_states, out_weight)
+        hidden_states_attention, present_key_value = CohereAttention_fast_forward_inference(
+            decoder_layer.self_attn,
+            hidden_states = hidden_states,
+            past_key_value = past_key_values[idx],
+            position_ids = position_ids,
+            attention_mask = attention_mask,
+            do_prefill = not hasattr(decoder_layer.self_attn, "paged_attention"),
+        )
+
+        hidden_states_mlp = fast_swiglu_inference(self.mlp, hidden_states)
+        residual += hidden_states_attention
+        residual += hidden_states_mlp
+        hidden_states = residual
+
+        next_decoder_cache.append(present_key_value)
+    pass
+    hidden_states = fast_layernorm_inference(self.model.norm, hidden_states, out_weight)
+
+    return BaseModelOutputWithPast(
+        last_hidden_state = hidden_states,
+        past_key_values = next_decoder_cache,
+        hidden_states = [],
+        attentions = [],
+    )
+pass
+
+
+class FastCohereModel(FastLlamaModel):
+
+    @staticmethod
+    def pre_patch():
+        init_name, function = patch_linear_scaling(
+            model_name         = "cohere",
+            rope_module        = LlamaRotaryEmbedding,
+            scaled_rope_module = LlamaLinearScalingRotaryEmbedding,
+            attention_module   = CohereAttention,
+        )
+        if init_name is not None:
+            exec(function, globals())
+            CohereAttention.__init__  = eval(init_name)
+        pass
+        CohereAttention      .forward = CohereAttention_fast_forward
+        CohereSdpaAttention  .forward = CohereAttention_fast_forward
+        CohereFlashAttention2.forward = CohereAttention_fast_forward
+        CohereDecoderLayer   .forward = CohereDecoderLayer_fast_forward
+        CohereModel          .forward = LlamaModel_fast_forward
+        CohereForCausalLM    .forward = CausalLM_fast_forward(CohereModel_fast_forward_inference)
+        PeftModelForCausalLM .forward = PeftModelForCausalLM_fast_forward
+        fix_prepare_inputs_for_generation(CohereForCausalLM)
+        
+        import transformers.models.cohere.modeling_cohere
+        transformers.models.cohere.modeling_cohere.CohereRotaryEmbedding = LlamaRotaryEmbedding
+        return
+    pass
+pass
diff --git a/unsloth-main/unsloth/models/dpo.py b/unsloth-main/unsloth/models/dpo.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dc71f920aa0ce10fa56f0ebdbe5cdb1efdee9e2
--- /dev/null
+++ b/unsloth-main/unsloth/models/dpo.py
@@ -0,0 +1,131 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    "PatchDPOTrainer",
+    "PatchKTOTrainer",
+]
+
+try:
+    from transformers.utils.notebook import (
+        IntervalStrategy,
+        NotebookTrainingTracker,
+        NotebookProgressCallback,
+    )
+    HAS_NOTEBOOK = True
+except:
+    HAS_NOTEBOOK = False
+pass
+import torch
+from ._utils import torch_compile_options
+import inspect
+import torch.nn as nn
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+
+
+DPOTrainer_metrics = [
+    "rewards/chosen",
+    "rewards/rejected",
+    "rewards/accuracies",
+    "rewards/margins",
+    "logps/rejected",
+    "logps/chosen",
+    "logits/rejected",
+    "logits/chosen",
+]
+set_DPOTrainer_metrics = frozenset(DPOTrainer_metrics)
+
+
+def NotebookProgressCallback_on_train_begin(self, args, state, control, **kwargs):
+    self.first_column = "Epoch" if args.eval_strategy == IntervalStrategy.EPOCH else "Step"
+    self.training_loss = 0
+    self.last_log = 0
+    column_names = [self.first_column] + ["Training Loss"]
+    if args.eval_strategy != IntervalStrategy.NO:
+        column_names.append("Validation Loss")
+    column_names += [x.replace("/", " / ") for x in DPOTrainer_metrics]
+    self.training_tracker = NotebookTrainingTracker(state.max_steps, column_names)
+pass
+
+
+def NotebookProgressCallback_on_log(self, args, state, control, logs=None, **kwargs):
+    # Only for when there is no evaluation
+    if args.eval_strategy == IntervalStrategy.NO and "loss" in logs:
+        values = {"Training Loss": logs["loss"]}
+        for metric in DPOTrainer_metrics:
+            values[metric.replace("/", " / ")] = logs[metric]
+        pass
+        # First column is necessarily Step since we're not in epoch eval strategy
+        values["Step"] = state.global_step
+        self.training_tracker.write_line(values)
+    pass
+pass
+
+
+def NotebookTrainingTracker_write_line(self, values):
+    """
+    Write the values in the inner table.
+
+    Args:
+        values (`Dict[str, float]`): The values to display.
+    """
+    if self.inner_table is None:
+        self.inner_table = [list(values.keys()), list(values.values())]
+    else:
+        columns = self.inner_table[0]
+        new_values = {}
+        for key, value in values.items():
+            lowered = key.lower()
+            if lowered in set_DPOTrainer_metrics:
+                new_values[lowered.replace("/", " / ")] = value
+            else:
+                new_values[key] = value
+        pass
+        values = new_values
+
+        self.inner_table[0] = columns
+        if len(self.inner_table) > 1:
+            last_values = self.inner_table[-1]
+            first_column = self.inner_table[0][0]
+            if last_values[0] != values[first_column]:
+                # write new line
+                self.inner_table.append([values[c] if c in values else "No Log" for c in columns])
+            else:
+                # update last line
+                new_values = values
+                for c in columns:
+                    if c not in new_values.keys():
+                        new_values[c] = last_values[columns.index(c)]
+                self.inner_table[-1] = [new_values[c] for c in columns]
+        else:
+            # Edit for evaluation purposes
+            self.inner_table.append([values[c] if c in values else 0 for c in columns])
+        pass
+    pass
+pass
+
+
+def PatchDPOTrainer():
+    if HAS_NOTEBOOK:
+        from transformers.trainer import is_in_notebook
+        if is_in_notebook():
+            # Patch DPO notebook printing
+            NotebookTrainingTracker.write_line = NotebookTrainingTracker_write_line
+            from transformers.trainer import DEFAULT_PROGRESS_CALLBACK
+            DEFAULT_PROGRESS_CALLBACK.on_train_begin = NotebookProgressCallback_on_train_begin
+            DEFAULT_PROGRESS_CALLBACK.on_log         = NotebookProgressCallback_on_log
+        pass
+    pass
+pass
+PatchKTOTrainer = PatchDPOTrainer
diff --git a/unsloth-main/unsloth/models/gemma.py b/unsloth-main/unsloth/models/gemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6543432821d53b5b7ef91faa063974d65fd5465
--- /dev/null
+++ b/unsloth-main/unsloth/models/gemma.py
@@ -0,0 +1,379 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .llama import *
+from ._utils import __version__
+import math
+
+try:
+    from transformers.models.gemma.modeling_gemma import (
+        GemmaAttention,
+        GemmaDecoderLayer,
+        GemmaModel,
+        GemmaForCausalLM,
+        GemmaRotaryEmbedding,
+        apply_rotary_pos_emb,
+        repeat_kv,
+    )
+except:
+    from packaging.version import Version
+    transformers_version = Version(transformers_version)
+    if not transformers_version >= Version("4.38"):
+        raise ImportError(
+            f"Unsloth: Your transformers version of {transformers_version} does not support Gemma.\n"\
+            f"The minimum required version is 4.38.\n"\
+            f'Try `pip install --upgrade "transformers>=4.38"`\n'\
+            f"to obtain the latest transformers build, then restart this session."\
+        )
+    pass
+pass
+
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+# For Pytorch 2.1.1
+try:
+    from transformers.models.gemma.modeling_gemma import (
+        GemmaSdpaAttention,
+        GemmaFlashAttention2,
+    )
+except:
+    GemmaSdpaAttention   = GemmaAttention
+    GemmaFlashAttention2 = GemmaAttention
+pass
+
+
+torch_nn_functional_gelu = torch.nn.functional.gelu
+def fast_geglu_inference(self, X):
+    # gate = self.gate_proj(X)
+    # up   = self.up_proj(X)
+    bsz, _, hd = X.shape
+    # mlp_size = self.config.intermediate_size
+    # temp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = "cuda:0")
+
+    gate = fast_linear_forward(self.gate_proj, X)#, out = temp[0])
+    up   = fast_linear_forward(self.  up_proj, X)#, out = temp[1])
+    gate = torch_nn_functional_gelu(gate, approximate = "tanh")
+    gate *= up
+
+    # X = self.down_proj(gate)
+    down = fast_linear_forward(self.down_proj, gate, out = up[:,:,:hd])
+    return down
+pass
+
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L590
+def GemmaDecoderLayer_fast_forward(
+    self,
+    hidden_states:        torch.Tensor,
+    causal_mask:          Optional[BlockDiagonalCausalMask] = None,
+    attention_mask:       Optional[torch.Tensor] = None,
+    position_ids:         Optional[torch.LongTensor] = None,
+    past_key_value:       Optional[Tuple[torch.Tensor]] = None,
+    output_attentions:    Optional[bool] = False,
+    use_cache:            Optional[bool] = False,
+    padding_mask:         Optional[torch.LongTensor] = None,
+    *args, **kwargs,
+):
+    if use_cache and hasattr(self, "_flag_for_generation"): #past_key_value is not None:
+        out_weight = torch.empty(self.input_layernorm.weight.shape, dtype = torch.float32, device = "cuda:0")
+
+        # Self Attention
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference_gemma(self.input_layernorm, hidden_states, out_weight)
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            causal_mask=causal_mask,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            padding_mask=padding_mask,
+        )
+        hidden_states += residual
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference_gemma(self.post_attention_layernorm, hidden_states, out_weight)
+        hidden_states = fast_geglu_inference(self.mlp, hidden_states)
+        hidden_states += residual
+    else:
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states, gemma = True)
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            causal_mask=causal_mask,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            padding_mask=padding_mask,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states, gemma = True)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+    pass
+
+    outputs = (hidden_states,)
+    if output_attentions: outputs += (self_attn_weights,)
+    if use_cache: outputs += (present_key_value,)
+    return outputs
+pass
+
+
+from math import sqrt as math_sqrt
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L825
+# @torch.inference_mode
+def GemmaModel_fast_forward_inference(
+    self,
+    input_ids,
+    past_key_values,
+    position_ids,
+    attention_mask = None,
+):
+    out_weight = torch.empty_like(self.model.layers[0].input_layernorm.weight, dtype = torch.float32, device = "cuda:0")
+    input_ids = input_ids[:,:self.max_seq_length]
+    hidden_states = self.model.embed_tokens(input_ids)
+    hidden_states = hidden_states.to(self.config.torch_dtype)
+    # 3072**0.5 = 55.5000 in bfloat16, whilst 55.4256 in float32
+    # 2048**0.5 = 45.2500 in bfloat16, whilst 45.2548 in float32
+    hidden_states *= torch.tensor(math_sqrt(self.config.hidden_size), dtype = hidden_states.dtype)
+
+    bsz, q_len, hd = hidden_states.shape
+    seq_len = past_key_values[0][0].shape[-2]
+    if bsz != 1:
+        attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+            attention_mask,
+            (bsz, q_len),
+            hidden_states,
+            seq_len,
+        )
+    pass
+
+    next_decoder_cache = []
+    for idx, decoder_layer in enumerate(self.model.layers):
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference_gemma(decoder_layer.input_layernorm, hidden_states, out_weight)
+        hidden_states, present_key_value = LlamaAttention_fast_forward_inference(
+            decoder_layer.self_attn,
+            hidden_states = hidden_states,
+            past_key_value = past_key_values[idx],
+            position_ids = position_ids,
+            attention_mask = attention_mask,
+            do_prefill = not hasattr(decoder_layer.self_attn, "paged_attention"),
+        )
+        hidden_states += residual
+
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference_gemma(decoder_layer.post_attention_layernorm, hidden_states, out_weight)
+        hidden_states = fast_geglu_inference(decoder_layer.mlp, hidden_states)
+        hidden_states += residual
+
+        next_decoder_cache.append(present_key_value)
+    pass
+    hidden_states = fast_rms_layernorm_inference_gemma(self.model.norm, hidden_states, out_weight)
+
+    return BaseModelOutputWithPast(
+        last_hidden_state = hidden_states,
+        past_key_values = next_decoder_cache,
+        hidden_states = [],
+        attentions = [],
+    )
+pass
+
+
+# Follows line by line https://github.com/google-deepmind/gemma/blob/main/gemma/positional_embeddings.py#L45
+# Formulates cos and sin differently from Llama!
+class GemmaFixedRotaryEmbedding(torch.nn.Module):
+    # Fixes https://github.com/huggingface/transformers/pull/28837
+    # https://github.com/microsoft/DeepSpeed/issues/4932
+    # The precision of RoPE buffers is not correct, so we cast to int64.
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
+        super().__init__()
+        if config is not None: return # [TODO] Hack to pass in config - need to remove later
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
+        self.current_rope_size = min(4 * 8192, self.max_position_embeddings)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
+    pass
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
+        # in FP32. They are applied (multiplied) in FP32 as well.
+        self.current_rope_size = seq_len
+
+        # The difference is we do division explicity instead of t * (1/x) ie we do t/x.
+        freq_exponents = (2.0 / self.dim) * (
+            torch.arange(self.dim // 2, dtype = torch.int64, device = "cpu").float()
+        )
+        timescale = self.base**freq_exponents
+        positions = torch.arange(self.current_rope_size, device = "cpu", dtype = torch.int64).float()
+        radians_new = positions[..., None] / timescale[None, None, :]
+        radians_new = radians_new.squeeze(0)
+
+        emb = torch.cat((radians_new, radians_new), dim = -1)
+        # We must do RoPE in float32!
+        cos = emb.cos().to(device = "cuda:0", non_blocking = True)#, dtype = dtype)
+        sin = emb.sin().to(device = "cuda:0", non_blocking = True)#, dtype = dtype)
+        self.register_buffer("cos_cached", cos, persistent = False)
+        self.register_buffer("sin_cached", sin, persistent = False)
+    pass
+
+    def forward(self, x, position_ids=None, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.current_rope_size:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+    pass
+
+    def get_cached(self, seq_len = None):
+        return self.cos_cached, self.sin_cached
+    pass
+
+    def extend_rope_embedding(self, x, seq_len):
+        if seq_len <= self.current_rope_size: return
+        # Iteratively grow by increments of 8192
+        self.current_rope_size = math.ceil(seq_len / 8192) * 8192
+        self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
+    pass
+pass
+
+
+class GemmaFixedLinearScalingRotaryEmbedding(GemmaFixedRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    # Fixes https://github.com/huggingface/transformers/pull/28837
+    # https://github.com/microsoft/DeepSpeed/issues/4932
+    # The precision of RoPE buffers is not correct, so we cast to int64.
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim = dim, max_position_embeddings = max_position_embeddings, base = base, device = device, config = config)
+    pass
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+# Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
+        # in FP32. They are applied (multiplied) in FP32 as well.
+        self.current_rope_size = seq_len
+
+        # The difference is we do division explicity instead of t * (1/x) ie we do t/x.
+        freq_exponents = (2.0 / self.dim) * (
+            torch.arange(self.dim // 2, dtype = torch.int64, device = "cpu").float()
+        )
+        timescale = self.base**freq_exponents
+        positions = torch.arange(self.current_rope_size, device = "cpu", dtype = torch.int64).float()
+        positions = positions /  self.scaling_factor
+        radians_new = positions[..., None] / timescale[None, None, :]
+        radians_new = radians_new.squeeze(0)
+
+        emb = torch.cat((radians_new, radians_new), dim = -1)
+        # We must do RoPE in float32!
+        cos = emb.cos().to(device = "cuda:0", non_blocking = True)#, dtype = dtype)
+        sin = emb.sin().to(device = "cuda:0", non_blocking = True)#, dtype = dtype)
+        self.register_buffer("cos_cached", cos, persistent = False)
+        self.register_buffer("sin_cached", sin, persistent = False)
+    pass
+pass
+
+
+class FastGemmaModel(FastLlamaModel):
+
+    @staticmethod
+    def pre_patch():
+        init_name, function = patch_linear_scaling(
+            model_name         = "gemma",
+            rope_module        = GemmaFixedRotaryEmbedding,
+            scaled_rope_module = GemmaFixedLinearScalingRotaryEmbedding,
+            attention_module   = GemmaAttention,
+        )
+        if init_name is not None:
+            exec(function, globals())
+            GemmaAttention.__init__  = eval(init_name)
+        pass
+        GemmaAttention      .forward = LlamaAttention_fast_forward
+        GemmaSdpaAttention  .forward = LlamaAttention_fast_forward
+        GemmaFlashAttention2.forward = LlamaAttention_fast_forward
+        GemmaDecoderLayer   .forward = GemmaDecoderLayer_fast_forward
+        GemmaModel          .forward = LlamaModel_fast_forward
+        GemmaForCausalLM    .forward = CausalLM_fast_forward(GemmaModel_fast_forward_inference)
+        PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward
+        fix_prepare_inputs_for_generation(GemmaForCausalLM)
+
+        # Solves https://github.com/unslothai/unsloth/issues/168
+        # Static KV Cache was introduced in 4.38.0, causing training to be much slower.
+        # Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
+        # https://github.com/huggingface/transformers/pull/27931
+        # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
+        import transformers.models.gemma.modeling_gemma
+        transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding = GemmaFixedRotaryEmbedding
+        return
+    pass
+
+
+    @staticmethod
+    def post_patch(model, tokenizer):
+        # Gemma does not downcast RoPE
+        model, tokenizer = patch_model_and_tokenizer(model, tokenizer, downcast_rope = False)
+
+        # Add 1 to weight
+        # return output * (1 + self.weight)
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma/modeling_gemma.py#L89
+        from transformers.models.gemma.modeling_gemma import GemmaRMSNorm
+
+        # Freeze all parameters except LoRA
+        # We do this first since += 1 seems to not be liked by requires_grad = True
+        for name, param in model.named_parameters():
+            if ".lora_A." in name or ".lora_B." in name:
+                param.requires_grad_(True)
+            else:
+                param.requires_grad_(False)
+        pass
+
+        # Patch RMS Layernorm
+        for name, module in model.named_modules():
+            if isinstance(module, GemmaRMSNorm):
+                # Must be in float32
+                # https://github.com/keras-team/keras-nlp/blob/v0.8.2/keras_nlp/models/gemma/rms_normalization.py#L36
+                # module = module.to(torch.float32)
+                # Leave + 1 to Triton kernel itself
+                # module.weight += 1.0 # return output * (1 + self.weight)
+                if not hasattr(module, "variance_epsilon"):
+                    module.variance_epsilon = module.eps # Gemma doesn't use variance_epsilon
+        pass
+
+        # Clear deleted GPU items
+        import gc
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+        return model, tokenizer
+    pass
+pass
diff --git a/unsloth-main/unsloth/models/gemma2.py b/unsloth-main/unsloth/models/gemma2.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f0a020717688d083ed2f24fe28915e703ac2a04
--- /dev/null
+++ b/unsloth-main/unsloth/models/gemma2.py
@@ -0,0 +1,529 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .llama import *
+from ._utils import __version__
+from .gemma import (
+    GemmaFixedRotaryEmbedding,
+    GemmaFixedLinearScalingRotaryEmbedding,
+    fast_geglu_inference,
+)
+try:
+    from transformers.models.gemma2.modeling_gemma2 import (
+        Gemma2Attention,
+        Gemma2DecoderLayer,
+        Gemma2Model,
+        Gemma2ForCausalLM,
+        Gemma2RotaryEmbedding,
+        apply_rotary_pos_emb,
+        repeat_kv,
+    )
+except:
+    from packaging.version import Version
+    transformers_version = Version(transformers_version)
+    if not transformers_version >= Version("4.42"):
+        raise ImportError(
+            f"Unsloth: Your transformers version of {transformers_version} does not support Gemma2.\n"\
+            f"The minimum required version is 4.42.3.\n"\
+            f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\
+            f"to obtain the latest transformers build, then restart this session."\
+        )
+    pass
+pass
+
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+# For Pytorch 2.1.1
+try:
+    from transformers.models.gemma2.modeling_gemma2 import (
+        Gemma2SdpaAttention,
+        Gemma2FlashAttention2,
+    )
+except:
+    Gemma2SdpaAttention   = Gemma2Attention
+    Gemma2FlashAttention2 = Gemma2Attention
+pass
+
+if HAS_FLASH_ATTENTION_SOFTCAPPING:
+    from flash_attn import flash_attn_func
+
+# [TODO] We must randomnly use torch.compile?
+# Gemma 2 uses double RMS Layernorms, so the backward passes should not overwrite the gradients!
+@torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
+def fast_rms_layernorm_gemma2_compiled(layernorm, X, gemma = True):
+    old_dtype = X.dtype
+    X = X.float()
+    X = X * torch.rsqrt(X.square().mean(-1, keepdim = True) + layernorm.eps) * \
+        (1.0 + layernorm.weight.float())
+    return X.to(old_dtype)
+pass
+
+
+# Logit softcapping
+def Gemma2Attention_fast_forward(
+    self,
+    hidden_states:        torch.Tensor,
+    causal_mask:          Optional[BlockDiagonalCausalMask] = None,
+    attention_mask:       Optional[torch.Tensor] = None,
+    position_ids:         Optional[torch.LongTensor] = None,
+    past_key_value:       Optional[Tuple[torch.Tensor]] = None,
+    output_attentions:    bool = False,
+    use_cache:            bool = False,
+    padding_mask:         Optional[torch.LongTensor] = None,
+    *args, **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    
+    # Clear inference
+    if hasattr(self, "paged_attention"):
+        del self.paged_attention_K
+        del self.paged_attention_V
+        del self.paged_attention
+        del self.temp_QA
+        del self.temp_KV
+        del self.RH_Q
+        del self.attention
+    pass
+
+    bsz, q_len, _ = hidden_states.size()
+
+    n_heads    = self.num_heads
+    n_groups   = self.num_key_value_groups
+    n_kv_heads = self.num_key_value_heads
+    head_dim   = self.head_dim
+    assert(n_kv_heads * n_groups == n_heads)
+
+    Q, K, V = self.apply_qkv(self, hidden_states)
+    Q = Q.view(bsz, q_len, n_heads,    head_dim).transpose(1, 2)
+    K = K.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
+    V = V.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
+
+    kv_seq_len = K.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    if position_ids is None:
+        cos = self.rotary_emb.cos_cached
+        sin = self.rotary_emb.sin_cached
+        Q, K = fast_rope_embedding(Q, K, cos, sin)
+    else:
+        cos, sin = self.rotary_emb(V, seq_len = kv_seq_len)
+        Q, K = inplace_rope_embedding(Q, K, cos, sin, position_ids)
+    pass
+
+    if past_key_value is not None:
+        K = torch.cat([past_key_value[0], K], dim = 2)
+        V = torch.cat([past_key_value[1], V], dim = 2)
+    pass
+    past_key_value = (K, V) if use_cache else None
+
+    # Only enable if the attention_mask is True
+    has_sliding_window = type(causal_mask) is bool and causal_mask is True
+    if HAS_FLASH_ATTENTION_SOFTCAPPING and attention_mask is None:
+        window = (-1, -1)
+        if has_sliding_window:
+            sw = getattr(self.config, "sliding_window", None)
+            sw = kv_seq_len if (sw is None or sw == "null") else sw
+            window = (-1, -1) if (kv_seq_len <= sw) else (sw, sw)
+        pass
+
+        # FA uses 1 / sqrt for softmax_scale!
+        if not hasattr(self, "_flash_attention_softmax_scale"):
+            self._flash_attention_softmax_scale = 1.0 / (self.config.query_pre_attn_scalar**0.5)
+        pass
+
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2)
+        V = V.transpose(1, 2)
+        A = flash_attn_func(
+            Q, K, V,
+            causal = True,
+            softcap = self.config.attn_logit_softcapping,
+            softmax_scale = self._flash_attention_softmax_scale,
+            window_size = window,
+        )
+        A = A.reshape(bsz, q_len, n_heads*head_dim)
+    else:
+        fx = slow_inference_attention_softcapping \
+            if "_flag_for_generation" in kwargs else \
+            slow_attention_softcapping
+        A = fx(Q, K, V, causal_mask, self, bsz, kv_seq_len)
+    pass
+    A = self.apply_o(self, A)
+    return A, None, past_key_value
+pass
+
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L590
+def Gemma2DecoderLayer_fast_forward(
+    self,
+    hidden_states:        torch.Tensor,
+    causal_mask:          Optional[BlockDiagonalCausalMask] = None,
+    attention_mask:       Optional[torch.Tensor] = None,
+    position_ids:         Optional[torch.LongTensor] = None,
+    past_key_value:       Optional[Tuple[torch.Tensor]] = None,
+    output_attentions:    Optional[bool] = False,
+    use_cache:            Optional[bool] = False,
+    padding_mask:         Optional[torch.LongTensor] = None,
+    *args, **kwargs,
+):
+    if use_cache and hasattr(self, "_flag_for_generation"): #past_key_value is not None:
+        out_weight = torch.empty(self.input_layernorm.weight.shape, dtype = torch.float32, device = "cuda:0")
+
+        # Self Attention
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference_gemma(self.input_layernorm, hidden_states, out_weight)
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            causal_mask=causal_mask,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            padding_mask=padding_mask,
+            _flag_for_generation=self._flag_for_generation,
+        )
+        hidden_states = fast_rms_layernorm_inference_gemma(self.post_attention_layernorm, hidden_states, out_weight)
+        hidden_states += residual
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference_gemma(self. pre_feedforward_layernorm, hidden_states, out_weight)
+        hidden_states = fast_geglu_inference(self.mlp, hidden_states)
+        hidden_states = fast_rms_layernorm_inference_gemma(self.post_feedforward_layernorm, hidden_states, out_weight)
+        hidden_states += residual
+    else:
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states, gemma = True)
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            causal_mask=causal_mask,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            padding_mask=padding_mask,
+        )
+        hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states, gemma = True)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm(self. pre_feedforward_layernorm, hidden_states, gemma = True)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = fast_rms_layernorm(self.post_feedforward_layernorm, hidden_states, gemma = True)
+        hidden_states = residual + hidden_states
+    pass
+
+    outputs = (hidden_states,)
+    if output_attentions: outputs += (self_attn_weights,)
+    if use_cache: outputs += (present_key_value,)
+    return outputs
+pass
+
+
+from math import sqrt as math_sqrt
+KV_CACHE_INCREMENT = 256 # KV Cache update size
+torch_nn_functional_softmax = torch.nn.functional.softmax
+torch_matmul = torch.matmul
+torch_tanh   = torch.tanh
+
+def Gemma2Attention_fast_forward_inference(
+    self,
+    hidden_states:  torch.Tensor,
+    past_key_value: Optional[Tuple[torch.Tensor]],
+    position_ids,
+    do_prefill = False,
+    attention_mask = None,
+    use_sliding_window = False,
+):
+    Xn = hidden_states
+    bsz, _, hd = hidden_states.size()
+    K1, V1 = past_key_value
+    dtype = Xn.dtype
+
+    n_heads    = self.num_heads
+    n_groups   = self.num_key_value_groups
+    n_kv_heads = self.num_key_value_heads
+    head_dim   = self.head_dim
+    attention_size = n_heads*head_dim
+    # assert(n_kv_heads * n_groups == n_heads)
+    seq_len = K1.shape[-2]
+    kv_seq_len = seq_len + 1
+
+    # Prefill phase
+    # if not hasattr(self, "paged_attention"):
+    if do_prefill:
+        self.paged_attention = torch.empty((KV_CACHE_INCREMENT+seq_len+1, 2, bsz, n_kv_heads, head_dim), dtype = dtype, device = "cuda:0")
+        self.paged_attention_K = self.paged_attention[:,0]
+        self.paged_attention_V = self.paged_attention[:,1]
+        self.paged_attention_K[:seq_len] = K1.permute(2, 0, 1, 3)
+        self.paged_attention_V[:seq_len] = V1.permute(2, 0, 1, 3)
+        self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0")
+        self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0")
+        self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
+        # Only for Gemma2
+        self.temp_O  = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0")
+        self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0")
+        
+        # See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+        # Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
+        # We default to using the config file itself
+        # s = self.config.hidden_size // self.config.num_attention_heads
+        self.scalar = 1.0 / math_sqrt(self.config.query_pre_attn_scalar)
+        # self.scalar = 1.0 / math_sqrt(self.config.hidden_size // self.config.num_attention_heads)
+        self.half_head_dim = head_dim // 2
+        self.           t =       self.config.attn_logit_softcapping
+        self.reciprocal_t = 1.0 / self.config.attn_logit_softcapping
+    elif kv_seq_len >= self.paged_attention.shape[0]:
+        self.paged_attention.resize_((self.paged_attention.shape[0]+KV_CACHE_INCREMENT, 2, bsz, n_kv_heads, head_dim))
+        self.paged_attention_K = self.paged_attention[:,0]
+        self.paged_attention_V = self.paged_attention[:,1]
+        self.attention.resize_((bsz, n_heads, 1, self.attention.shape[-1]+KV_CACHE_INCREMENT))
+    pass
+
+    Qn = fast_linear_forward(self.q_proj, Xn, out = self.temp_QA[0])
+    Kn = fast_linear_forward(self.k_proj, Xn, out = self.temp_KV[0])
+    Vn = fast_linear_forward(self.v_proj, Xn, out = self.temp_KV[1])
+    Qn = Qn.view(bsz, 1, n_heads,    head_dim).transpose(1, 2)
+    Kn = Kn.view(bsz, 1, n_kv_heads, head_dim).transpose(1, 2)
+    Vn = Vn.view(bsz, 1, n_kv_heads, head_dim).transpose(1, 2)
+
+    # cos, sin = self.rotary_emb(Vn, seq_len = kv_seq_len)
+    # Qn, Kn = inplace_rope_embedding(Qn, Kn, cos, sin, position_ids)
+    cos = self.rotary_emb.cos_cached[position_ids].unsqueeze(1)
+    sin = self.rotary_emb.sin_cached[position_ids].unsqueeze(1)
+    h = self.half_head_dim
+
+    RH_Q = self.RH_Q
+    RH_Q[:,:,:,:h] = Qn[:,:,:,h:]
+    RH_Q[:,:,:,h:] = Qn[:,:,:,:h]
+    torch.neg(RH_Q[:,:,:,:h], out = RH_Q[:,:,:,:h])
+    Qn *= cos
+    Qn.addcmul_(RH_Q, sin)
+
+    RH_K = RH_Q[:,:n_kv_heads,:,:] # torch.empty((n_kv_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
+    RH_K[:,:,:,:h] = Kn[:,:,:,h:]
+    RH_K[:,:,:,h:] = Kn[:,:,:,:h]
+    torch.neg(RH_K[:,:,:,:h], out = RH_K[:,:,:,:h])
+    Kn *= cos
+    Kn.addcmul_(RH_K, sin)
+    
+    # New KV cache
+    # Kn = torch.cat([K1, Kn], dim = 2)
+    # Vn = torch.cat([V1, Vn], dim = 2)
+    self.paged_attention_K[seq_len] = Kn.permute(2, 0, 1, 3)
+    self.paged_attention_V[seq_len] = Vn.permute(2, 0, 1, 3)
+    Kn = self.paged_attention_K[:kv_seq_len].permute(1, 2, 0, 3)
+    Vn = self.paged_attention_V[:kv_seq_len].permute(1, 2, 0, 3)
+
+    # Handle sliding windows
+    sliding_window = self.config.sliding_window
+    if use_sliding_window and kv_seq_len > sliding_window:
+        # From https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral/modeling_mistral.py#L193
+        slicing_tokens = 1 - sliding_window
+        Knn = Kn[:, :, slicing_tokens:, :]#.contiguous()
+        Vnn = Vn[:, :, slicing_tokens:, :]#.contiguous()
+    else:
+        Knn, Vnn = Kn, Vn
+    pass
+
+    # Grouped query attention
+    _, _, cached_len, _ = Knn.shape
+    if n_groups != 1:
+        Knn = Knn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
+        Vnn = Vnn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
+        Knn = Knn.reshape(bsz, n_heads, cached_len, head_dim)
+        Vnn = Vnn.reshape(bsz, n_heads, cached_len, head_dim)
+    pass
+    # else:
+    #     Knn, Vnn = Knn, Vnn
+    # pass
+
+    # Attention
+    # if bsz == 1:
+    Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
+    # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows
+    A = torch_matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
+    # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched
+
+    A *= self.reciprocal_t; torch_tanh(A, out = A); A *= self.t;  # Logit softcapping
+
+    A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype)
+    A = torch_matmul(A, Vnn, out = Qn)
+    # else:
+    #     A = scaled_dot_product_attention(Qn, Knn, Vnn, attn_mask = attention_mask, is_causal = False)
+    # pass
+    A = A.transpose(1, 2)
+    A = A.reshape(bsz, 1, attention_size)
+    A = fast_linear_forward(self.o_proj, A, out = self.temp_O)
+    return A, (Kn, Vn)
+pass
+
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L825
+# @torch.inference_mode
+def Gemma2Model_fast_forward_inference(
+    self,
+    input_ids,
+    past_key_values,
+    position_ids,
+    attention_mask = None,
+):
+    out_weight = torch.empty_like(self.model.layers[0].input_layernorm.weight, dtype = torch.float32, device = "cuda:0")
+    input_ids = input_ids[:,:self.max_seq_length]
+    hidden_states = self.model.embed_tokens(input_ids)
+    hidden_states = hidden_states.to(self.config.torch_dtype)
+    # 3072**0.5 = 55.5000 in bfloat16, whilst 55.4256 in float32
+    # 2048**0.5 = 45.2500 in bfloat16, whilst 45.2548 in float32
+    hidden_states *= torch.tensor(math_sqrt(self.config.hidden_size), dtype = hidden_states.dtype)
+
+    bsz, q_len, hd = hidden_states.shape
+    seq_len = past_key_values[0][0].shape[-2]
+    if bsz != 1:
+        if HAS_FLASH_ATTENTION_SOFTCAPPING:
+            SWA = True
+            GA  = False
+        else:
+            SWA = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (bsz, q_len),
+                hidden_states,
+                seq_len,
+                sliding_window = self.config.sliding_window,
+            )
+            GA = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (bsz, q_len),
+                hidden_states,
+                seq_len,
+            )
+        pass
+    else:
+        SWA = attention_mask
+        GA  = attention_mask
+    pass
+    next_decoder_cache = []
+    for idx, decoder_layer in enumerate(self.model.layers):
+
+        use_sliding_window = idx % 2 == 0
+
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference_gemma(decoder_layer.input_layernorm, hidden_states, out_weight)
+        hidden_states, present_key_value = Gemma2Attention_fast_forward_inference(
+            decoder_layer.self_attn,
+            hidden_states = hidden_states,
+            past_key_value = past_key_values[idx],
+            position_ids = position_ids,
+            attention_mask = SWA if use_sliding_window else GA,
+            do_prefill = not hasattr(decoder_layer.self_attn, "paged_attention"),
+            use_sliding_window = use_sliding_window,
+        )
+        hidden_states = fast_rms_layernorm_inference_gemma(decoder_layer.post_attention_layernorm, hidden_states, out_weight)
+        hidden_states += residual
+
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference_gemma(decoder_layer. pre_feedforward_layernorm, hidden_states, out_weight)
+        hidden_states = fast_geglu_inference(decoder_layer.mlp, hidden_states)
+        hidden_states = fast_rms_layernorm_inference_gemma(decoder_layer.post_feedforward_layernorm, hidden_states, out_weight)
+        hidden_states += residual
+
+        next_decoder_cache.append(present_key_value)
+    pass
+    hidden_states = fast_rms_layernorm_inference_gemma(self.model.norm, hidden_states, out_weight)
+
+    return BaseModelOutputWithPast(
+        last_hidden_state = hidden_states,
+        past_key_values = next_decoder_cache,
+        hidden_states = [],
+        attentions = [],
+    )
+pass
+
+
+class FastGemma2Model(FastLlamaModel):
+
+    @staticmethod
+    def pre_patch():
+        init_name, function = patch_linear_scaling(
+            model_name         = "gemma2",
+            rope_module        = GemmaFixedRotaryEmbedding,
+            scaled_rope_module = GemmaFixedLinearScalingRotaryEmbedding,
+            attention_module   = Gemma2Attention,
+        )
+        if init_name is not None:
+            exec(function, globals())
+            Gemma2Attention.__init__  = eval(init_name)
+        pass
+        Gemma2Attention      .forward = Gemma2Attention_fast_forward
+        Gemma2SdpaAttention  .forward = Gemma2Attention_fast_forward
+        Gemma2FlashAttention2.forward = Gemma2Attention_fast_forward
+        Gemma2DecoderLayer   .forward = Gemma2DecoderLayer_fast_forward
+        Gemma2Model          .forward = LlamaModel_fast_forward
+        Gemma2ForCausalLM    .forward = CausalLM_fast_forward(Gemma2Model_fast_forward_inference)
+        PeftModelForCausalLM .forward = PeftModelForCausalLM_fast_forward
+        fix_prepare_inputs_for_generation(Gemma2ForCausalLM)
+        
+        # Solves https://github.com/unslothai/unsloth/issues/168
+        # Static KV Cache was introduced in 4.38.0, causing training to be much slower.
+        # Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
+        # https://github.com/huggingface/transformers/pull/27931
+        # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
+        import transformers.models.gemma2.modeling_gemma2
+        transformers.models.gemma2.modeling_gemma2.Gemma2RotaryEmbedding = GemmaFixedRotaryEmbedding
+        return
+    pass
+
+
+    @staticmethod
+    def post_patch(model, tokenizer):
+        # Gemma does not downcast RoPE
+        model, tokenizer = patch_model_and_tokenizer(model, tokenizer, downcast_rope = False)
+
+        # Add 1 to weight
+        # return output * (1 + self.weight)
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma/modeling_gemma.py#L89
+        from transformers.models.gemma2.modeling_gemma2 import Gemma2RMSNorm
+
+        # Freeze all parameters except LoRA
+        # We do this first since += 1 seems to not be liked by requires_grad = True
+        for name, param in model.named_parameters():
+            if ".lora_A." in name or ".lora_B." in name:
+                param.requires_grad_(True)
+            else:
+                param.requires_grad_(False)
+        pass
+
+        # Patch RMS Layernorm
+        for name, module in model.named_modules():
+            if isinstance(module, Gemma2RMSNorm):
+                # Must be in float32
+                # https://github.com/keras-team/keras-nlp/blob/v0.8.2/keras_nlp/models/gemma/rms_normalization.py#L36
+                # module = module.to(torch.float32)
+                # Leave + 1 to Triton kernel itself
+                # module.weight += 1.0 # return output * (1 + self.weight)
+                if not hasattr(module, "variance_epsilon"):
+                    module.variance_epsilon = module.eps # Gemma doesn't use variance_epsilon
+        pass
+
+        # Clear deleted GPU items
+        import gc
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+        return model, tokenizer
+    pass
+pass
diff --git a/unsloth-main/unsloth/models/granite.py b/unsloth-main/unsloth/models/granite.py
new file mode 100644
index 0000000000000000000000000000000000000000..9466a8d6c1d912999f5d2324f4289516f2deea92
--- /dev/null
+++ b/unsloth-main/unsloth/models/granite.py
@@ -0,0 +1,523 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .llama import *
+import os
+from ._utils import __version__
+from .llama import (
+    LlamaRotaryEmbedding,
+    LlamaLinearScalingRotaryEmbedding,
+)
+from .mistral import *
+
+try:
+    from transformers.models.granite.modeling_granite import (
+        GraniteAttention,
+        GraniteDecoderLayer,
+        GraniteModel,
+        GraniteForCausalLM,
+    )
+except:
+    from packaging.version import Version
+
+    transformers_version = Version(transformers_version)
+    if not transformers_version >= Version("4.45.0"):
+        raise ImportError(
+            f"Unsloth: Your transformers version of {transformers_version} does not support Gemma2.\n"\
+            f"The minimum required version is 4.42.3.\n"\
+            f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\
+            f"to obtain the latest transformers build, then restart this session."\
+        )
+    pass
+pass
+
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+
+# For Pytorch 2.1.1
+try:
+    from transformers.models.granite.modeling_granite import (
+        GraniteSdpaAttention,
+        GraniteFlashAttention2,
+    )
+except:
+    GraniteSdpaAttention   = GraniteAttention
+    GraniteFlashAttention2 = GraniteAttention
+pass
+
+def GraniteAttention_fast_forward(
+    self,
+    hidden_states:        torch.Tensor,
+    causal_mask:          Optional[BlockDiagonalCausalMask] = None,
+    attention_mask:       Optional[torch.Tensor] = None,
+    position_ids:         Optional[torch.LongTensor] = None,
+    past_key_value:       Optional[Tuple[torch.Tensor]] = None,
+    output_attentions:    bool = False,
+    use_cache:            bool = False,
+    padding_mask:         Optional[torch.LongTensor] = None,
+    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    *args, **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    
+    # Clear inference
+    if hasattr(self, "paged_attention"):
+        del self.paged_attention_K
+        del self.paged_attention_V
+        del self.paged_attention
+        del self.temp_QA
+        del self.temp_KV
+        del self.RH_Q
+        del self.attention
+    pass
+
+    bsz, q_len, _ = hidden_states.size()
+
+    n_heads    = self.num_heads
+    n_groups   = self.num_key_value_groups
+    n_kv_heads = self.num_key_value_heads
+    head_dim   = self.head_dim
+    assert(n_kv_heads * n_groups == n_heads)
+
+    Q, K, V = self.apply_qkv(self, hidden_states)
+    Q = Q.view(bsz, q_len, n_heads,    head_dim).transpose(1, 2)
+    K = K.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
+    V = V.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
+
+    kv_seq_len = K.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    assert position_embeddings is not None
+    cos, sin = position_embeddings
+    if position_ids is None:
+        Q, K = fast_rope_embedding(Q, K, cos, sin)
+    else:
+        Q, K = inplace_rope_embedding(Q, K, cos, sin, position_ids)
+
+    if past_key_value is not None:
+        K = torch.cat([past_key_value[0], K], dim = 2)
+        V = torch.cat([past_key_value[1], V], dim = 2)
+    pass
+    past_key_value = (K, V) if use_cache else None
+
+    # Attention module
+    if (not HAS_FLASH_ATTENTION and attention_mask is None):
+        # Xformers memory efficient attention
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2)
+        V = V.transpose(1, 2)
+        K_M = V_M = bsz * kv_seq_len
+        Q_M = bsz * q_len
+
+        # Group query attention
+        K = K  .view(bsz, kv_seq_len, n_kv_heads,        1, head_dim)
+        V = V  .view(bsz, kv_seq_len, n_kv_heads,        1, head_dim)
+        K = K.expand(bsz, kv_seq_len, n_kv_heads, n_groups, head_dim)
+        V = V.expand(bsz, kv_seq_len, n_kv_heads, n_groups, head_dim)
+        if hidden_states.requires_grad:
+            K = K.reshape(bsz, kv_seq_len, n_heads, head_dim)
+            V = V.reshape(bsz, kv_seq_len, n_heads, head_dim)
+        else:
+            # Xformers does support the forward pass though
+            Q = Q.view(bsz, q_len, n_kv_heads, n_groups, head_dim)
+        pass
+
+        A = xformers_attention(Q, K, V, attn_bias = causal_mask, scale=self.scaling)
+        A = A.view(bsz, q_len, n_heads, head_dim)
+
+    elif HAS_FLASH_ATTENTION and attention_mask is None:
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2)
+        V = V.transpose(1, 2)
+        window = (kv_seq_len, kv_seq_len)
+        A = flash_attn_func(Q, K, V, causal = True, window_size = window, softmax_scale=self.scaling)
+    else:
+        # Grouped query attention
+        # if n_groups != 1:
+        K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, kv_seq_len, head_dim)
+        V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, kv_seq_len, head_dim)
+        K = K.reshape(bsz, n_heads, kv_seq_len, head_dim)
+        V = V.reshape(bsz, n_heads, kv_seq_len, head_dim)
+        # pass
+        # Must be contiguous or else results are False!
+        # https://github.com/pytorch/pytorch/issues/112577
+        Q, K, V = Q.contiguous(), K.contiguous(), V.contiguous()
+        # Needs (batch_size, n_heads, seq_len, head_dim)
+        # is_casual and attention_mask must not be both set!
+        A = scaled_dot_product_attention(Q, K, V, attn_mask = attention_mask, scale = self.scaling, is_causal = False)
+        # Go back to (batch_size, seq_len, n_heads, head_dim)
+        A = A.transpose(1, 2).contiguous()
+    pass
+    
+    attn_output = A.reshape(bsz, q_len, n_heads*head_dim)
+    attn_output = self.apply_o(self, attn_output)
+    attn_weights = None
+    return attn_output, attn_weights, past_key_value
+pass
+
+
+def GraniteDecoderLayer_fast_forward(
+    self,
+    hidden_states:        torch.Tensor,
+    causal_mask:          Optional[BlockDiagonalCausalMask] = None,
+    attention_mask:       Optional[torch.Tensor] = None,
+    position_ids:         Optional[torch.LongTensor] = None,
+    past_key_value:       Optional[Tuple[torch.Tensor]] = None,
+    output_attentions:    Optional[bool] = False,
+    use_cache:            Optional[bool] = False,
+    padding_mask:         Optional[torch.LongTensor] = None,
+    position_embeddings:  Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    *args, **kwargs,
+):
+    if use_cache and hasattr(self, "_flag_for_generation"): #past_key_value is not None:
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference(self.input_layernorm, hidden_states)
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            causal_mask=causal_mask,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            padding_mask=padding_mask,
+            position_embeddings = position_embeddings,
+            _flag_for_generation=self._flag_for_generation,
+        )
+        hidden_states = torch.add(residual, hidden_states, alpha = self.config.residual_multiplier)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference(self.post_attention_layernorm, hidden_states)
+        hidden_states = fast_swiglu_inference(self.mlp, hidden_states)
+        hidden_states = torch.add(residual, hidden_states, alpha = self.config.residual_multiplier)
+    else:
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states)
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            causal_mask=causal_mask,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            padding_mask=padding_mask,
+            position_embeddings = position_embeddings,
+        )
+        hidden_states = torch.add(residual, hidden_states, alpha = self.config.residual_multiplier)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = torch.add(residual, hidden_states, alpha = self.config.residual_multiplier)
+    pass
+
+    outputs = (hidden_states,)
+    if output_attentions: outputs += (self_attn_weights,)
+    if use_cache: outputs += (present_key_value,)
+    return outputs
+pass
+
+
+from math import sqrt as math_sqrt
+KV_CACHE_INCREMENT = 256 # KV Cache update size
+torch_nn_functional_softmax = torch.nn.functional.softmax
+torch_matmul = torch.matmul
+torch_tanh   = torch.tanh
+
+def GraniteAttention_fast_forward_inference(
+    self,
+    hidden_states:  torch.Tensor,
+    past_key_value: Optional[Tuple[torch.Tensor]],
+    position_ids,
+    do_prefill = False,
+    attention_mask = None,
+    use_sliding_window = False,
+    position_embeddings : Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+):
+    
+    assert position_embeddings is not None, f"Granite model requires position embeddings to be specified"
+
+    Xn = hidden_states
+    bsz, _, hd = hidden_states.size()
+    K1, V1 = past_key_value
+    dtype = Xn.dtype
+
+    n_heads    = self.num_heads
+    n_groups   = self.num_key_value_groups
+    n_kv_heads = self.num_key_value_heads
+    head_dim   = self.head_dim
+    attention_size = n_heads*head_dim
+    # assert(n_kv_heads * n_groups == n_heads)
+    seq_len = K1.shape[-2]
+    kv_seq_len = seq_len + 1
+
+    # Prefill phase
+    # if not hasattr(self, "paged_attention"):
+    if do_prefill:
+        self.paged_attention = torch.empty((KV_CACHE_INCREMENT+seq_len+1, 2, bsz, n_kv_heads, head_dim), dtype = dtype, device = "cuda:0")
+        self.paged_attention_K = self.paged_attention[:,0]
+        self.paged_attention_V = self.paged_attention[:,1]
+        self.paged_attention_K[:seq_len] = K1.permute(2, 0, 1, 3)
+        self.paged_attention_V[:seq_len] = V1.permute(2, 0, 1, 3)
+        self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0")
+        self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0")
+        self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
+        # Only for Gemma2
+        self.temp_O  = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0")
+        self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0")
+
+
+        self.half_head_dim = head_dim // 2
+    elif kv_seq_len >= self.paged_attention.shape[0]:
+        self.paged_attention.resize_((self.paged_attention.shape[0]+KV_CACHE_INCREMENT, 2, bsz, n_kv_heads, head_dim))
+        self.paged_attention_K = self.paged_attention[:,0]
+        self.paged_attention_V = self.paged_attention[:,1]
+        self.attention.resize_((bsz, n_heads, 1, self.attention.shape[-1]+KV_CACHE_INCREMENT))
+    pass
+
+    Qn = fast_linear_forward(self.q_proj, Xn, out = self.temp_QA[0])
+    Kn = fast_linear_forward(self.k_proj, Xn, out = self.temp_KV[0])
+    Vn = fast_linear_forward(self.v_proj, Xn, out = self.temp_KV[1])
+    Qn = Qn.view(bsz, 1, n_heads,    head_dim).transpose(1, 2)
+    Kn = Kn.view(bsz, 1, n_kv_heads, head_dim).transpose(1, 2)
+    Vn = Vn.view(bsz, 1, n_kv_heads, head_dim).transpose(1, 2)
+
+    # cos, sin = self.rotary_emb(Vn, seq_len = kv_seq_len)
+    # Qn, Kn = inplace_rope_embedding(Qn, Kn, cos, sin, position_ids)
+    cos, sin = position_embeddings
+    cos, sin = cos[position_ids], sin[position_ids]
+    h = self.half_head_dim
+
+    RH_Q = self.RH_Q
+    RH_Q[:,:,:,:h] = Qn[:,:,:,h:]
+    RH_Q[:,:,:,h:] = Qn[:,:,:,:h]
+    torch.neg(RH_Q[:,:,:,:h], out = RH_Q[:,:,:,:h])
+    Qn *= cos
+    Qn.addcmul_(RH_Q, sin)
+
+    RH_K = RH_Q[:,:n_kv_heads,:,:] # torch.empty((n_kv_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
+    RH_K[:,:,:,:h] = Kn[:,:,:,h:]
+    RH_K[:,:,:,h:] = Kn[:,:,:,:h]
+    torch.neg(RH_K[:,:,:,:h], out = RH_K[:,:,:,:h])
+    Kn *= cos
+    Kn.addcmul_(RH_K, sin)
+    
+    # New KV cache
+    # Kn = torch.cat([K1, Kn], dim = 2)
+    # Vn = torch.cat([V1, Vn], dim = 2)
+    self.paged_attention_K[seq_len] = Kn.permute(2, 0, 1, 3)
+    self.paged_attention_V[seq_len] = Vn.permute(2, 0, 1, 3)
+    Kn = self.paged_attention_K[:kv_seq_len].permute(1, 2, 0, 3)
+    Vn = self.paged_attention_V[:kv_seq_len].permute(1, 2, 0, 3)
+
+    # Grouped query attention
+    _, _, cached_len, _ = Kn.shape
+    if n_groups != 1:
+        Kn = Kn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
+        Vn = Vn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
+        Kn = Kn.reshape(bsz, n_heads, cached_len, head_dim)
+        Vn = Vn.reshape(bsz, n_heads, cached_len, head_dim)
+    pass
+    # else:
+    #     Kn, Vn = Kn, Vn
+    # pass
+
+    Qn *= self.scaling
+    A = torch_matmul(Qn, Kn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
+    
+    # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched
+
+    A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype)
+    A = torch_matmul(A, Vn, out = Qn)
+    # else:
+    #     A = scaled_dot_product_attention(Qn, Kn, Vn, attn_mask = attention_mask, is_causal = False)
+    # pass
+    A = A.transpose(1, 2)
+    A = A.reshape(bsz, 1, attention_size)
+    A = fast_linear_forward(self.o_proj, A, out = self.temp_O)
+    return A, (Kn, Vn)
+pass
+
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L825
+# @torch.inference_mode
+def GraniteModel_fast_forward_inference(
+    self,
+    input_ids,
+    past_key_values,
+    position_ids,
+    attention_mask = None,
+):
+    input_ids = input_ids[:,:self.max_seq_length]
+    hidden_states = self.model.embed_tokens(input_ids)
+    hidden_states = hidden_states.to(self.config.torch_dtype)
+    hidden_states *= self.model.embedding_multiplier
+
+    bsz, q_len, hd = hidden_states.shape
+    seq_len = past_key_values[0][0].shape[-2]
+    if bsz != 1:
+        attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+            attention_mask,
+            (bsz, q_len),
+            hidden_states,
+            seq_len,
+        )
+    else:
+        attention_mask = None
+    pass
+
+    position_embeddings = self.model.rotary_emb(hidden_states, position_ids, self.max_seq_length)
+
+    next_decoder_cache = []
+    for idx, decoder_layer in enumerate(self.model.layers):
+
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference(decoder_layer.input_layernorm, hidden_states)
+        hidden_states, present_key_value = GraniteAttention_fast_forward_inference(
+            decoder_layer.self_attn,
+            hidden_states = hidden_states,
+            past_key_value = past_key_values[idx],
+            position_ids = position_ids,
+            attention_mask = attention_mask,
+            do_prefill = not hasattr(decoder_layer.self_attn, "paged_attention"),
+            position_embeddings = position_embeddings,
+        )
+
+        hidden_states = torch.add(residual, hidden_states, alpha = self.config.residual_multiplier)
+
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference(decoder_layer.post_attention_layernorm, hidden_states)
+        hidden_states = fast_swiglu_inference(decoder_layer.mlp, hidden_states)
+        hidden_states = torch.add(residual, hidden_states, alpha = self.config.residual_multiplier)
+
+        next_decoder_cache.append(present_key_value)
+    pass
+    hidden_states = fast_rms_layernorm_inference(self.model.norm, hidden_states)
+
+    return BaseModelOutputWithPast(
+        last_hidden_state = hidden_states,
+        past_key_values = next_decoder_cache,
+        hidden_states = [],
+        attentions = [],
+    )
+pass
+
+class GraniteRotaryEmbedding(LlamaRotaryEmbedding):
+    def __init__(self, config):
+        super().__init__(config = config)
+
+class FastGraniteModel(FastLlamaModel):
+
+    @staticmethod
+    def pre_patch():
+        init_name, function = patch_linear_scaling(
+            model_name         = "granite",
+            rope_module        = GraniteRotaryEmbedding,
+            scaled_rope_module = LlamaLinearScalingRotaryEmbedding,
+            attention_module   = GraniteAttention,
+        )
+        if init_name is not None:
+            exec(function, globals())
+            GraniteAttention.__init__  = eval(init_name)
+        pass
+        GraniteAttention      .forward = GraniteAttention_fast_forward
+        GraniteSdpaAttention  .forward = GraniteAttention_fast_forward
+        GraniteFlashAttention2.forward = GraniteAttention_fast_forward
+        GraniteDecoderLayer   .forward = GraniteDecoderLayer_fast_forward
+        GraniteModel          .forward = LlamaModel_fast_forward
+        GraniteForCausalLM    .forward = CausalLM_fast_forward(GraniteModel_fast_forward_inference)
+        PeftModelForCausalLM .forward = PeftModelForCausalLM_fast_forward
+        fix_prepare_inputs_for_generation(GraniteForCausalLM)
+
+        import transformers.models.granite.modeling_granite
+        transformers.models.granite.modeling_granite.GraniteRotaryEmbedding = GraniteRotaryEmbedding
+
+        return
+    pass
+
+
+    @staticmethod
+    def post_patch(model):
+
+        # Torch.compile fails on embedding matrix??
+        # Workaround randomnly fixes it for torch versions < 2.2
+        model.model.embed_tokens = torch.nn.Embedding.from_pretrained(model.model.embed_tokens.weight)
+        model.config.update({"unsloth_version" : __version__})
+
+        # We also do this for the lm_head
+        lm_head = torch.nn.Linear(1, 1, bias = None)
+        del lm_head.weight
+        lm_head.weight = model.lm_head.weight
+        lm_head.in_features  = lm_head.weight.shape[1]
+        lm_head.out_features = lm_head.weight.shape[0]
+        model.lm_head = lm_head
+
+        # Granite has tied weights! This means lm_head == embed_tokens
+        if model.model.embed_tokens.weight.data_ptr() != model.lm_head.weight.data_ptr():
+            lm_head = torch.nn.Linear(1, 1, bias = None)
+            del lm_head.weight
+            lm_head.weight = model.model.embed_tokens.weight
+            lm_head.in_features  = lm_head.weight.shape[1]
+            lm_head.out_features = lm_head.weight.shape[0]
+            model.lm_head = lm_head
+        pass
+
+        # Also patch all dtypes - BnB seems to not allocate the correct type?
+        # BnB default dtype seems to be float16!
+        correct_dtype = lm_head.weight.dtype
+
+        for name, module in model.named_modules():
+            if isinstance(module, (Bnb_Linear4bit, Peft_Linear4bit)):
+                weight = module.weight
+                quant_state = weight.quant_state
+
+                if type(quant_state) is list:
+                    # BnB seems to have float16 as default!
+                    module.weight.quant_state[2] = correct_dtype # Cast to correct dtype
+                else:
+                    # https://github.com/TimDettmers/bitsandbytes/pull/763/files
+                    quant_state.dtype = correct_dtype
+                pass
+            pass
+            # Downcast RoPE embedding to correct data type
+            if (name.endswith("rotary_emb") or hasattr(module, "cos_cached")):
+
+                if hasattr(module, "cos_cached") and \
+                    (module.cos_cached.dtype != correct_dtype):
+
+                    module.cos_cached = module.cos_cached.to(correct_dtype)
+                    module.sin_cached = module.sin_cached.to(correct_dtype)
+
+                elif hasattr(module, "short_cos_cached") and \
+                    (module.short_cos_cached.dtype != correct_dtype):
+                    
+                    module.short_cos_cached = module.short_cos_cached.to(correct_dtype)
+                    module.short_sin_cached = module.short_sin_cached.to(correct_dtype)
+                pass
+            pass
+        pass
+
+        # Clear deleted GPU items
+        import gc
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+        return model
+    pass
+pass
+
diff --git a/unsloth-main/unsloth/models/llama.py b/unsloth-main/unsloth/models/llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..c94514966f9062c6b47a56fe4420082d3a5981d8
--- /dev/null
+++ b/unsloth-main/unsloth/models/llama.py
@@ -0,0 +1,2578 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import gc
+import math
+from typing import Optional, Tuple, List, Union
+from ._utils import *
+from ._utils import __version__
+from torch.nn.functional import scaled_dot_product_attention
+from transformers import __version__ as transformers_version
+from transformers.models.llama.modeling_llama import (
+    logger,
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from ..kernels import *
+from ..tokenizer_utils import *
+if HAS_FLASH_ATTENTION:
+    from flash_attn import flash_attn_func
+
+# Final patching code
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaModel,
+    LlamaForCausalLM,
+)
+
+# For Pytorch 2.1.1
+try:
+    from transformers.models.llama.modeling_llama import (
+        LlamaSdpaAttention,
+        LlamaFlashAttention2,
+    )
+except:
+    LlamaSdpaAttention   = LlamaAttention
+    LlamaFlashAttention2 = LlamaAttention
+pass
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
+from transformers import set_seed as transformers_set_seed
+from peft import LoraConfig, TaskType, get_peft_model as _get_peft_model
+from peft import PeftModelForCausalLM
+from ..save import patch_saving_functions
+import re, os, inspect, math, sys
+try:
+    from huggingface_hub.utils import get_token
+except:
+    # Old HF Hub versions <= 0.0.25
+    from huggingface_hub.utils._token import get_token
+pass
+from triton import __version__ as triton_version
+BlockDiagonalCausalMask = xformers.attn_bias.BlockDiagonalCausalMask if xformers is not None else None
+
+
+def original_apply_qkv(self, X):
+    Q = self.q_proj(X)
+    K = self.k_proj(X)
+    V = self.v_proj(X)
+    return Q, K, V
+pass
+
+
+def original_apply_o(self, X):
+    O = self.o_proj(X)
+    return O
+pass
+
+from math import sqrt as math_sqrt
+KV_CACHE_INCREMENT = 256 # KV Cache update size
+torch_nn_functional_softmax = torch.nn.functional.softmax
+
+# Fix new HF's inference code
+def _fast_prepare_inputs_for_generation(self, input_ids, **kwargs,):
+    if "past_key_values" in kwargs:
+        input_ids = input_ids[:,[-1]]
+        kwargs["attention_mask"] = kwargs["attention_mask"][:,[-1]]
+    if "cache_position" in kwargs:
+        kwargs["position_ids"] = kwargs["cache_position"]
+    return { "input_ids" : input_ids, **kwargs, }
+pass
+
+
+def fix_prepare_inputs_for_generation(module):
+    # Fix prepare_inputs_for_generation
+    if hasattr(module, "prepare_inputs_for_generation"):
+        module.prepare_inputs_for_generation = _fast_prepare_inputs_for_generation
+    pass
+pass
+
+torch_matmul = torch.matmul
+def LlamaAttention_fast_forward_inference(
+    self,
+    hidden_states:  torch.Tensor,
+    past_key_value: Optional[Tuple[torch.Tensor]],
+    position_ids,
+    do_prefill = False,
+    attention_mask = None,
+):
+    """
+        https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L406
+        Fast inference using KV cache.
+        QK^T can be computed in 4 chunks
+
+        [Q, q] @ [K, k].T where q, k are the new tokens.
+        [QK^T, Qk^T]
+        [qK^T, qk^T]
+
+        Since the attention mask wipes Qk^T, we just get
+        [QK^T,    0]
+        [qK^T, qk^T]
+
+        Since softmax is row-wise, we get
+        softmax([QK^T,    0])
+        softmax([qK^T, qk^T])
+
+        We then multiply by   [V]
+                              [v]
+        softmax([QK^T,    0]) [softmax(QK^T)V] *
+        softmax([qK^T, qk^T]) [softmax([qK^T, qk^T]) @ [V, v]]
+
+        But notice * [softmax(QK^T)V] is just the last attention.
+        We just need to compute the last final row.
+
+        This means we can pass in a row of Q, but we need to
+        remember K and V, which are called the KV cache.
+    """
+    Xn = hidden_states
+    bsz, _, hd = hidden_states.size()
+    K1, V1 = past_key_value
+    dtype = Xn.dtype
+
+    n_heads    = self.num_heads
+    n_groups   = self.num_key_value_groups
+    n_kv_heads = self.num_key_value_heads
+    head_dim   = self.head_dim
+    attention_size = n_heads*head_dim
+    # assert(n_kv_heads * n_groups == n_heads)
+    seq_len = K1.shape[-2]
+    kv_seq_len = seq_len + 1
+
+    # Prefill phase
+    # if not hasattr(self, "paged_attention"):
+    if do_prefill:
+        self.paged_attention = torch.empty((KV_CACHE_INCREMENT+seq_len+1, 2, bsz, n_kv_heads, head_dim), dtype = dtype, device = "cuda:0")
+        self.paged_attention_K = self.paged_attention[:,0]
+        self.paged_attention_V = self.paged_attention[:,1]
+        self.paged_attention_K[:seq_len] = K1.permute(2, 0, 1, 3)
+        self.paged_attention_V[:seq_len] = V1.permute(2, 0, 1, 3)
+        self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0")
+        self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0")
+        self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
+        
+        # Mistral Nemo 12b has weird dimensions
+        if attention_size != self.hidden_size:
+            self.temp_O = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0")
+        else:
+            self.temp_O = self.temp_QA[1][:,:,:self.hidden_size]
+        pass
+        
+        self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0")
+        self.scalar = 1.0 / math_sqrt(self.head_dim)
+        self.half_head_dim = head_dim // 2
+    elif kv_seq_len >= self.paged_attention.shape[0]:
+        self.paged_attention.resize_((self.paged_attention.shape[0]+KV_CACHE_INCREMENT, 2, bsz, n_kv_heads, head_dim))
+        self.paged_attention_K = self.paged_attention[:,0]
+        self.paged_attention_V = self.paged_attention[:,1]
+        self.attention.resize_((bsz, n_heads, 1, self.attention.shape[-1]+KV_CACHE_INCREMENT))
+    pass
+
+    Qn = fast_linear_forward(self.q_proj, Xn, out = self.temp_QA[0])
+    Kn = fast_linear_forward(self.k_proj, Xn, out = self.temp_KV[0])
+    Vn = fast_linear_forward(self.v_proj, Xn, out = self.temp_KV[1])
+    Qn = Qn.view(bsz, 1, n_heads,    head_dim).transpose(1, 2)
+    Kn = Kn.view(bsz, 1, n_kv_heads, head_dim).transpose(1, 2)
+    Vn = Vn.view(bsz, 1, n_kv_heads, head_dim).transpose(1, 2)
+
+    # cos, sin = self.rotary_emb(Vn, seq_len = kv_seq_len)
+    # Qn, Kn = inplace_rope_embedding(Qn, Kn, cos, sin, position_ids)
+
+    # Need to do it prior 2 steps before hitting full on short KV cache
+    # or else error
+    self.rotary_emb.extend_rope_embedding(Vn, seq_len + 2)
+    cos, sin = self.rotary_emb.get_cached(kv_seq_len)
+    cos = cos[position_ids].unsqueeze(1)
+    sin = sin[position_ids].unsqueeze(1)
+    h = self.half_head_dim
+
+    RH_Q = self.RH_Q
+    RH_Q[:,:,:,:h] = Qn[:,:,:,h:]
+    RH_Q[:,:,:,h:] = Qn[:,:,:,:h]
+    torch.neg(RH_Q[:,:,:,:h], out = RH_Q[:,:,:,:h])
+    Qn *= cos
+    Qn.addcmul_(RH_Q, sin)
+
+    RH_K = RH_Q[:,:n_kv_heads,:,:] # torch.empty((n_kv_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
+    RH_K[:,:,:,:h] = Kn[:,:,:,h:]
+    RH_K[:,:,:,h:] = Kn[:,:,:,:h]
+    torch.neg(RH_K[:,:,:,:h], out = RH_K[:,:,:,:h])
+    Kn *= cos
+    Kn.addcmul_(RH_K, sin)
+    
+    # New KV cache
+    # Kn = torch.cat([K1, Kn], dim = 2)
+    # Vn = torch.cat([V1, Vn], dim = 2)
+    self.paged_attention_K[seq_len] = Kn.permute(2, 0, 1, 3)
+    self.paged_attention_V[seq_len] = Vn.permute(2, 0, 1, 3)
+    Kn = self.paged_attention_K[:kv_seq_len].permute(1, 2, 0, 3)
+    Vn = self.paged_attention_V[:kv_seq_len].permute(1, 2, 0, 3)
+
+    # Handle sliding windows
+    sliding_window = getattr(self.config, "sliding_window", None)
+    if sliding_window is not None and kv_seq_len > sliding_window:
+        # From https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral/modeling_mistral.py#L193
+        slicing_tokens = 1 - sliding_window
+        Knn = Kn[:, :, slicing_tokens:, :]#.contiguous()
+        Vnn = Vn[:, :, slicing_tokens:, :]#.contiguous()
+    else:
+        Knn, Vnn = Kn, Vn
+    pass
+
+    # Grouped query attention
+    _, _, cached_len, _ = Knn.shape
+    if n_groups != 1:
+        Knn = Knn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
+        Vnn = Vnn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
+        Knn = Knn.reshape(bsz, n_heads, cached_len, head_dim)
+        Vnn = Vnn.reshape(bsz, n_heads, cached_len, head_dim)
+    pass
+    # else:
+    #     Knn, Vnn = Knn, Vnn
+    # pass
+
+    # Attention
+    if bsz == 1:
+        Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
+        # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows
+        A = torch_matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
+        # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched
+        A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype)
+        A = torch_matmul(A, Vnn, out = Qn)
+    else:
+        A = scaled_dot_product_attention(Qn, Knn, Vnn, attn_mask = attention_mask, is_causal = False)
+    pass
+    A = A.transpose(1, 2)
+    A = A.reshape(bsz, 1, attention_size)
+    A = fast_linear_forward(self.o_proj, A, out = self.temp_O)
+    return A, (Kn, Vn)
+pass
+
+
+torch_nn_functional_silu = torch.nn.functional.silu
+def fast_swiglu_inference(self, X):
+    # gate = self.gate_proj(X)
+    # up   = self.up_proj(X)
+    bsz, _, hd = X.shape
+    # mlp_size = self.config.intermediate_size
+    # temp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = "cuda:0")
+
+    gate = fast_linear_forward(self.gate_proj, X)#, out = temp[0])
+    up   = fast_linear_forward(self.  up_proj, X)#, out = temp[1])
+    gate = torch_nn_functional_silu(gate, inplace = True)
+    gate *= up
+
+    # X = self.down_proj(gate)
+    down = fast_linear_forward(self.down_proj, gate, out = up[:,:,:hd])
+    return down
+pass
+
+
+def fast_rms_layernorm_inference(self, X):
+    old_dtype = X.dtype
+    XX = X.to(torch.float32)
+    variance = XX.square().mean(-1, keepdim = True)
+    variance += self.variance_epsilon
+    XX *= variance.rsqrt_()
+    X = XX.to(old_dtype) # Must preserve due to residual
+    X *= self.weight
+    return X
+pass
+
+
+def fast_rms_layernorm_inference_gemma(self, X, out_weight = None):
+    XX = X.to(torch.float32)
+    variance = XX.square().mean(-1, keepdim = True)
+    variance += self.variance_epsilon
+    XX *= variance.rsqrt_()
+
+    if out_weight is None:
+        out_weight = self.weight + 1.0
+    else:
+        out_weight[:] = self.weight
+        out_weight += 1.0
+    pass
+
+    XX *= out_weight
+    return XX.to(X.dtype)
+pass
+
+
+# Normal layernorm with mean removal
+@torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
+def fast_layernorm_compiled(layernorm, X):
+    old_dtype = X.dtype
+    X = X.float()
+    mean = X.mean(-1, keepdim = True)
+    Xbar = X - mean
+    X = Xbar * torch.rsqrt(Xbar.square().mean(-1, keepdim = True) + \
+        layernorm.variance_epsilon) * \
+        layernorm.weight.float()
+    return X.to(old_dtype)
+pass
+
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L320
+def LlamaAttention_fast_forward(
+    self,
+    hidden_states:       torch.Tensor,
+    causal_mask:         Optional[BlockDiagonalCausalMask] = None,
+    attention_mask:      Optional[torch.Tensor] = None,
+    position_ids:        Optional[torch.LongTensor] = None,
+    past_key_value:      Optional[Tuple[torch.Tensor]] = None,
+    output_attentions:   bool = False,
+    use_cache:           bool = False,
+    padding_mask:        Optional[torch.LongTensor] = None,
+    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    *args, **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    
+    # Clear inference
+    if hasattr(self, "paged_attention"):
+        del self.paged_attention_K
+        del self.paged_attention_V
+        del self.paged_attention
+        del self.temp_QA
+        del self.temp_KV
+        del self.RH_Q
+        del self.attention
+    pass
+
+    bsz, q_len, _ = hidden_states.size()
+
+    n_heads    = self.num_heads
+    n_groups   = self.num_key_value_groups
+    n_kv_heads = self.num_key_value_heads
+    head_dim   = self.head_dim
+    assert(n_kv_heads * n_groups == n_heads)
+
+    Q, K, V = self.apply_qkv(self, hidden_states)
+    Q = Q.view(bsz, q_len, n_heads,    head_dim).transpose(1, 2)
+    K = K.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
+    V = V.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
+
+    kv_seq_len = K.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    if position_embeddings:
+        cos, sin = position_embeddings
+    else:
+        # Extend RoPE dynamically to fit in VRA
+        rotary_emb = self.rotary_emb
+        rotary_emb.extend_rope_embedding(V, seq_len=kv_seq_len)
+
+        if position_ids is None:
+            # Useful for LongRoPE
+            cos, sin = rotary_emb.get_cached(kv_seq_len)
+        else:
+            cos, sin = rotary_emb(V, seq_len=kv_seq_len)
+
+    Q, K = (
+        fast_rope_embedding(Q, K, cos, sin) 
+        if position_ids is None 
+        else inplace_rope_embedding(Q, K, cos, sin, position_ids)
+    )
+
+    if past_key_value is not None:
+        K = torch.cat([past_key_value[0], K], dim = 2)
+        V = torch.cat([past_key_value[1], V], dim = 2)
+    pass
+    past_key_value = (K, V) if use_cache else None
+
+    # Attention module
+    if (not HAS_FLASH_ATTENTION and attention_mask is None):
+        # Xformers memory efficient attention
+        # Also has Flash Attention v2 dispatching
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2)
+        V = V.transpose(1, 2)
+
+        # Group query attention
+        if n_groups != 1:
+            K = K  .view(bsz, kv_seq_len, n_kv_heads,        1, head_dim)
+            V = V  .view(bsz, kv_seq_len, n_kv_heads,        1, head_dim)
+            K = K.expand(bsz, kv_seq_len, n_kv_heads, n_groups, head_dim)
+            V = V.expand(bsz, kv_seq_len, n_kv_heads, n_groups, head_dim)
+            if hidden_states.requires_grad:
+                K = K.reshape(bsz, kv_seq_len, n_heads, head_dim)
+                V = V.reshape(bsz, kv_seq_len, n_heads, head_dim)
+            else:
+                Q = Q.view(bsz, q_len, n_kv_heads, n_groups, head_dim)
+        pass
+        A = xformers_attention(Q, K, V, attn_bias = causal_mask)
+        A = A.view(bsz, q_len, n_heads, head_dim)
+
+    elif HAS_FLASH_ATTENTION and attention_mask is None:
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2)
+        V = V.transpose(1, 2)
+        A = flash_attn_func(Q, K, V, causal = True)
+    else:
+        # Grouped query attention
+        if n_groups != 1:
+            K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, kv_seq_len, head_dim)
+            V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, kv_seq_len, head_dim)
+            K = K.reshape(bsz, n_heads, kv_seq_len, head_dim)
+            V = V.reshape(bsz, n_heads, kv_seq_len, head_dim)
+        pass
+        # Must be contiguous or else results are False!
+        # https://github.com/pytorch/pytorch/issues/112577
+        Q, K, V = Q.contiguous(), K.contiguous(), V.contiguous()
+        # Needs (batch_size, n_heads, seq_len, head_dim)
+        # is_casual and attention_mask must not be both set!
+        A = scaled_dot_product_attention(Q, K, V, attn_mask = attention_mask, is_causal = False)
+        # Go back to (batch_size, seq_len, n_heads, head_dim)
+        A = A.transpose(1, 2).contiguous()
+    pass
+    attn_output = A.reshape(bsz, q_len, n_heads*head_dim)
+    attn_output = self.apply_o(self, attn_output)
+    attn_weights = None
+    return attn_output, attn_weights, past_key_value
+pass
+
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L590
+def LlamaDecoderLayer_fast_forward(
+    self,
+    hidden_states:       torch.Tensor,
+    causal_mask          = None,
+    attention_mask:      Optional[torch.Tensor] = None,
+    position_ids:        Optional[torch.LongTensor] = None,
+    past_key_value:      Optional[Tuple[torch.Tensor]] = None,
+    output_attentions:   Optional[bool] = False,
+    use_cache:           Optional[bool] = False,
+    padding_mask:        Optional[torch.LongTensor] = None,
+    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    *args, **kwargs,
+) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    """
+    Args:
+        hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+            `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+            returned tensors for more detail.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+            (see `past_key_values`).
+        past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+    """
+    if use_cache and hasattr(self, "_flag_for_generation"):
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference(self.input_layernorm, hidden_states)
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states       = hidden_states,
+            causal_mask         = causal_mask,
+            attention_mask      = attention_mask,
+            position_ids        = position_ids,
+            past_key_value      = past_key_value,
+            output_attentions   = output_attentions,
+            use_cache           = use_cache,
+            padding_mask        = padding_mask,
+            position_embeddings = position_embeddings,
+        )
+        hidden_states += residual
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference(self.post_attention_layernorm, hidden_states)
+        hidden_states = fast_swiglu_inference(self.mlp, hidden_states)
+        hidden_states += residual
+    else:
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states)
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states       = hidden_states,
+            causal_mask         = causal_mask,
+            attention_mask      = attention_mask,
+            position_ids        = position_ids,
+            past_key_value      = past_key_value,
+            output_attentions   = output_attentions,
+            use_cache           = use_cache,
+            padding_mask        = padding_mask,
+            position_embeddings = position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+    pass
+
+    outputs = (hidden_states,)
+    if output_attentions: outputs += (self_attn_weights,)
+    if use_cache: outputs += (present_key_value,)
+    return outputs
+pass
+
+
+# https://github.com/unslothai/unsloth/issues/404#issuecomment-2323473452
+__DTYPE_MAP = {
+    "float32": torch.float32,
+    torch.float32: torch.float32,
+    "float16": torch.float16,
+    torch.float16: torch.float16,
+    "bfloat16": torch.bfloat16,
+    torch.bfloat16: torch.bfloat16,
+}
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L825
+def LlamaModel_fast_forward(
+    self,
+    input_ids:            torch.LongTensor,
+    causal_mask:          Optional[BlockDiagonalCausalMask] = None,
+    attention_mask:       Optional[torch.Tensor] = None,
+    position_ids:         Optional[torch.LongTensor] = None,
+    past_key_values:      Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds:        Optional[torch.FloatTensor] = None,
+    use_cache:            Optional[bool] = None,
+    output_attentions:    Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict:          Optional[bool] = None,
+    *args, **kwargs,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    assert(output_attentions is False)
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # retrieve input_ids and inputs_embeds
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("Unsloth: You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+    elif input_ids is not None:
+        batch_size, seq_length = input_ids.shape
+    elif inputs_embeds is not None:
+        batch_size, seq_length, _ = inputs_embeds.shape
+    else:
+        raise ValueError("Unsloth: You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+    seq_length_with_past = seq_length
+
+    # Fix out of bounds tokenization
+    if hasattr(self, "max_seq_length"):
+        if seq_length > self.max_seq_length:
+            logger.warning_once(
+                f"Unsloth: Input IDs of length {seq_length} > the model's max sequence length of {self.max_seq_length}.\n"\
+                "We shall truncate it ourselves. It's imperative if you correct this issue first."
+            )
+        if input_ids is not None:
+            input_ids = input_ids[:,:self.max_seq_length]
+        elif inputs_embeds is not None:
+            inputs_embeds = inputs_embeds[:,:self.max_seq_length,:]
+        pass
+    pass
+    
+    past_key_values_length = 0
+
+    if past_key_values is not None:
+        past_key_values_length = past_key_values[0][0].shape[2]
+        seq_length_with_past = seq_length_with_past + past_key_values_length
+    pass
+
+    # We already handle KV cache position_ids ourselves.
+    if False:#(past_key_values_length != 0):
+        position_ids = torch.arange(
+            past_key_values_length, seq_length + past_key_values_length,
+            dtype  = torch.int32,
+            device = "cuda:0",
+        )
+        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+    elif position_ids is not None:
+        position_ids = position_ids.view(-1, seq_length).to(torch.int32)#.long()
+    else:
+        position_ids = None
+    pass
+
+    if position_ids is not None:
+        if position_ids.shape[0] != batch_size:
+            position_ids = position_ids.repeat((batch_size, 1))
+    pass
+
+    # Embed positions
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids)
+
+    # inputs_embeds = inputs_embeds.to(self.config.torch_dtype)
+    torch_dtype = __DTYPE_MAP.get(self.config.torch_dtype, None)
+    if torch_dtype is not None:
+        inputs_embeds = inputs_embeds.to(torch_dtype)
+    else:
+        raise TypeError("Unsloth: torch_dtype for models is not bfloat16, float16 or float32!")
+    pass
+
+    # Normalized from Gemma
+    IS_GEMMA   = self.config.model_type.startswith("gemma")
+    IS_GEMMA2  = self.config.model_type.startswith("gemma2")
+    IS_COHERE  = self.config.model_type.startswith("cohere")
+    IS_GRANITE = self.config.model_type.startswith("granite")
+    train_embed_tokens = self.embed_tokens.weight.requires_grad
+
+    if IS_GEMMA:
+        # Match Gemma exactly by casting to bfloat16 / float16
+        # inputs_embeds *= math_sqrt(self.config.hidden_size)
+        # Ie 3072**0.5 = 55.5000 in bfloat16, whilst 55.4256 in float32
+        # &  2048**0.5 = 45.2500 in bfloat16, whilst 45.2548 in float32
+        normalizer = torch.tensor(math_sqrt(self.config.hidden_size), dtype = inputs_embeds.dtype)
+
+        if train_embed_tokens:
+            # Careful we must not do an inplace op!
+            inputs_embeds = inputs_embeds * normalizer
+        else:
+            inputs_requires_grad = inputs_embeds.requires_grad
+            if not inputs_embeds.is_leaf:
+                inputs_embeds = inputs_embeds.detach()
+                inputs_requires_grad = True
+            elif inputs_requires_grad:
+                inputs_embeds.requires_grad_(False)
+            pass
+            inputs_embeds *= normalizer
+            # inputs_embeds *= math_sqrt(self.config.hidden_size)
+            if inputs_requires_grad: inputs_embeds.requires_grad_(True)
+        pass
+    pass
+
+    # Fix up attention mask by setting elements to 0
+    # Specifically for DPO
+    if self._has_no_labels and (attention_mask is not None) and (past_key_values is None) and \
+        (not train_embed_tokens):
+        # Careful for inference the attention_mask is size (1, kv_seq_len)
+        # Whilst the input_embeds is size (1, 1, 4096)
+        inputs_requires_grad = inputs_embeds.requires_grad
+        if not inputs_embeds.is_leaf:
+            inputs_embeds = inputs_embeds.detach()
+            inputs_requires_grad = True
+        elif inputs_requires_grad:
+            inputs_embeds.requires_grad_(False)
+        pass
+        inputs_embeds *= attention_mask.unsqueeze(0).transpose(0, 1).transpose(1, 2)
+        if inputs_requires_grad: inputs_embeds.requires_grad_(True)
+    pass
+
+    # Ignore attention_mask
+    if attention_mask is None:
+        padding_mask = None
+    elif self.training:
+        attention_mask = None
+        padding_mask = None
+    else:
+        # if 0 in attention_mask:
+        #     padding_mask = attention_mask
+        # else:
+        padding_mask = None
+
+        attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+            sliding_window = getattr(self.config, "sliding_window", None),
+        )
+    pass
+
+    hidden_states = inputs_embeds
+    if IS_GRANITE: #granite has embedding multiplier
+        hidden_states = self.embedding_multiplier * hidden_states
+
+    if past_key_values is None and self.training:
+        use_cache = False
+        # if use_cache:
+        #     logger.warning_once(
+        #         "Unsloth: `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`"
+        #     )
+        #     use_cache = False
+    pass
+
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    next_decoder_cache = () if use_cache else None
+
+    # Gradient checkpointing methods (ie sqrt)
+    if hasattr(self, "_gradient_checkpointing_boundaries"):
+        boundaries = self._gradient_checkpointing_boundaries
+    else:
+        boundaries = None
+    pass
+
+    # Check checkpointing method
+    gradient_checkpointing = False
+    offloaded_gradient_checkpointing = False
+
+    if (self.gradient_checkpointing and self.training and not use_cache):
+
+        gradient_checkpointing = True
+
+        if output_attentions is False and hasattr(self, "_offloaded_gradient_checkpointing"):
+            offloaded_gradient_checkpointing = True
+    pass
+
+    # Gemma2 has alternating SWA and global attn
+    use_static_mask  = True
+    dynamic_SWA_mask = None
+    dynamic_GA_mask  = None
+    if IS_GEMMA2:
+        if HAS_FLASH_ATTENTION_SOFTCAPPING and attention_mask is None:
+            self.SWA_mask = True
+            self.GA_mask  = False
+        elif attention_mask is not None:
+
+            # Fixes https://github.com/unslothai/unsloth/issues/853
+            # Unsloth needs a 2D mask, not a [2, 1, n, n] mask!
+            dynamic_SWA_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window = self.config.sliding_window,
+            )[0][0]
+            dynamic_GA_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window = None,
+            )[0][0]
+            use_static_mask = False
+
+        elif not hasattr(self, "SWA_mask"):
+            if HAS_FLEX_ATTENTION:
+                # Use Flex Attention instead!
+                self.SWA_mask = create_flex_attention_sliding_window_mask(self.max_seq_length, self.config.sliding_window)
+                self.GA_mask  = create_flex_attention_causal_mask(self.max_seq_length)
+            else:
+                n = self.max_seq_length # self.config.max_position_embeddings
+                # masked_fill is making stuff slower!
+                # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0)
+                # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window)
+                from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+                self.SWA_mask = AttentionMaskConverter(
+                    is_causal = True,
+                    sliding_window = self.config.sliding_window,
+                )\
+                    .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
+                    .squeeze(0).squeeze(0)
+
+                self.GA_mask = AttentionMaskConverter(
+                    is_causal = True,
+                )\
+                    .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
+                    .squeeze(0).squeeze(0)
+            pass
+        pass
+    pass
+
+    if transformers_version > "4.47.1" and hasattr(self, "rotary_emb"):
+        # Transformers main has made it mandatory to pass position_embeddings
+        # https://github.com/huggingface/transformers/pull/34858
+        position_embeddings = self.rotary_emb(hidden_states, position_ids, self.config.max_position_embeddings)
+    else:
+        position_embeddings = None
+
+    # Go through every layer!
+    for idx, decoder_layer in enumerate(self.layers):
+
+        if output_hidden_states: all_hidden_states += (hidden_states,)
+        past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+        mask = causal_mask
+        if IS_GEMMA2:
+            if (idx % 2 == 0):
+                mask = self.SWA_mask if use_static_mask else dynamic_SWA_mask
+            else:
+                mask = self. GA_mask if use_static_mask else dynamic_GA_mask
+        pass
+
+        if offloaded_gradient_checkpointing:
+            hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply(
+                decoder_layer,
+                hidden_states,
+                mask,
+                attention_mask,
+                position_ids,
+                past_key_values,
+                output_attentions,
+                use_cache,
+                None,
+                position_embeddings,
+            )[0]
+
+        elif gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs, past_key_value, output_attentions, padding_mask = padding_mask, position_embeddings = position_embeddings)
+                return custom_forward
+            pass
+
+            layer_outputs = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(decoder_layer),
+                hidden_states,
+                mask,
+                attention_mask,
+                position_ids,
+                use_reentrant = True,
+                preserve_rng_state = False,
+            )
+            hidden_states = layer_outputs[0]
+
+        else:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                causal_mask=mask,
+                attention_mask      = attention_mask,
+                position_ids        = position_ids,
+                past_key_value      = past_key_value,
+                output_attentions   = output_attentions,
+                use_cache           = use_cache,
+                padding_mask        = padding_mask,
+                position_embeddings = position_embeddings,
+            )
+            hidden_states = layer_outputs[0]
+        pass
+
+        if use_cache: next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+        if output_attentions: all_self_attns += (layer_outputs[1],)
+    pass
+
+    # Final layernorm
+    if use_cache:
+        hidden_states = \
+            (fast_rms_layernorm_inference_gemma if IS_GEMMA else fast_rms_layernorm_inference)\
+            (self.norm, hidden_states)
+    elif IS_COHERE:
+        hidden_states = self.norm(hidden_states)
+    else:
+        hidden_states = fast_rms_layernorm(self.norm, hidden_states, gemma = IS_GEMMA)
+    pass
+
+    if output_hidden_states: all_hidden_states += (hidden_states,)
+    next_cache = next_decoder_cache if use_cache else None
+
+    if not return_dict:
+        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+    )
+pass
+
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L825
+def LlamaModel_fast_forward_inference(
+    self,
+    input_ids,
+    past_key_values,
+    position_ids,
+    attention_mask = None,
+):
+    input_ids = input_ids[:,:self.max_seq_length]
+    hidden_states = self.model.embed_tokens(input_ids)
+    hidden_states = hidden_states.to(self.config.torch_dtype)
+    bsz, q_len, hd = hidden_states.shape
+    seq_len = past_key_values[0][0].shape[-2]
+    if bsz != 1:
+        attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+            attention_mask,
+            (bsz, q_len),
+            hidden_states,
+            seq_len,
+            sliding_window = getattr(self.config, "sliding_window", None),
+        )
+    else:
+        attention_mask = None
+    pass
+
+    next_decoder_cache = []
+    for idx, decoder_layer in enumerate(self.model.layers):
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference(decoder_layer.input_layernorm, hidden_states)
+        hidden_states, present_key_value = LlamaAttention_fast_forward_inference(
+            decoder_layer.self_attn,
+            hidden_states = hidden_states,
+            past_key_value = past_key_values[idx],
+            position_ids = position_ids,
+            attention_mask = attention_mask,
+            do_prefill = not hasattr(decoder_layer.self_attn, "paged_attention"),
+        )
+        hidden_states += residual
+
+        residual = hidden_states
+        hidden_states = fast_rms_layernorm_inference(decoder_layer.post_attention_layernorm, hidden_states)
+        hidden_states = fast_swiglu_inference(decoder_layer.mlp, hidden_states)
+        hidden_states += residual
+
+        next_decoder_cache.append(present_key_value)
+    pass
+    hidden_states = fast_rms_layernorm_inference(self.model.norm, hidden_states)
+
+    return BaseModelOutputWithPast(
+        last_hidden_state = hidden_states,
+        past_key_values = next_decoder_cache,
+        hidden_states = [],
+        attentions = [],
+    )
+pass
+
+
+def CausalLM_fast_forward(fast_forward_inference):
+    def _CausalLM_fast_forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        causal_mask: Optional[BlockDiagonalCausalMask] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_logits_to_keep: Optional[int] = 0,
+        *args, **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        
+        if past_key_values is not None:
+            outputs = fast_forward_inference(
+                self,
+                input_ids,
+                past_key_values,
+                position_ids = position_ids,
+                attention_mask = attention_mask,
+            )
+        else:
+            causal_mask = xformers.attn_bias.LowerTriangularMask()
+
+            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+            output_hidden_states = (
+                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            )
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+            # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+            self.model._has_no_labels = labels is None
+            outputs = self.model(
+                input_ids=input_ids,
+                causal_mask=causal_mask,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        pass
+        hidden_states = outputs[0]
+
+        bsz, q_len, hd = hidden_states.shape
+        lm_head = self.lm_head.weight
+        logit_softcapping = getattr(self.config, "final_logit_softcapping", 0)
+        logit_scaling     = getattr(self.config, "logit_scale", 0)
+
+        if bsz == 1 and q_len == 1:
+            logits = torch.mv(lm_head, hidden_states.ravel().to(lm_head.dtype))
+            logits = logits.unsqueeze(0).unsqueeze(0)
+        elif num_logits_to_keep != 0:
+            logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :].to(lm_head.dtype))
+        else:
+            RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1"
+            # < 1024 Normal Unsloth uses less VRAM!
+            if bsz*q_len <= 1024: RETURN_LOGITS = True
+            
+            if not RETURN_LOGITS and HAS_CUT_CROSS_ENTROPY and labels is not None:
+                n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None)
+                loss = fused_linear_cross_entropy(
+                    hidden_states      = hidden_states,
+                    lm_weight          = lm_head,
+                    labels             = labels,
+                    num_items_in_batch = n_items,
+                    logit_softcapping  = logit_softcapping,
+                )
+                if not return_dict:
+                    output = (logits,) + outputs[1:]
+                    return (loss,) + output if loss is not None else output
+
+                output = CausalLMOutputWithPast(
+                    loss=loss,
+                    logits=EMPTY_LOGITS,
+                    past_key_values=outputs.past_key_values,
+                    hidden_states=outputs.hidden_states,
+                    attentions=outputs.attentions,
+                )
+                return output
+            pass
+            logits = self.lm_head(hidden_states.to(lm_head.dtype))
+        pass
+
+        torch_dtype = __DTYPE_MAP.get(self.config.torch_dtype, None)
+        if torch_dtype is not None:
+            logits = logits.to(torch_dtype)
+        else:
+            raise TypeError("Unsloth: torch_dtype for models is not bfloat16, float16 or float32!")
+        pass
+
+        loss = None
+        logit_softcapping = getattr(self.config, "final_logit_softcapping", 0)
+        logit_scaling     = getattr(self.config, "logit_scale", 0)
+        if self.config.model_type == "granite":
+            # granite uses logit_scaling as key and they divide by the scale unlike cohere
+            # notice that for granite, logits_scale is 16 and for cohere it is 0.125 (aka 1/8) in their respective configs
+            # granite: https://github.com/huggingface/transformers/blob/4d1d0f29a493098e6bc6b904b82e29cb331827f5/src/transformers/models/granite/modeling_granite.py#L1103
+            # cohere: https://github.com/huggingface/transformers/blob/4d1d0f29a493098e6bc6b904b82e29cb331827f5/src/transformers/models/cohere/modeling_cohere.py#L1176
+            logit_scaling = 1 / getattr(self.config, "logits_scaling", 1)
+
+        if labels is not None:
+            shift_logits = logits
+            if not hasattr(self, "extra_ignored_labels"):
+                # Fixes https://github.com/unslothai/unsloth/issues/10
+                self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = "cuda:0")
+            pass
+            shift_labels = torch.hstack((labels[..., 1:], self.extra_ignored_labels[:labels.shape[0]]))
+            loss = fast_cross_entropy_loss(
+                logits = shift_logits,
+                labels = shift_labels,
+                logit_softcapping = logit_softcapping,
+                logit_scaling     = logit_scaling,
+                n_items           = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None),
+            )
+        else:
+            if logit_scaling != 0:
+                if logits.requires_grad:
+                    logits = logit_scaling * logits
+                else:
+                    logits *= logit_scaling
+                pass
+            pass
+            if logit_softcapping != 0:
+                if logits.requires_grad:
+                    logits = (1.0 / logit_softcapping) * logits
+                    logits = torch.tanh(logits)
+                    logits = logit_softcapping * logits
+                else:
+                    logits *= (1.0 / logit_softcapping)
+                    torch.tanh(logits, out = logits)
+                    logits *= logit_softcapping
+                pass
+            pass
+        pass
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    pass
+    return _CausalLM_fast_forward
+pass
+
+
+@torch._disable_dynamo
+def PeftModelForCausalLM_fast_forward(
+    self,
+    input_ids=None,
+    causal_mask=None,
+    attention_mask=None,
+    inputs_embeds=None,
+    labels=None,
+    output_attentions=None,
+    output_hidden_states=None,
+    return_dict=None,
+    task_ids=None,
+    num_logits_to_keep=0,
+    **kwargs,
+):
+    return self.base_model(
+        input_ids=input_ids,
+        causal_mask=causal_mask,
+        attention_mask=attention_mask,
+        inputs_embeds=inputs_embeds,
+        labels=labels,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        num_logits_to_keep=num_logits_to_keep,
+        **kwargs,
+    )
+pass
+
+
+# Solves https://github.com/unslothai/unsloth/issues/168
+# Static KV Cache was introduced in 4.38.0, causing training to be much slower.
+# Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
+# https://github.com/huggingface/transformers/pull/27931
+# https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
+class LlamaRotaryEmbedding(torch.nn.Module):
+    # Fixes https://github.com/huggingface/transformers/pull/28837
+    # https://github.com/microsoft/DeepSpeed/issues/4932
+    # The precision of RoPE buffers is not correct, so we cast to int64.
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
+        super().__init__()
+        if config is not None:
+            # [TODO] Hack to pass in config - need to remove later
+            base = config.rope_theta
+            partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+            dim = int((config.hidden_size // config.num_attention_heads))
+            device = "cuda"
+            max_position_embeddings = config.max_position_embeddings
+        pass
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
+        self.current_rope_size = min(4 * 8192, self.max_position_embeddings)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
+    pass
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
+        # in FP32. They are applied (multiplied) in FP32 as well.
+        self.current_rope_size = seq_len
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
+        )
+        t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
+
+        freqs = torch.outer(t, inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
+    pass
+
+    def forward(self, x, position_ids=None, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.current_rope_size:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype = x.dtype),
+            self.sin_cached[:seq_len].to(dtype = x.dtype),
+        )
+    pass
+
+    def get_cached(self, seq_len = None):
+        return self.cos_cached, self.sin_cached
+    pass
+
+    def extend_rope_embedding(self, x, seq_len):
+        if seq_len <= self.current_rope_size: return
+        # Iteratively grow by increments of 8192
+        self.current_rope_size = ((seq_len // 8192) + ((seq_len % 8192) != 0)) * 8192
+        self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
+    pass
+pass
+
+
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    # Fixes https://github.com/huggingface/transformers/pull/28837
+    # https://github.com/microsoft/DeepSpeed/issues/4932
+    # The precision of RoPE buffers is not correct, so we cast to int64.
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim = dim, max_position_embeddings = max_position_embeddings, base = base, device = device, config = config)
+    pass
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.current_rope_size = seq_len
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
+        )
+        t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
+        t = t / self.scaling_factor
+
+        freqs = torch.outer(t, inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
+    pass
+pass
+
+
+# See https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py#L736
+# For Llama 3.1
+class LlamaExtendedRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
+        super().__init__()
+        if config is not None:
+            # [TODO] Hack to pass in config - need to remove later
+            base = config.rope_theta
+            partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+            dim = int((config.hidden_size // config.num_attention_heads))
+            device = "cuda"
+            max_position_embeddings = config.max_position_embeddings
+        pass
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
+        self.current_rope_size = min(4 * 8192, self.max_position_embeddings)
+
+        # Normal Llama-3 RoPE
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
+        )
+        inv_freq = self.apply_scaling(inv_freq)
+        self.register_buffer("inv_freq", inv_freq, persistent = False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
+    pass
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
+        # in FP32. They are applied (multiplied) in FP32 as well.
+        self.current_rope_size = seq_len
+        
+        t = torch.arange(self.current_rope_size, device=self.inv_freq.device, dtype=torch.int64).float()
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
+    pass
+
+    # From https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/api/model.py#L41
+    def apply_scaling(self, freqs: torch.Tensor):
+        # Values obtained from grid search
+        scale_factor = 8
+        low_freq_factor = 1
+        high_freq_factor = 4
+        old_context_len = 8192  # original llama3 length
+
+        low_freq_wavelen = old_context_len / low_freq_factor
+        high_freq_wavelen = old_context_len / high_freq_factor
+        new_freqs = []
+        for freq in freqs:
+            wavelen = 2 * math.pi / freq
+            if wavelen < high_freq_wavelen:
+                new_freqs.append(freq)
+            elif wavelen > low_freq_wavelen:
+                new_freqs.append(freq / scale_factor)
+            else:
+                assert low_freq_wavelen != high_freq_wavelen
+                smooth = (old_context_len / wavelen - low_freq_factor) / (
+                    high_freq_factor - low_freq_factor
+                )
+                new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
+        return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+    pass
+
+    def forward(self, x, position_ids=None, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.current_rope_size:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype = x.dtype),
+            self.sin_cached[:seq_len].to(dtype = x.dtype),
+        )
+    pass
+
+    def get_cached(self, seq_len = None):
+        return self.cos_cached, self.sin_cached
+    pass
+
+    def extend_rope_embedding(self, x, seq_len):
+        if seq_len <= self.current_rope_size: return
+        # Iteratively grow by increments of 8192
+        self.current_rope_size = ((seq_len // 8192) + ((seq_len % 8192) != 0)) * 8192
+        self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
+    pass
+pass
+
+
+class LongRopeRotaryEmbedding(torch.nn.Module):
+    # For Phi 3.5 128K https://huggingface.co/microsoft/Phi-3.5-mini-instruct/blob/main/modeling_phi3.py
+    def __init__(self,
+        dim = None,
+        max_position_embeddings = 131072,
+        original_max_position_embeddings = 4096,
+        base = 10000,
+        short_factor = None,
+        long_factor  = None,
+        device = None,
+        config = None, # [TODO] Hack to pass in config - need to remove later
+    ):
+        super().__init__()
+        assert(short_factor is not None)
+        assert(long_factor  is not None)
+        assert(type(original_max_position_embeddings) is int)
+
+        if config is not None:
+            # [TODO] Hack to pass in config - need to remove later
+            base = config.rope_theta
+            partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+            dim = int((config.hidden_size // config.num_attention_heads))
+            device = "cuda"
+            max_position_embeddings = config.max_position_embeddings
+        pass
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.base = base
+        # Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
+        self.current_rope_size = min(original_max_position_embeddings, self.max_position_embeddings)
+
+        # Long RoPE similar to RoPE except short sequences have 1 cos / sin
+        # and long sequences have another cos / sin
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim
+        short_factor = torch.tensor(short_factor, device = "cpu", dtype = torch.float32)
+        long_factor  = torch.tensor(long_factor,  device = "cpu", dtype = torch.float32)
+        short_inv_freq = 1.0 / (short_factor * self.base**inv_freq_shape)
+        long_inv_freq  = 1.0 / (long_factor  * self.base**inv_freq_shape)
+
+        # Phi-3 Scale factor
+        scale = self.max_position_embeddings / self.original_max_position_embeddings
+        if scale <= 1.0:
+            scaling_factor = 1.0
+        else:
+            scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+        pass
+        self.scaling_factor = scaling_factor
+
+        # Short and long inv_freq
+        self.register_buffer("short_inv_freq", short_inv_freq, persistent = False)
+        self.register_buffer("long_inv_freq",  long_inv_freq,  persistent = False)
+        # Build here to make `torch.jit.trace` work.
+        # self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
+
+        # Short sequences
+        dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
+        t = torch.arange(original_max_position_embeddings, device=self.short_inv_freq.device, dtype=torch.int64).float()
+        freqs = torch.outer(t, self.short_inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos_cached = (emb.cos() * self.scaling_factor).to(dtype=dtype, device=device, non_blocking=True)
+        sin_cached = (emb.sin() * self.scaling_factor).to(dtype=dtype, device=device, non_blocking=True)
+        self.register_buffer("short_cos_cached", cos_cached, persistent=False)
+        self.register_buffer("short_sin_cached", sin_cached, persistent=False)
+    pass
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        # Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
+        # in FP32. They are applied (multiplied) in FP32 as well.
+        self.current_rope_size = seq_len
+        
+        t = torch.arange(self.current_rope_size, device=self.long_inv_freq.device, dtype=torch.int64).float()
+        # Long sequences
+        freqs = torch.outer(t, self.long_inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos_cached = (emb.cos() * self.scaling_factor).to(dtype=dtype, device=device, non_blocking=True)
+        sin_cached = (emb.sin() * self.scaling_factor).to(dtype=dtype, device=device, non_blocking=True)
+        self.register_buffer("long_cos_cached", cos_cached, persistent=False)
+        self.register_buffer("long_sin_cached", sin_cached, persistent=False)
+    pass
+
+    def forward(self, x, position_ids=None, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.current_rope_size:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        if seq_len < self.original_max_position_embeddings:
+            return (
+                self.short_cos_cached[:seq_len].to(dtype = x.dtype),
+                self.short_sin_cached[:seq_len].to(dtype = x.dtype),
+            )
+        else:
+            return (
+                self.long_cos_cached[:seq_len].to(dtype = x.dtype),
+                self.long_sin_cached[:seq_len].to(dtype = x.dtype),
+            )
+        pass
+    pass
+
+    def get_cached(self, seq_len = None):
+        if seq_len < self.original_max_position_embeddings:
+            return self.short_cos_cached, self.short_sin_cached
+        return self.long_cos_cached, self.long_sin_cached
+    pass
+
+    def extend_rope_embedding(self, x, seq_len):
+        if seq_len <= self.current_rope_size: return
+        # Iteratively grow by increments of 8192
+        self.current_rope_size = ((seq_len // 8192) + ((seq_len % 8192) != 0)) * 8192
+        self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
+    pass
+pass
+
+
+def _wrap_fast_inference(generate, device_type, dtype, model):
+    # Wraps inference with bfloat16 / float16
+    @torch.inference_mode
+    def _fast_generate(*args, **kwargs):
+
+        if hasattr(model, "config") and hasattr(model.config, "max_position_embeddings"):
+            if "input_ids" in kwargs and kwargs["input_ids"] is not None and "max_new_tokens" in kwargs:
+                if kwargs["input_ids"].shape[-1] + kwargs["max_new_tokens"] > model.config.max_position_embeddings:
+                    raise ValueError(
+                        f'Unsloth: input length {kwargs["input_ids"].shape[-1]} + max_new_tokens {kwargs["max_new_tokens"]} exceeds the maximum sequence length of {model.config.max_position_embeddings}!\n'\
+                        'You will need to do long context extension by increasing the `max_seq_length` in `FastLanguageModel.from_pretrained`.'
+                    )
+        pass
+
+        # Set a flag for generation!
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            internal_model._flag_for_generation = True
+            internal_model = internal_model.model
+        pass
+        internal_model._flag_for_generation = True
+
+        # Must patch accelerate for Xformers
+        if accelerate_new_send_to_device is not None:
+            import accelerate.utils.operations
+            accelerate.utils.operations.send_to_device = accelerate_new_send_to_device
+        pass
+
+        # For newer HF
+        kwargs["cache_implementation"] = "dynamic"
+        # For num_logits_to_keep
+        kwargs["num_logits_to_keep"] = 1
+
+        # Remove token_type_ids
+        kwargs.pop("token_type_ids", None)
+
+        # Check pad_token
+        model_eos_token_id = getattr(model.config, "eos_token_id", None)
+        if model_eos_token_id is not None and hasattr(model_eos_token_id, "__iter__"):
+            model_eos_token_id = model_eos_token_id[0]
+
+        kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id)
+
+        # Set pad token
+        # old_pad_token_id = getattr(model.config, "pad_token_id", None)
+        # old_eos_token_id = getattr(model.config, "eos_token_id", None)
+        # model.config.pad_token_id = old_eos_token_id
+
+        # Autocasted
+        with torch.autocast(device_type = device_type, dtype = dtype):
+            output = generate(*args, **kwargs)
+        pass
+
+        # Revert
+        # model.config.pad_token_id = old_pad_token_id
+
+        # Unset a flag for generation!
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_flag_for_generation"): del internal_model._flag_for_generation
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "_flag_for_generation"): del internal_model._flag_for_generation
+
+        # Return accelerate back
+        if accelerate_new_send_to_device is not None:
+            accelerate.utils.operations.send_to_device = accelerate_old_send_to_device
+        pass
+
+        return output
+    pass
+    return _fast_generate
+pass
+
+
+class FastLlamaModel:
+
+    @staticmethod
+    def pre_patch():
+        init_name, function = patch_llama_rope_scaling(
+            model_name           = "llama",
+            rope_module          = LlamaRotaryEmbedding,
+            scaled_rope_module   = LlamaLinearScalingRotaryEmbedding,
+            extended_rope_module = LlamaExtendedRotaryEmbedding,
+            attention_module     = LlamaAttention,
+            longrope_module      = LongRopeRotaryEmbedding,
+        )
+        if init_name is not None:
+            exec(function, globals())
+            LlamaAttention.__init__  = eval(init_name)
+        pass
+        LlamaAttention      .forward = LlamaAttention_fast_forward
+        LlamaSdpaAttention  .forward = LlamaAttention_fast_forward
+        LlamaFlashAttention2.forward = LlamaAttention_fast_forward
+        LlamaDecoderLayer   .forward = LlamaDecoderLayer_fast_forward
+        LlamaModel          .forward = LlamaModel_fast_forward
+        LlamaForCausalLM    .forward = CausalLM_fast_forward(LlamaModel_fast_forward_inference)
+        PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward
+        fix_prepare_inputs_for_generation(LlamaForCausalLM)
+
+        # Solves https://github.com/unslothai/unsloth/issues/168
+        # Static KV Cache was introduced in 4.38.0, causing training to be much slower.
+        # Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
+        # https://github.com/huggingface/transformers/pull/27931
+        # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
+        import transformers.models.llama.modeling_llama
+        transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = LlamaRotaryEmbedding
+        transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding = LlamaLinearScalingRotaryEmbedding
+        return
+    pass
+
+
+    @staticmethod
+    def from_pretrained(
+        model_name        = "unsloth/llama-3-8b-bnb-4bit",
+        max_seq_length    = None,
+        dtype             = None,
+        load_in_4bit      = True,
+        token             = None,
+        device_map        = "sequential",
+        rope_scaling      = None,
+        fix_tokenizer     = True,
+        model_patcher     = None,
+        tokenizer_name    = None,
+        trust_remote_code = False,
+        **kwargs,
+    ):
+        if trust_remote_code:
+            print(
+                "Unsloth: WARNING `trust_remote_code` is True.\n"\
+                "Are you certain you want to do remote code execution?"
+            )
+        pass
+        if token is None: token = get_token()
+        if model_patcher is None: model_patcher = FastLlamaModel
+        SUPPORTS_BFLOAT16 = is_bfloat16_supported()
+        gpu_stats = torch.cuda.get_device_properties(0)
+        max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+
+        statistics = \
+           f"==((====))==  Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers: {transformers_version}.\n"\
+           f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\
+           f"O^O/ \_/ \\    Torch: {torch.__version__}. CUDA: {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit: {torch.version.cuda}. Triton: {triton_version}\n"\
+           f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
+           f' "-____-"     Free Apache license: http://github.com/unslothai/unsloth'
+        print(statistics)
+
+        # Warn about fast transfers
+        old_hf_transfer = os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0")
+        if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1":
+            print("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!")
+        pass
+        # Return old flag
+        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
+        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+
+        model_patcher.pre_patch()
+        get_statistics() # For debugging - we use a download counter to see if environments are not breaking 
+
+        if dtype is None:
+            dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
+        elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16:
+            logger.warning_once("Device does not support bfloat16. Will change to float16.")
+            dtype = torch.float16
+
+        assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32)
+
+        # RoPE Scaling
+        model_config = AutoConfig.from_pretrained(model_name, token = token)
+        model_max_seq_length = model_config.max_position_embeddings
+
+        # Check if RoPE Scaling is even allowed
+        model_function = MODEL_FOR_CAUSAL_LM_MAPPING[model_config.__class__]
+        has_rope_scaling = False
+        try:
+            with open(inspect.getfile(model_function), "r") as file:
+                has_rope_scaling = "self.config.rope_scaling" in file.read()
+        except: pass
+        has_rope_scaling = True
+
+        # If max_seq_length is not specified, use maximum fron config
+        if max_seq_length is None:
+            max_seq_length = model_max_seq_length
+        pass
+
+        if (rope_scaling is None) and (max_seq_length > model_max_seq_length):
+
+            rope_scaling = max_seq_length / model_max_seq_length
+
+            logger.warning_once(
+                f"Unsloth: {model_name} can only handle sequence lengths of at most "\
+                f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\
+                f"{round(rope_scaling, 3)}, it can be magically be extended to "\
+                f"{max_seq_length}!"
+            )
+
+            # Warn RoPE scaling isn't allowed
+            if not has_rope_scaling:
+                raise RuntimeError(
+                    "However, {model_name} doesn't support RoPE Scaling!\n"\
+                    "Please file a feature request at https://github.com/unslothai/unsloth."
+                )
+            pass
+
+            rope_scaling = {"type": "linear", "factor": rope_scaling,}
+
+            # Add to kwargs
+            kwargs["rope_scaling"] = rope_scaling
+        pass
+        # We currently only support NVIDIA GPUs - AMD / Intel is a work in progress!
+        pre_check = check_nvidia()
+
+        bnb_config = None
+        if load_in_4bit:
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit              = True,
+                bnb_4bit_use_double_quant = True,
+                bnb_4bit_quant_type       = "nf4",
+                bnb_4bit_compute_dtype    = dtype,
+            )
+        pass
+
+        # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12
+        # RoPE Scaling's max_position_embeddings must be updated
+        max_position_embeddings = max(max_seq_length, model_max_seq_length)
+        kwargs.pop("attn_implementation", None); # No need since we auto call it
+
+        # Cannot be None, since HF now checks for the config
+        if load_in_4bit: kwargs["quantization_config"] = bnb_config
+        
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map              = device_map,
+            torch_dtype             = dtype,
+            # quantization_config     = bnb_config,
+            token                   = token,
+            max_position_embeddings = max_position_embeddings,
+            trust_remote_code       = trust_remote_code,
+            attn_implementation     = "eager",
+            **kwargs,
+        )
+        # Return old flag
+        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
+        # We currently only support NVIDIA GPUs - AMD / Intel is a work in progress!
+        post_check = check_nvidia()
+
+        # Counteract saved tokenizers
+        tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
+        tokenizer = load_correct_tokenizer(
+            tokenizer_name    = tokenizer_name,
+            model_max_length  = max_position_embeddings,
+            padding_side      = "right",
+            token             = token,
+            trust_remote_code = trust_remote_code,
+            fix_tokenizer     = fix_tokenizer,
+        )
+
+        model, tokenizer = patch_tokenizer(model, tokenizer)
+        model, tokenizer = model_patcher.post_patch(model, tokenizer)
+
+        # Patch up QKV / O and MLP
+        for idx, layer in enumerate(model.model.layers):
+            layer.self_attn.apply_qkv = original_apply_qkv
+            layer.self_attn.apply_o   = original_apply_o
+        pass
+
+        # Patch Trainer
+        from transformers.trainer import Trainer
+        try:
+            if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
+                inner_training_loop = inspect.getsource(Trainer._inner_training_loop)
+                Trainer._original_training_loop = inner_training_loop
+            else:
+                inner_training_loop = Trainer._original_training_loop
+        except:
+            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
+        pass
+
+        if ((post_check - pre_check) >= 1).sum() > 1:
+            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
+
+        import transformers.trainer
+        items_in_trainer = dir(transformers.trainer)
+        good_items = []
+        for item in items_in_trainer:
+            # TODO: Support Deepspeed
+            if item.startswith(("deepspeed", "xm", "met", "smp")): continue
+            if item in inner_training_loop: good_items.append(item)
+        pass
+        exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals())
+
+        start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
+        end = inner_training_loop.find("\n\n", start)
+        original_debug = inner_training_loop[start:end]
+        spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
+        front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)
+
+        # Cannot use \\ since it will cause a SyntaxWarning in Python 3.12
+        # Instead use chr(92) == \\
+        debug_info = """debug_info = \\
+        f"==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\
+        f"   {chr(92)}{chr(92)}   /|    Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\
+        f"O^O/ {chr(92)}_/ {chr(92)}    Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
+        f"{chr(92)}        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
+        f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
+        logger.warning(debug_info)
+        import subprocess, re, gc, numpy as np
+        a = np.array([0,])
+        try:
+            a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
+            a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)
+            a = np.array([int(x.decode('utf-8'))/1024 for x in a])
+        except:
+            if not torch.cuda.is_available():
+                raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')
+        if ((a - PRE_CHECK) >= 1).sum() > 1:
+            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()"""
+
+        debug_info = debug_info.split('\n')
+        debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
+        inner_training_loop = inner_training_loop.replace(original_debug, debug_info)
+
+        debug_info = """n_total_devices = total_train_batch_size // \\
+            args.gradient_accumulation_steps // self._train_batch_size
+        if n_total_devices > 1:
+            logger.warning_once('Unsloth currently does not support multi GPU setups - but we are working on it!')
+        debug_info ="""
+        debug_info = debug_info.split('\n')
+        debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
+        inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1)
+
+        front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0)
+        inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE)
+        inner_training_loop = inner_training_loop.replace(
+            "train_dataloader = tpu_spmd_dataloader(train_dataloader)",
+            "raise RuntimeError('Unsloth: TPUs are not yet supported!')"
+        )
+        inner_training_loop = inner_training_loop.replace(
+            "self.accelerator.free_memory()",
+            "self.accelerator.free_memory()\n" + \
+            front_spaces + "if self.is_deepspeed_enabled:"\
+            "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1,
+        )
+
+        check_batches = """train_dataloader = self.get_train_dataloader()
+        ga  = args.gradient_accumulation_steps
+        bsz = self._train_batch_size
+        total_batches = bsz * ga * args.world_size
+        n_total_devices = total_batches // ga // bsz
+        if n_total_devices > 1:
+            logger.warning_once('Unsloth currently does not support multi GPU setups - but we are working on it!')
+            divisor = n_total_devices / 1
+            bsz = self._train_batch_size = max(int(bsz / divisor), 1)
+            if total_batches // ga // bsz > 1:
+                divisor = n_total_devices / 1
+                ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
+        check_batches = check_batches.split('\n')
+        check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])
+        inner_training_loop = inner_training_loop.replace(
+            "train_dataloader = self.get_train_dataloader()",
+            check_batches, 1,
+        )
+        inner_training_loop = inner_training_loop.replace(
+            "_inner_training_loop",
+            "_fast_inner_training_loop", 1,
+        )
+        exec(inner_training_loop, globals())
+
+        Trainer._inner_training_loop = _fast_inner_training_loop
+        inner_training_loop = inner_training_loop.replace(
+            "is_torch_tpu_available()",
+            "False",
+        )
+        if "n_total_devices >" not in inner_training_loop:
+            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
+        pass
+        inner_training_loop = inner_training_loop.replace(
+            "is_sagemaker_mp_enabled()",
+            "False",
+        )
+        exec(inner_training_loop, globals())
+        Trainer._inner_training_loop = _fast_inner_training_loop
+
+        # Save max_seq_length
+        model.max_seq_length = max_position_embeddings
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            internal_model.max_seq_length = max_position_embeddings
+            internal_model = internal_model.model
+        pass
+        internal_model.max_seq_length = max_position_embeddings
+
+        # We check the tokenizer first for errors
+        if fix_tokenizer:
+            tokenizer = check_tokenizer(
+                model            = model,
+                tokenizer        = tokenizer,
+                model_name       = model_name,
+                model_max_length = max_position_embeddings,
+                padding_side     = "right",
+                token            = token,
+            )
+        pass
+        patch_saving_functions(tokenizer)
+
+        # Fix up config for transformers uploading PEFT
+        # Not necessary anymore since we require transformers>=4.37!
+        if False:
+            name = model.config._name_or_path
+            if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
+                name = name[:len(name) - len("-bnb-4bit")]
+                model.config.update({"_name_or_path" : name})
+            pass
+        pass
+
+        # Log Unsloth version for future fastpaths for inference
+        model.config.update({"unsloth_version" : __version__})
+
+        # Add save modules
+        patch_saving_functions(model)
+        Trainer._inner_training_loop = _fast_inner_training_loop
+
+        # Fix gradient accumulation
+        patch_gradient_accumulation_fix(Trainer)
+
+        # Save tokenizer for inference purposes
+        tokenizer.padding_side = "left" # Force inference
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            internal_model._saved_temp_tokenizer = tokenizer
+            internal_model = internal_model.model
+        pass
+        internal_model._saved_temp_tokenizer = tokenizer
+        
+        return model, tokenizer
+    pass
+
+
+    @staticmethod
+    def post_patch(model, tokenizer):
+        model, tokenizer = patch_model_and_tokenizer(model, tokenizer, downcast_rope = True)
+        return model, tokenizer
+    pass
+
+
+    @staticmethod
+    def get_peft_model(
+        model,
+        r                   = 16,
+        target_modules      = ["q_proj", "k_proj", "v_proj", "o_proj",
+                               "gate_proj", "up_proj", "down_proj"],
+        lora_alpha          = 16,
+        lora_dropout        = 0,
+        bias                = "none",
+        layers_to_transform = None,
+        layers_pattern      = None,
+        use_gradient_checkpointing = True,
+        random_state        = 3407,
+        max_seq_length      = 2048, # not used anymore
+        use_rslora          = False,
+        modules_to_save     = None,
+        init_lora_weights   = True,
+        loftq_config        = {},
+        temporary_location  = "_unsloth_temporary_saved_buffers",
+        **kwargs,
+    ):
+        transformers_set_seed(random_state)
+
+        if type(r) is not int:
+            raise TypeError(f"Unsloth: Rank of {str(r)} must be an integer.")
+        if r <= 0:
+            raise TypeError(f"Unsloth: Rank of {str(r)} must be larger than 0.")
+
+        if isinstance(model, PeftModelForCausalLM):
+            # Check if exactly the same and then pass through!
+            assert(hasattr(model, "peft_config"))
+
+            peft_config = model.peft_config["default"].to_dict()
+            check_parameters = [
+                "r", "lora_alpha", "lora_dropout",
+                "bias", "layers_to_transform", "layers_pattern",
+                "use_rslora", "init_lora_weights",
+            ]
+            check_all = True
+            for param in check_parameters:
+                check_all = check_all and (peft_config[param] == eval(param))
+            pass
+
+            # Check save_modules
+            old_target_modules = list(peft_config["target_modules"])
+            modules_to_save = peft_config["modules_to_save"]
+            if modules_to_save is None: modules_to_save = {}
+            modules_to_save = list(modules_to_save)
+            old_target_modules += modules_to_save
+
+            # Combine all
+            new_target_modules = list(target_modules) + \
+                list(modules_to_save if modules_to_save is not None else [])
+
+            # Now check!
+            new_target_modules = set(new_target_modules)
+            check_all = check_all and (
+                len(set(old_target_modules) ^ new_target_modules) == 0
+            )
+
+            check_all = check_all and (
+                (loftq_config == {} or loftq_config is None) and \
+                (peft_config["loftq_config"] == {} or peft_config["loftq_config"] is None)
+            )
+
+            if check_all:
+                # Simply pass through!
+                logger.warning(
+                    "Unsloth: Already have LoRA adapters! We shall skip this step."
+                )
+
+                # Offload!
+                # [TODO] First offload lm_head and embed_tokens to CPU (should be disk!!)
+                if "embed_tokens" in new_target_modules:
+                    print("Unsloth: Training embed_tokens in mixed precision to save VRAM")
+
+                    dtype = model.model.model.embed_tokens.modules_to_save.default.weight.dtype
+                    model.model.model.embed_tokens.modules_to_save.default\
+                        .to(device = "cuda:0", dtype=(dtype if (dtype != torch.float16) else torch.float32), non_blocking = True)
+                    model.model.model.embed_tokens.modules_to_save.default.requires_grad_(True)
+
+                    # [TODO] Move old embed_tokens to CPU - should be disk!
+                    model.model.model.embed_tokens.original_module\
+                        .to(device = "cpu", non_blocking = True)
+                    model.model.model.embed_tokens.original_module.requires_grad_(False)
+                pass
+
+                if "lm_head" in new_target_modules:
+                    print("Unsloth: Training lm_head in mixed precision to save VRAM")
+
+                    dtype = model.model.model.lm_head.modules_to_save.default.weight.dtype
+                    model.model.lm_head.modules_to_save.default\
+                        .to(device = "cuda:0", dtype=(dtype if (dtype != torch.float16) else torch.float32), non_blocking = True)
+                    model.model.lm_head.modules_to_save.default.requires_grad_(True)
+
+                    # [TODO] Move old lm_head to CPU - should be disk!
+                    model.model.lm_head.original_module\
+                        .to(device = "cpu", non_blocking = True)
+                    model.model.lm_head.original_module.requires_grad_(False)
+                pass
+
+                return model
+            else:
+                raise TypeError(
+                    "Unsloth: Your model already has LoRA adapters. Your new parameters are different."
+                )
+            pass
+        pass
+
+        if loftq_config is None: loftq_config = {}
+
+        signature = str(inspect.signature(LoraConfig))
+        SUPPORTS_LOFTQ  = "loftq_config" in signature
+        SUPPORTS_RSLORA = "use_rslora"   in signature
+        
+        assert(max_seq_length <= model.max_seq_length)
+
+        if lora_dropout != 0:
+            logger.warning_once(
+                f"Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = {lora_dropout}.\n"\
+                f"Unsloth will patch all other layers, except LoRA matrices, causing a performance hit."
+            )
+        pass
+
+        if bias != "none":
+            logger.warning_once(
+                f"Unsloth: bias = `none` is supported for fast patching. You are using bias = {bias}.\n"\
+                f"Unsloth will patch all other layers, except LoRA matrices, causing a performance hit."
+            )
+        pass
+
+        if not (type(init_lora_weights) is bool or \
+            init_lora_weights == "gaussian" or init_lora_weights == "loftq"):
+            raise ValueError(
+                'Unsloth: `init_lora_weights` must be either [True, False, "gaussian", "loftq"].'
+            )
+        pass
+
+        if init_lora_weights == "loftq":
+
+            if not SUPPORTS_LOFTQ:
+                import peft
+                raise RuntimeError(
+                    f"Unsloth: Your PEFT version of {peft.__version__} does not support LoftQ init.\n"\
+                    "Please install PEFT 0.7.2 or higher.\n"\
+                    "You can also install from source: `pip install git+https://github.com/huggingface/peft.git"
+                )
+            pass
+
+            if loftq_config == {}:
+                from peft import LoftQConfig
+                logger.warning_once(
+                    "Unsloth: init_lora_weights = `loftq` is set, but `loftq_config` is None.\n"\
+                    "We shall use `loftq_config = LoftQConfig(loftq_bits = 4, loftq_iter = 1)`."
+                )
+                loftq_config = LoftQConfig(loftq_bits = 4, loftq_iter = 1)
+            pass
+            
+            if hasattr(model.config, "quantization_config"):
+                raise ValueError(
+                    "Unsloth: You are using `loftq` init, yet `load_in_4bit = True` was set.\n"\
+                    "Reload your model without any quantization by setting `load_in_4bit = False`."
+                )
+            pass
+        pass
+
+        assert(type(use_rslora) is bool)
+        if use_rslora:
+            if not SUPPORTS_RSLORA:
+                # We manually check for PEFT
+                import peft
+                raise RuntimeError(
+                    f"Unsloth: Your PEFT version of {peft.__version__} does not support `use_rslora`.\n"\
+                    "Please install PEFT 0.7.2 or higher.\n"\
+                    "You can also install from source: `pip install git+https://github.com/huggingface/peft.git"
+                )
+            pass
+        pass
+
+        accepted_modules = frozenset(("q_proj", "k_proj", "v_proj", "o_proj",
+                                      "gate_proj", "up_proj", "down_proj",),)
+        model.config.update({"unsloth_version" : __version__})
+
+        if type(modules_to_save) is tuple:
+            modules_to_save = list(modules_to_save)
+        pass
+
+        train_lm_head = False
+        train_embed_tokens = False
+        final_modules = []
+        for module in target_modules:
+            if module == "lm_head":
+                # logger.warning_once(
+                #     "Unsloth: `lm_head` should be placed in `modules_to_save` and not `target_modules`. "\
+                #     "Luckily, we shall do it for you!"
+                # )
+                train_lm_head = True
+                if modules_to_save is None: modules_to_save = ["lm_head"]
+                else: modules_to_save.append("lm_head")
+
+            elif module == "embed_tokens":
+                # logger.warning_once(
+                #     "Unsloth: `embed_tokens` should be placed in `modules_to_save` and not `target_modules`. "\
+                #     "Luckily, we shall do it for you!"
+                # )
+                train_embed_tokens = True
+                if modules_to_save is None: modules_to_save = ["embed_tokens"]
+                else: modules_to_save.append("embed_tokens")
+
+            else:
+                try:
+                    assert(module in accepted_modules)
+                    final_modules.append(module)
+                except AssertionError as e:
+                    final_modules.append(module)
+                    print(
+                        "Unsloth: You added custom modules, but Unsloth hasn't optimized for this.\n"\
+                        "Beware - your finetuning might be noticeably slower!"
+                    )
+                pass
+            pass
+        pass
+
+        # Check if we added new tokens!
+        if hasattr(model, "_need_to_train_embeddings"):
+            if not train_lm_head or not train_embed_tokens:
+                print(
+                    "Unsloth: You added new tokens but did not specify if you wanted to "\
+                    "train the lm_head and embed_tokens.\nWe must turn it on for you."
+                )
+                train_lm_head = True
+                train_embed_tokens = True
+
+                if modules_to_save is None: modules_to_save = ["embed_tokens"]
+                else: modules_to_save.append("embed_tokens")
+
+                if modules_to_save is None: modules_to_save = ["lm_head"]
+                else: modules_to_save.append("lm_head")
+            pass
+        pass
+
+        # Check for Llama-3
+        # if hasattr(model._saved_temp_tokenizer, "_using_llama3_template"):
+        #     if not train_embed_tokens and not train_lm_head:
+        #         raise RuntimeError("")
+
+        # First fix untrained tokens
+        # Wrong - can cause reserved tokens to pop out!!
+        # if train_embed_tokens or train_lm_head:
+        #     fix_untrained_tokens(model, eps = 1e-16)
+        # pass
+
+        # Check modules_to_save
+        if modules_to_save is not None:
+            for module in modules_to_save:
+                if module == "lm_head":
+                    train_lm_head = True
+                elif module == "embed_tokens":
+                    train_embed_tokens = True
+                else:
+                    raise TypeError(
+                        f"Unsloth: Module = {module} is not allowed. Only 'lm_head' and 'embed_tokens' is allowed."
+                    )
+            pass
+        pass
+        if isinstance(modules_to_save, (tuple, list)):
+            modules_to_save = list(set(modules_to_save))
+        pass
+
+        # Get LoRA
+        arguments = dict(
+            r                   = r,
+            lora_alpha          = lora_alpha,
+            target_modules      = final_modules,
+            lora_dropout        = lora_dropout,
+            bias                = bias,
+            task_type           = TaskType.CAUSAL_LM,
+            layers_to_transform = layers_to_transform,
+            init_lora_weights   = init_lora_weights,
+            loftq_config        = loftq_config,
+            use_rslora          = use_rslora,
+            modules_to_save     = modules_to_save,
+            **kwargs,
+        )
+        if not SUPPORTS_LOFTQ:  del arguments["loftq_config"]
+        if not SUPPORTS_RSLORA: del arguments["use_rslora"]
+
+        _saved_temp_tokenizer = model._saved_temp_tokenizer
+
+        lora_config = LoraConfig(**arguments)
+
+        # First offload lm_head and embed_tokens to disk
+        input_embeddings_device  = model. get_input_embeddings().weight.device
+        output_embeddings_device = model.get_output_embeddings().weight.device
+
+        if use_gradient_checkpointing == "unsloth":
+            if train_embed_tokens:
+                print("Unsloth: Offloading input_embeddings to disk to save VRAM")
+                offload_input_embeddings(model, temporary_location)
+            pass
+
+            # Remove old items to save VRAM
+            for _ in range(3):
+                gc.collect()
+                torch.cuda.empty_cache()
+            pass
+
+            if train_lm_head:
+                print("Unsloth: Offloading output_embeddings to disk to save VRAM")
+                offload_output_embeddings(model, temporary_location)
+            pass
+
+            # Remove old items to save VRAM
+            for _ in range(3):
+                gc.collect()
+                torch.cuda.empty_cache()
+            pass
+        pass
+
+        model = _get_peft_model(model, lora_config)
+
+        model._saved_temp_tokenizer = _saved_temp_tokenizer
+
+        model = FastLlamaModel.patch_peft_model(model, use_gradient_checkpointing)
+
+        # Now patch lm_head and embed_tokens
+        if train_embed_tokens:
+            print("Unsloth: Training embed_tokens in mixed precision to save VRAM")
+            assert(hasattr(model.model.model.embed_tokens, "modules_to_save"))
+
+            dtype = model.model.model.embed_tokens.modules_to_save.default.weight.dtype
+            model.model.model.embed_tokens.modules_to_save.default\
+                .to(device = "cuda:0", dtype=(dtype if (dtype != torch.float16) else torch.float32), non_blocking = True)
+            model.model.model.embed_tokens.modules_to_save.default.requires_grad_(True)
+        pass
+
+        if train_lm_head:
+            print("Unsloth: Training lm_head in mixed precision to save VRAM")
+            assert(hasattr(model.model.lm_head, "modules_to_save"))
+
+            dtype = model.model.lm_head.modules_to_save.default.weight.dtype
+            model.model.lm_head.modules_to_save.default\
+                .to(device = "cuda:0", dtype=(dtype if (dtype != torch.float16) else torch.float32), non_blocking = True)
+            model.model.lm_head.modules_to_save.default.requires_grad_(True)
+        pass
+
+        # Patch tokenizer to pad to the right
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_saved_temp_tokenizer"):
+                internal_model._saved_temp_tokenizer.padding_side = "right"
+            pass
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "_saved_temp_tokenizer"):
+            internal_model._saved_temp_tokenizer.padding_side = "right"
+        pass
+
+        # Clear deleted GPU items
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+        pass
+
+        return model
+    pass
+
+
+    @staticmethod
+    def patch_peft_model(
+        model,
+        use_gradient_checkpointing = True,
+    ):
+        if not isinstance(model, PeftModelForCausalLM):
+            raise TypeError(
+                "Unsloth: Your model needs to call `.get_peft_model` first!"
+            )
+        pass
+
+        # Get activation function
+        model_type = model.config.model_type
+
+        if   model_type == "llama":   apply_lora_mlp = apply_lora_mlp_swiglu
+        elif model_type == "mistral": apply_lora_mlp = apply_lora_mlp_swiglu
+        elif model_type == "qwen2":   apply_lora_mlp = apply_lora_mlp_swiglu
+        elif model_type == "gemma":   apply_lora_mlp = apply_lora_mlp_geglu_approx
+        elif model_type == "gemma2":  apply_lora_mlp = apply_lora_mlp_geglu_approx
+        elif model_type == "cohere":  apply_lora_mlp = apply_lora_mlp_swiglu
+        elif model_type == "granite": apply_lora_mlp = apply_lora_mlp_swiglu
+        else:
+            raise NotImplementedError(f"Unsloth: {model_type} is not yet implemented!")
+        pass
+
+        model = prepare_model_for_kbit_training(
+            model,
+            use_gradient_checkpointing = use_gradient_checkpointing,
+            use_reentrant = True,
+        )
+
+        # Fix up config for transformers uploading PEFT
+        for active_adapter in model.peft_config.keys():
+            # Not necessary since we requires transformers >= 4.37
+            if False:
+                name = model.peft_config[active_adapter].base_model_name_or_path
+                if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
+                    name = name[:len(name) - len("-bnb-4bit")]
+                    model.peft_config[active_adapter].base_model_name_or_path = name
+                pass
+            # Add revision to enable future fast inference paths
+            # [TODO] Bugs out!see https://github.com/unslothai/unsloth/issues/492
+            # model.peft_config[active_adapter].revision = f"unsloth"
+        pass
+
+        from transformers.trainer import Trainer 
+        if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
+            raise RuntimeError(
+                'Unsloth currently does not work on multi GPU setups - sadly we are a 2 brother team so '\
+                'enabling it will require much more work, so we have to prioritize. Please understand!\n'\
+                'We do have a separate beta version, which you can contact us about!\n'\
+                'Thank you for your understanding and we appreciate it immensely!'
+            )
+        pass
+
+        # Fix loftq issues
+        # loftq_config must not = None, but rather {}
+        all_configs = model.peft_config
+        for key, current_config in all_configs.items():
+            if hasattr(current_config, "loftq_config") and current_config.loftq_config is None:
+                new_args = current_config.__dict__
+                new_args["loftq_config"] = {}
+                current_config = current_config.__class__(**new_args)
+                all_configs[key] = current_config
+            pass
+        pass
+
+        # Do patching
+        n_mlp = 0
+        n_qkv = 0
+        n_o   = 0
+        import types
+
+        active_adapter = model.active_adapters[0] if \
+            hasattr(model, "active_adapters") else model.active_adapter
+
+        # Get dropout and bias
+        lora_dropout = model.peft_config[active_adapter].lora_dropout
+        bias         = model.peft_config[active_adapter].bias
+
+        # We also do not inplace edit QKV for Cohere!
+        from functools import partial
+        _apply_lora_mlp = \
+            partial(apply_lora_mlp, inplace = False) \
+            if model_type == "cohere" else \
+            apply_lora_mlp
+        pass
+
+        if lora_dropout == 0 and bias == "none":
+            for idx, layer in enumerate(model.model.model.layers):
+
+                # MLP patching
+                gate_proj = layer.mlp.gate_proj
+                up_proj   = layer.mlp.  up_proj
+                down_proj = layer.mlp.down_proj
+
+                if  hasattr(gate_proj, "lora_A") and \
+                    hasattr(  up_proj, "lora_A") and \
+                    hasattr(down_proj, "lora_A") and \
+                    (getattr(gate_proj, "base_layer", gate_proj).bias is None) and \
+                    (getattr(  up_proj, "base_layer",   up_proj).bias is None) and \
+                    (getattr(down_proj, "base_layer", down_proj).bias is None) and \
+                    (len(getattr(gate_proj, "lora_magnitude_vector", []) or []) == 0) and \
+                    (len(getattr(  up_proj, "lora_magnitude_vector", []) or []) == 0) and \
+                    (len(getattr(down_proj, "lora_magnitude_vector", []) or []) == 0):
+
+                    # https://stackoverflow.com/questions/50599045/python-replacing-a-function-within-a-class-of-a-module
+                    layer.mlp.forward = types.MethodType(_apply_lora_mlp, layer.mlp)
+                    n_mlp += 1
+                else:
+                    logger.warning_once(
+                        "Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters\n"\
+                        "are not enabled or a bias term (like in Qwen) is used."
+                    )
+                pass
+
+                # QKV attention patching
+                q_proj = layer.self_attn.q_proj
+                k_proj = layer.self_attn.k_proj
+                v_proj = layer.self_attn.v_proj
+                if  hasattr(q_proj, "lora_A") and \
+                    hasattr(k_proj, "lora_A") and \
+                    hasattr(v_proj, "lora_A") and \
+                    (getattr(q_proj, "base_layer", q_proj).bias is None) and \
+                    (getattr(k_proj, "base_layer", k_proj).bias is None) and \
+                    (getattr(v_proj, "base_layer", v_proj).bias is None) and \
+                    (len(getattr(q_proj, "lora_magnitude_vector", []) or []) == 0) and \
+                    (len(getattr(k_proj, "lora_magnitude_vector", []) or []) == 0) and \
+                    (len(getattr(v_proj, "lora_magnitude_vector", []) or []) == 0):
+
+                    layer.self_attn.apply_qkv = apply_lora_qkv
+                    n_qkv += 1
+                else:
+                    if model_type == "qwen2": n_qkv += 1
+                    else:
+                        logger.warning_once(
+                            "Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters\n"\
+                            "are not enabled or a bias term (like in Qwen) is used."
+                        )
+                    pass
+                pass
+
+                # O attention patching
+                o_proj = layer.self_attn.o_proj
+                if hasattr(o_proj, "lora_A") and \
+                    (getattr(o_proj, "base_layer", o_proj).bias is None) and \
+                    (len(getattr(o_proj, "lora_magnitude_vector", []) or []) == 0):
+
+                    layer.self_attn.apply_o = apply_lora_o
+                    n_o += 1
+                else:
+                    logger.warning_once(
+                        "Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters\n"\
+                        "are not enabled or a bias term (like in Qwen) is used."
+                    )
+                pass
+            pass
+        pass
+
+        logger.warning_once(
+            f"Unsloth {__version__} patched {len(model.model.model.layers)} layers with "\
+            f"{n_qkv} QKV layers, {n_o} O layers and {n_mlp} MLP layers.",
+        )
+        patch_saving_functions(model)
+
+        # Patch cross entropy loss labels
+        # Fixes https://github.com/unslothai/unsloth/issues/10
+        max_seq_length = model.max_seq_length
+        extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = "cuda:0")
+        model.model.extra_ignored_labels = extra_ignored_labels
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            internal_model.max_seq_length = max_seq_length
+            internal_model = internal_model.model
+        pass
+        internal_model.max_seq_length = max_seq_length        
+
+        # Patch tokenizer to pad to the right
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_saved_temp_tokenizer"):
+                internal_model._saved_temp_tokenizer.padding_side = "right"
+            pass
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "_saved_temp_tokenizer"):
+            internal_model._saved_temp_tokenizer.padding_side = "right"
+        pass
+
+        # Clear deleted GPU items
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+        pass
+        return model
+    pass
+
+
+    @staticmethod
+    def for_inference(model):
+        # if model.config.model_type == "qwen2":
+        #     FastLlamaModel.for_training(model)
+        #     return
+        # pass
+
+        internal_model = model
+        internal_model.gradient_checkpointing = False
+        internal_model.training = False
+
+        while hasattr(internal_model, "model"):
+            internal_model = internal_model.model
+            internal_model.gradient_checkpointing = False
+            internal_model.training = False
+        pass
+        if hasattr(internal_model, "training"):
+            internal_model.training = False
+        pass
+
+        # Also check if lm_head / embeddings are trained
+        internal_model = model
+        while not hasattr(internal_model, "lm_head"):
+            internal_model = internal_model.model
+        pass
+        lm_head = internal_model.lm_head.weight
+        device_type = lm_head.device.type
+        dtype = model.config.torch_dtype
+        
+        if type(dtype) is str:
+            if   dtype ==  "float16": dtype = torch.float16
+            elif dtype == "bfloat16": dtype = torch.bfloat16
+        pass
+
+        # Wrap model.generate
+        if model.generate.__name__ != "_fast_generate":
+            model._unwrapped_old_generate = model.generate
+            model.generate = _wrap_fast_inference(model.generate, device_type, dtype, model)
+        pass
+        
+        # Patch tokenizer to pad to the left
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_saved_temp_tokenizer"):
+                internal_model._saved_temp_tokenizer.padding_side = "left"
+            pass
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "_saved_temp_tokenizer"):
+            internal_model._saved_temp_tokenizer.padding_side = "left"
+        pass
+
+        # Also disable training for embeddings for NEFTune
+        if hasattr(model, "get_input_embeddings"):
+            embeddings = model.get_input_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = False
+        pass
+        if hasattr(model, "get_output_embeddings"):
+            embeddings = model.get_output_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = False
+        pass
+
+        return model
+    pass
+
+
+    @staticmethod
+    def for_training(model, use_gradient_checkpointing = True):
+        internal_model = model
+        internal_model.gradient_checkpointing = use_gradient_checkpointing
+        internal_model.training = True
+
+        # Delete all fast inference loras
+        for param in model.parameters():
+            if hasattr(param, "_fast_lora"):
+                del param._fast_lora
+        pass
+
+        while hasattr(internal_model, "model"):
+            internal_model = internal_model.model
+            internal_model.gradient_checkpointing = use_gradient_checkpointing
+            internal_model.training = True
+        pass
+        if hasattr(internal_model, "training"):
+            internal_model.training = True
+        pass
+
+        # Also revert model.generate
+        if hasattr(model, "_unwrapped_old_generate"):
+            model.generate = model._unwrapped_old_generate
+            del model._unwrapped_old_generate
+        pass
+
+        # Patch tokenizer to pad to the right
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_saved_temp_tokenizer"):
+                internal_model._saved_temp_tokenizer.padding_side = "right"
+            pass
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "_saved_temp_tokenizer"):
+            internal_model._saved_temp_tokenizer.padding_side = "right"
+        pass
+
+        # Also re-enable training for embeddings for NEFTune
+        if hasattr(model, "get_input_embeddings"):
+            embeddings = model.get_input_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = True
+        pass
+        if hasattr(model, "get_output_embeddings"):
+            embeddings = model.get_output_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = True
+        pass
+
+        return model
+    pass
+pass
+
diff --git a/unsloth-main/unsloth/models/loader.py b/unsloth-main/unsloth/models/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..113c4fbc70dda9a11a443d05e95685d9d252776f
--- /dev/null
+++ b/unsloth-main/unsloth/models/loader.py
@@ -0,0 +1,560 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._utils import is_bfloat16_supported, HAS_FLASH_ATTENTION, HAS_FLASH_ATTENTION_SOFTCAPPING
+from .granite import FastGraniteModel
+from .llama   import FastLlamaModel, logger
+from .mistral import FastMistralModel
+from .qwen2   import FastQwen2Model
+from .cohere  import FastCohereModel
+from transformers import AutoConfig
+from transformers import __version__ as transformers_version
+from peft import PeftConfig, PeftModel
+from .loader_utils import get_model_name
+import os, contextlib, sys
+try:
+    from huggingface_hub.utils import get_token
+except:
+    # Old HF Hub versions <= 0.0.25
+    from huggingface_hub.utils._token import get_token
+pass
+from huggingface_hub import HfFileSystem
+
+# https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading!
+from unsloth_zoo.utils import Version
+transformers_version = Version(transformers_version)
+SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
+SUPPORTS_GEMMA   = transformers_version >= Version("4.38")
+SUPPORTS_GEMMA2  = transformers_version >= Version("4.42")
+SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.2")
+SUPPORTS_LLAMA32 = transformers_version  > Version("4.45.0")
+SUPPORTS_GRANITE = transformers_version >= Version("4.46.0")
+if SUPPORTS_GEMMA:
+    from .gemma  import FastGemmaModel
+if SUPPORTS_GEMMA2:
+    from .gemma2 import FastGemma2Model
+pass
+import torch
+
+def _get_dtype(dtype):
+    __DTYPE_MAP = {
+        "float32": torch.float32,
+        torch.float32: torch.float32,
+        "float16": torch.float16,
+        torch.float16: torch.float16,
+        "bfloat16": torch.bfloat16,
+        torch.bfloat16: torch.bfloat16,
+    }
+    if   dtype is None or dtype == None: return None
+    elif dtype in __DTYPE_MAP: return __DTYPE_MAP[dtype]
+    else:
+        print(f"Unsloth: {dtype} is not recognized, so we'll default to None")
+        return None
+    pass
+pass
+
+
+class FastLanguageModel(FastLlamaModel):
+    @staticmethod
+    def from_pretrained(
+        model_name                 = "unsloth/llama-3-8b-bnb-4bit",
+        max_seq_length             = None,
+        dtype                      = None,
+        load_in_4bit               = True,
+        token                      = None,
+        device_map                 = "sequential",
+        rope_scaling               = None,
+        fix_tokenizer              = True,
+        trust_remote_code          = False,
+        use_gradient_checkpointing = "unsloth",
+        resize_model_vocab         = None,
+        revision                   = None,
+        *args, **kwargs,
+    ):
+        if token is None: token = get_token()
+        
+        old_model_name = model_name
+        model_name = get_model_name(model_name, load_in_4bit)
+
+        # First check if it's a normal model via AutoConfig
+        from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
+        was_disabled = are_progress_bars_disabled()
+        disable_progress_bars()
+
+        autoconfig_error = None
+        peft_error = None
+        try:
+            model_config = AutoConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
+            is_model = True
+        except Exception as error:
+            autoconfig_error = str(error)
+            is_model = False
+        try:
+            peft_config = PeftConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
+            is_peft = True
+        except Exception as error:
+            peft_error = str(error)
+            is_peft = False
+        pass
+
+        # Both config.json and adapter_config.json should not exist!
+
+        # Old transformers versions check
+        both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32
+        
+        # New transformers need to check manually.
+        if SUPPORTS_LLAMA32:
+            # Check if folder exists locally
+            if os.path.isdir(model_name):
+                exist_adapter_config = os.path.exists(os.path.join(model_name, "adapter_config.json"))
+                exist_config         = os.path.exists(os.path.join(model_name, "config.json"))
+                both_exist = exist_adapter_config and exist_config
+            else:
+                # Because HfFileSystem assumes linux paths, we need to set the path with forward slashes, even on Windows.
+                files = HfFileSystem(token = token).glob(f"{model_name}/*.json")
+                files = (os.path.split(x)[-1] for x in files)
+                if sum(x == "adapter_config.json" or x == "config.json" for x in files) >= 2:
+                    both_exist = True
+                pass
+            pass
+        pass
+
+        # Error out if both LoRA and normal model config exists.
+        if both_exist:
+            raise RuntimeError(
+                "Unsloth: Your repo has a LoRA adapter and a base model.\n"\
+                "You have 2 files `config.json` and `adapter_config.json`.\n"\
+                "We must only allow one config file.\n"\
+                "Please separate the LoRA and base models to 2 repos."
+            )
+
+        elif not is_model and not is_peft:
+            error = autoconfig_error or peft_error
+            # Old transformers version
+            if "rope_scaling" in error.lower() and not SUPPORTS_LLAMA31:
+                raise ImportError(
+                    f"Unsloth: Your transformers version of {transformers_version} does not support new RoPE scaling methods.\n"\
+                    f"This includes Llama 3.1. The minimum required version is 4.43.2\n"\
+                    f'Try `pip install --upgrade "transformers>=4.43.2"`\n'\
+                    f"to obtain the latest transformers build, then restart this session."\
+                ) 
+            raise RuntimeError(autoconfig_error or peft_error)
+        pass
+
+        # Get base model for PEFT:
+        if is_peft:
+            # Check base model again for PEFT
+            model_name = get_model_name(peft_config.base_model_name_or_path, load_in_4bit)
+            model_config = AutoConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
+        pass
+
+        if not was_disabled: enable_progress_bars()
+
+        model_type = model_config.model_type
+
+        if model_type == "llama":
+            scaling_type = None
+            if getattr(model_config, "rope_scaling", None) is not None:
+                scaling_type1 = model_config.rope_scaling.get("type", None)
+                scaling_type2 = model_config.rope_scaling.get("rope_type", None)
+                scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2
+            pass
+
+            if scaling_type == "llama3" and not SUPPORTS_LLAMA31:
+                raise ImportError(
+                    f"Unsloth: Your transformers version of {transformers_version} does not support Llama 3.1.\n"\
+                    f"The minimum required version is 4.43.2\n"\
+                    f'Try `pip install --upgrade "transformers>=4.43.2"`\n'\
+                    f"to obtain the latest transformers build, then restart this session."\
+                )
+
+            dispatch_model = FastLlamaModel
+
+        elif model_type == "mistral": dispatch_model = FastMistralModel
+        elif model_type == "gemma":
+            if not SUPPORTS_GEMMA:
+                raise ImportError(
+                    f"Unsloth: Your transformers version of {transformers_version} does not support Gemma.\n"\
+                    f"The minimum required version is 4.38.\n"\
+                    f'Try `pip install --upgrade "transformers>=4.38"`\n'\
+                    f"to obtain the latest transformers build, then restart this session."\
+                )
+            dispatch_model = FastGemmaModel
+        elif model_type == "gemma2":
+            if not SUPPORTS_GEMMA2:
+                raise ImportError(
+                    f"Unsloth: Your transformers version of {transformers_version} does not support Gemma2.\n"\
+                    f"The minimum required version is 4.42.3.\n"\
+                    f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\
+                    f"to obtain the latest transformers build, then restart this session."\
+                )
+            # Also check for softcapping support in flash-attn which is faster!
+            if is_bfloat16_supported() and not HAS_FLASH_ATTENTION:
+                print(
+                    "Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!\n"\
+                    "To install flash-attn, do the below:\n"\
+                    '\npip install --no-deps --upgrade "flash-attn>=2.6.3"'
+                )
+            elif HAS_FLASH_ATTENTION and not HAS_FLASH_ATTENTION_SOFTCAPPING:
+                print(
+                    "Unsloth: If you want to finetune Gemma 2, upgrade flash-attn to version 2.6.3 or higher!\n"\
+                    "Newer versions support faster and less memory usage kernels for Gemma 2's attention softcapping!\n"\
+                    "To update flash-attn, do the below:\n"\
+                    '\npip install --no-deps --upgrade "flash-attn>=2.6.3"'
+                )
+            
+            dispatch_model = FastGemma2Model
+        elif model_type == "qwen2":
+            dispatch_model = FastQwen2Model
+        elif model_type == "cohere":
+            dispatch_model = FastCohereModel
+        elif model_type == "granite":
+            dispatch_model = FastGraniteModel
+        else:
+            raise NotImplementedError(
+                f"Unsloth: {model_name} not supported yet!\n"\
+                "Maybe you're doing vision finetuning? Please use FastVisionModel instead!\n"\
+                "Otherwise, make an issue to https://github.com/unslothai/unsloth!",
+            )
+        pass
+
+        # Check if this is local model since the tokenizer gets overwritten
+        if  os.path.exists(os.path.join(old_model_name, "tokenizer_config.json")) and \
+            os.path.exists(os.path.join(old_model_name, "tokenizer.json")) and \
+            os.path.exists(os.path.join(old_model_name, "special_tokens_map.json")):
+
+            tokenizer_name = old_model_name
+        else:
+            tokenizer_name = None
+        pass
+
+        model, tokenizer = dispatch_model.from_pretrained(
+            model_name        = model_name,
+            max_seq_length    = max_seq_length,
+            dtype             = _get_dtype(dtype),
+            load_in_4bit      = load_in_4bit,
+            token             = token,
+            device_map        = device_map,
+            rope_scaling      = rope_scaling,
+            fix_tokenizer     = fix_tokenizer,
+            model_patcher     = dispatch_model,
+            tokenizer_name    = tokenizer_name,
+            trust_remote_code = trust_remote_code,
+            revision          = revision if not is_peft else None,
+            *args, **kwargs,
+        )
+        
+        if resize_model_vocab is not None:
+            model.resize_token_embeddings(resize_model_vocab)
+        pass
+
+        # In case the model supports tagging, add the unsloth tag.
+        if hasattr(model, "add_model_tags"):
+            model.add_model_tags(["unsloth",])
+        pass
+        if hasattr(tokenizer, "add_model_tags"):
+            tokenizer.add_model_tags(["unsloth",])
+        pass
+
+        if load_in_4bit:
+            # Fix up bitsandbytes config
+            quantization_config = \
+            {
+                # Sometimes torch_dtype is not a string!!
+                "bnb_4bit_compute_dtype"           : model.config.to_dict()["torch_dtype"],
+                "bnb_4bit_quant_type"              : "nf4",
+                "bnb_4bit_use_double_quant"        : True,
+                "llm_int8_enable_fp32_cpu_offload" : False,
+                "llm_int8_has_fp16_weight"         : False,
+                "llm_int8_skip_modules"            : None,
+                "llm_int8_threshold"               : 6.0,
+                "load_in_4bit"                     : True,
+                "load_in_8bit"                     : False,
+                "quant_method"                     : "bitsandbytes",
+            }
+            model.config.update({"quantization_config" : quantization_config})
+        pass
+
+        if is_peft:
+            # From https://github.com/huggingface/peft/issues/184
+            # Now add PEFT adapters
+            model.enable_input_require_grads()
+            model = PeftModel.from_pretrained(
+                model,
+                old_model_name,
+                token = token,
+                revision = revision,
+                is_trainable = True,
+                trust_remote_code = trust_remote_code,
+            )
+            # Patch it as well!
+            model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing)
+        pass
+        return model, tokenizer
+    pass
+pass
+
+
+from ._utils import (
+    patch_compiling_bitsandbytes,
+    patch_model_and_tokenizer,
+    prepare_model_for_kbit_training,
+    patch_unsloth_smart_gradient_checkpointing,
+    patch_compiled_autograd,
+    process_vision_info,
+    unsloth_compile_transformers,
+)
+from ..kernels import (
+    patch_loss_functions,
+    post_patch_loss_function,
+)
+from .vision import FastBaseVisionModel
+
+
+class FastVisionModel(FastBaseVisionModel):
+    @staticmethod
+    def from_pretrained(
+        model_name                 = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
+        max_seq_length             = None, # [TODO] No effect
+        dtype                      = None,
+        load_in_4bit               = True,
+        token                      = None,
+        device_map                 = "sequential",
+        rope_scaling               = None, # [TODO] No effect
+        fix_tokenizer              = True, # [TODO] No effect
+        trust_remote_code          = False,
+        use_gradient_checkpointing = "unsloth",
+        resize_model_vocab         = None, # [TODO] No effect
+        revision                   = None,
+        return_logits              = False, # Return logits
+        fullgraph                  = True, # No graph breaks
+        *args, **kwargs,
+    ):
+        if token is None: token = get_token()
+
+        patch_compiled_autograd()
+        patch_compiling_bitsandbytes()
+        if use_gradient_checkpointing == "unsloth":
+            patch_unsloth_smart_gradient_checkpointing()
+        
+        old_model_name = model_name
+        model_name = get_model_name(model_name, load_in_4bit)
+
+        # First check if it's a normal model via AutoConfig
+        from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
+        was_disabled = are_progress_bars_disabled()
+        disable_progress_bars()
+        
+        autoconfig_error = None
+        peft_error = None
+        try:
+            model_config = AutoConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
+            is_model = True
+        except Exception as error:
+            autoconfig_error = str(error)
+            is_model = False
+        try:
+            peft_config = PeftConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
+            is_peft = True
+        except Exception as error:
+            peft_error = str(error)
+            is_peft = False
+        pass
+
+        # Both config.json and adapter_config.json should not exist!
+
+        # Old transformers versions check
+        both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32
+        
+        # New transformers need to check manually.
+        if SUPPORTS_LLAMA32:
+            # Check if folder exists locally
+            if os.path.isdir(model_name):
+                exist_adapter_config = os.path.exists(os.path.join(model_name, "adapter_config.json"))
+                exist_config         = os.path.exists(os.path.join(model_name, "config.json"))
+                both_exist = exist_adapter_config and exist_config
+            else:
+                files = HfFileSystem(token = token).glob(os.path.join(model_name, "*.json"))
+                files = (os.path.split(x)[-1] for x in files)
+                if sum(x == "adapter_config.json" or x == "config.json" for x in files) >= 2:
+                    both_exist = True
+                pass
+            pass
+        pass
+
+        # Error out if both LoRA and normal model config exists.
+        if both_exist:
+            raise RuntimeError(
+                "Unsloth: Your repo has a LoRA adapter and a base model.\n"\
+                "You have 2 files `config.json` and `adapter_config.json`.\n"\
+                "We must only allow one config file.\n"\
+                "Please separate the LoRA and base models to 2 repos."
+            )
+
+        elif not is_model and not is_peft:
+            error = autoconfig_error or peft_error
+            # Old transformers version
+            if "rope_scaling" in error.lower() and not SUPPORTS_LLAMA31:
+                raise ImportError(
+                    f"Unsloth: Your transformers version of {transformers_version} does not support new RoPE scaling methods.\n"\
+                    f"This includes Llama 3.1. The minimum required version is 4.43.2\n"\
+                    f'Try `pip install --upgrade "transformers>=4.43.2"`\n'\
+                    f"to obtain the latest transformers build, then restart this session."\
+                ) 
+            raise RuntimeError(autoconfig_error or peft_error)
+        pass
+
+        # Get base model for PEFT:
+        if is_peft:
+            # Check base model again for PEFT
+            model_name = get_model_name(peft_config.base_model_name_or_path, load_in_4bit)
+            model_config = AutoConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
+        pass
+
+        if not was_disabled: enable_progress_bars()
+
+        with contextlib.redirect_stdout(open(os.devnull, "w")):
+            patch_loss_functions(torch_compile = False)
+            model_types = unsloth_compile_transformers(
+                model_name              = model_name,
+                sdpa_dynamic_mask       = True,
+                sdpa_bool_masks         = True,
+                sdpa_gqa_replace        = True,
+                sdpa_dynamic_compile    = True,
+                compile_attention       = True,
+                disable_causal_masks    = True,
+                compile_torch_modules   = True,
+                compile_custom_modules  = True,
+                compile_function_calls  = True,
+                fuse_lm_head            = True,
+                gradient_checkpointing  = True,
+                manual_replacements     = True,
+                fast_lora_forwards      = False,
+                fast_residual_stream    = False,
+                accurate_accumulation   = True,
+                epilogue_fusion         = True,
+                max_autotune            = False,
+                shape_padding           = True,
+                cudagraphs              = False,
+                debug                   = False,
+                fullgraph               = fullgraph,
+                import_from_cache       = False,
+                disable                 = False,
+                return_logits           = return_logits,
+            )
+        pass
+
+        # Check if this is local model since the tokenizer gets overwritten
+        if  os.path.exists(os.path.join(old_model_name, "tokenizer_config.json")) and \
+            os.path.exists(os.path.join(old_model_name, "tokenizer.json")) and \
+            os.path.exists(os.path.join(old_model_name, "special_tokens_map.json")):
+
+            tokenizer_name = old_model_name
+        else:
+            tokenizer_name = None
+        pass
+
+        model, tokenizer = FastBaseVisionModel.from_pretrained(
+            model_name        = model_name,
+            max_seq_length    = max_seq_length,
+            dtype             = _get_dtype(dtype),
+            load_in_4bit      = load_in_4bit,
+            token             = token,
+            device_map        = device_map,
+            trust_remote_code = trust_remote_code,
+            revision          = revision if not is_peft else None,
+            model_types       = model_types,
+            tokenizer_name    = tokenizer_name,
+            *args, **kwargs,
+        )
+        
+        if resize_model_vocab is not None:
+            model.resize_token_embeddings(resize_model_vocab)
+        pass
+
+        # In case the model supports tagging, add the unsloth tag.
+        if hasattr(model, "add_model_tags"):
+            model.add_model_tags(["unsloth",])
+        pass
+        if hasattr(tokenizer, "add_model_tags"):
+            tokenizer.add_model_tags(["unsloth",])
+        pass
+
+        if load_in_4bit:
+            # Fix up bitsandbytes config
+            quantization_config = \
+            {
+                # Sometimes torch_dtype is not a string!!
+                "bnb_4bit_compute_dtype"           : model.config.to_dict()["torch_dtype"],
+                "bnb_4bit_quant_type"              : "nf4",
+                "bnb_4bit_use_double_quant"        : True,
+                "llm_int8_enable_fp32_cpu_offload" : False,
+                "llm_int8_has_fp16_weight"         : False,
+                "llm_int8_skip_modules"            : None,
+                "llm_int8_threshold"               : 6.0,
+                "load_in_4bit"                     : True,
+                "load_in_8bit"                     : False,
+                "quant_method"                     : "bitsandbytes",
+            }
+            model.config.update({"quantization_config" : quantization_config})
+        pass
+
+        if is_peft:
+            # From https://github.com/huggingface/peft/issues/184
+            # Now add PEFT adapters
+            model.enable_input_require_grads()
+            model = PeftModel.from_pretrained(
+                model,
+                old_model_name,
+                token = token,
+                revision = revision,
+                is_trainable = True,
+                trust_remote_code = trust_remote_code,
+            )
+            # Patch it as well!
+            model = FastBaseVisionModel.patch_peft_model(model, use_gradient_checkpointing)
+        pass
+        return model, tokenizer
+    pass
+pass
diff --git a/unsloth-main/unsloth/models/loader_utils.py b/unsloth-main/unsloth/models/loader_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b778b7e95b37ea910a58262f536cab913e785d32
--- /dev/null
+++ b/unsloth-main/unsloth/models/loader_utils.py
@@ -0,0 +1,120 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER, MAP_TO_UNSLOTH_16bit
+# https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading!
+from packaging.version import Version
+from transformers import __version__ as transformers_version
+transformers_version = Version(transformers_version)
+SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
+
+
+def __get_model_name(
+    model_name,
+    load_in_4bit = True,
+    INT_TO_FLOAT_MAPPER  = None,
+    FLOAT_TO_INT_MAPPER  = None,
+    MAP_TO_UNSLOTH_16bit = None,
+):
+    model_name = str(model_name)
+    lower_model_name = model_name.lower()
+
+    if not SUPPORTS_FOURBIT and lower_model_name in INT_TO_FLOAT_MAPPER:
+
+        model_name = INT_TO_FLOAT_MAPPER[lower_model_name]
+        print(
+            f"Unsloth: Your transformers version of {transformers_version} does not support native "\
+            f"4bit loading.\nThe minimum required version is 4.37.\n"\
+            f'Try `pip install --upgrade "transformers>=4.37"`\n'\
+            f"to obtain the latest transformers build, then restart this session.\n"\
+            f"For now, we shall load `{model_name}` instead (still 4bit, just slower downloading)."
+        )
+        return model_name
+    
+    elif not load_in_4bit and lower_model_name in INT_TO_FLOAT_MAPPER:
+
+        new_model_name = INT_TO_FLOAT_MAPPER[lower_model_name]
+        # logger.warning_once(
+        #     f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
+        #     f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
+        # )
+        return new_model_name
+
+    elif not load_in_4bit and lower_model_name in MAP_TO_UNSLOTH_16bit:
+
+        new_model_name = MAP_TO_UNSLOTH_16bit[lower_model_name]
+        return new_model_name
+
+    elif load_in_4bit and SUPPORTS_FOURBIT and lower_model_name in FLOAT_TO_INT_MAPPER:
+
+        new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name]
+        # logger.warning_once(
+        #     f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
+        #     f"We shall load `{new_model_name}` for 4x faster loading."
+        # )
+        return new_model_name
+    pass
+
+    return None
+pass
+
+
+def _get_new_mapper():
+    try:
+        import requests
+        new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py"
+        with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text
+        new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):]
+        new_mapper = new_mapper\
+            .replace("INT_TO_FLOAT_MAPPER",  "NEW_INT_TO_FLOAT_MAPPER")\
+            .replace("FLOAT_TO_INT_MAPPER",  "NEW_FLOAT_TO_INT_MAPPER")\
+            .replace("MAP_TO_UNSLOTH_16bit", "NEW_MAP_TO_UNSLOTH_16bit")
+
+        exec(new_mapper, globals())
+        return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit
+    except:
+        return {}, {}, {}
+    pass
+pass
+
+
+def get_model_name(model_name, load_in_4bit = True):
+    new_model_name = __get_model_name(
+        model_name = model_name,
+        load_in_4bit = load_in_4bit,
+        INT_TO_FLOAT_MAPPER  = INT_TO_FLOAT_MAPPER,
+        FLOAT_TO_INT_MAPPER  = FLOAT_TO_INT_MAPPER,
+        MAP_TO_UNSLOTH_16bit = MAP_TO_UNSLOTH_16bit,
+    )
+    if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum():
+        # Try checking if a new Unsloth version allows it!
+        NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit = _get_new_mapper()
+        upgraded_model_name = __get_model_name(
+            model_name = model_name,
+            load_in_4bit = load_in_4bit,
+            INT_TO_FLOAT_MAPPER  = NEW_INT_TO_FLOAT_MAPPER,
+            FLOAT_TO_INT_MAPPER  = NEW_FLOAT_TO_INT_MAPPER,
+            MAP_TO_UNSLOTH_16bit = NEW_MAP_TO_UNSLOTH_16bit,
+        )
+        if upgraded_model_name is not None:
+            raise NotImplementedError(
+                f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\
+                'pip uninstall unsloth unsloth_zoo -y\n'\
+                'pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'\
+                'pip install --upgrade --no-cache-dir "git+https://github.com/unslothai/unsloth-zoo.git"\n'\
+            )
+        pass
+    pass
+    return new_model_name if new_model_name is not None else model_name
+pass
diff --git a/unsloth-main/unsloth/models/mapper.py b/unsloth-main/unsloth/models/mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..41f7444643d5f417f280a393286fc652eb8d0335
--- /dev/null
+++ b/unsloth-main/unsloth/models/mapper.py
@@ -0,0 +1,559 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    "INT_TO_FLOAT_MAPPER",
+    "FLOAT_TO_INT_MAPPER",
+]
+
+__INT_TO_FLOAT_MAPPER = \
+{
+    "unsloth/mistral-7b-bnb-4bit" : (
+        "unsloth/mistral-7b",
+        "mistralai/Mistral-7B-v0.1",
+    ),
+    "unsloth/llama-2-7b-bnb-4bit" : (
+        "unsloth/llama-2-7b",
+        "meta-llama/Llama-2-7b-hf",
+    ),
+    "unsloth/llama-2-13b-bnb-4bit" : (
+        "unsloth/llama-2-13b",
+        "meta-llama/Llama-2-13b-hf",
+    ),
+    "unsloth/codellama-34b-bnb-4bit" : (
+        "codellama/CodeLlama-34b-hf",
+    ),
+    "unsloth/zephyr-sft-bnb-4bit" : (
+        "unsloth/zephyr-sft",
+        "HuggingFaceH4/mistral-7b-sft-beta",
+    ),
+    "unsloth/tinyllama-bnb-4bit" : (
+        "unsloth/tinyllama",
+        "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+    ),
+    "unsloth/tinyllama-chat-bnb-4bit" : (
+        "unsloth/tinyllama-chat",
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    ),
+    "unsloth/mistral-7b-instruct-v0.1-bnb-4bit" : (
+        "unsloth/mistral-7b-instruct-v0.1",
+        "mistralai/Mistral-7B-Instruct-v0.1",
+    ),
+    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit" : (
+        "unsloth/mistral-7b-instruct-v0.2",
+        "mistralai/Mistral-7B-Instruct-v0.2",
+    ),
+    "unsloth/llama-2-7b-chat-bnb-4bit" : (
+        "unsloth/llama-2-7b-chat",
+        "meta-llama/Llama-2-7b-chat-hf",
+    ),
+    "unsloth/llama-2-7b-chat-bnb-4bit" : (
+        "unsloth/llama-2-7b-chat",
+        "meta-llama/Llama-2-7b-chat-hf",
+    ),
+    "unsloth/codellama-7b-bnb-4bit" : (
+        "unsloth/codellama-7b",
+        "codellama/CodeLlama-7b-hf",
+    ),
+    "unsloth/codellama-13b-bnb-4bit" : (
+        "codellama/CodeLlama-13b-hf",
+    ),
+    "unsloth/yi-6b-bnb-4bit" : (
+        "unsloth/yi-6b",
+        "01-ai/Yi-6B",
+    ),
+    "unsloth/solar-10.7b-bnb-4bit" : (
+        "upstage/SOLAR-10.7B-v1.0",
+    ),
+    "unsloth/gemma-7b-bnb-4bit" : (
+        "unsloth/gemma-7b",
+        "google/gemma-7b",
+    ),
+    "unsloth/gemma-2b-bnb-4bit" : (
+        "unsloth/gemma-2b",
+        "google/gemma-2b",
+    ),
+    "unsloth/gemma-7b-it-bnb-4bit" : (
+        "unsloth/gemma-7b-it",
+        "google/gemma-7b-it",
+    ),
+    "unsloth/gemma-2b-bnb-4bit" : (
+        "unsloth/gemma-2b-it",
+        "google/gemma-2b-it",
+    ),
+    "unsloth/mistral-7b-v0.2-bnb-4bit" : (
+        "unsloth/mistral-7b-v0.2",
+        "alpindale/Mistral-7B-v0.2-hf",
+    ),
+    "unsloth/gemma-1.1-2b-it-bnb-4bit" : (
+        "unsloth/gemma-1.1-2b-it",
+        "google/gemma-1.1-2b-it",
+    ),
+    "unsloth/gemma-1.1-7b-it-bnb-4bit" : (
+        "unsloth/gemma-1.1-7b-it",
+        "google/gemma-1.1-7b-it",
+    ),
+    "unsloth/Starling-LM-7B-beta-bnb-4bit" : (
+        "unsloth/Starling-LM-7B-beta",
+        "Nexusflow/Starling-LM-7B-beta",
+    ),
+    "unsloth/Hermes-2-Pro-Mistral-7B-bnb-4bit" : (
+        "unsloth/Hermes-2-Pro-Mistral-7B",
+        "NousResearch/Hermes-2-Pro-Mistral-7B",
+    ),
+    "unsloth/OpenHermes-2.5-Mistral-7B-bnb-4bit" : (
+        "unsloth/OpenHermes-2.5-Mistral-7B",
+        "teknium/OpenHermes-2.5-Mistral-7B",
+    ),
+    "unsloth/codegemma-2b-bnb-4bit" : (
+        "unsloth/codegemma-2b",
+        "google/codegemma-2b",
+    ),
+    "unsloth/codegemma-7b-bnb-4bit" : (
+        "unsloth/codegemma-7b",
+        "google/codegemma-7b",
+    ),
+    "unsloth/codegemma-7b-it-bnb-4bit" : (
+        "unsloth/codegemma-7b-it",
+        "google/codegemma-7b-it",
+    ),
+    "unsloth/llama-3-8b-bnb-4bit" : (
+        "unsloth/llama-3-8b",
+        "meta-llama/Meta-Llama-3-8B",
+    ),
+    "unsloth/llama-3-8b-Instruct-bnb-4bit" : (
+        "unsloth/llama-3-8b-Instruct",
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+    ),
+    "unsloth/llama-3-70b-bnb-4bit" : (
+        "meta-llama/Meta-Llama-3-70B",
+    ),
+    "unsloth/llama-3-70b-Instruct-bnb-4bit" : (
+        "meta-llama/Meta-Llama-3-70B-Instruct",
+    ),
+    "unsloth/Phi-3-mini-4k-instruct-bnb-4bit" : (
+        "unsloth/Phi-3-mini-4k-instruct",
+        "microsoft/Phi-3-mini-4k-instruct",
+    ),
+    "unsloth/mistral-7b-v0.3-bnb-4bit" : (
+        "unsloth/mistral-7b-v0.3",
+        "mistralai/Mistral-7B-v0.3",
+    ),
+    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit" : (
+        "unsloth/mistral-7b-instruct-v0.3",
+        "mistralai/Mistral-7B-Instruct-v0.3",
+    ),
+    "unsloth/Phi-3-medium-4k-instruct-bnb-4bit" : (
+        "unsloth/Phi-3-medium-4k-instruct",
+        "microsoft/Phi-3-medium-4k-instruct",
+    ),
+    "unsloth/Qwen2-0.5B-bnb-4bit" : (
+        "unsloth/Qwen2-0.5B",
+        "Qwen/Qwen2-0.5B",
+    ),
+    "unsloth/Qwen2-0.5B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2-0.5B-Instruct",
+        "Qwen/Qwen2-0.5B-Instruct",
+    ),
+    "unsloth/Qwen2-1.5B-bnb-4bit" : (
+        "unsloth/Qwen2-1.5B",
+        "Qwen/Qwen2-1.5B",
+    ),
+    "unsloth/Qwen2-1.5B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2-1.5B-Instruct",
+        "Qwen/Qwen2-1.5B-Instruct",
+    ),
+    "unsloth/Qwen2-7B-bnb-4bit" : (
+        "unsloth/Qwen2-7B",
+        "Qwen/Qwen2-7B",
+    ),
+    "unsloth/Qwen2-7B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2-7B-Instruct",
+        "Qwen/Qwen2-7B-Instruct",
+    ),
+    "unsloth/Qwen2-70B-bnb-4bit" : (
+        "Qwen/Qwen2-70B",
+    ),
+    "unsloth/Qwen2-70B-Instruct-bnb-4bit" : (
+        "Qwen/Qwen2-70B-Instruct",
+    ),
+    "mistralai/Codestral-22B-v0.1" : (
+        "mistral-community/Codestral-22B-v0.1",
+    ),
+    "unsloth/gemma-2-9b-bnb-4bit" : (
+        "unsloth/gemma-2-9b",
+        "google/gemma-2-9b",
+    ),
+    "unsloth/gemma-2-27b-bnb-4bit" : (
+        "unsloth/gemma-2-27b",
+        "google/gemma-2-27b",
+    ),
+    "unsloth/gemma-2-9b-it-bnb-4bit" : (
+        "unsloth/gemma-2-9b-it",
+        "google/gemma-2-9b-it",
+    ),
+    "unsloth/gemma-2-27b-it-bnb-4bit" : (
+        "unsloth/gemma-2-27b-it",
+        "google/gemma-2-27b-it",
+    ),
+    "unsloth/Phi-3-mini-4k-instruct-v0-bnb-4bit" : ( # Old Phi pre July
+        "unsloth/Phi-3-mini-4k-instruct-v0",
+    ),
+    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit" : ( # New 12b Mistral models
+        "unsloth/Mistral-Nemo-Instruct-2407",
+        "mistralai/Mistral-Nemo-Instruct-2407",
+    ),
+    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit" : ( # New 12b Mistral models
+        "unsloth/Mistral-Nemo-Base-2407",
+        "mistralai/Mistral-Nemo-Base-2407",
+    ),
+    "unsloth/Meta-Llama-3.1-8B-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-8B",
+        "meta-llama/Meta-Llama-3.1-8B",
+    ),
+    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-8B-Instruct",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    ),
+    "unsloth/Meta-Llama-3.1-70B-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-70B",
+        "meta-llama/Meta-Llama-3.1-70B",
+    ),
+    "unsloth/Meta-Llama-3.1-405B-bnb-4bit" : (
+        "meta-llama/Meta-Llama-3.1-405B",
+    ),
+    "unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit" : (
+        "meta-llama/Meta-Llama-3.1-405B-Instruct",
+    ),
+    "unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit" : (
+        "unsloth/Meta-Llama-3.1-70B-Instruct",
+        "meta-llama/Meta-Llama-3.1-70B-Instruct",
+    ),
+    "unsloth/Mistral-Large-Instruct-2407-bnb-4bit" : (
+        "mistralai/Mistral-Large-Instruct-2407",
+    ),
+    "unsloth/gemma-2-2b-bnb-4bit" : (
+        "unsloth/gemma-2-2b",
+        "google/gemma-2-2b",
+    ),
+    "unsloth/gemma-2-2b-it-bnb-4bit" : (
+        "unsloth/gemma-2-2b-it",
+        "google/gemma-2-2b-it",
+    ),
+    "unsloth/Phi-3.5-mini-instruct-bnb-4bit" : (
+        "unsloth/Phi-3.5-mini-instruct",
+        "microsoft/Phi-3.5-mini-instruct",
+    ),
+    "unsloth/c4ai-command-r-08-2024-bnb-4bit" : (
+        "CohereForAI/c4ai-command-r-08-2024",
+    ),
+    "unsloth/c4ai-command-r-plus-08-2024-bnb-4bit" : (
+        "CohereForAI/c4ai-command-r-plus-08-2024",
+    ),
+    "unsloth/Llama-3.1-Storm-8B-bnb-4bit" : (
+        "unsloth/Llama-3.1-Storm-8B",
+        "akjindal53244/Llama-3.1-Storm-8B",
+    ),
+    "unsloth/Hermes-3-Llama-3.1-8B-bnb-4bit" : (
+        "unsloth/Hermes-3-Llama-3.1-8B",
+        "NousResearch/Hermes-3-Llama-3.1-8B",
+    ),
+    "unsloth/Hermes-3-Llama-3.1-70B-bnb-4bit" : (
+        "unsloth/Hermes-3-Llama-3.1-70B",
+        "NousResearch/Hermes-3-Llama-3.1-70B",
+    ),
+    "unsloth/Hermes-3-Llama-3.1-405B-bnb-4bit" : (
+        "NousResearch/Hermes-3-Llama-3.1-405B",
+    ),
+    "unsloth/SmolLM-135M-bnb-4bit" : (
+        "unsloth/SmolLM-135M",
+        "HuggingFaceTB/SmolLM-135M",
+    ),
+    "unsloth/SmolLM-360M-bnb-4bit" : (
+        "unsloth/SmolLM-360M",
+        "HuggingFaceTB/SmolLM-360M",
+    ),
+    "unsloth/SmolLM-1.7B-bnb-4bit" : (
+        "unsloth/SmolLM-1.7B",
+        "HuggingFaceTB/SmolLM-1.7B",
+    ),
+    "unsloth/SmolLM-135M-Instruct-bnb-4bit" : (
+        "unsloth/SmolLM-135M-Instruct",
+        "HuggingFaceTB/SmolLM-135M-Instruct",
+    ),
+    "unsloth/SmolLM-360M-Instruct-bnb-4bit" : (
+        "unsloth/SmolLM-360M-Instruct",
+        "HuggingFaceTB/SmolLM-360M-Instruct",
+    ),
+    "unsloth/SmolLM-1.7B-Instruct-bnb-4bit" : (
+        "unsloth/SmolLM-1.7B-Instruct",
+        "HuggingFaceTB/SmolLM-1.7B-Instruct",
+    ),
+    "unsloth/Mistral-Small-Instruct-2409-bnb-4bit" : (
+        "unsloth/Mistral-Small-Instruct-2409",
+        "mistralai/Mistral-Small-Instruct-2409",
+    ),
+    "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-0.5B-Instruct",
+        "Qwen/Qwen2.5-0.5B-Instruct",
+    ),
+    "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-1.5B-Instruct",
+        "Qwen/Qwen2.5-1.5B-Instruct",
+    ),
+    "unsloth/Qwen2.5-3B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-3B-Instruct",
+        "Qwen/Qwen2.5-3B-Instruct",
+    ),
+    "unsloth/Qwen2.5-7B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-7B-Instruct",
+        "Qwen/Qwen2.5-7B-Instruct",
+    ),
+    "unsloth/Qwen2.5-14B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-14B-Instruct",
+        "Qwen/Qwen2.5-14B-Instruct",
+    ),
+    "unsloth/Qwen2.5-32B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-32B-Instruct",
+        "Qwen/Qwen2.5-32B-Instruct",
+    ),
+    "unsloth/Qwen2.5-72B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-72B-Instruct",
+        "Qwen/Qwen2.5-72B-Instruct",
+    ),
+    "unsloth/Qwen2.5-0.5B-bnb-4bit" : (
+        "unsloth/Qwen2.5-0.5B",
+        "Qwen/Qwen2.5-0.5B",
+    ),
+    "unsloth/Qwen2.5-1.5B-bnb-4bit" : (
+        "unsloth/Qwen2.5-1.5B",
+        "Qwen/Qwen2.5-1.5B",
+    ),
+    "unsloth/Qwen2.5-3B-bnb-4bit" : (
+        "unsloth/Qwen2.5-3B",
+        "Qwen/Qwen2.5-3B",
+    ),
+    "unsloth/Qwen2.5-7B-bnb-4bit" : (
+        "unsloth/Qwen2.5-7B",
+        "Qwen/Qwen2.5-7B",
+    ),
+    "unsloth/Qwen2.5-14B-bnb-4bit" : (
+        "unsloth/Qwen2.5-14B",
+        "Qwen/Qwen2.5-14B",
+    ),
+    "unsloth/Qwen2.5-32B-bnb-4bit" : (
+        "unsloth/Qwen2.5-32B",
+        "Qwen/Qwen2.5-32B",
+    ),
+    "unsloth/Qwen2.5-72B-bnb-4bit" : (
+        "unsloth/Qwen2.5-72B",
+        "Qwen/Qwen2.5-72B",
+    ),
+    "unsloth/Qwen2.5-Math-1.5B-bnb-4bit" : (
+        "unsloth/Qwen2.5-Math-1.5B",
+        "Qwen/Qwen2.5-Math-1.5B",
+    ),
+    "unsloth/Qwen2.5-Math-7B-bnb-4bit" : (
+        "unsloth/Qwen2.5-Math-7B",
+        "Qwen/Qwen2.5-Math-7B",
+    ),
+    "unsloth/Qwen2.5-Math-72B-bnb-4bit" : (
+        "unsloth/Qwen2.5-Math-72B",
+        "Qwen/Qwen2.5-Math-72B",
+    ),
+    "unsloth/Qwen2.5-Math-1.5B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-Math-1.5B-Instruct",
+        "Qwen/Qwen2.5-Math-1.5B-Instruct",
+    ),
+    "unsloth/Qwen2.5-Math-7B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-Math-7B-Instruct",
+        "Qwen/Qwen2.5-Math-7B-Instruct",
+    ),
+    "unsloth/Qwen2.5-Math-72B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-Math-72B-Instruct",
+        "Qwen/Qwen2.5-Math-72B-Instruct",
+    ),
+    "unsloth/Qwen2.5-Coder-0.5B-bnb-4bit" : (
+        "unsloth/Qwen2.5-Coder-0.5B",
+        "Qwen/Qwen2.5-Coder-0.5B",
+    ),
+    "unsloth/Qwen2.5-Coder-1.5B-bnb-4bit" : (
+        "unsloth/Qwen2.5-Coder-1.5B",
+        "Qwen/Qwen2.5-Coder-1.5B",
+    ),
+    "unsloth/Qwen2.5-Coder-3B-bnb-4bit" : (
+        "unsloth/Qwen2.5-Coder-3B",
+        "Qwen/Qwen2.5-Coder-3B",
+    ),
+    "unsloth/Qwen2.5-Coder-7B-bnb-4bit" : (
+        "unsloth/Qwen2.5-Coder-7B",
+        "Qwen/Qwen2.5-Coder-7B",
+    ),
+    "unsloth/Qwen2.5-Coder-14B-bnb-4bit" : (
+        "unsloth/Qwen2.5-Coder-14B",
+        "Qwen/Qwen2.5-Coder-14B",
+    ),
+    "unsloth/Qwen2.5-Coder-32B-bnb-4bit" : (
+        "unsloth/Qwen2.5-Coder-32B",
+        "Qwen/Qwen2.5-Coder-32B",
+    ),
+    "unsloth/Qwen2.5-Coder-0.5B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-Coder-0.5B-Instruct",
+        "Qwen/Qwen2.5-Coder-0.5B-Instruct",
+    ),
+    "unsloth/Qwen2.5-Coder-1.5B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-Coder-1.5B-Instruct",
+        "Qwen/Qwen2.5-Coder-1.5B-Instruct",
+    ),
+    "unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-Coder-3B-Instruct",
+        "Qwen/Qwen2.5-Coder-3B-Instruct",
+    ),
+    "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-Coder-7B-Instruct",
+        "Qwen/Qwen2.5-Coder-7B-Instruct",
+    ),
+    "unsloth/Qwen2.5-Coder-14B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-Coder-14B-Instruct",
+        "Qwen/Qwen2.5-Coder-14B-Instruct",
+    ),
+    "unsloth/Qwen2.5-Coder-32B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2.5-Coder-32B-Instruct",
+        "Qwen/Qwen2.5-Coder-32B-Instruct",
+    ),
+    "unsloth/Llama-3.2-1B-bnb-4bit" : (
+        "unsloth/Llama-3.2-1B",
+        "meta-llama/Llama-3.2-1B",
+    ),
+    "unsloth/Llama-3.2-3B-bnb-4bit" : (
+        "unsloth/Llama-3.2-3B",
+        "meta-llama/Llama-3.2-3B",
+    ),
+    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit" : (
+        "unsloth/Llama-3.2-1B-Instruct",
+        "meta-llama/Llama-3.2-1B-Instruct",
+    ),
+    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" : (
+        "unsloth/Llama-3.2-3B-Instruct",
+        "meta-llama/Llama-3.2-3B-Instruct",
+    ),
+    "unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit" : (
+        "unsloth/Llama-3.1-Nemotron-70B-Instruct",
+        "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+    ),
+    "unsloth/Qwen2-VL-2B-Instruct-unsloth-bnb-4bit" : (
+        "unsloth/Qwen2-VL-2B-Instruct",
+        "Qwen/Qwen2-VL-2B-Instruct",
+        "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",
+    ),
+    "unsloth/Qwen2-VL-7B-Instruct-unsloth-bnb-4bit" : (
+        "unsloth/Qwen2-VL-7B-Instruct",
+        "Qwen/Qwen2-VL-7B-Instruct",
+        "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
+    ),
+    "unsloth/Qwen2-VL-72B-Instruct-unsloth-bnb-4bit" : (
+        "unsloth/Qwen2-VL-72B-Instruct",
+        "Qwen/Qwen2-VL-72B-Instruct",
+        "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",
+    ),
+    "unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit" : (
+        "unsloth/Llama-3.2-11B-Vision-Instruct",
+        "meta-llama/Llama-3.2-11B-Vision-Instruct",
+        "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
+    ),
+    "unsloth/Llama-3.2-90B-Vision-Instruct-unsloth-bnb-4bit" : (
+        "unsloth/Llama-3.2-90B-Vision-Instruct",
+        "meta-llama/Llama-3.2-90B-Vision-Instruct",
+        "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit",
+    ),
+    "unsloth/Llama-3.2-11B-Vision-unsloth-bnb-4bit" : (
+        "unsloth/Llama-3.2-11B-Vision",
+        "meta-llama/Llama-3.2-11B-Vision",
+        "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
+    ),
+    "unsloth/Llama-3.2-90B-Vision-unsloth-bnb-4bit" : (
+        "unsloth/Llama-3.2-90B-Vision",
+        "meta-llama/Llama-3.2-90B-Vision",
+        "unsloth/Llama-3.2-90B-Vision-bnb-4bit",
+    ),
+    "unsloth/Pixtral-12B-2409-unsloth-bnb-4bit" : (
+        "unsloth/Pixtral-12B-2409",
+        "mistralai/Pixtral-12B-2409",
+        "unsloth/Pixtral-12B-2409-bnb-4bit",
+    ),
+    "unsloth/Pixtral-12B-2409-Base-bnb-4bit" : (
+        "unsloth/Pixtral-12B-Base-2409",
+        "mistralai/Pixtral-12B-Base-2409",
+    ),
+    "unsloth/llava-1.5-7b-hf-bnb-4bit" : (
+        "unsloth/llava-1.5-7b-hf",
+        "llava-hf/llava-1.5-7b-hf",
+    ),
+    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit" : (
+        "unsloth/llava-v1.6-mistral-7b-hf",
+        "llava-hf/llava-v1.6-mistral-7b-hf",
+    ),
+    "unsloth/Llama-3.1-Tulu-3-8B-bnb-4bit" : (
+        "unsloth/Llama-3.1-Tulu-3-8B",
+        "allenai/Llama-3.1-Tulu-3-8B",
+    ),
+    "unsloth/Llama-3.1-Tulu-3-70B-bnb-4bit" : (
+        "unsloth/Llama-3.1-Tulu-3-70B",
+        "allenai/Llama-3.1-Tulu-3-70B",
+    ),
+    "unsloth/QwQ-32B-Preview-bnb-4bit" : (
+        "unsloth/QwQ-32B-Preview",
+        "Qwen/QwQ-32B-Preview",
+    ),
+    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" : (
+        "unsloth/Llama-3.3-70B-Instruct",
+        "meta-llama/Llama-3.3-70B-Instruct",
+    ),
+}
+
+INT_TO_FLOAT_MAPPER  = {}
+FLOAT_TO_INT_MAPPER  = {}
+MAP_TO_UNSLOTH_16bit = {}
+
+for key, values in __INT_TO_FLOAT_MAPPER.items():
+    INT_TO_FLOAT_MAPPER[key] = values[0]
+
+    for value in values:
+        FLOAT_TO_INT_MAPPER[value] = key
+    pass
+
+    # Map to Unsloth version for 16bit versions
+    if len(values) == 2:
+        if values[0].startswith("unsloth"):
+            MAP_TO_UNSLOTH_16bit[values[1]] = values[0]
+            MAP_TO_UNSLOTH_16bit[values[1].lower()] = values[0]
+        pass
+    elif len(values) == 3:
+        # Dynamic Unsloth quantization
+        if values[0].startswith("unsloth"):
+            MAP_TO_UNSLOTH_16bit[values[1]] = values[0]
+            MAP_TO_UNSLOTH_16bit[values[1].lower()] = values[0]
+            MAP_TO_UNSLOTH_16bit[values[2]] = values[0]
+            MAP_TO_UNSLOTH_16bit[values[2].lower()] = values[0]
+        pass
+    pass
+
+    # Get lowercased
+    lowered_key = key.lower()
+    INT_TO_FLOAT_MAPPER[lowered_key] = values[0].lower()
+
+    for value in values:
+        FLOAT_TO_INT_MAPPER[value.lower()] = lowered_key
+    pass
+pass
diff --git a/unsloth-main/unsloth/models/mistral.py b/unsloth-main/unsloth/models/mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6c6946664f239f128ea118b1e5a6050a76b4e94
--- /dev/null
+++ b/unsloth-main/unsloth/models/mistral.py
@@ -0,0 +1,363 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .llama import *
+import os
+from ._utils import __version__
+from .llama import (
+    LlamaRotaryEmbedding,
+    LlamaLinearScalingRotaryEmbedding,
+)
+from transformers.models.mistral.modeling_mistral import (
+    MistralAttention,
+    MistralDecoderLayer,
+    MistralModel,
+    MistralForCausalLM,
+)
+# For Pytorch 2.1.1
+try:
+    from transformers.models.mistral.modeling_mistral import (
+        MistralSdpaAttention,
+        MistralFlashAttention2,
+    )
+except:
+    MistralSdpaAttention   = MistralAttention
+    MistralFlashAttention2 = MistralAttention
+pass
+
+
+def MistralAttention_fast_forward(
+    self,
+    hidden_states:       torch.Tensor,
+    causal_mask:         Optional[BlockDiagonalCausalMask] = None,
+    attention_mask:      Optional[torch.Tensor] = None,
+    position_ids:        Optional[torch.LongTensor] = None,
+    past_key_value:      Optional[Tuple[torch.Tensor]] = None,
+    output_attentions:   bool = False,
+    use_cache:           bool = False,
+    padding_mask:        Optional[torch.LongTensor] = None,
+    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    *args, **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    
+    # Clear inference
+    if hasattr(self, "paged_attention"):
+        del self.paged_attention_K
+        del self.paged_attention_V
+        del self.paged_attention
+        del self.temp_QA
+        del self.temp_KV
+        del self.RH_Q
+        del self.attention
+    pass
+
+    bsz, q_len, _ = hidden_states.size()
+
+    n_heads    = self.num_heads
+    n_groups   = self.num_key_value_groups
+    n_kv_heads = self.num_key_value_heads
+    head_dim   = self.head_dim
+    assert(n_kv_heads * n_groups == n_heads)
+
+    Q, K, V = self.apply_qkv(self, hidden_states)
+    Q = Q.view(bsz, q_len, n_heads,    head_dim).transpose(1, 2)
+    K = K.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
+    V = V.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
+
+    kv_seq_len = K.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    # Extend RoPE dynamically to fit in VRAM
+    self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len)
+
+    if position_ids is None:
+        cos = self.rotary_emb.cos_cached
+        sin = self.rotary_emb.sin_cached
+        Q, K = fast_rope_embedding(Q, K, cos, sin)
+    else:
+        cos, sin = self.rotary_emb(V, seq_len = kv_seq_len)
+        Q, K = inplace_rope_embedding(Q, K, cos, sin, position_ids)
+    pass
+
+    if past_key_value is not None:
+        K = torch.cat([past_key_value[0], K], dim = 2)
+        V = torch.cat([past_key_value[1], V], dim = 2)
+    pass
+    past_key_value = (K, V) if use_cache else None
+
+    # Attention module
+    if (not HAS_FLASH_ATTENTION and attention_mask is None):
+        # Xformers memory efficient attention
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2)
+        V = V.transpose(1, 2)
+        K_M = V_M = bsz * kv_seq_len
+        Q_M = bsz * q_len
+
+        has_swa = isinstance(causal_mask, xformers.attn_bias.BlockDiagonalCausalMask)
+
+        # Group query attention
+        K = K  .view(bsz, kv_seq_len, n_kv_heads,        1, head_dim)
+        V = V  .view(bsz, kv_seq_len, n_kv_heads,        1, head_dim)
+        K = K.expand(bsz, kv_seq_len, n_kv_heads, n_groups, head_dim)
+        V = V.expand(bsz, kv_seq_len, n_kv_heads, n_groups, head_dim)
+        if hidden_states.requires_grad:
+            K = K.reshape(bsz, kv_seq_len, n_heads, head_dim)
+            V = V.reshape(bsz, kv_seq_len, n_heads, head_dim)
+
+            if has_swa:
+                Q = Q.view(1, Q_M, n_heads, head_dim)
+                K = K.view(1, K_M, n_heads, head_dim)
+                V = V.view(1, V_M, n_heads, head_dim)
+            pass
+        else:
+            # Xformers does support the forward pass though
+            Q = Q.view(bsz, q_len, n_kv_heads, n_groups, head_dim)
+
+            if has_swa:
+                Q = Q.view(1, Q_M, n_kv_heads, n_groups, head_dim)
+                K = K.view(1, K_M, n_kv_heads, n_groups, head_dim)
+                V = V.view(1, V_M, n_kv_heads, n_groups, head_dim)
+            pass
+        pass
+
+        A = xformers_attention(Q, K, V, attn_bias = causal_mask)
+        A = A.view(bsz, q_len, n_heads, head_dim)
+
+    elif HAS_FLASH_ATTENTION and attention_mask is None:
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2)
+        V = V.transpose(1, 2)
+        sw = getattr(self.config, "sliding_window", None)
+        sw = kv_seq_len if (sw is None or sw == "null") else sw
+        window = (-1, -1) if (kv_seq_len <= sw) else (sw, sw)
+        A = flash_attn_func(Q, K, V, causal = True, window_size = window)
+    else:
+        # Grouped query attention
+        # if n_groups != 1:
+        K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, kv_seq_len, head_dim)
+        V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, kv_seq_len, head_dim)
+        K = K.reshape(bsz, n_heads, kv_seq_len, head_dim)
+        V = V.reshape(bsz, n_heads, kv_seq_len, head_dim)
+        # pass
+        # Must be contiguous or else results are False!
+        # https://github.com/pytorch/pytorch/issues/112577
+        Q, K, V = Q.contiguous(), K.contiguous(), V.contiguous()
+        # Needs (batch_size, n_heads, seq_len, head_dim)
+        # is_casual and attention_mask must not be both set!
+        A = scaled_dot_product_attention(Q, K, V, attn_mask = attention_mask, is_causal = False)
+        # Go back to (batch_size, seq_len, n_heads, head_dim)
+        A = A.transpose(1, 2).contiguous()
+    pass
+    
+    attn_output = A.reshape(bsz, q_len, n_heads*head_dim)
+    attn_output = self.apply_o(self, attn_output)
+    attn_weights = None
+    return attn_output, attn_weights, past_key_value
+pass
+
+
+def MistralForCausalLM_fast_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    causal_mask: Optional[BlockDiagonalCausalMask] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    num_logits_to_keep: Optional[int] = 0,
+    *args, **kwargs,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+
+    if causal_mask is None and past_key_values is None:
+        bsz, q_len = input_ids.shape
+        sliding_window = getattr(self.config, "sliding_window", None)
+        if sliding_window is None or sliding_window == "null" or sliding_window <= 0:
+            causal_mask = xformers.attn_bias.LowerTriangularMask()
+        elif q_len <= sliding_window:
+            causal_mask = xformers.attn_bias.LowerTriangularMask()
+        else:
+            # Fix from https://github.com/Rypo
+            causal_mask = xformers.attn_bias.BlockDiagonalCausalMask\
+                .from_seqlens([q_len]*bsz)\
+                .make_local_attention(window_size = sliding_window)
+    pass
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    self.model._has_no_labels = labels is None
+
+    if past_key_values is not None:
+        outputs = LlamaModel_fast_forward_inference(
+            self,
+            input_ids,
+            past_key_values,
+            position_ids = position_ids,
+            attention_mask = attention_mask,
+        )
+    else:
+        outputs = self.model(
+            input_ids=input_ids,
+            causal_mask=causal_mask,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+    pass
+
+    hidden_states = outputs[0]
+    bsz, q_len, hd = hidden_states.shape
+    lm_head = self.lm_head.weight
+    if bsz == 1 and q_len == 1:
+        logits = torch.mv(lm_head, hidden_states.ravel().to(lm_head.dtype))
+        logits = logits.unsqueeze(0).unsqueeze(0)
+    elif num_logits_to_keep != 0:
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :].to(lm_head.dtype))
+    else:
+        logits = self.lm_head(hidden_states.to(lm_head.dtype))
+    pass
+    logits = logits.to(self.config.torch_dtype)
+
+    loss = None
+    if labels is not None:
+        shift_logits = logits
+        if not hasattr(self, "extra_ignored_labels"):
+            # Fixes https://github.com/unslothai/unsloth/issues/10
+            self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = "cuda:0")
+        pass
+        
+        shift_labels = torch.hstack((labels[..., 1:], self.extra_ignored_labels[:labels.shape[0]]))
+        loss = fast_cross_entropy_loss(
+            logits  = shift_logits,
+            labels  = shift_labels,
+            n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None),
+        )
+    pass
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+pass
+
+
+# Transformers had to update for Mistral Nemo 12b since Attention is (5120, 4096) now.
+def patch_mistral_nemo_attention(function):
+    function = function.replace(
+        "(self.head_dim * self.num_heads) != self.hidden_size",
+        "False",
+    )
+    function = function.replace(
+        "self.head_dim = self.hidden_size // self.num_heads",
+        "self.head_dim = config.head_dim",
+    )
+    function = function.replace(
+        "self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)",
+        "self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)",
+    )
+    return function
+pass
+
+
+class FastMistralModel(FastLlamaModel):
+
+    @staticmethod
+    def pre_patch():
+        init_name, function = patch_linear_scaling(
+            model_name         = "mistral",
+            rope_module        = LlamaRotaryEmbedding,
+            scaled_rope_module = LlamaLinearScalingRotaryEmbedding,
+            attention_module   = MistralAttention,
+        )
+        # Just for Mistral Nemo models!
+        if function is not None:
+            function = patch_mistral_nemo_attention(function)
+            # if True:#init_name is not None:
+            exec(function, globals())
+            MistralAttention.__init__  = eval(init_name)
+        pass
+        MistralAttention      .forward = MistralAttention_fast_forward
+        MistralSdpaAttention  .forward = MistralAttention_fast_forward
+        MistralFlashAttention2.forward = MistralAttention_fast_forward
+        MistralDecoderLayer   .forward = LlamaDecoderLayer_fast_forward
+        MistralModel          .forward = LlamaModel_fast_forward
+        MistralForCausalLM    .forward = MistralForCausalLM_fast_forward
+        PeftModelForCausalLM  .forward = PeftModelForCausalLM_fast_forward
+        fix_prepare_inputs_for_generation(MistralForCausalLM)
+        
+        # Solves https://github.com/unslothai/unsloth/issues/168
+        # Static KV Cache was introduced in 4.38.0, causing training to be much slower.
+        # Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
+        # https://github.com/huggingface/transformers/pull/27931
+        # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
+        import transformers.models.mistral.modeling_mistral
+        transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding = LlamaRotaryEmbedding
+        return
+    pass
+
+
+    @staticmethod
+    def from_pretrained(
+        model_name        = "unsloth/mistral-7b-bnb-4bit",
+        max_seq_length    = None,
+        dtype             = None,
+        load_in_4bit      = True,
+        token             = None,
+        device_map        = "sequential",
+        rope_scaling      = None, # Mistral does not support RoPE scaling
+        fix_tokenizer     = True,
+        model_patcher     = None,
+        tokenizer_name    = None,
+        trust_remote_code = False,
+        **kwargs,
+    ):
+        return FastLlamaModel.from_pretrained(
+            model_name        = model_name,
+            max_seq_length    = max_seq_length,
+            dtype             = dtype,
+            load_in_4bit      = load_in_4bit,
+            token             = token,
+            device_map        = device_map,
+            rope_scaling      = rope_scaling,
+            fix_tokenizer     = fix_tokenizer,
+            model_patcher     = FastMistralModel,
+            tokenizer_name    = tokenizer_name,
+            trust_remote_code = trust_remote_code,
+            **kwargs,
+        )
+    pass
+pass
diff --git a/unsloth-main/unsloth/models/qwen2.py b/unsloth-main/unsloth/models/qwen2.py
new file mode 100644
index 0000000000000000000000000000000000000000..82de1951b8905b3a1d780c40bf5c02aedca09a09
--- /dev/null
+++ b/unsloth-main/unsloth/models/qwen2.py
@@ -0,0 +1,102 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .llama import *
+from .llama import (
+    LlamaRotaryEmbedding,
+    LlamaLinearScalingRotaryEmbedding,
+)
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2Attention,
+    Qwen2DecoderLayer,
+    Qwen2Model,
+    Qwen2ForCausalLM,
+)
+# For Pytorch 2.1.1
+try:
+    from transformers.models.qwen2.modeling_qwen2 import (
+        Qwen2SdpaAttention,
+        Qwen2FlashAttention2,
+    )
+except:
+    Qwen2SdpaAttention   = Qwen2Attention
+    Qwen2FlashAttention2 = Qwen2Attention
+pass
+
+
+class FastQwen2Model(FastLlamaModel):
+
+    @staticmethod
+    def pre_patch():
+        init_name, function = patch_linear_scaling(
+            model_name         = "qwen2",
+            rope_module        = LlamaRotaryEmbedding,
+            scaled_rope_module = LlamaLinearScalingRotaryEmbedding,
+            attention_module   = Qwen2Attention,
+        )
+        if init_name is not None:
+            exec(function, globals())
+            Qwen2Attention.__init__  = eval(init_name)
+        pass
+        Qwen2Attention      .forward = LlamaAttention_fast_forward
+        Qwen2SdpaAttention  .forward = LlamaAttention_fast_forward
+        Qwen2FlashAttention2.forward = LlamaAttention_fast_forward
+        Qwen2DecoderLayer   .forward = LlamaDecoderLayer_fast_forward
+        Qwen2Model          .forward = LlamaModel_fast_forward
+        Qwen2ForCausalLM    .forward = CausalLM_fast_forward(LlamaModel_fast_forward_inference)
+        PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward
+        fix_prepare_inputs_for_generation(Qwen2ForCausalLM)
+
+        # Solves https://github.com/unslothai/unsloth/issues/168
+        # Static KV Cache was introduced in 4.38.0, causing training to be much slower.
+        # Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
+        # https://github.com/huggingface/transformers/pull/27931
+        # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
+        import transformers.models.qwen2.modeling_qwen2
+        transformers.models.qwen2.modeling_qwen2.Qwen2RotaryEmbedding = LlamaRotaryEmbedding
+        return
+    pass
+
+
+    @staticmethod
+    def from_pretrained(
+        model_name        = "Qwen/Qwen2-7B",
+        max_seq_length    = 4096,
+        dtype             = None,
+        load_in_4bit      = True,
+        token             = None,
+        device_map        = "sequential",
+        rope_scaling      = None, # Qwen2 does not support RoPE scaling
+        fix_tokenizer     = True,
+        model_patcher     = None,
+        tokenizer_name    = None,
+        trust_remote_code = False,
+        **kwargs,
+    ):
+        return FastLlamaModel.from_pretrained(
+            model_name        = model_name,
+            max_seq_length    = max_seq_length,
+            dtype             = dtype,
+            load_in_4bit      = load_in_4bit,
+            token             = token,
+            device_map        = device_map,
+            rope_scaling      = rope_scaling,
+            fix_tokenizer     = fix_tokenizer,
+            model_patcher     = FastQwen2Model,
+            tokenizer_name    = tokenizer_name,
+            trust_remote_code = trust_remote_code,
+            **kwargs,
+        )
+    pass
+pass
diff --git a/unsloth-main/unsloth/models/vision.py b/unsloth-main/unsloth/models/vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dc4b88dfabad17bf418749c95dc50894f59ad34
--- /dev/null
+++ b/unsloth-main/unsloth/models/vision.py
@@ -0,0 +1,434 @@
+# Unsloth Zoo - Utilities for Unsloth
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import torch
+from transformers import (
+    BitsAndBytesConfig,
+    AutoModelForVision2Seq,
+    AutoProcessor,
+)
+from .llama import *
+from ..kernels import (
+    post_patch_loss_function,
+)
+from ._utils import __version__
+from peft import LoraConfig, TaskType, get_peft_model
+from transformers import set_seed as transformers_set_seed
+from unsloth_zoo.peft_utils import (
+    get_peft_regex,
+    SKIP_QUANTIZATION_MODULES,
+)
+from triton import __version__ as triton_version
+
+__all__ = [
+    "FastBaseVisionModel",
+]
+
+def _wrap_fast_inference(generate, device_type, dtype, model):
+    # Wraps inference with bfloat16 / float16
+    @torch.inference_mode
+    def _fast_generate(*args, **kwargs):
+        # For num_logits_to_keep
+        # kwargs["num_logits_to_keep"] = 1
+
+        # Remove token_type_ids
+        kwargs.pop("token_type_ids", None)
+
+        # Check pad_token
+        model_eos_token_id = getattr(model.config, "eos_token_id", None)
+        if model_eos_token_id is not None and hasattr(model_eos_token_id, "__iter__"):
+            model_eos_token_id = model_eos_token_id[0]
+
+        kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id)
+
+        try:
+            kwargs["pixel_values"] = kwargs["pixel_values"].to(model.dtype)
+        except:
+            pass
+
+        # Autocasted
+        with torch.autocast(device_type = device_type, dtype = dtype):
+            output = generate(*args, **kwargs)
+        pass
+        return output
+    pass
+    return _fast_generate
+pass
+
+
+class FastBaseVisionModel:
+
+    @staticmethod
+    def from_pretrained(
+        model_name        = "unsloth/llama-3-8b-bnb-4bit",
+        max_seq_length    = None,
+        dtype             = None,
+        load_in_4bit      = True,
+        token             = None,
+        device_map        = "sequential",
+        trust_remote_code = False,
+        model_types       = None,
+        tokenizer_name    = None,
+        **kwargs,
+    ):
+        if trust_remote_code:
+            print(
+                "Unsloth: WARNING `trust_remote_code` is True.\n"\
+                "Are you certain you want to do remote code execution?"
+            )
+        pass
+        if token is None: token = get_token()
+        SUPPORTS_BFLOAT16 = is_bfloat16_supported()
+        gpu_stats = torch.cuda.get_device_properties(0)
+        max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+
+        statistics = \
+           f"==((====))==  Unsloth {__version__}: Fast {model_types[0].title()} vision patching. Transformers: {transformers_version}.\n"\
+           f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\
+           f"O^O/ \_/ \\    Torch: {torch.__version__}. CUDA: {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit: {torch.version.cuda}. Triton: {triton_version}\n"\
+           f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
+           f' "-____-"     Free Apache license: http://github.com/unslothai/unsloth'
+        print(statistics)
+
+        # Warn about fast transfers
+        old_hf_transfer = os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0")
+        if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1":
+            print("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!")
+        pass
+        # Return old flag
+        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
+        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+
+        get_statistics() # For debugging - we use a download counter to see if environments are not breaking 
+
+        if dtype is None:
+            dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
+        elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16:
+            logger.warning_once("Device does not support bfloat16. Will change to float16.")
+            dtype = torch.float16
+
+        assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32)
+
+        # We currently only support NVIDIA GPUs - AMD / Intel is a work in progress!
+        pre_check = check_nvidia()
+
+        bnb_config = None
+        if load_in_4bit:
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit              = True,
+                bnb_4bit_use_double_quant = True,
+                bnb_4bit_quant_type       = "nf4",
+                bnb_4bit_compute_dtype    = dtype,
+                llm_int8_skip_modules     = SKIP_QUANTIZATION_MODULES,
+            )
+        pass
+
+        kwargs.pop("attn_implementation", None); # No need since we auto call it
+
+        # Cannot be None, since HF now checks for the config
+        if load_in_4bit: kwargs["quantization_config"] = bnb_config
+        
+        model = AutoModelForVision2Seq.from_pretrained(
+            model_name,
+            device_map              = device_map,
+            torch_dtype             = dtype,
+            # quantization_config   = bnb_config,
+            token                   = token,
+            trust_remote_code       = trust_remote_code,
+            # attn_implementation   = "sdpa", [TODO] Pixtral for eg fails
+            **kwargs,
+        )
+        # Return old flag
+        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
+        # We currently only support NVIDIA GPUs - AMD / Intel is a work in progress!
+        post_check = check_nvidia()
+
+        # Counteract saved tokenizers
+        tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
+        tokenizer = AutoProcessor.from_pretrained(
+            tokenizer_name,
+            padding_side = "right",
+            token        = token,
+        )
+        # Add padding side as well
+        tokenizer.tokenizer.padding_side = "right"
+
+        model, tokenizer = patch_tokenizer(model, tokenizer)
+        model = post_patch_loss_function(model)
+
+        # Fix up config for transformers uploading PEFT
+        # Not necessary anymore since we require transformers>=4.37!
+        if False:
+            name = model.config._name_or_path
+            if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
+                name = name[:len(name) - len("-bnb-4bit")]
+                model.config.update({"_name_or_path" : name})
+            pass
+        pass
+
+        # Log Unsloth version for future fastpaths for inference
+        if hasattr(model, "config"):
+            model.config.update({"unsloth_version" : __version__})
+        pass
+        patch_saving_functions(model, vision = True)
+        patch_saving_functions(tokenizer, vision = True)
+
+        # Fix gradient accumulation
+        from transformers.trainer import Trainer
+        patch_gradient_accumulation_fix(Trainer)
+
+        # Save tokenizer for inference purposes
+        tokenizer.padding_side = "left" # Force inference
+        tokenizer.tokenizer.padding_side = "left" # Force inference
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            internal_model._saved_temp_tokenizer = tokenizer
+            internal_model = internal_model.model
+        pass
+        internal_model._saved_temp_tokenizer = tokenizer
+        
+        return model, tokenizer
+    pass
+
+
+    @staticmethod
+    def get_peft_model(
+        model,
+        r                          = 16,
+        target_modules             = None,
+        lora_alpha                 = 16,
+        lora_dropout               = 0,
+        bias                       = "none",
+        finetune_vision_layers     = True,
+        finetune_language_layers   = True,
+        finetune_attention_modules = True,
+        finetune_mlp_modules       = True,
+        layers_to_transform        = None,
+        layers_pattern             = None,
+        use_gradient_checkpointing = True,
+        random_state               = 3407,
+        max_seq_length             = 2048, # not used anymore
+        use_rslora                 = False,
+        modules_to_save            = None,
+        init_lora_weights          = True,
+        loftq_config               = {},
+        temporary_location         = "_unsloth_temporary_saved_buffers",
+        **kwargs,
+    ):
+        transformers_set_seed(random_state)
+
+        if type(r) is not int:
+            raise TypeError(f"Unsloth: Rank of {str(r)} must be an integer.")
+        if r <= 0:
+            raise TypeError(f"Unsloth: Rank of {str(r)} must be larger than 0.")
+
+        if isinstance(model, PeftModelForCausalLM):
+            raise RuntimeError("Unsloth: You already added LoRA adapters to your model!")
+
+        if target_modules == "all-linear":
+            finetune_vision_layers     = True
+            finetune_language_layers   = True
+            finetune_attention_modules = True
+            finetune_mlp_modules       = True
+        pass
+        if target_modules is None:
+            target_modules = get_peft_regex(
+                model,
+                finetune_vision_layers     = finetune_vision_layers,
+                finetune_language_layers   = finetune_language_layers,
+                finetune_attention_modules = finetune_attention_modules,
+                finetune_mlp_modules       = finetune_mlp_modules,
+            )
+        else:
+            assert(type(target_modules) in (list, tuple,))
+        pass
+
+        # Clear deleted GPU items
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+        pass
+
+        lora_config = LoraConfig(
+            r               = r,
+            lora_alpha      = lora_alpha,
+            target_modules  = target_modules,
+            lora_dropout    = lora_dropout,
+            bias            = bias,
+            task_type       = TaskType.CAUSAL_LM,
+        )
+        model = prepare_model_for_kbit_training(
+            model,
+            use_gradient_checkpointing = use_gradient_checkpointing,
+        )
+        model = get_peft_model(model, lora_config)
+
+        model = FastBaseVisionModel.patch_peft_model(model, use_gradient_checkpointing)
+
+        # Clear deleted GPU items
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+        pass
+        patch_saving_functions(model, vision = True)
+
+        return model
+    pass
+
+
+    @staticmethod
+    def patch_peft_model(
+        model,
+        use_gradient_checkpointing = True,
+    ):
+        if not isinstance(model, PeftModelForCausalLM):
+            raise TypeError(
+                "Unsloth: Your model needs to call `.get_peft_model` first!"
+            )
+        pass
+
+        model = prepare_model_for_kbit_training(
+            model,
+            use_gradient_checkpointing = use_gradient_checkpointing,
+            use_reentrant = True,
+        )
+
+        from transformers.trainer import Trainer 
+        if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
+            raise RuntimeError(
+                'Unsloth currently does not work on multi GPU setups - sadly we are a 2 brother team so '\
+                'enabling it will require much more work, so we have to prioritize. Please understand!\n'\
+                'We do have a separate beta version, which you can contact us about!\n'\
+                'Thank you for your understanding and we appreciate it immensely!'
+            )
+        pass
+        patch_saving_functions(model, vision = True)
+
+        # Patch tokenizer to pad to the right
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_saved_temp_tokenizer"):
+                internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right"
+            pass
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "_saved_temp_tokenizer"):
+            internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right"
+        pass
+
+        # Clear deleted GPU items
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+        pass
+        return model
+    pass
+
+
+    @staticmethod
+    def for_inference(model):
+        model.gradient_checkpointing = False
+        model.training = False
+
+        for name, module in model.named_modules():
+            if hasattr(module, "gradient_checkpointing"):
+                module.gradient_checkpointing = False
+            if hasattr(module, "training"):
+                module.training = False
+        pass
+
+        dtype = model.config.torch_dtype
+        if type(dtype) is str:
+            if   dtype ==  "float16": dtype = torch.float16
+            elif dtype == "bfloat16": dtype = torch.bfloat16
+        pass
+        device_type = model.device.type
+
+        # Wrap model.generate
+        if model.generate.__name__ != "_fast_generate":
+            model._unwrapped_old_generate = model.generate
+            model.generate = _wrap_fast_inference(model.generate, device_type, dtype, model)
+        pass
+        
+        # Patch tokenizer to pad to the left
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_saved_temp_tokenizer"):
+                internal_model._saved_temp_tokenizer.tokenizer.padding_side = "left"
+            pass
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "_saved_temp_tokenizer"):
+            internal_model._saved_temp_tokenizer.tokenizer.padding_side = "left"
+        pass
+
+        # Also disable training for embeddings for NEFTune
+        if hasattr(model, "get_input_embeddings"):
+            embeddings = model.get_input_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = False
+        pass
+        if hasattr(model, "get_output_embeddings"):
+            embeddings = model.get_output_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = False
+        pass
+
+        return model
+    pass
+
+
+    @staticmethod
+    def for_training(model, use_gradient_checkpointing = True):
+        model.gradient_checkpointing = use_gradient_checkpointing
+        model.training = True
+
+        for name, module in model.named_modules():
+            if hasattr(module, "gradient_checkpointing"):
+                module.gradient_checkpointing = use_gradient_checkpointing
+            if hasattr(module, "training"):
+                module.training = True
+        pass
+
+        # Also revert model.generate
+        if hasattr(model, "_unwrapped_old_generate"):
+            model.generate = model._unwrapped_old_generate
+            del model._unwrapped_old_generate
+        pass
+
+        # Patch tokenizer to pad to the right
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_saved_temp_tokenizer"):
+                internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right"
+            pass
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "_saved_temp_tokenizer"):
+            internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right"
+        pass
+
+        # Also re-enable training for embeddings for NEFTune
+        if hasattr(model, "get_input_embeddings"):
+            embeddings = model.get_input_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = True
+        pass
+        if hasattr(model, "get_output_embeddings"):
+            embeddings = model.get_output_embeddings()
+            if hasattr(embeddings, "training"): embeddings.training = True
+        pass
+
+        return model
+    pass
+pass
diff --git a/unsloth-main/unsloth/save.py b/unsloth-main/unsloth/save.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3ba1928c42e369cd580ad4b1b809ebd66371773
--- /dev/null
+++ b/unsloth-main/unsloth/save.py
@@ -0,0 +1,2349 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unsloth_zoo.utils import Version
+from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
+from peft.tuners.lora import Linear4bit as Peft_Linear4bit
+from peft.tuners.lora import Linear as Peft_Linear
+from typing import Optional, Callable, Union, List
+import torch
+import os
+import shutil
+import pickle
+import gc
+from transformers.models.llama.modeling_llama import logger
+from .kernels import fast_dequantize, QUANT_STATE, get_lora_parameters_bias
+import subprocess
+import psutil
+import re
+from transformers.models.llama.modeling_llama import logger
+from .tokenizer_utils import fix_sentencepiece_gguf
+from huggingface_hub import HfApi
+try:
+    from huggingface_hub.utils import get_token
+except:
+    # Old HF Hub versions <= 0.0.25
+    from huggingface_hub.utils._token import get_token
+pass
+from pathlib import Path
+
+__all__ = [
+    "print_quantization_methods",
+    "unsloth_save_model",
+    "save_to_gguf",
+    "patch_saving_functions",
+    "create_huggingface_repo",
+]
+
+# llama.cpp specific targets - all takes 90s. Below takes 60s
+LLAMA_CPP_TARGETS = ["llama-quantize", "llama-export-lora", "llama-cli",]
+
+# Check environments
+keynames = "\n" + "\n".join(os.environ.keys())
+IS_COLAB_ENVIRONMENT  = "\nCOLAB_"  in keynames
+IS_KAGGLE_ENVIRONMENT = "\nKAGGLE_" in keynames
+KAGGLE_TMP = "/tmp"
+del keynames
+
+# Weights
+LLAMA_WEIGHTS = (
+    "self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj",
+    "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj",
+)
+LLAMA_LAYERNORMS = (
+    "input_layernorm", "post_attention_layernorm",
+    "pre_feedforward_layernorm", "post_feedforward_layernorm",
+)
+
+# https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19
+# From https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html
+ALLOWED_QUANTS = \
+{
+    "not_quantized"  : "Recommended. Fast conversion. Slow inference, big files.",
+    "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
+    "quantized"      : "Recommended. Slow conversion. Fast inference, small files.",
+    "f32"     : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
+    "bf16"    : "Bfloat16 - Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
+    "f16"     : "Float16  - Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
+    "q8_0"    : "Fast conversion. High resource use, but generally acceptable.",
+    "q4_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
+    "q5_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
+    "q2_k"    : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+    "q3_k_l"  : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+    "q3_k_m"  : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+    "q3_k_s"  : "Uses Q3_K for all tensors",
+    "q4_0"    : "Original quant method, 4-bit.",
+    "q4_1"    : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
+    "q4_k_s"  : "Uses Q4_K for all tensors",
+    "q4_k"    : "alias for q4_k_m",
+    "q5_k"    : "alias for q5_k_m",
+    "q5_0"    : "Higher accuracy, higher resource usage and slower inference.",
+    "q5_1"    : "Even higher accuracy, resource usage and slower inference.",
+    "q5_k_s"  : "Uses Q5_K for all tensors",
+    "q6_k"    : "Uses Q8_K for all tensors",
+    # "iq2_xxs" : "2.06 bpw quantization", # Not supported sadly
+    # "iq2_xs"  : "2.31 bpw quantization",
+    # "iq3_xxs" : "3.06 bpw quantization",
+    "q3_k_xs" : "3-bit extra small quantization",
+}
+
+def print_quantization_methods():
+    for key, value in ALLOWED_QUANTS.items():
+        print(f'"{key}"  ==> {value}')
+    pass
+pass
+
+
+def check_if_sentencepiece_model(model, temporary_location = "_unsloth_sentencepiece_temp"):
+    if not hasattr(model, "_saved_temp_tokenizer"): return False
+
+    temp_tokenizer = model._saved_temp_tokenizer
+    sentencepiece_model = False
+    file_location = os.path.join(temporary_location, temp_tokenizer.name_or_path)
+    created_folder = False
+    if not os.path.exists(file_location):
+        created_folder = True
+        os.makedirs(file_location)
+    pass
+    temp_tokenizer.save_pretrained(file_location)
+    if os.path.isfile(f"{file_location}/tokenizer.model"):
+        sentencepiece_model = True
+    pass
+    if created_folder:
+        shutil.rmtree(file_location, ignore_errors = True)
+    return sentencepiece_model
+pass
+
+
+def _free_cached_model(model):
+    from huggingface_hub import scan_cache_dir
+    cached_repos = list(scan_cache_dir().repos)
+
+    # Go through every cached repo, and delete the one that matches the model we want to save.
+    # Can save 4GB of disk space - useful for Kaggle systems.
+    for cached_repo in cached_repos:
+        if cached_repo.repo_id == model.config._name_or_path:
+            remove_cache_commit = list(cached_repo.revisions)[0].commit_hash
+            delete_strategy = scan_cache_dir().delete_revisions(remove_cache_commit,)
+
+            logger.warning_once(
+                "Unsloth: Will remove a cached repo with size " + \
+                delete_strategy.expected_freed_size_str,
+            )
+
+            delete_strategy.execute()
+        pass
+    pass
+pass
+
+
+def _merge_lora(layer, name):
+
+    bias = getattr(layer, "bias", None)
+    if isinstance(layer, (Bnb_Linear4bit, Peft_Linear4bit, Peft_Linear)):
+        # Is LoRA so we need to merge!
+        W, quant_state, A, B, s, bias = get_lora_parameters_bias(layer)
+        if quant_state is not None:
+            dtype = quant_state.dtype if type(quant_state) is not list else quant_state[2]
+            W = fast_dequantize(W, quant_state)
+        else:
+            dtype = W.dtype
+        W = W.to(torch.float32).t()
+        # W = W.t()
+
+        if A is not None:
+            # sAB = (A.t().to(torch.float32) @ (s * B.t().to(torch.float32)))
+            # W += sAB
+            W.addmm_(A.t().to(torch.float32), B.t().to(torch.float32), alpha = s)
+            # W.addmm_(A.t().to(W.dtype), B.t().to(W.dtype), alpha = s)
+            # if not torch.isfinite(W).all():
+            maximum_element = torch.max(W.min().abs(), W.max())
+            if not torch.isfinite(maximum_element).item():
+                raise ValueError(f"Unsloth: Merge failed.\n{name} has some elements = infinity.")
+        pass
+        W = W.t().to(dtype)
+    else:
+        W = layer.weight
+    return W, bias
+pass
+
+
+def fast_save_pickle(shard, name):
+    # Use this if # CPUs is <= 2
+    print(f"Unsloth: Saving {name}...")
+    torch.save(
+        shard,
+        name,
+        # HIGHEST_PROTOCOL seems to not work with Pytorch!
+        # pickle_module   = pickle,
+        # pickle_protocol = pickle.HIGHEST_PROTOCOL,
+    )
+    return
+pass
+
+
+@torch.inference_mode
+def unsloth_save_model(
+    model,
+    tokenizer,
+    save_directory       : Union[str, os.PathLike],
+    save_method          : str = "lora", # ["lora", "merged_16bit", "merged_4bit"]
+    push_to_hub          : bool = False,
+    token                : Optional[Union[str, bool]] = None,
+    is_main_process      : bool = True,
+    state_dict           : Optional[dict] = None,
+    save_function        : Callable = torch.save,
+    max_shard_size       : Union[int, str] = "5GB",
+    safe_serialization   : bool = True,
+    variant              : Optional[str] = None,
+    save_peft_format     : bool = True,
+
+    # Push to hub
+    use_temp_dir         : Optional[bool] = None,
+    commit_message       : Optional[str] = "Trained with Unsloth",
+    private              : Optional[bool] = None,
+    create_pr            : bool = False,
+    revision             : str = None,
+    commit_description   : str = "Upload model trained with Unsloth 2x faster",
+    tags                 : List[str] = None,
+
+    # Our functions
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.9,
+):
+    if token is None: token = get_token()
+
+    if commit_message is None: commit_message = ""
+    if "Unsloth" not in commit_message:
+        commit_message += " (Trained with Unsloth)"
+    commit_message = commit_message.lstrip()
+
+    if commit_description is None:
+        commit_description = "Upload model trained with Unsloth 2x faster"
+    elif "Unsloth 2x faster" not in commit_description:
+        commit_description += " (Trained with Unsloth 2x faster)"
+    pass
+
+    if save_method == "merged_4bit":
+        raise RuntimeError(
+            "Unsloth: Merging into 4bit will cause your model to lose accuracy if you plan\n"\
+            "to merge to GGUF or others later on. I suggest you to do this as a final step\n"\
+            "if you're planning to do multiple saves.\n"\
+            "If you are certain, change `save_method` to `merged_4bit_forced`."
+        )
+    elif save_method == "merged_4bit_forced":
+        save_method = "merged_4bit"
+    pass
+
+    save_pretrained_settings = dict(locals())
+    for deletion in ("model", "tokenizer", "save_method", "temporary_location", "maximum_memory_usage"):
+        del save_pretrained_settings[deletion]
+    pass
+
+    # First check for a token!
+    if push_to_hub:
+        from huggingface_hub import whoami
+        try: 
+            username = whoami(token = token)["name"]
+        except:
+            raise RuntimeError(
+                "Unsloth: Please supply a token!\n"\
+                "Go to https://huggingface.co/settings/tokens"
+            )
+        pass
+    pass
+
+    assert(maximum_memory_usage > 0 and maximum_memory_usage <= 0.95)
+
+    # Clean memory up first
+    for _ in range(3):
+        torch.cuda.empty_cache()
+        gc.collect()
+    pass
+
+    save_method = save_method.lower().replace(" ", "_")
+    if save_method != "lora" and save_method != "merged_16bit" and save_method != "merged_4bit":
+        raise RuntimeError(
+            "Unsloth: You must select one of 3 options when saving models:\n"\
+            '"lora"         ==> This is the fastest and easiet. Just saves LoRA modules.\n'\
+            '"merged_16bit" ==> This merges LoRA weights and saves to float16. Needed for llama.cpp / GGUF.\n'\
+            '"merged_4bit"  ==> This merges LoRA weights and saves to 4bit. Useful for DPO / inference.'
+        )
+    pass
+
+    if save_method == "merged_4bit":
+
+        print("Unsloth: Merging 4bit and LoRA weights to 4bit...")
+        print("This might take 5 minutes...")
+
+        # Counteract no LoRA adapters!
+        if hasattr(model, "merge_and_unload"):
+            model = model.merge_and_unload()
+        pass
+        print("Done.")
+    pass
+
+    if tags is not None:
+        assert(isinstance(tags, (list, tuple)))
+        tags = list(tags) + ["unsloth",]
+    else:
+        tags = ["unsloth",]
+    pass
+    save_pretrained_settings["tags"] = tags
+
+    if ((save_method == "lora") or (save_method == "merged_4bit")) and push_to_hub:
+        if token is None:
+            raise RuntimeError(
+                "Unsloth: Pushing to HF requires a token. Pass `token = 'hf_....'`\n"\
+                "Go to https://huggingface.co/settings/tokens."
+            )
+        pass
+
+        if save_method == "lora":
+            print("Unsloth: Saving LoRA adapters. Please wait...")
+        elif save_method == "merged_4bit":
+            print("Unsloth: Saving 4bit Bitsandbytes model. Please wait...")
+        pass
+
+        # Update model tag
+        _ = upload_to_huggingface(
+            model, save_directory, token,
+            "finetuned", "trl", file_location = None,
+            old_username = None, private = private,
+        )
+
+        getattr(model, "original_push_to_hub", tokenizer.push_to_hub)\
+        (
+            repo_id            = save_directory,
+            use_temp_dir       = use_temp_dir,
+            commit_message     = commit_message,
+            private            = private,
+            token              = token,
+            max_shard_size     = max_shard_size,
+            create_pr          = create_pr,
+            safe_serialization = safe_serialization,
+            revision           = revision,
+            commit_description = commit_description,
+            tags               = tags,
+        )
+        if tokenizer is not None:
+            # Set padding side to left for inference
+            old_padding_side = tokenizer.padding_side
+            tokenizer.padding_side = "left"
+
+            getattr(tokenizer, "original_push_to_hub", tokenizer.push_to_hub)\
+            (
+                repo_id            = save_directory,
+                use_temp_dir       = use_temp_dir,
+                commit_message     = commit_message,
+                private            = private,
+                token              = token,
+                max_shard_size     = max_shard_size,
+                create_pr          = create_pr,
+                safe_serialization = safe_serialization,
+                revision           = revision,
+                commit_description = commit_description,
+                tags               = tags,
+            )
+
+            # Revert back padding side
+            tokenizer.padding_side = old_padding_side
+        pass
+
+        if hasattr(model, "config"):
+            print(f"Saved {save_method} model to https://huggingface.co/" + save_directory)
+        pass
+        return save_directory, None
+    pass
+
+    # Tokenizer has different saving arguments
+    tokenizer_save_settings = \
+    {
+        "save_directory"  : save_pretrained_settings["save_directory"],
+        "legacy_format"   : None,
+        "filename_prefix" : None,
+        "push_to_hub"     : save_pretrained_settings["push_to_hub"],
+        "private"         : save_pretrained_settings["private"],
+        "token"           : save_pretrained_settings["token"],
+    }
+
+    # Check if PEFT Model or not - if yes, 3 levels. If not 2 levels.
+    from peft import PeftModelForCausalLM
+    if isinstance(model, PeftModelForCausalLM):
+        internal_model = model.model
+    else:
+        internal_model = model
+    pass
+        
+    # Cannot be converted properly!
+    if (save_method == "merged_4bit") or (save_method == "lora") or (
+        not hasattr(model, "model") or \
+        not hasattr(internal_model.model, "layers")
+    ):
+        # Do general saving
+        # Edit save_pretrained_settings
+        # [TODO] _create_repo has errors due to **kwargs getting accepted
+        # commit_description does not seem to work?
+        what_to_delete = ("use_temp_dir", "commit_message", "create_pr", "revision", "commit_description", "tags",) \
+            if save_pretrained_settings["push_to_hub"] is False else \
+            ("use_temp_dir", "create_pr", "revision", "tags", "commit_description",)
+        for deletion in what_to_delete:
+            del save_pretrained_settings[deletion]
+        pass
+        if hasattr(model, "add_model_tags"):
+            model.add_model_tags(["unsloth",])
+
+        # Update model tag
+        if push_to_hub:
+             _ = upload_to_huggingface(
+                model, save_pretrained_settings["save_directory"], token,
+                "finetuned", "trl", file_location = None,
+                old_username = None, private = private,
+            )
+        pass
+
+        if tokenizer is not None:
+            print("Unsloth: Saving tokenizer...", end = "")
+
+            # Set padding side to left for inference
+            old_padding_side = tokenizer.padding_side
+            tokenizer.padding_side = "left"
+
+            tokenizer.save_pretrained(**tokenizer_save_settings)
+
+            # Revert back padding side
+            tokenizer.padding_side = old_padding_side
+
+            print(" Done.")
+        else:
+            print()
+
+        print("Unsloth: Saving model...", end = "")
+        if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
+
+        # [TODO] Is this correct?
+        if save_method == "lora":
+            save_pretrained_settings["selected_adapters"] = None
+        pass
+
+        model.save_pretrained(**save_pretrained_settings)
+
+        if push_to_hub and hasattr(model, "config"):
+            print("Saved to https://huggingface.co/" + save_pretrained_settings["save_directory"])
+        pass
+
+        print(" Done.")
+        return save_directory, None
+    pass
+
+    # If push_to_hub, we must remove the .../ part of a repo
+    username = None
+    if push_to_hub and "/" in save_directory:
+
+        # +1 solves absolute path issues
+        new_save_directory = save_directory
+        username = new_save_directory[:new_save_directory.find("/")]
+        new_save_directory = new_save_directory[new_save_directory.find("/")+1:]
+        if IS_KAGGLE_ENVIRONMENT:
+            new_save_directory = os.path.join(KAGGLE_TMP, new_save_directory[new_save_directory.find("/")+1:])
+            logger.warning_once(
+                "Unsloth: You are pushing to hub in Kaggle environment.\n"\
+                f"To save memory, we shall move {save_directory} to {new_save_directory}"
+            )
+        else:
+            logger.warning_once(
+                f"Unsloth: You are pushing to hub, but you passed your HF username = {username}.\n"\
+                f"We shall truncate {save_directory} to {new_save_directory}"
+            )
+
+        save_pretrained_settings["save_directory"] = new_save_directory
+        tokenizer_save_settings ["save_directory"] = new_save_directory
+        save_directory = new_save_directory
+    pass
+
+    print("Unsloth: Merging 4bit and LoRA weights to 16bit...")
+
+    # Determine max RAM usage minus sharding
+    max_ram = psutil.virtual_memory().available
+    sharded_ram_usage = 5 * 1024 * 1024 * 1024
+    if type(max_shard_size) is str:
+        gb_found = re.match("([0-9]{1,})[\s]{0,}GB", max_shard_size, flags = re.IGNORECASE)
+        mb_found = re.match("([0-9]{1,})[\s]{0,}MB", max_shard_size, flags = re.IGNORECASE)
+        if   gb_found: sharded_ram_usage = int(gb_found.group(1)) * 1024 * 1024 * 1024
+        elif mb_found: sharded_ram_usage = int(mb_found.group(1)) * 1024 * 1024 
+    elif type(max_shard_size) is int:
+        sharded_ram_usage = sharded_ram_usage
+    pass
+
+    # Switch to our fast saving modules if it's a slow PC!
+    n_cpus = psutil.cpu_count(logical = False)
+    if n_cpus is None: n_cpus = psutil.cpu_count()
+    if n_cpus is None: n_cpus = 1
+
+    if safe_serialization is None:
+        safe_serialization = True
+        save_pretrained_settings["safe_serialization"] = safe_serialization
+
+    elif safe_serialization and (n_cpus <= 2):
+        logger.warning_once(
+            f"Unsloth: You have {n_cpus} CPUs. Using `safe_serialization` is 10x slower.\n"\
+            f"We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.\n"\
+            f"To force `safe_serialization`, set it to `None` instead.",
+        )
+        safe_serialization = False
+        save_function = fast_save_pickle
+        save_pretrained_settings["safe_serialization"] = safe_serialization
+        save_pretrained_settings["save_function"]      = save_function
+    pass
+
+    # Only safe_serialization uses more RAM
+    if safe_serialization:
+        max_ram -= sharded_ram_usage
+    else:
+        max_ram -= sharded_ram_usage*0.25 # Uses much less
+    pass
+
+    max_ram = int(max(0, max_ram) * maximum_memory_usage)
+    print(f"Unsloth: Will use up to "\
+          f"{round(max_ram/1024/1024/1024, 2)} out of "\
+          f"{round(psutil.virtual_memory().total/1024/1024/1024, 2)} RAM for saving.")
+
+    # Move temporary_location to /tmp in Kaggle
+    if IS_KAGGLE_ENVIRONMENT:
+        temporary_location = os.path.join(KAGGLE_TMP, temporary_location)
+
+    # Max directory for disk saving
+    if not os.path.exists(temporary_location):
+        os.makedirs(temporary_location)
+    pass
+
+    # Check if Kaggle or Colab, since only 20GB of Disk space allowed.
+    if IS_KAGGLE_ENVIRONMENT or IS_COLAB_ENVIRONMENT:
+        # We free up 4GB of space
+        logger.warning_once(
+            "Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded\n"\
+            "model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab."
+        )
+        _free_cached_model(internal_model)
+    pass
+
+    # HF also uses a OrderedDict
+    from collections import OrderedDict
+    state_dict = OrderedDict()
+
+    torch_dtype = internal_model.config.torch_dtype
+    if type(torch_dtype) is str:
+        if   torch_dtype ==  "float16": torch_dtype = torch.float16
+        elif torch_dtype == "bfloat16": torch_dtype = torch.bfloat16
+    pass
+
+    # Check modules to save float32 dtype
+    state_dict["model.embed_tokens.weight"] = internal_model.model.embed_tokens.weight.data.to(torch_dtype)
+
+    max_vram = int(torch.cuda.get_device_properties(0).total_memory * maximum_memory_usage)
+
+    print("Unsloth: Saving model... This might take 5 minutes ...")
+
+    from tqdm import tqdm as ProgressBar
+    for j, layer in enumerate(ProgressBar(internal_model.model.layers)):
+        for item in LLAMA_WEIGHTS:
+            proj = eval(f"layer.{item}")
+            name = f"model.layers.{j}.{item}.weight"
+            W, bias = _merge_lora(proj, name)
+
+            # Bias term
+            if bias is not None:
+                state_dict[f"model.layers.{j}.{item}.bias"] = bias
+            pass
+
+            if (torch.cuda.memory_allocated() + W.nbytes) < max_vram:
+                # Save to GPU memory
+                state_dict[name] = W
+            # [TODO] Saving to RAM seems to leak memory???
+            # elif (max_ram - W.nbytes) > 0:
+            #     # Save to CPU memory
+            #     logger.warning_once(f"We will save to RAM and not VRAM now.")
+            #     state_dict[name] = W.to("cpu", non_blocking = True, copy = True)
+            #     max_ram = max(max_ram - W.nbytes, 0)
+            else:
+                # Save to Disk
+                logger.warning_once("\nWe will save to Disk and not RAM now.")
+                filename = os.path.join(temporary_location, f"{name}.pt")
+                torch.save(W, filename, pickle_module = pickle, pickle_protocol = pickle.HIGHEST_PROTOCOL,)
+                # weights_only = True weirdly fails?
+                state_dict[name] = torch.load(filename, map_location = "cpu", mmap = True, weights_only = False)
+        pass
+        for item in LLAMA_LAYERNORMS:
+            try:
+                # Skip for Gemma 2
+                state_dict[f"model.layers.{j}.{item}.weight"] = eval(f"layer.{item}.weight.data")
+            except:
+                continue
+        pass
+    pass
+
+    state_dict["model.norm.weight"] = internal_model.model.norm.weight.data
+    # Check for modules_to_save float32 dtype
+
+    # Check for tied weights
+    if internal_model.model.embed_tokens.weight.data_ptr() != internal_model.lm_head.weight.data_ptr():
+        state_dict["lm_head.weight"] = internal_model.lm_head.weight.data.to(torch_dtype)
+    pass
+
+    # All tensors MUST be type torch.Tensor and not torch.nn.parameter.Parameter
+    for key, value in state_dict.items():
+        if hasattr(value, "data"): state_dict[key] = value = value.data
+        if type(value) is not torch.Tensor:
+            logger.warning_once(f"Unsloth: {key} is not a Tensor but a {type(value)}.")
+        pass
+    pass
+
+    # Edit save_pretrained_settings
+    # [TODO] _create_repo has errors due to **kwargs getting accepted
+    save_pretrained_settings["state_dict"] = state_dict
+    
+    # commit_description does not seem to work?
+    what_to_delete = ("use_temp_dir", "commit_message", "create_pr", "revision", "commit_description", "tags",) \
+        if not push_to_hub else \
+        ("use_temp_dir", "create_pr", "revision", "tags", "commit_description",)
+    for deletion in what_to_delete:
+        del save_pretrained_settings[deletion]
+    pass
+    if hasattr(model, "add_model_tags"):
+        model.add_model_tags(["unsloth",])
+
+    # Update model tag
+    if push_to_hub:
+        _ = upload_to_huggingface(
+            model, save_pretrained_settings["save_directory"], token,
+            "finetuned", "trl", file_location = None,
+            old_username = username, private = private,
+        )
+    pass
+
+    # First check if we're pushing to an organization!
+    save_directory = save_pretrained_settings["save_directory"]
+
+    if save_pretrained_settings["push_to_hub"]:
+        new_save_directory, new_username = _determine_username(save_directory, username, token)
+
+        if token is not None:
+            from huggingface_hub import whoami
+            actual_username = whoami(token = token)["name"]
+        else:
+            actual_username = username
+    pass
+
+    # Check if pushing to an organization
+    if save_pretrained_settings["push_to_hub"] and (username != actual_username):
+        print(f"Unsloth: Saving to organization with address {new_save_directory}")
+        # We upload everything at the end!
+        tokenizer_save_settings["push_to_hub"] = False
+        tokenizer_save_settings["save_directory"] = new_save_directory
+    pass
+
+    # Save tokenizer
+    if tokenizer is not None:
+        print("Unsloth: Saving tokenizer...", end = "")
+
+        # Set padding side to left for inference
+        old_padding_side = tokenizer.padding_side
+        tokenizer.padding_side = "left"
+
+        tokenizer.save_pretrained(**tokenizer_save_settings)
+
+        # Revert back padding side
+        tokenizer.padding_side = old_padding_side
+            
+        print(" Done.")
+    else:
+        print()
+    pass
+
+    # Since merged, edit quantization_config
+    old_config = model.config
+    new_config = model.config.to_dict()
+    if "quantization_config" in new_config:
+        del new_config["quantization_config"]
+    original_model = model
+    new_config = type(model.config).from_dict(new_config)
+    while hasattr(original_model, "model"):
+        original_model = original_model.model
+        original_model.config = new_config
+    model.config = new_config
+
+    # Save!
+    # [TODO] --> is this correct?
+    # save_pretrained_settings["selected_adapters"] = None
+
+    # Check if pushing to an organization
+    if save_pretrained_settings["push_to_hub"] and (username != actual_username):
+        print(f"Unsloth: Saving to organization with address {new_save_directory}")
+        # Pushing to organization!
+        # Sadly .save_pretrained doesn't work :(
+        # We first save it via .save_pretrained, then upload manually!
+        save_pretrained_settings["save_directory"] = new_save_directory
+        save_pretrained_settings["push_to_hub"] = False
+        internal_model.save_pretrained(**save_pretrained_settings)
+
+        # Now manually go through each file and upload them manually!
+        filenames = os.listdir(new_save_directory)
+
+        hf_api = HfApi(token = save_pretrained_settings["token"])
+
+        print("Unsloth: Uploading all files... Please wait...")
+        hf_api.upload_folder(
+            folder_path = new_save_directory,
+            path_in_repo = ".",
+            repo_id = new_save_directory,
+            repo_type = "model",
+            commit_message  = "(Trained with Unsloth)",
+            ignore_patterns = "*.md",
+        )
+    else:
+        internal_model.save_pretrained(**save_pretrained_settings)
+    pass
+
+    # Revert config back
+    original_model = model
+    while hasattr(original_model, "model"):
+        original_model = original_model.model
+        original_model.config = old_config
+    model.config = old_config
+    print("Done.")
+
+    if push_to_hub and hasattr(model, "config"):
+        print(f"Saved merged model to https://huggingface.co/{username}/{save_directory.lstrip('/').split('/')[-1]}")
+    pass
+
+    save_pretrained_settings["state_dict"] = None
+
+    for j, (key, value) in enumerate(state_dict.items()):
+        state_dict[key] = None
+        if j % 10 == 0:
+            torch.cuda.empty_cache()
+            gc.collect()
+        pass
+    pass
+    state_dict = None
+    del state_dict
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    # Remove temporary location
+    import shutil
+    shutil.rmtree(temporary_location, ignore_errors = True)
+
+    for _ in range(3):
+        torch.cuda.empty_cache()
+        gc.collect()
+    return save_directory, username
+pass
+
+
+def install_llama_cpp_clone_non_blocking():
+    full_command = ["git", "clone", "--recursive", "https://github.com/ggerganov/llama.cpp"]
+    run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
+    return run_installer
+pass
+
+
+def install_llama_cpp_make_non_blocking():
+    # https://github.com/ggerganov/llama.cpp/issues/7062
+    # Weirdly GPU conversion for GGUF breaks??
+    # env = { **os.environ, "LLAMA_CUDA": "1", }
+    # Force make clean
+    check = os.system("make clean -C llama.cpp")
+    IS_CMAKE = False
+    if check == 0:
+        # Uses old MAKE
+        n_jobs = max(int(psutil.cpu_count()*1.5), 1)
+        full_command = ["make", "all", "-j"+str(n_jobs), "-C", "llama.cpp"]
+        IS_CMAKE = False
+    else:
+        # Uses new CMAKE
+        n_jobs = max(int(psutil.cpu_count()), 1) # Use less CPUs since 1.5x faster
+        check = os.system("cmake llama.cpp -B llama.cpp/build -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=OFF -DLLAMA_CURL=ON")
+        if check != 0:
+            raise RuntimeError(f"*** Unsloth: Failed compiling llama.cpp using os.system(...) with error {check}. Please report this ASAP!")
+        pass
+        # f"cmake --build llama.cpp/build --config Release -j{psutil.cpu_count()*2} --clean-first --target {' '.join(LLAMA_CPP_TARGETS)}",
+        full_command = [
+            "cmake", "--build", "llama.cpp/build",
+            "--config", "Release",
+            "-j"+str(n_jobs),
+            "--clean-first",
+            "--target",
+        ] + LLAMA_CPP_TARGETS
+        IS_CMAKE = True
+    pass
+    # https://github.com/ggerganov/llama.cpp/issues/7062
+    # Weirdly GPU conversion for GGUF breaks??
+    # run_installer = subprocess.Popen(full_command, env = env, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
+    run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
+    return run_installer, IS_CMAKE
+pass
+
+
+def install_python_non_blocking(packages = []):
+    full_command = ["pip", "install"] + packages
+    run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
+    return run_installer
+pass
+
+
+def try_execute(commands, force_complete = False):
+    for command in commands:
+        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
+            for line in sp.stdout:
+                line = line.decode("utf-8", errors = "replace")
+                if "undefined reference" in line:
+                    raise RuntimeError(f"*** Unsloth: Failed compiling llama.cpp with {line}. Please report this ASAP!")
+                elif "deprecated" in line:
+                    return "CMAKE"
+                elif "Unknown argument" in line:
+                    raise RuntimeError(f"*** Unsloth: Failed compiling llama.cpp with {line}. Please report this ASAP!")
+                elif "***" in line:
+                    raise RuntimeError(f"*** Unsloth: Failed compiling llama.cpp with {line}. Please report this ASAP!")
+                print(line, flush = True, end = "")
+            pass
+            if force_complete and sp.returncode is not None and sp.returncode != 0:
+                raise subprocess.CalledProcessError(sp.returncode, sp.args)
+        pass
+    pass
+    return None
+pass
+
+
+def install_llama_cpp_old(version = -10):
+    # Download the 10th latest release since the latest might be broken!
+    # FALLBACK mechanism
+    releases = subprocess.check_output(["git", "ls-remote", "--tags", "https://github.com/ggerganov/llama.cpp.git"])
+    releases = releases.decode("utf-8").replace("\t", " ").split("\n")
+    for i, x in enumerate(releases):
+        if "refs/tags/b" not in x: break
+    releases = releases[:i]
+    latest = releases[-1]
+    version = releases[version].split(" ")[0]
+
+    # Check if the llama.cpp exists
+    if os.path.exists("llama.cpp"):
+        print(
+            "**[WARNING]** You have a llama.cpp directory which is broken.\n"\
+            "Unsloth will DELETE the broken directory and install a new one.\n"\
+            "Press CTRL + C / cancel this if this is wrong. We shall wait 30 seconds.\n"
+        )
+        import time
+        for i in range(30):
+            print(f"**[WARNING]** Deleting llama.cpp directory... {30-i} seconds left.")
+            time.sleep(1)
+        import shutil
+        shutil.rmtree("llama.cpp", ignore_errors = True)
+    pass
+
+    # Clone a specific commit
+    # Also don't use the GPU!
+    commands = [
+        "git clone --recursive https://github.com/ggerganov/llama.cpp",
+        f"cd llama.cpp && git reset --hard {version} && git clean -df",
+    ]
+    try_execute(commands)
+
+    # Try using MAKE
+    commands = [
+        "make clean -C llama.cpp",
+        f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
+    ]
+    if try_execute(commands) == "CMAKE":
+        # Instead use CMAKE
+        commands = [
+            "cmake llama.cpp -B llama.cpp/build -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=OFF -DLLAMA_CURL=ON",
+            f"cmake --build llama.cpp/build --config Release -j{psutil.cpu_count()*2} --clean-first --target {' '.join(LLAMA_CPP_TARGETS)}",
+            "cp llama.cpp/build/bin/llama-* llama.cpp",
+            "rm -rf llama.cpp/build",
+        ]
+        try_execute(commands)
+    pass
+
+    # Check if successful
+    if not os.path.exists("llama.cpp/quantize") and not os.path.exists("llama.cpp/llama-quantize"):
+        raise RuntimeError(
+            "Unsloth: The file 'llama.cpp/llama-quantize' or `llama.cpp/quantize` does not exist.\n"\
+            "But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
+        )
+    pass
+pass
+
+
+def install_llama_cpp_blocking(use_cuda = False):
+    # https://github.com/ggerganov/llama.cpp/issues/7062
+    # Weirdly GPU conversion for GGUF breaks??
+    # use_cuda = "LLAMA_CUDA=1" if use_cuda else ""
+
+    commands = [
+        "git clone --recursive https://github.com/ggerganov/llama.cpp",
+        "pip install gguf protobuf",
+    ]
+    if os.path.exists("llama.cpp"): return
+    try_execute(commands)
+
+    commands = [
+        "make clean -C llama.cpp",
+        # https://github.com/ggerganov/llama.cpp/issues/7062
+        # Weirdly GPU conversion for GGUF breaks??
+        # f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp",
+        f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
+    ]
+    if try_execute(commands) == "CMAKE":
+        # Instead use CMAKE
+        commands = [
+            "cmake llama.cpp -B llama.cpp/build -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=OFF -DLLAMA_CURL=ON",
+            f"cmake --build llama.cpp/build --config Release -j{psutil.cpu_count()*2} --clean-first --target {' '.join(LLAMA_CPP_TARGETS)}",
+            "cp llama.cpp/build/bin/llama-* llama.cpp",
+            "rm -rf llama.cpp/build",
+        ]
+        try_execute(commands)
+    pass
+pass
+
+
+def get_executable(executables):
+    # Get system locations (System Path).split(system separator)
+    system_directories = os.environ.get("PATH").split(os.pathsep)
+
+    for directory in system_directories:
+        for executable in executables:
+            path = os.path.join(directory, executable)
+            # Check if the executable exists and is executable
+            if os.path.exists(path) and os.access(path, os.X_OK): return path
+        pass
+    pass
+    return None
+pass
+
+
+def save_to_gguf(
+    model_type           : str,
+    model_dtype          : str,
+    is_sentencepiece     : bool = False,
+    model_directory      : str = "unsloth_finetuned_model",
+    quantization_method  = "fast_quantized", # Can be a list of options! ["q4_k_m", "q8_0", "q5_k_m"]
+    first_conversion     : str = None,
+    _run_installer = None, # Non blocking install of llama.cpp
+):
+    # logger.warning(
+    #     "NOTICE: llama.cpp GGUF conversion is currently unstable, since llama.cpp is\n"\
+    #     "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\
+    #     "Please be patient - GGUF saving should still work, but might not work as well."
+    # )
+    assert(model_dtype == "float16" or model_dtype == "bfloat16")
+    model_dtype = "f16" if model_dtype == "float16" else "bf16"
+
+    # Convert quantization_method to list
+    if   isinstance(quantization_method, list):  pass
+    elif isinstance(quantization_method, str):   quantization_method = [ quantization_method, ]
+    elif isinstance(quantization_method, tuple): quantization_method = list(quantization_method)
+    else:
+        raise TypeError("Unsloth: quantization_method can only be a string or a list of strings")
+    pass
+    
+    # Check if bfloat16 is supported
+    if model_dtype == "bf16" and not torch.cuda.is_bf16_supported():
+        logger.warning(
+            "Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\
+            "We shall switch instead to f16."
+        )
+        model_dtype = "f16"
+    pass
+
+    # Check first_conversion as well
+    if first_conversion is None:
+        first_conversion = model_dtype
+    pass
+
+    # Check I quants
+    for quant_method in quantization_method: 
+        if quant_method.startswith("iq2"):
+            raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!")
+    pass
+
+    # Careful convert.py is only for Llama / Mistral based archs
+    use_fast_convert = False
+    if not is_sentencepiece:      use_fast_convert = False # Llama-3
+    elif model_type == "llama":   use_fast_convert = True
+    elif model_type == "mistral": use_fast_convert = True
+    pass
+    logger.warning_once(f"Unsloth: Converting {model_type} model. Can use fast conversion = {use_fast_convert}.")
+
+    # Map quant methods
+    new_quantization_method = []
+    for quant_method in quantization_method:
+        if   quant_method == "not_quantized":  quant_method = model_dtype
+        elif quant_method == "fast_quantized": quant_method = "q8_0"
+        elif quant_method == "quantized":      quant_method = "q4_k_m"
+        elif quant_method is None:             quant_method = "q8_0"
+
+        # Check if wrong method
+        if quant_method not in ALLOWED_QUANTS.keys():
+            error = f"Unsloth: Quant method = [{quant_method}] not supported. Choose from below:\n"
+            for key, value in ALLOWED_QUANTS.items():
+                error += f"[{key}] => {value}\n"
+            raise RuntimeError(error)
+        pass
+
+        new_quantization_method.append(quant_method)
+    pass
+    quantization_method = new_quantization_method
+
+    print_info = \
+        f"==((====))==  Unsloth: Conversion from QLoRA to GGUF information\n"\
+        f"   \\\   /|    [0] Installing llama.cpp might take 3 minutes.\n"\
+        f"O^O/ \_/ \\    [1] Converting HF to GGUF 16bits might take 3 minutes.\n"\
+        f"\        /    [2] Converting GGUF 16bits to {quantization_method} might take 10 minutes each.\n"\
+        f' "-____-"     In total, you will have to wait at least 16 minutes.\n'
+    print(print_info)
+
+    # Check first_conversion format
+    if   first_conversion == "f16"  : pass
+    elif first_conversion == "bf16" : pass
+    elif first_conversion == "f32"  : pass
+    elif first_conversion == "q8_0" : pass
+    else:
+        raise RuntimeError(
+            f"Unsloth: `first_conversion` can only be one of ['f16', 'bf16', 'f32', 'q8_0'] and not `{first_conversion}`."
+        )
+    pass
+
+    # Determine whether the system already has llama.cpp installed and the scripts are executable
+    quantize_location = get_executable(["llama-quantize", "quantize"])
+    convert_location  = get_executable(["convert-hf-to-gguf.py", "convert_hf_to_gguf.py"])
+    
+    error = 0
+    if quantize_location is not None and convert_location is not None:
+        print("Unsloth: llama.cpp found in the system. We shall skip installation.")
+    else:
+        print("Unsloth: Installing llama.cpp. This might take 3 minutes...")
+        if _run_installer is not None:
+            _run_installer, IS_CMAKE = _run_installer
+
+            error = _run_installer.wait()
+            # Check if successful
+            if error != 0:
+                print(f"Unsloth: llama.cpp error code = {error}.")
+                install_llama_cpp_old(-10)
+            pass
+
+            if IS_CMAKE:
+                # CMAKE needs to do some extra steps
+                print("Unsloth: CMAKE detected. Finalizing some steps for installation.")
+
+                check = os.system("cp llama.cpp/build/bin/llama-* llama.cpp")
+                if check != 0: raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
+                check = os.system("rm -rf llama.cpp/build")
+                if check != 0: raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
+            pass
+        else:
+            error = 0
+            install_llama_cpp_blocking()
+        pass
+
+        # Careful llama.cpp/quantize changed to llama.cpp/llama-quantize
+        # and llama.cpp/main changed to llama.cpp/llama-cli
+        # See https://github.com/ggerganov/llama.cpp/pull/7809
+        quantize_location = None
+        if os.path.exists("llama.cpp/quantize"):
+            quantize_location = "llama.cpp/quantize"
+        elif os.path.exists("llama.cpp/llama-quantize"):
+            quantize_location = "llama.cpp/llama-quantize"
+        else:
+            raise RuntimeError(
+                "Unsloth: The file 'llama.cpp/llama-quantize' or 'llama.cpp/quantize' does not exist.\n"\
+                "But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
+            )
+        pass
+
+        # See https://github.com/unslothai/unsloth/pull/730
+        # Filenames changed again!
+        convert_location = None
+        if os.path.exists("llama.cpp/convert-hf-to-gguf.py"):
+            convert_location = "llama.cpp/convert-hf-to-gguf.py"
+        elif os.path.exists("llama.cpp/convert_hf_to_gguf.py"):
+            convert_location = "llama.cpp/convert_hf_to_gguf.py"
+        else:
+            raise RuntimeError(
+                "Unsloth: The file 'llama.cpp/convert-hf-to-gguf.py' or 'llama.cpp/convert_hf_to_gguf.py' does not exist.\n"\
+                "But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
+            )
+        pass
+    pass
+
+    # Determine maximum first_conversion state
+    if   first_conversion == "f32"  : strength = 3
+    elif first_conversion == "f16"  : strength = 2
+    elif first_conversion == "bf16" : strength = 1
+    elif first_conversion == "q8_0" : strength = 0
+
+    for quant_method in quantization_method:
+        if   quant_method == "f32":  strength = max(strength, 3)
+        elif quant_method == "f16":  strength = max(strength, 2)
+        elif quant_method == "bf16": strength = max(strength, 1)
+        elif quant_method == "q8_0": strength = max(strength, 0)
+        else:
+            # Quantized models must have f16 as the default argument
+            if   first_conversion == "f32"  : pass
+            elif first_conversion == "f16"  : pass
+            elif first_conversion == "bf16" : pass
+            elif first_conversion == "q8_0":
+                logger.warning_once(
+                    "Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\
+                    "but saves disk space!"
+                )
+                # first_conversion = "f16"
+            pass
+        pass
+    pass
+
+    # If only q8_0:
+    if len(quantization_method) == 1 and quantization_method[0] == "q8_0":
+        strength = 0
+    pass
+
+    if   strength >= 3: first_conversion = "f32"
+    elif strength >= 2: first_conversion = "f16"
+    elif strength >= 1: first_conversion = "bf16"
+    else: first_conversion = "q8_0"
+
+    # Non llama/mistral needs can only use f32 or f16
+    if not use_fast_convert and \
+        (first_conversion != "f16" or first_conversion != "bf16" or first_conversion != "f32"):
+
+        pass
+        # Latest llama.cpp works for all models for q8_0!
+
+        # logger.warning_once("Unsloth: We must use f16 for non Llama and Mistral models.")
+        # first_conversion = "f16"
+    pass
+
+    # Check if bfloat16 is supported
+    if first_conversion == "bf16" and not torch.cuda.is_bf16_supported():
+        logger.warning(
+            "Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\
+            "We shall switch instead to f16."
+        )
+        first_conversion = "f16"
+    pass
+
+    n_cpus = psutil.cpu_count()
+    if n_cpus is None: n_cpus = 1
+    n_cpus *= 2
+    # Concurrency from https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model
+
+    final_location = str((Path(model_directory) / f"unsloth.{first_conversion.upper()}.gguf").absolute())
+    
+    print(f"Unsloth: [1] Converting model at {model_directory} into {first_conversion} GGUF format.\n"\
+          f"The output location will be {final_location}\n"\
+          "This might take 3 minutes...")
+
+    # We first check if tokenizer.model exists in the model_directory
+    if os.path.exists(f"{model_directory}/tokenizer.model"):
+        vocab_type = "spm,hfft,bpe"
+        # Fix Sentencepiece model as well!
+        fix_sentencepiece_gguf(model_directory)
+    else:
+        vocab_type = "bpe"
+    pass
+
+    # convert.py is deprecated!
+    use_fast_convert = False
+    if use_fast_convert:
+        command = f"python llama.cpp/convert.py {model_directory} "\
+            f"--outfile {final_location} --vocab-type {vocab_type} "\
+            f"--outtype {first_conversion} --concurrency {n_cpus} --pad-vocab"
+    else:
+        command = f"python {convert_location} {model_directory} "\
+            f"--outfile {final_location} "\
+            f"--outtype {first_conversion}"
+    pass
+
+    try_execute([command,], force_complete = True)
+
+    # Check if quantization succeeded!
+    if not os.path.isfile(final_location):
+        if IS_KAGGLE_ENVIRONMENT:
+            if not Path(final_location).resolve().is_relative_to(Path('/tmp').resolve()):
+                raise RuntimeError(
+                    f"Unsloth: Quantization failed for {final_location}\n"\
+                    "You are in a Kaggle environment, which might be the reason this is failing.\n"\
+                    "Kaggle only provides 20GB of disk space in the working directory.\n"\
+                    "Merging to 16bit for 7b models use 16GB of space.\n"\
+                    "This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\
+                    "`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\
+                    "You can try saving it to the `/tmp` directory for larger disk space.\n"\
+                    "I suggest you to save the 16bit model first, then use manual llama.cpp conversion."
+                )
+        else:
+            raise RuntimeError(
+                f"Unsloth: Quantization failed for {final_location}\n"\
+                "You might have to compile llama.cpp yourself, then run this again.\n"\
+                "You do not need to close this Python program. Run the following commands in a new terminal:\n"\
+                "You must run this in the same folder as you're saving your model.\n"\
+                "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
+                "cd llama.cpp && make clean && make all -j\n"\
+                "Once that's done, redo the quantization."
+            )
+        pass
+    pass
+    print(f"Unsloth: Conversion completed! Output location: {final_location}")
+
+    full_precision_location = final_location
+
+    all_saved_locations = [full_precision_location,]
+    # Convert each type!
+    for quant_method in quantization_method:
+        if quant_method != first_conversion:
+            print(f"Unsloth: [2] Converting GGUF 16bit into {quant_method}. This might take 20 minutes...")
+            final_location = str((Path(model_directory) / f"unsloth.{quant_method.upper()}.gguf").absolute())
+
+            command = f"./{quantize_location} {full_precision_location} "\
+                f"{final_location} {quant_method} {n_cpus}"
+            
+            try_execute([command,], force_complete = True)
+
+            # Check if quantization succeeded!
+            if not os.path.isfile(final_location):
+                if IS_KAGGLE_ENVIRONMENT:
+                    if not Path(final_location).resolve().is_relative_to(Path('/tmp').resolve()):
+                        raise RuntimeError(
+                            f"Unsloth: Quantization failed for {final_location}\n"\
+                            "You are in a Kaggle environment, which might be the reason this is failing.\n"\
+                            "Kaggle only provides 20GB of disk space in the working directory.\n"\
+                            "Merging to 16bit for 7b models use 16GB of space.\n"\
+                            "This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\
+                            "`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\
+                            "You can try saving it to the `/tmp` directory for larger disk space.\n"\
+                            "I suggest you to save the 16bit model first, then use manual llama.cpp conversion."
+                        )
+                else:
+                    raise RuntimeError(
+                        "Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\
+                        "You do not need to close this Python program. Run the following commands in a new terminal:\n"\
+                        "You must run this in the same folder as you're saving your model.\n"\
+                        "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
+                        "cd llama.cpp && make clean && make all -j\n"\
+                        "Once that's done, redo the quantization."
+                    )
+                pass
+            pass
+
+            print(f"Unsloth: Conversion completed! Output location: {final_location}")
+            all_saved_locations.append(final_location)
+        pass
+    pass
+
+    # Finally check if first_conversion (f16, bf16 etc) was in the list of actual quant methods
+    full_precision_seen = first_conversion in frozenset(quantization_method)
+
+    return all_saved_locations, full_precision_seen
+pass
+
+
+def unsloth_save_pretrained_merged(
+    self,
+    save_directory       : Union[str, os.PathLike],
+    tokenizer            = None,
+    save_method          : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
+    push_to_hub          : bool = False,
+    token                : Optional[Union[str, bool]] = None,
+    is_main_process      : bool = True,
+    state_dict           : Optional[dict] = None,
+    save_function        : Callable = torch.save,
+    max_shard_size       : Union[int, str] = "5GB",
+    safe_serialization   : bool = True,
+    variant              : Optional[str] = None,
+    save_peft_format     : bool = True,
+    tags                 : List[str] = None,
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.75,
+):
+    """
+        Same as .save_pretrained(...) except 4bit weights are auto
+        converted to float16 with as few overhead as possible.
+
+        Choose for `save_method` to be either:
+        1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
+        2.  `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
+        3.  `lora`: Save LoRA adapters with no merging. Useful for HF inference.
+    """
+    if tokenizer is None:
+        logger.warning_once(
+            "Unsloth: You're not saving a tokenizer as well?\n"\
+            "You can do it separately via `tokenizer.save_pretrained(...)`"
+        )
+    pass
+
+    arguments = dict(locals())
+    arguments["model"] = self
+    del arguments["self"]
+    unsloth_save_model(**arguments)
+    for _ in range(3):
+        gc.collect()
+pass
+
+
+def unsloth_push_to_hub_merged(
+    self,
+    repo_id              : str,
+    tokenizer            = None,
+    save_method          : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
+    use_temp_dir         : Optional[bool] = None,
+    commit_message       : Optional[str] = "Trained with Unsloth",
+    private              : Optional[bool] = None,
+    token                : Union[bool, str, None] = None,
+    max_shard_size       : Union[int, str, None] = "5GB",
+    create_pr            : bool = False,
+    safe_serialization   : bool = True,
+    revision             : str = None,
+    commit_description   : str = "Upload model trained with Unsloth 2x faster",
+    tags                 : Optional[List[str]] = None,
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.75,
+):
+    """
+        Same as .push_to_hub(...) except 4bit weights are auto
+        converted to float16 with as few overhead as possible.
+
+        Choose for `save_method` to be either:
+        1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
+        2.  `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
+        3.  `lora`: Save LoRA adapters with no merging. Useful for HF inference.
+    """
+    if tokenizer is None:
+        logger.warning_once(
+            "Unsloth: You're not saving a tokenizer as well?\n"\
+            "You can do it separately via `tokenizer.push_to_hub(...)`"
+        )
+    pass
+
+    arguments = dict(locals())
+    arguments["model"]          = self
+    arguments["save_directory"] = repo_id
+    arguments["push_to_hub"]    = True
+    del arguments["self"]
+    del arguments["repo_id"]
+    unsloth_save_model(**arguments)
+    for _ in range(3):
+        gc.collect()
+pass
+
+
+MODEL_CARD = \
+"""---
+base_model: {base_model}
+tags:
+- text-generation-inference
+- transformers
+- unsloth
+- {model_type}
+- {extra}
+license: apache-2.0
+language:
+- en
+---
+
+# Uploaded {method} model
+
+- **Developed by:** {username}
+- **License:** apache-2.0
+- **Finetuned from model :** {base_model}
+
+This {model_type} model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
+
+[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
+"""
+
+
+def _determine_username(save_directory, old_username, token):
+    username = ""
+    save_directory = save_directory.lstrip("./")
+    if "/" not in save_directory:
+        from huggingface_hub import whoami
+        try: 
+            username = whoami(token = token)["name"]
+            if type(old_username) is str and username != old_username:
+                username = old_username
+            pass
+            save_directory = f"{username}/{save_directory}"
+        except:
+            raise RuntimeError(f"Unsloth: {save_directory} is not a Huggingface directory.")
+    else:
+        username = save_directory.split("/")[0]
+    pass
+    return save_directory, username
+pass
+
+
+def create_huggingface_repo(
+    model,
+    save_directory,
+    token = None,
+    private = False,
+):
+    if token is None :
+        token = get_token()
+    pass
+    save_directory, username = _determine_username(save_directory, "", token)
+
+    from huggingface_hub import create_repo
+    try:
+        create_repo(
+            repo_id   = save_directory,
+            token     = token,
+            repo_type = "model",
+            exist_ok  = False,
+            private   = private,
+        ) 
+
+        # Create model card
+        from huggingface_hub import ModelCard
+        content = MODEL_CARD.format(
+            username   = username,
+            base_model = model.config._name_or_path,
+            model_type = model.config.model_type,
+            method     = "",
+            extra      = "unsloth",
+        )
+        card = ModelCard(content)
+        card.push_to_hub(save_directory, token = token)
+    except:
+        pass
+    hf_api = HfApi(token = token)
+    return save_directory, hf_api
+pass
+
+
+def upload_to_huggingface(
+    model,
+    save_directory,
+    token,
+    method,
+    extra = "",
+    file_location = None,
+    old_username = None,
+    private = None,
+    create_config = True,
+):
+    save_directory, username = _determine_username(save_directory, old_username, token)
+
+    from huggingface_hub import create_repo
+    try:
+        create_repo(
+            repo_id   = save_directory,
+            token     = token,
+            repo_type = "model",
+            exist_ok  = False,
+            private   = private,
+        ) 
+
+        # Create model card
+        from huggingface_hub import ModelCard
+        content = MODEL_CARD.format(
+            username   = username,
+            base_model = model.config._name_or_path,
+            model_type = model.config.model_type,
+            method     = "",
+            extra      = extra,
+        )
+        card = ModelCard(content)
+        card.push_to_hub(save_directory, token = token)
+    except:
+        pass
+
+    if file_location is not None:
+        # Now upload file
+        hf_api = HfApi(token = token)
+
+        if "/" in file_location:
+            uploaded_location = file_location[file_location.rfind("/")+1:]
+        else:
+            uploaded_location = file_location
+        pass
+
+        # find ftevent file from tensorboard and upload it
+        import glob
+        ftevent_files = glob.glob("*out.tfevents*", recursive = True)
+        if len(ftevent_files) > 0:
+            print("Unsloth: Uploading tensorboard files... Please wait...", file_location + "*out.tfevents*")
+            for ftevent_file in ftevent_files:
+                hf_api.upload_file(
+                    path_or_fileobj = ftevent_file,
+                    path_in_repo    = ftevent_file.replace(file_location, ""),
+                    repo_id         = save_directory,
+                    repo_type       = "model",
+                    commit_message  = "(Trained with Unsloth)",
+                )
+            pass
+        pass
+
+        hf_api.upload_file(
+            path_or_fileobj = file_location,
+            path_in_repo    = uploaded_location,
+            repo_id         = save_directory,
+            repo_type       = "model",
+            commit_message  = "(Trained with Unsloth)",
+        )
+
+        # We also upload a config.json file
+        if create_config:
+            import json
+            with open("_temporary_unsloth_config.json", "w") as file:
+                json.dump({"model_type" : model.config.model_type}, file, indent = 4)
+            pass
+            hf_api.upload_file(
+                path_or_fileobj = "_temporary_unsloth_config.json",
+                path_in_repo    = "config.json",
+                repo_id         = save_directory,
+                repo_type       = "model",
+                commit_message  = "(Trained with Unsloth)",
+            )
+            os.remove("_temporary_unsloth_config.json")
+        pass
+    pass
+    return username
+pass
+
+
+def fix_tokenizer_bos_token(tokenizer):
+    # Check if BOS added already, then warn
+    fix_bos_token = False
+    chat_template = getattr(tokenizer, "chat_template", None)
+    
+    if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
+        if chat_template is not None and \
+            (
+                tokenizer.bos_token in chat_template or \
+                "{bos_token}" in chat_template.replace(" ", "") or \
+                "{bos_token+" in chat_template.replace(" ", "")
+            ):
+
+            fix_bos_token = True
+            logger.warning(
+                "Unsloth: ##### The current model auto adds a BOS token.\n"\
+                "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily."
+            )
+
+            # Remove {{bos_token}}
+            new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template)
+            # Remove {{bos_token +
+            new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\+[\s]{0,}", "", new_chat_template)
+            
+            tokenizer.chat_template = new_chat_template
+
+        pass
+    pass
+    return fix_bos_token, chat_template
+pass
+
+
+def create_ollama_modelfile(tokenizer, gguf_location):
+    """
+        Creates an Ollama Modelfile.
+        Use ollama.create(model = "new_ollama_model", modelfile = modelfile)
+    """
+    modelfile = getattr(tokenizer, "_ollama_modelfile", None)
+    if modelfile is None: return None
+
+    FILE_LOCATION_REPLACER = "⚫@✅#🦥__FILE_LOCATION__⚡@🦥#⛵"
+    EOS_TOKEN_REPLACER     = "⚫@✅#🦥__EOS_TOKEN__⚡@🦥#⛵"
+    LEFT_BRACKET_REPLACER  = "⚫@✅#🦥"
+    RIGHT_BRACKET_REPLACER = "⚡@🦥#⛵"
+
+    # Fixes https://github.com/unslothai/unsloth/issues/1087
+    # We must convert all {'s and }'s but keep {__FILE_LOCATION__} intact
+    modelfile = modelfile\
+        .replace("{__FILE_LOCATION__}", FILE_LOCATION_REPLACER)\
+        .replace("{__EOS_TOKEN__}",     EOS_TOKEN_REPLACER)\
+        .replace("{", LEFT_BRACKET_REPLACER)\
+        .replace("}", RIGHT_BRACKET_REPLACER)
+
+    # Revert {__FILE_LOCATION__} back
+    modelfile = modelfile\
+        .replace(FILE_LOCATION_REPLACER, "{__FILE_LOCATION__}")\
+        .replace(EOS_TOKEN_REPLACER,     "{__EOS_TOKEN__}")
+    
+    if "__EOS_TOKEN__" in modelfile:
+        modelfile = modelfile.format(
+            __FILE_LOCATION__  = gguf_location,
+            __EOS_TOKEN__      = tokenizer.eos_token,
+        )
+    else:
+        modelfile = modelfile.format(
+            __FILE_LOCATION__  = gguf_location,
+        )
+    pass
+    
+    modelfile = modelfile\
+        .replace("⚫@✅#🦥", "{")\
+        .replace("⚡@🦥#⛵", "}")\
+        .rstrip()
+
+    return modelfile
+pass
+
+
+def unsloth_save_pretrained_gguf(
+    self,
+    save_directory       : Union[str, os.PathLike],
+    tokenizer            = None,
+    quantization_method  : str = "fast_quantized",
+    first_conversion     : str = None,
+    push_to_hub          : bool = False,
+    token                : Optional[Union[str, bool]] = None,
+    private              : Optional[bool] = None,
+    is_main_process      : bool = True,
+    state_dict           : Optional[dict] = None,
+    save_function        : Callable = torch.save,
+    max_shard_size       : Union[int, str] = "5GB",
+    safe_serialization   : bool = True,
+    variant              : Optional[str] = None,
+    save_peft_format     : bool = True,
+    tags                 : List[str] = None,
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.85,
+):
+    """
+        Same as .save_pretrained(...) except 4bit weights are auto
+        converted to float16 then converted to GGUF / llama.cpp format.
+
+        Choose for `quantization_method` to be:
+        "not_quantized"  : "Recommended. Fast conversion. Slow inference, big files.",
+        "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
+        "quantized"      : "Recommended. Slow conversion. Fast inference, small files.",
+        "f32"     : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
+        "f16"     : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
+        "q8_0"    : "Fast conversion. High resource use, but generally acceptable.",
+        "q4_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
+        "q5_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
+        "q2_k"    : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+        "q3_k_l"  : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+        "q3_k_m"  : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+        "q3_k_s"  : "Uses Q3_K for all tensors",
+        "q4_0"    : "Original quant method, 4-bit.",
+        "q4_1"    : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
+        "q4_k_s"  : "Uses Q4_K for all tensors",
+        "q4_k"    : "alias for q4_k_m",
+        "q5_k"    : "alias for q5_k_m",
+        "q5_0"    : "Higher accuracy, higher resource usage and slower inference.",
+        "q5_1"    : "Even higher accuracy, resource usage and slower inference.",
+        "q5_k_s"  : "Uses Q5_K for all tensors",
+        "q6_k"    : "Uses Q8_K for all tensors",
+        "iq2_xxs" : "2.06 bpw quantization",
+        "iq2_xs"  : "2.31 bpw quantization",
+        "iq3_xxs" : "3.06 bpw quantization",
+        "q3_k_xs" : "3-bit extra small quantization",
+    """
+    if tokenizer is None:
+        raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.")
+
+    arguments = dict(locals())
+    arguments["model"]        = self
+    arguments["tokenizer"]    = tokenizer
+    arguments["push_to_hub"]  = False # We save ourselves
+    arguments["save_method"] = "merged_16bit" # Must be 16bit
+    del arguments["self"]
+    del arguments["quantization_method"]
+    del arguments["first_conversion"]
+
+    # Fix tokenizer adding an extra BOS token at the front
+    fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer)
+
+    # Non blocking install GGUF first
+    if not os.path.exists("llama.cpp"):
+
+        if IS_KAGGLE_ENVIRONMENT:
+            # Kaggle is weird - no blocking installs, and no CUDA?
+            python_install = install_python_non_blocking(["gguf", "protobuf"])
+            python_install.wait()
+            install_llama_cpp_blocking(use_cuda = False)
+            new_save_directory, old_username = unsloth_save_model(**arguments)
+            makefile = None
+        else:
+            git_clone = install_llama_cpp_clone_non_blocking()
+            python_install = install_python_non_blocking(["gguf", "protobuf"])
+            git_clone.wait()
+            makefile = install_llama_cpp_make_non_blocking()
+            new_save_directory, old_username = unsloth_save_model(**arguments)
+            python_install.wait()
+        pass
+    else:
+        try:
+            new_save_directory, old_username = unsloth_save_model(**arguments)
+            makefile = None
+        except:
+            # Retry by recloning llama.cpp
+            if IS_KAGGLE_ENVIRONMENT:
+                # Kaggle is weird - no blocking installs, and no CUDA?
+                python_install = install_python_non_blocking(["gguf", "protobuf"])
+                python_install.wait()
+                install_llama_cpp_blocking(use_cuda = False)
+                new_save_directory, old_username = unsloth_save_model(**arguments)
+                makefile = None
+            else:
+                git_clone = install_llama_cpp_clone_non_blocking()
+                python_install = install_python_non_blocking(["gguf", "protobuf"])
+                git_clone.wait()
+                makefile = install_llama_cpp_make_non_blocking()
+                new_save_directory, old_username = unsloth_save_model(**arguments)
+                python_install.wait()
+            pass
+        pass
+    pass
+
+    # Use old chat template if the bos is removed
+    if fix_bos_token:
+        tokenizer.chat_template = old_chat_template
+    pass
+
+    for _ in range(3):
+        gc.collect()
+
+    model_dtype = self.config.torch_dtype
+    model_type  = self.config.model_type
+    if type(model_dtype) is str:
+        assert(model_dtype == "float16" or model_dtype == "bfloat16")
+    elif model_dtype == torch.float16:
+        model_dtype = "float16"
+    elif model_dtype == torch.bfloat16:
+        model_dtype = "bfloat16"
+    else:
+        raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16")
+    pass
+
+    is_sentencepiece_model = check_if_sentencepiece_model(self)
+
+    # Save to GGUF
+    all_file_locations, want_full_precision = save_to_gguf(
+        model_type, model_dtype, is_sentencepiece_model, 
+        new_save_directory, quantization_method, first_conversion, makefile,
+    )
+
+    # Save Ollama modelfile
+    modelfile = create_ollama_modelfile(tokenizer, all_file_locations[0])
+    modelfile_location = None
+    if modelfile is not None:
+        modelfile_location = os.path.join(new_save_directory, "Modelfile")
+        with open(modelfile_location, "w") as file:
+            file.write(modelfile)
+        pass
+        print(f"Unsloth: Saved Ollama Modelfile to {modelfile_location}")
+    pass
+
+    if fix_bos_token:
+        logger.warning(
+            "Unsloth: ##### The current model auto adds a BOS token.\n"\
+            "Unsloth: ##### We removed it in GGUF's chat template for you."
+        )
+    pass
+
+    if push_to_hub:
+        print("Unsloth: Uploading GGUF to Huggingface Hub...")
+
+        # If not needing full precision, skip the first
+        if not want_full_precision: all_file_locations = all_file_locations[1:]
+
+        for file_location in all_file_locations:
+            username = upload_to_huggingface(
+                self, save_directory, token,
+                "GGUF converted", "gguf", file_location, old_username, private,
+            )
+            link = f"{username}/{new_save_directory.lstrip('/.')}" \
+                if username not in new_save_directory else \
+                new_save_directory.lstrip('/.')
+            print(f"Saved GGUF to https://huggingface.co/{link}")
+        pass
+
+        # Save modelfile
+        if modelfile_location is not None:
+            username = upload_to_huggingface(
+                self, save_directory, token,
+                "GGUF converted", "gguf", modelfile_location, old_username, private,
+            )
+            print(f"Saved Ollama Modelfile to https://huggingface.co/{link}")
+        pass
+    pass
+pass
+
+
+def unsloth_push_to_hub_gguf(
+    self,
+    repo_id              : str,
+    tokenizer            = None,
+    quantization_method  : str = "fast_quantized",
+    first_conversion     : str = None,
+    use_temp_dir         : Optional[bool] = None,
+    commit_message       : Optional[str] = "Trained with Unsloth",
+    private              : Optional[bool] = None,
+    token                : Union[bool, str, None] = None,
+    max_shard_size       : Union[int, str, None] = "5GB",
+    create_pr            : bool = False,
+    safe_serialization   : bool = True,
+    revision             : str = None,
+    commit_description   : str = "Upload model trained with Unsloth 2x faster",
+    tags                 : Optional[List[str]] = None,
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.85,
+):
+    """
+        Same as .push_to_hub(...) except 4bit weights are auto
+        converted to float16 then converted to GGUF / llama.cpp format.
+
+        Choose for `quantization_method` to be:
+        "not_quantized"  : "Recommended. Fast conversion. Slow inference, big files.",
+        "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
+        "quantized"      : "Recommended. Slow conversion. Fast inference, small files.",
+        "f32"     : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
+        "f16"     : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
+        "q8_0"    : "Fast conversion. High resource use, but generally acceptable.",
+        "q4_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
+        "q5_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
+        "q2_k"    : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+        "q3_k_l"  : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+        "q3_k_m"  : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+        "q3_k_s"  : "Uses Q3_K for all tensors",
+        "q4_0"    : "Original quant method, 4-bit.",
+        "q4_1"    : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
+        "q4_k_s"  : "Uses Q4_K for all tensors",
+        "q5_0"    : "Higher accuracy, higher resource usage and slower inference.",
+        "q5_1"    : "Even higher accuracy, resource usage and slower inference.",
+        "q5_k_s"  : "Uses Q5_K for all tensors",
+        "q6_k"    : "Uses Q8_K for all tensors",
+    """
+    if tokenizer is None:
+        raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.")
+
+    arguments = dict(locals())
+    arguments["model"]          = self
+    arguments["tokenizer"]      = tokenizer
+    arguments["save_directory"] = repo_id
+    arguments["push_to_hub"]    = False # We save ourselves
+    arguments["save_method"]   = "merged_16bit" # Must be 16bit
+    del arguments["self"]
+    del arguments["repo_id"]
+    del arguments["quantization_method"]
+    del arguments["first_conversion"]
+
+    # Fix tokenizer adding an extra BOS token at the front
+    fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer)
+
+    # Non blocking install GGUF first
+    if not os.path.exists("llama.cpp"):
+
+        if IS_KAGGLE_ENVIRONMENT:
+            # Kaggle is weird - no blocking installs, and no CUDA?
+            python_install = install_python_non_blocking(["gguf", "protobuf"])
+            python_install.wait()
+            install_llama_cpp_blocking(use_cuda = False)
+            new_save_directory, old_username = unsloth_save_model(**arguments)
+            makefile = None
+        else:
+            git_clone = install_llama_cpp_clone_non_blocking()
+            python_install = install_python_non_blocking(["gguf", "protobuf"])
+            git_clone.wait()
+            makefile = install_llama_cpp_make_non_blocking()
+            new_save_directory, old_username = unsloth_save_model(**arguments)
+            python_install.wait()
+        pass
+    else:
+        try:
+            new_save_directory, old_username = unsloth_save_model(**arguments)
+            makefile = None
+        except:
+            # Retry by recloning llama.cpp
+            if IS_KAGGLE_ENVIRONMENT:
+                # Kaggle is weird - no blocking installs, and no CUDA?
+                python_install = install_python_non_blocking(["gguf", "protobuf"])
+                python_install.wait()
+                install_llama_cpp_blocking(use_cuda = False)
+                new_save_directory, old_username = unsloth_save_model(**arguments)
+                makefile = None
+            else:
+                git_clone = install_llama_cpp_clone_non_blocking()
+                python_install = install_python_non_blocking(["gguf", "protobuf"])
+                git_clone.wait()
+                makefile = install_llama_cpp_make_non_blocking()
+                new_save_directory, old_username = unsloth_save_model(**arguments)
+                python_install.wait()
+            pass
+        pass
+    pass
+
+    # Use old chat template if the bos is removed
+    if fix_bos_token:
+        tokenizer.chat_template = old_chat_template
+    pass
+
+    for _ in range(3):
+        gc.collect()
+
+    model_dtype = self.config.torch_dtype
+    model_type  = self.config.model_type
+    if type(model_dtype) is str:
+        assert(model_dtype == "float16" or model_dtype == "bfloat16")
+    elif model_dtype == torch.float16:
+        model_dtype = "float16"
+    elif model_dtype == torch.bfloat16:
+        model_dtype = "bfloat16"
+    else:
+        raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16")
+    pass
+
+    is_sentencepiece_model = check_if_sentencepiece_model(self)
+
+    # Save to GGUF
+    all_file_locations, want_full_precision = save_to_gguf(
+        model_type, model_dtype, is_sentencepiece_model, 
+        new_save_directory, quantization_method, first_conversion, makefile,
+    )
+
+    # Save Ollama modelfile
+    modelfile = create_ollama_modelfile(tokenizer, all_file_locations[0])
+    modelfile_location = None
+    if modelfile is not None:
+        modelfile_location = os.path.join(new_save_directory, "Modelfile")
+        with open(modelfile_location, "w") as file:
+            file.write(modelfile)
+        pass
+        print(f"Unsloth: Saved Ollama Modelfile to {modelfile_location}")
+    pass
+
+    # If not needing full precision, skip the first
+    if not want_full_precision: all_file_locations = all_file_locations[1:]
+    
+    for file_location in all_file_locations:
+        print("Unsloth: Uploading GGUF to Huggingface Hub...")
+        username = upload_to_huggingface(
+            self, repo_id, token,
+            "GGUF converted", "gguf", file_location, old_username, private,
+        )
+        link = f"{username}/{new_save_directory.lstrip('/.')}" \
+            if username not in new_save_directory else \
+            new_save_directory.lstrip('/.')
+
+        print(f"Saved GGUF to https://huggingface.co/{link}")
+    pass
+
+    # Save modelfile
+    if modelfile_location is not None:
+        username = upload_to_huggingface(
+            self, repo_id, token,
+            "GGUF converted", "gguf", modelfile_location, old_username, private,
+        )
+        print(f"Saved Ollama Modelfile to https://huggingface.co/{link}")
+    pass
+
+    if fix_bos_token:
+        logger.warning(
+            "Unsloth: ##### The current model auto adds a BOS token.\n"\
+            "Unsloth: ##### We removed it in GGUF's chat template for you."
+        )
+    pass
+pass
+
+# Corrected function to save LoRA to a custom directory
+def save_lora_to_custom_dir(model, tokenizer, save_directory):
+    # Create the custom directory if it doesn't exist
+    os.makedirs(save_directory, exist_ok=True)
+
+    # Call the unsloth_save_model function with the custom directory
+    unsloth_save_model(
+        model,
+        tokenizer,
+        save_directory=save_directory,
+        save_method="lora",
+        push_to_hub=False,
+    )
+
+# Corrected method within the model class to convert LoRA to GGML and push to Hugging Face Hub
+def unsloth_convert_lora_to_ggml_and_push_to_hub(
+    self,
+    tokenizer,
+    repo_id: str,
+    use_temp_dir: Optional[bool] = None,
+    commit_message: Optional[str] = "Converted LoRA to GGML with Unsloth",
+    private: Optional[bool] = None,
+    token: Union[bool, str, None] = None,
+    create_pr: bool = False,
+    revision: str = None,
+    commit_description: str = "Convert LoRA to GGML format using Unsloth",
+    temporary_location: str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage: float = 0.85,
+):
+    if not os.path.exists("llama.cpp"):
+        if IS_KAGGLE_ENVIRONMENT:
+            python_install = install_python_non_blocking(["protobuf"])
+            python_install.wait()
+            install_llama_cpp_blocking(use_cuda=False)
+            makefile = None
+        else:
+            git_clone = install_llama_cpp_clone_non_blocking()
+            python_install = install_python_non_blocking(["protobuf"])
+            git_clone.wait()
+            makefile = install_llama_cpp_make_non_blocking()
+            python_install.wait()
+    else:
+        makefile = None
+
+    for _ in range(3):
+        gc.collect()
+
+    lora_directory_push = "lora-to-ggml-push"
+    save_lora_to_custom_dir(self, tokenizer, lora_directory_push)
+
+    model_type = self.config.model_type
+    output_file = os.path.join(lora_directory_push, "ggml-adapter-model.bin")
+
+    print(f"Unsloth: Converting auto-saved LoRA adapters at {lora_directory_push} to GGML format.")
+    print(f"The output file will be {output_file}")
+
+    command = f"python3 llama.cpp/convert-lora-to-ggml.py {lora_directory_push} {output_file} llama"
+
+    try:
+        with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp:
+            for line in sp.stdout:
+                print(line, end="", flush=True)
+            for line in sp.stderr:
+                print(line, end="", flush=True)
+            sp.wait()
+            if sp.returncode != 0:
+                raise subprocess.CalledProcessError(sp.returncode, command)
+    except subprocess.CalledProcessError as e:
+        print(f"Error: Conversion failed with return code {e.returncode}")
+        return
+
+    print(f"Unsloth: Conversion completed! Output file: {output_file}")
+
+    print("Unsloth: Uploading GGML file to Hugging Face Hub...")
+    username = upload_to_huggingface(
+        self, repo_id, token,
+        "GGML converted LoRA", "ggml", output_file, None, private,
+    )
+    link = f"{repo_id.lstrip('/')}"
+    print("Unsloth: Done.")
+    print(f"Converted LoRA to GGML and uploaded to https://huggingface.co/{link}")
+    print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!")
+
+def unsloth_convert_lora_to_ggml_and_save_locally(
+    self,
+    save_directory: str, # Added parameter for the folder name 
+    tokenizer, 
+    temporary_location: str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage: float = 0.85,
+):
+    if not os.path.exists("llama.cpp"):
+        if IS_KAGGLE_ENVIRONMENT:
+            python_install = install_python_non_blocking(["protobuf"])
+            python_install.wait()
+            install_llama_cpp_blocking(use_cuda=False)
+            makefile = None
+        else:
+            git_clone = install_llama_cpp_clone_non_blocking()
+            python_install = install_python_non_blocking(["protobuf"])
+            git_clone.wait()
+            makefile = install_llama_cpp_make_non_blocking()
+            python_install.wait()
+    else:
+        makefile = None
+
+    for _ in range(3):
+        gc.collect()
+
+    # Use the provided save_directory for local saving
+    save_lora_to_custom_dir(self, tokenizer, save_directory)
+
+    model_type = self.config.model_type
+    output_file = os.path.join(save_directory, "ggml-adapter-model.bin")
+
+    print(f"Unsloth: Converting auto-saved LoRA adapters at {save_directory} to GGML format.")
+    print(f"The output file will be {output_file}")
+
+    command = f"python3 llama.cpp/convert-lora-to-ggml.py {save_directory} {output_file} llama"
+
+    try:
+        with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp:
+            for line in sp.stdout:
+                print(line, end="", flush=True)
+            for line in sp.stderr:
+                print(line, end="", flush=True)
+            sp.wait()
+            if sp.returncode != 0:
+                raise subprocess.CalledProcessError(sp.returncode, command)
+    except subprocess.CalledProcessError as e:
+        print(f"Error: Conversion failed with return code {e.returncode}")
+        return
+    print("Unsloth: Done.")
+    print(f"Unsloth: Conversion completed! Output file: {output_file}")
+    print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!")
+pass
+
+
+from .models.loader_utils import get_model_name
+from unsloth_zoo.saving_utils import merge_and_overwrite_lora
+
+@torch.inference_mode
+def unsloth_generic_save(
+    model,
+    tokenizer,
+    save_directory       : Union[str, os.PathLike] = "unsloth_finetuned_merge",
+    save_method          : str = "lora", # ["lora", "merged_16bit", "merged_4bit"]
+    push_to_hub          : bool = False,
+    token                : Optional[Union[str, bool]] = None,
+    is_main_process      : bool = True,
+    state_dict           : Optional[dict] = None,
+    save_function        : Callable = torch.save,
+    max_shard_size       : Union[int, str] = "5GB",
+    safe_serialization   : bool = True,
+    variant              : Optional[str] = None,
+    save_peft_format     : bool = True,
+
+    # Push to hub
+    use_temp_dir         : Optional[bool] = None,
+    commit_message       : Optional[str] = "Trained with Unsloth",
+    private              : Optional[bool] = None,
+    create_pr            : bool = False,
+    revision             : str = None,
+    commit_description   : str = "Upload model trained with Unsloth 2x faster",
+    tags                 : List[str] = None,
+
+    # Our functions
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.9,
+):
+    if token is None and push_to_hub: token = get_token()
+    merge_and_overwrite_lora(
+        get_model_name,
+        model                = model,
+        tokenizer            = tokenizer,
+        save_directory       = save_directory,
+        push_to_hub          = push_to_hub,
+        private              = private,
+        token                = token,
+        output_dtype         = None,
+        low_disk_space_usage = False,
+        use_temp_file        = False,
+    )
+    return
+pass
+
+
+def unsloth_generic_save_pretrained_merged(
+    self,
+    save_directory       : Union[str, os.PathLike],
+    tokenizer            = None,
+    save_method          : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
+    push_to_hub          : bool = False,
+    token                : Optional[Union[str, bool]] = None,
+    is_main_process      : bool = True,
+    state_dict           : Optional[dict] = None,
+    save_function        : Callable = torch.save,
+    max_shard_size       : Union[int, str] = "5GB",
+    safe_serialization   : bool = True,
+    variant              : Optional[str] = None,
+    save_peft_format     : bool = True,
+    tags                 : List[str] = None,
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.75,
+):   
+    """
+        Same as .push_to_hub(...) except 4bit weights are auto
+        converted to float16 with as few overhead as possible.
+
+        Choose for `save_method` to be either:
+        1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
+        2.  `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
+        3.  `lora`: Save LoRA adapters with no merging. Useful for HF inference.
+    """
+    if tokenizer is None:
+        logger.warning_once(
+            "Unsloth: You're not saving a tokenizer as well?\n"\
+            "You can do it separately via `tokenizer.save_pretrained(...)`"
+        )
+    pass
+
+    arguments = dict(locals())
+    arguments["model"] = self
+    del arguments["self"]
+    unsloth_generic_save(**arguments)
+    for _ in range(3):
+        gc.collect()
+pass
+
+
+def unsloth_generic_push_to_hub_merged(
+    self,
+    repo_id              : str,
+    tokenizer            = None,
+    save_method          : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
+    use_temp_dir         : Optional[bool] = None,
+    commit_message       : Optional[str] = "Trained with Unsloth",
+    private              : Optional[bool] = None,
+    token                : Union[bool, str, None] = None,
+    max_shard_size       : Union[int, str, None] = "5GB",
+    create_pr            : bool = False,
+    safe_serialization   : bool = True,
+    revision             : str = None,
+    commit_description   : str = "Upload model trained with Unsloth 2x faster",
+    tags                 : Optional[List[str]] = None,
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.75,
+):
+    """
+        Same as .push_to_hub(...) except 4bit weights are auto
+        converted to float16 with as few overhead as possible.
+
+        Choose for `save_method` to be either:
+        1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
+        2.  `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
+        3.  `lora`: Save LoRA adapters with no merging. Useful for HF inference.
+    """
+    if tokenizer is None:
+        logger.warning_once(
+            "Unsloth: You're not saving a tokenizer as well?\n"\
+            "You can do it separately via `tokenizer.push_to_hub(...)`"
+        )
+    pass
+
+    arguments = dict(locals())
+    arguments["model"]          = self
+    arguments["save_directory"] = repo_id
+    arguments["push_to_hub"]    = True
+    del arguments["self"]
+    del arguments["repo_id"]
+    unsloth_generic_save(**arguments)
+    for _ in range(3):
+        gc.collect()
+pass
+
+
+def not_implemented_save(*args, **kwargs):
+    raise NotImplementedError("Unsloth: Sorry GGUF is currently not supported for vision models!")
+pass
+
+
+def patch_saving_functions(model, vision = False):
+    import inspect
+    import types
+    from typing import Callable, Optional, Union, List
+
+    # And now re add our saving methods!
+    if model.push_to_hub.__name__ == "unsloth_push_to_hub":
+        original_push_to_hub = model.original_push_to_hub
+    else:
+        original_push_to_hub = model.push_to_hub
+    pass
+
+    signature = str(inspect.signature(original_push_to_hub)).replace("NoneType", "None")
+    signature = signature[1:]
+    signature = re.sub("<function save at .+?>", "torch.save", signature)
+    docs = original_push_to_hub.__doc__.encode("utf-8").decode("utf-8")
+
+    push_to_hub_text = f'''def unsloth_push_to_hub(self, {signature}:
+    """
+    {docs}
+    """
+    arguments = dict(locals())
+    del arguments["self"]
+    if "tags" in arguments and arguments["tags"] is not None:
+        assert(isinstance(arguments["tags"], (list, tuple)))
+        arguments["tags"] = list(arguments["tags"]) + ["unsloth",]
+    elif "tags" in arguments:
+        arguments["tags"] = ["unsloth",]
+    elif hasattr(self, "add_model_tags"):
+        self.add_model_tags(["unsloth",])
+
+    if "commit_message" in arguments:
+        commit_message = arguments["commit_message"]
+        if commit_message is not None:
+            if not commit_message.endswith(" "): commit_message += " "
+            if "Unsloth" not in commit_message:
+                commit_message += "(Trained with Unsloth)"
+        else:
+            commit_message = "Upload model trained with Unsloth"
+        arguments["commit_message"] = commit_message
+
+    if "commit_description" in arguments:
+        commit_description = arguments["commit_description"]
+        if commit_description is not None:
+            if not commit_description.endswith(" "): commit_description += " "
+            if "Unsloth" not in commit_description:
+                commit_description += "(Trained with Unsloth 2x faster)"
+        else:
+            commit_description = "Upload model trained with Unsloth 2x faster"
+        arguments["commit_description"] = commit_description
+
+    # Update model tag
+    if hasattr(self, "config"):
+        _ = upload_to_huggingface(
+            self, arguments["repo_id"], arguments["token"],
+            "finetuned", "trl", file_location = None,
+            old_username = None, private = arguments["private"],
+        )
+    pass
+
+    try:
+        self.original_push_to_hub(**arguments)
+    except:
+        del arguments["tags"]
+        self.original_push_to_hub(**arguments)
+    pass
+
+    if hasattr(self, "config"):
+        print("Saved model to https://huggingface.co/" + arguments["repo_id"])
+    pass
+    '''
+    exec(push_to_hub_text, globals())
+
+    original_model = model
+    while True:
+
+        if original_model.push_to_hub.__name__ != "unsloth_push_to_hub":
+            original_model.original_push_to_hub = original_model.push_to_hub
+            original_model.push_to_hub = types.MethodType(unsloth_push_to_hub, original_model)
+            if hasattr(original_model, "add_model_tags"):
+                original_model.add_model_tags(["unsloth",])
+            pass
+        pass
+
+        if hasattr(original_model, "model"): original_model = original_model.model
+        else: break
+    pass
+
+    # Add saving methods to top level model
+    if not vision:
+        if hasattr(model, "config"):
+            # Counteract tokenizers
+            model.push_to_hub_merged     = types.MethodType(unsloth_push_to_hub_merged,                    model)
+            model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged,                model)
+            model.push_to_hub_gguf       = types.MethodType(unsloth_push_to_hub_gguf,                      model)
+            model.save_pretrained_gguf   = types.MethodType(unsloth_save_pretrained_gguf,                  model)
+            model.push_to_hub_ggml       = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub,  model)
+            model.save_pretrained_ggml   = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model)
+        pass
+    else:
+        # Vision only 1 option
+        model.push_to_hub_merged     = types.MethodType(unsloth_generic_push_to_hub_merged,     model)
+        model.save_pretrained_merged = types.MethodType(unsloth_generic_save_pretrained_merged, model)
+        model.push_to_hub_gguf       = types.MethodType(not_implemented_save,                   model)
+        model.save_pretrained_gguf   = types.MethodType(not_implemented_save,                   model)
+    pass
+    return model
+pass
diff --git a/unsloth-main/unsloth/tokenizer_utils.py b/unsloth-main/unsloth/tokenizer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..384b4bbca56f81eaa1a7eba9f03ce9b810abd606
--- /dev/null
+++ b/unsloth-main/unsloth/tokenizer_utils.py
@@ -0,0 +1,1061 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import AutoTokenizer
+from transformers.convert_slow_tokenizer import convert_slow_tokenizer
+from transformers import PreTrainedTokenizerFast
+import re
+import os
+from transformers.models.llama.modeling_llama import logger
+from peft import PeftModelForCausalLM
+import torch
+import itertools
+import collections
+import numpy as np
+import gc
+import subprocess
+
+from unsloth_zoo.tokenizer_utils import (
+    mean_of_trained_tokens,
+    add_new_tokens,
+    fix_untrained_tokens,
+)
+from unsloth_zoo.training_utils import (
+    fix_zero_training_loss,
+)
+
+__all__ = [
+    "load_correct_tokenizer",
+    "fix_sentencepiece_tokenizer",
+    "check_tokenizer",
+    "add_new_tokens",
+    "fix_sentencepiece_gguf",
+]
+
+
+IGNORED_TOKENIZER_CHECKING = frozenset((
+    "CodeLlamaTokenizerFast",
+    "CodeLlamaTokenizer",
+))
+
+
+IGNORED_TOKENIZER_NAMES = [
+    # Qwen Coder did not train on tool calling. Math did!
+    "unsloth/Qwen2.5-Coder-1.5B-Instruct",
+    "unsloth/Qwen2.5-Coder-7B-Instruct",
+]
+IGNORED_TOKENIZER_NAMES = frozenset(
+    [x.lower() for x in IGNORED_TOKENIZER_NAMES] + \
+    [x.lower()+"-bnb-4bit" for x in IGNORED_TOKENIZER_NAMES]
+)
+
+# Check environments
+keynames = "\n" + "\n".join(os.environ.keys())
+IS_COLAB_ENVIRONMENT  = "\nCOLAB_"  in keynames
+IS_KAGGLE_ENVIRONMENT = "\nKAGGLE_" in keynames
+KAGGLE_TMP = "/tmp"
+del keynames
+
+
+def try_fix_tokenizer(tokenizer, prepend = True):
+
+    if hasattr(tokenizer, "_tokenizer"):
+        converted_tokenizer = tokenizer._tokenizer
+    else:
+        converted_tokenizer = convert_slow_tokenizer(tokenizer)
+    pass
+
+    tokenizer_string = converted_tokenizer.to_str()
+
+    # Llama does _apple. Sometimes this is wrong!!
+    prepend_text = '{"type":"Prepend","prepend":"▁"},'
+    if not prepend and prepend_text in tokenizer_string:
+        tokenizer_string = tokenizer_string.replace(prepend_text, "", 1)
+    pass
+
+    dir_names = dir(tokenizer)
+    # Get eos_token, bos_token etc
+    token_names = [x for x in dir_names if x.endswith("_token") and x.count("_") == 1]
+
+    for token_name in token_names:
+        token = getattr(tokenizer, token_name, None)
+        if token is None: continue
+        token_id = getattr(tokenizer, token_name + "_id", None)
+
+        # Locate the token's id mapping in the string
+        find_text = f'"id":{token_id},"content":"'
+        start = tokenizer_string.find(find_text) + len(find_text)
+        if start == -1: continue
+        end   = tokenizer_string.find('",', start)
+
+        bad_token = tokenizer_string[start : end]
+        # Check if token is the actual same one - if not, edit it
+        if bad_token != token:
+            bad_text  = f'{find_text}{bad_token}",'
+            good_text = f'{find_text}{token}",'
+            tokenizer_string = tokenizer_string.replace(bad_text, good_text, 1)
+
+            # And replace vocab section
+            bad_text = f'"{bad_token}":{token_id},'
+            good_text = f'"{token}":{token_id},'
+            tokenizer_string = tokenizer_string.replace(bad_text, good_text, 1)
+        pass
+    pass
+
+    fixed_tokenizer = converted_tokenizer.from_str(tokenizer_string)
+    return fixed_tokenizer
+pass
+
+
+def get_sorted_dict(dictionary):
+    sorted_keys = sorted(dictionary.values())
+    inverted_dictionary = { value : key for key, value in dictionary.items() }
+
+    sorted_dictionary = {}
+    for key in sorted_keys:
+        value = inverted_dictionary[key]
+        sorted_dictionary[value] = key
+    return sorted_dictionary
+pass
+
+
+def convert_to_fast_tokenizer(
+    slow_tokenizer,
+    temporary_location = "_unsloth_sentencepiece_temp",
+):
+    is_fast = getattr(slow_tokenizer, "is_fast", False)
+    if is_fast: return slow_tokenizer
+    
+    try:
+        tokenizer_name = slow_tokenizer.__class__.__name__
+        lowered_tokenizer_name = tokenizer_name.lower()
+        if lowered_tokenizer_name.endswith("tokenizer"):
+            class_name = lowered_tokenizer_name[:-len("tokenizer")]
+            FastTokenizer = eval(
+                f'__import__(f"transformers.models.{class_name}").{tokenizer_name}Fast'
+            )
+        else:
+            FastTokenizer = PreTrainedTokenizerFast
+    except:
+        FastTokenizer = PreTrainedTokenizerFast
+    pass
+
+    # Get all arguments (bos_token, etc)
+    docs = FastTokenizer.__doc__
+    docs = docs[docs.find("Args:"):]
+    args = re.findall(r"\n[\s]+([^\s]{1,}) \(", docs, flags = re.MULTILINE)
+    args = [x for x in args if not x.endswith("_file")]
+
+    # Also some missing maybe!
+    docs = PreTrainedTokenizerFast.__doc__
+    docs = docs[docs.find("Args:"):]
+    args2 = re.findall(r"\n[\s]+([^\s]{1,}) \(", docs, flags = re.MULTILINE)
+    args2 = [x for x in args2 if not x.endswith("_file")]
+    args = list(set(args + args2))
+
+    kwargs = {}
+    for arg in args: kwargs[arg] = getattr(slow_tokenizer, arg, None)
+    kwargs["tokenizer_object"] = try_fix_tokenizer(slow_tokenizer, prepend = True)
+    fast_tokenizer = FastTokenizer( **kwargs )
+
+    # Check if they're similar!
+    sorted_slow_tokenizer = get_sorted_dict(slow_tokenizer.get_vocab())
+    sorted_fast_tokenizer = get_sorted_dict(fast_tokenizer.get_vocab())
+
+    check_vocab   = (sorted_slow_tokenizer == sorted_fast_tokenizer)
+    check_special = (slow_tokenizer.all_special_tokens == fast_tokenizer.all_special_tokens)
+
+    # Failure so return slow_tokenizer
+    if not check_vocab or not check_special: return slow_tokenizer
+
+    # Now confirm if they match
+    if not assert_same_tokenization(slow_tokenizer, fast_tokenizer):
+        # Maybe remove prepending of __apple?
+        kwargs["tokenizer_object"] = try_fix_tokenizer(slow_tokenizer, prepend = False)
+        fast_tokenizer = FastTokenizer( **kwargs )
+        if not assert_same_tokenization(slow_tokenizer, fast_tokenizer):
+            # Failure :(
+            return slow_tokenizer
+        pass
+    pass
+
+    # Also tokenizer.model is missing!
+    name = slow_tokenizer.name_or_path.replace("/", "_")
+    if not os.path.exists(temporary_location):
+        os.makedirs(temporary_location)
+    pass
+    new_location = f"{temporary_location}/{name}"
+    slow_tokenizer.save_pretrained(new_location)
+    fast_tokenizer.save_pretrained(new_location)
+
+    # Now load it!
+    fast_tokenizer = AutoTokenizer.from_pretrained(new_location)
+    if assert_same_tokenization(slow_tokenizer, fast_tokenizer):
+        return fast_tokenizer
+    return slow_tokenizer
+pass
+
+
+# Check Mistral chat template without BOS / EOS
+mistral_template = \
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{% if messages[1]['role'] == 'user' %}"\
+            "{{ '[INST] ' + messages[0]['content'] + ' ' + messages[1]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[2:] %}"\
+        "{% else %}"\
+            "{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[1:] %}"\
+        "{% endif %}"\
+    "{% else %}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '[INST] ' + message['content'] + ' [/INST]' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ message['content'] }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"
+pass
+
+# Check Llama chat template without BOS / EOS
+llama_template = \
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{% if messages[1]['role'] == 'user' %}"\
+            "{{ '[INST] <<SYS>>\n' + messages[0]['content'] + '\n<</SYS>>\n\n' + messages[1]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[2:] %}"\
+        "{% else %}"\
+            "{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[1:] %}"\
+        "{% endif %}"\
+    "{% else %}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '[INST] ' + message['content'].strip() + ' [/INST]' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ ' ' + message['content'].strip() + ' ' }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"
+pass
+
+
+def assert_same_tokenization(slow_tokenizer, fast_tokenizer):
+    # Get eos_token, bos_token etc
+    dir_names = dir(slow_tokenizer)
+    special_tokens = list(filter(None, (
+        getattr(slow_tokenizer, x) for x in dir_names
+        if x.endswith("_token") and x.count("_") == 1
+    )))
+    all_special_tokens = list(set(special_tokens + slow_tokenizer.all_special_tokens))
+
+    # Check if chat template is enabled!
+    check_chat_template1 = True
+    check_chat_template2 = True
+    check_chat_template3 = True
+    
+    """
+    Weirdly Mistral tokenizers are actually correct??
+    Ie below will actually load mistral v1 and v3 incorrectly!
+
+    slow_chat_template = getattr(slow_tokenizer, "chat_template", None)
+    fast_chat_template = getattr(fast_tokenizer, "chat_template", None)
+    messages = [
+        {"role": "user", "content": " What is 2+2? "},
+        {"role": "assistant", "content": " It's 4. "},
+    ]
+    # Check the tokenizer's own chat template
+    if slow_chat_template is not None and fast_chat_template is not None:
+        check_chat_template1 = \
+            slow_tokenizer.apply_chat_template(messages) == \
+            fast_tokenizer.apply_chat_template(messages)
+    pass
+
+    # Check Mistral chat template without BOS / EOS
+    slow_tokenizer.chat_template = mistral_template
+    fast_tokenizer.chat_template = mistral_template
+    check_chat_template2 = \
+        slow_tokenizer.apply_chat_template(messages) == \
+        fast_tokenizer.apply_chat_template(messages)
+    pass
+
+    # Check Llama chat template without BOS / EOS
+    slow_tokenizer.chat_template = llama_template
+    fast_tokenizer.chat_template = llama_template
+    check_chat_template3 = \
+        slow_tokenizer.apply_chat_template(messages) == \
+        fast_tokenizer.apply_chat_template(messages)
+    pass
+
+    # Combine them all and revert chat templates
+    slow_tokenizer.chat_template = slow_chat_template
+    fast_tokenizer.chat_template = fast_chat_template
+    """
+    check_chat_template = check_chat_template1 and check_chat_template2 and check_chat_template3
+
+    # Try special tokens
+    try:
+        string = "\n".join(all_special_tokens) + \
+            "A quick brown fox jumps over the lazy dog!!\n\nHi</s>\n\n" + \
+            "".join(all_special_tokens)
+        check_special_tokens = \
+            slow_tokenizer(string).input_ids == \
+            fast_tokenizer(string).input_ids
+
+        return check_chat_template and check_special_tokens
+    except:
+        # For eg see https://github.com/unslothai/unsloth/issues/292
+        # Sometimes tokenizer has weird tokens, causing a combined tokenization to fail.
+        # [TODO] We temporarily disable this for CodeLlama tokenizers
+        if slow_tokenizer.__repr__().split("(", 1)[0] in IGNORED_TOKENIZER_CHECKING:
+            return check_chat_template
+        else:
+            return False
+    pass
+pass
+
+
+def fix_sentencepiece_tokenizer(
+    old_tokenizer,
+    new_tokenizer,
+    token_mapping,
+    temporary_location = "_unsloth_sentencepiece_temp",
+):
+    # From https://github.com/google/sentencepiece/issues/121
+    # We need to manually edit the sentencepiece tokenizer!
+    from transformers.utils import sentencepiece_model_pb2
+
+    if not os.path.exists(temporary_location):
+        os.makedirs(temporary_location)
+    pass
+
+    # Check if tokenizer.model exists
+    if not os.path.isfile(f"{temporary_location}/tokenizer.model"):
+        return new_tokenizer
+    pass
+
+    # First save the old tokenizer
+    old_tokenizer.save_pretrained(temporary_location)
+
+    tokenizer_file = sentencepiece_model_pb2.ModelProto()
+    tokenizer_file.ParseFromString(open(f"{temporary_location}/tokenizer.model", "rb").read())
+
+    # Now save the new tokenizer
+    new_tokenizer.save_pretrained(temporary_location)
+
+    # Now correct the old tokenizer's .model file
+    for old_token, new_token in token_mapping.items():
+        ids = old_tokenizer([old_token], add_special_tokens = False).input_ids
+        ids = ids[0]
+        if (len(ids) != 1):
+            # Skip this token!
+            print(f"Skip mapping {old_token} to {new_token} since {new_token} is already in the tokenizer!")
+            continue
+        pass
+        ids = ids[0]
+        # [TODO] Hack for Starling - try except
+        try:
+            tokenizer_piece = tokenizer_file.pieces[ids]
+        except:
+            continue
+        assert(tokenizer_piece.piece == old_token)
+        tokenizer_piece.piece = new_token
+    pass
+
+    # And now write it
+    with open(f"{temporary_location}/tokenizer.model", "wb") as file:
+        file.write(tokenizer_file.SerializeToString())
+    pass
+
+    # And load it!
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        temporary_location,
+        eos_token = new_tokenizer.eos_token,
+        pad_token = new_tokenizer.pad_token,
+    )
+    return tokenizer
+pass
+
+
+def fix_sentencepiece_gguf(saved_location):
+    """
+        Fixes sentencepiece tokenizers which did not extend the vocabulary with
+        user defined tokens.
+        Inspiration from https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py
+    """
+    from copy import deepcopy
+    from transformers.utils import sentencepiece_model_pb2
+    import json
+    from enum import IntEnum
+    
+    class SentencePieceTokenTypes(IntEnum):
+        NORMAL = 1
+        UNKNOWN = 2
+        CONTROL = 3
+        USER_DEFINED = 4
+        UNUSED = 5
+        BYTE = 6
+    pass
+
+    # Load tokenizer.model
+    tokenizer_file = sentencepiece_model_pb2.ModelProto()
+    if not os.path.isfile(f"{saved_location}/tokenizer.model"): return
+    tokenizer_file.ParseFromString(open(f"{saved_location}/tokenizer.model", "rb").read())
+    sentence_piece_size = len(tokenizer_file.pieces)
+
+    # Load added_tokens_json
+    if not os.path.isfile(f"{saved_location}/added_tokens.json"): return
+    with open(f"{saved_location}/added_tokens.json", "r", encoding = "utf-8") as file:
+        added_tokens_json = json.load(file)
+    pass
+    if len(added_tokens_json) == 0: return
+
+    added_tokens_json = dict(sorted(added_tokens_json.items(), key = lambda item: item[1]))
+    new_size = sentence_piece_size + len(added_tokens_json)
+
+    # Confirm added_tokens_json is correct
+    added_tokens_ids = np.array(list(added_tokens_json.values()))
+    diff = np.diff(added_tokens_ids)
+    if (diff.min() != 1 or diff.max() != 1): return
+    if (added_tokens_ids.min() != sentence_piece_size): return
+
+    # Edit sentence piece tokens with added_tokens_json
+    logger.warning(
+        f"Unsloth: Extending {saved_location}/tokenizer.model with added_tokens.json.\n"\
+        f"Originally tokenizer.model is of size ({sentence_piece_size}).\n"\
+        f"But we need to extend to sentencepiece vocab size ({new_size})."
+    )
+    new_tokens = deepcopy(tokenizer_file.pieces[-len(added_tokens_ids):])
+    for new_token, added_token in zip(new_tokens, added_tokens_json.keys()):
+        new_token.piece = added_token.encode("utf-8")
+        new_token.score = -1000.0
+        new_token.type  = SentencePieceTokenTypes.USER_DEFINED
+    pass
+
+    tokenizer_file.pieces.extend(new_tokens)
+
+    with open(f"{saved_location}/tokenizer.model", "wb") as file:
+        file.write(tokenizer_file.SerializeToString())
+    pass
+
+    # Add padding tokens
+    # actual_vocab_size = model.config.vocab_size
+    # padding = actual_vocab_size - len(tokenizer_file.pieces)
+    return
+pass
+
+
+def _load_correct_tokenizer(
+    tokenizer_name,
+    model_max_length = None,
+    padding_side = "right",
+    token = None,
+    trust_remote_code = False,
+    cache_dir = "huggingface_tokenizers_cache",
+    fix_tokenizer = True,
+):
+    if IS_COLAB_ENVIRONMENT:
+        cache_dir = cache_dir
+    elif IS_KAGGLE_ENVIRONMENT:
+        # /tmp of Kaggle seems has a 80GB limit!
+        # Let's utilize them
+        cache_dir = os.path.join(KAGGLE_TMP, cache_dir)
+    else:
+        cache_dir = None
+    pass
+
+    # Try loading the slow tokenizer. If it fails, then try Fast only
+    # Mainly to solve Deepseek models with no tokenizer.model file
+    slow_tokenizer = None
+    try:
+        slow_tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name,
+            model_max_length  = model_max_length,
+            padding_side      = padding_side,
+            token             = token,
+            trust_remote_code = trust_remote_code,
+            # Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373
+            use_fast          = False,
+            legacy            = False,
+            from_slow         = True,
+            cache_dir         = cache_dir,
+        )
+    except:
+        pass
+        # print(
+        #     f"Unsloth: {tokenizer_name} has no tokenizer.model file.\n"\
+        #     "Just informing you about this - this is not a critical error."
+        # )
+    pass
+
+    fast_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name,
+        model_max_length  = model_max_length,
+        padding_side      = padding_side,
+        token             = token,
+        trust_remote_code = trust_remote_code,
+        cache_dir         = cache_dir,
+    )
+
+    if not fix_tokenizer or tokenizer_name in IGNORED_TOKENIZER_NAMES:
+        return fast_tokenizer
+    # Ignore Mistral ones - they're a bit weird to handle!
+    elif "mistral" in tokenizer_name.lower():
+        return fast_tokenizer
+    elif slow_tokenizer is not None:
+        if hasattr(fast_tokenizer, "add_bos_token") and hasattr(slow_tokenizer, "add_bos_token"):
+            fast_tokenizer.add_bos_token = slow_tokenizer.add_bos_token
+        if hasattr(fast_tokenizer, "add_eos_token") and hasattr(slow_tokenizer, "add_eos_token"):
+            fast_tokenizer.add_eos_token = slow_tokenizer.add_eos_token
+        
+        # Confirm if slow and fast are equivalent!
+        if assert_same_tokenization(slow_tokenizer, fast_tokenizer):
+            return fast_tokenizer
+        else:
+            logger.warning(f"Unsloth: Will load {tokenizer_name} as a legacy tokenizer.")
+            return convert_to_fast_tokenizer(slow_tokenizer)
+        pass
+    else:
+        return fast_tokenizer
+    pass
+pass
+
+
+def load_correct_tokenizer(
+    tokenizer_name,
+    model_max_length = None,
+    padding_side = "right",
+    token = None,
+    trust_remote_code = False,
+    cache_dir = "huggingface_tokenizers_cache",
+    fix_tokenizer = True,
+):
+    tokenizer = _load_correct_tokenizer(
+        tokenizer_name = tokenizer_name,
+        model_max_length = model_max_length,
+        padding_side = padding_side,
+        token = token,
+        trust_remote_code = trust_remote_code,
+        cache_dir = cache_dir,
+        fix_tokenizer = fix_tokenizer,
+    )
+
+    ### 1. Fixup tokenizer's chat_template
+    old_chat_template = getattr(tokenizer, "chat_template", None)
+
+    # Ignore mistral type models since they don't have a add_generation_prompt
+    if "mistral" in str(getattr(tokenizer, "name_or_path", "")).lower():
+        chat_template = old_chat_template
+
+    # Also check Llama-2 old style models
+    elif old_chat_template is not None and \
+        "[/INST]" in old_chat_template and "[INST]" in old_chat_template and \
+        "bos_token" in old_chat_template and "eos_token" in old_chat_template:
+
+        chat_template = old_chat_template
+
+    else:
+        chat_template = fix_chat_template(tokenizer)
+        if old_chat_template is not None and chat_template is None:
+            raise RuntimeError(
+                "Unsloth: Fixing chat template failed - please file a report immediately!"
+            )
+        pass
+    pass
+
+    tokenizer.chat_template = chat_template
+    return tokenizer
+pass
+
+
+def _find_end_position(template, endfor, endif):
+    where_endfor = template.find(endfor)
+    where_endif = template.find(endif)
+    if where_endfor == where_endif == -1:
+        return None
+    elif where_endfor > where_endif:
+        return endfor
+    else:
+        return endif
+    pass
+pass
+
+
+def _fix_chat_template(chat_template):
+    endfor = "{% endfor %}"
+    endif = "{% endif %}"
+    chosen_end = _find_end_position(chat_template, endfor, endif)
+    if chosen_end is None:
+        endfor = "{%- endfor %}"
+        endif = "{%- endif %}"
+        chosen_end = _find_end_position(chat_template, endfor, endif)
+    if chosen_end is None:
+        return chat_template
+    
+    where = chat_template.find(chosen_end)
+
+    after_endfor = chat_template[where + len(chosen_end):]
+
+    dash = "-" if chosen_end.startswith("{%-") else ""
+
+    if "{%" + dash + " if" not in after_endfor and "{%" + dash + " set " not in after_endfor and \
+        after_endfor.startswith("{{") and after_endfor.endswith("}}") and \
+        after_endfor.count("{{") == 1 and after_endfor.count("}}") == 1:
+
+        after_endfor = "{%" + dash + " if add_generation_prompt %}" + after_endfor + endif
+
+        chat_template = chat_template[:where + len(chosen_end)] + after_endfor
+    pass
+    return chat_template
+pass
+
+
+def fix_chat_template(tokenizer):
+    chat_template = getattr(tokenizer, "chat_template", None)
+    if chat_template is None: return None
+
+    ### 1. Check if add_generation_prompt works
+    # Check for ShareGPT style first
+    is_sharegpt = None
+    try:
+        messages = [
+            {"role": "user", "content": "Who are you?"},
+        ]
+        tokenizer.apply_chat_template(messages, add_generation_prompt = False, tokenize = False)
+        is_sharegpt = False
+    except:
+        try:
+            messages = [
+                {"from": "human", "value": "Who are you?"},
+            ]
+            tokenizer.apply_chat_template(messages, add_generation_prompt = False, tokenize = False)
+            is_sharegpt = True
+        except:
+            is_sharegpt = None
+        pass
+    pass
+
+    # Not ShareGPT or HF style - just return
+    if is_sharegpt is None: return chat_template
+
+    # Tokenize
+    messages = [
+        {"role": "user", "content": "Who are you?"} \
+        if not is_sharegpt else \
+        {"from": "human", "value": "Who are you?"}
+    ]
+    no  = tokenizer.apply_chat_template(messages, add_generation_prompt = False, tokenize = False)
+    yes = tokenizer.apply_chat_template(messages, add_generation_prompt =  True, tokenize = False)
+
+    if no == yes:
+        # SAME?! That's not good! We check for add_generation_prompt
+        if   "{% if add_generation_prompt %}" not in chat_template and \
+            "{%- if add_generation_prompt %}" not in chat_template:
+            # Try fixing it by adding it
+            new_chat_template = _fix_chat_template(chat_template)
+            if   "{% if add_generation_prompt %}" not in new_chat_template and \
+                "{%- if add_generation_prompt %}" not in new_chat_template:
+                raise RuntimeError(
+                    f"Unsloth: The tokenizer `{tokenizer.name_or_path}`\n"\
+                    "does not have a {% if add_generation_prompt %} for generation purposes.\n"\
+                    "Please file a bug report immediately - thanks!"
+                )
+            else:
+                logger.warning_once(
+                    "Unsloth: We successfully patched the tokenizer to add a {% if add_generation_prompt %} to the chat_template.\n"\
+                    "This is not a bug, but please notify the Unsloth maintainers - thanks!"
+                )
+                chat_template = new_chat_template
+            pass
+        else:
+            raise RuntimeError(
+                f"Unsloth: The tokenizer `{tokenizer.name_or_path}`\n"\
+                "has a {% if add_generation_prompt %} for generation purposes, but wasn't provided correctly.\n"\
+                "Please file a bug report immediately - thanks!"
+            )
+        pass
+    pass
+    return chat_template
+pass
+
+
+def check_tokenizer(
+    model,
+    tokenizer,
+    model_name = "unsloth/llama-2-7b-bnb-4bit",
+    model_max_length = 4096,
+    padding_side = "right",
+    token = None,
+    _reload = True,
+):
+    # Checks tokenizer for out of bounds ids.
+    # Mainly a fix for https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha
+    # where <sep> had token id=32002.
+    # See https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha/discussions/25
+    # Seems like the Fast tokenizer in Rust breaks things!
+
+    # We ignore some of them!
+    if tokenizer.__repr__().split("(", 1)[0] in IGNORED_TOKENIZER_CHECKING:
+        return tokenizer
+    pass
+
+    max_embedding_size = model.model.embed_tokens.weight.shape[0]
+    added_tokens_fast = tokenizer.added_tokens_decoder
+    added_tokens_fast = {index : str(value) for index, value in added_tokens_fast.items()}
+    sorted_keys = sorted(added_tokens_fast)
+    added_tokens_fast = {key : added_tokens_fast[key] for key in sorted_keys}
+
+    for j, index in enumerate(added_tokens_fast.keys()):
+        if index >= max_embedding_size:
+            bad_indices = list(added_tokens_fast.keys  ())[j:]
+            bad_tokens  = list(added_tokens_fast.values())[j:]
+            if not _reload:
+                # Try removing the token
+                added_tokens = [str(x) for x in tokenizer.added_tokens_decoder.values()]
+                special_tokens = tokenizer.special_tokens_map
+                import itertools
+                special_tokens = frozenset(
+                    itertools.chain.from_iterable(
+                        [x] if type(x) is str else x for x in special_tokens.values()
+                    )
+                )
+                can_be_removed1 = [x for x in bad_tokens if x not in special_tokens]
+                can_be_removed2 = [x for x in can_be_removed1 if x in tokenizer._added_tokens_encoder.keys()]
+
+                # Check of extra tokens can in fact we removed!
+                can_be_removed = \
+                    (len(can_be_removed1) == len(bad_tokens)) and \
+                    (len(can_be_removed2) == len(bad_tokens))
+
+                # Check if sep_token or other generic types
+                remove_generic = False
+                try_mapper = []
+                if not can_be_removed:
+                    names = dir(tokenizer)
+                    names = (x for x in names if x.endswith("_token") and x.count("_") == 1)
+                    generic_tokens = [(x, getattr(tokenizer, x, None)) for x in names]
+
+                    try_removal = []
+                    for token in bad_tokens:
+                        for (name_token, check_token) in generic_tokens:
+                            if check_token == token:
+                                try_removal.append(token)
+                                try_mapper.append(name_token)
+                            pass
+                        pass
+                    pass
+
+                    # Recheck!
+                    can_be_removed = (len(try_removal) == len(bad_tokens))
+                    if can_be_removed: remove_generic = True
+                    can_be_removed1 = bad_tokens
+                pass
+
+                if can_be_removed:
+                    # Yes it can be fixed!
+                    for j, bad_token in enumerate(can_be_removed1):
+                        remove_id = tokenizer._added_tokens_encoder[bad_token]
+                        del tokenizer._added_tokens_decoder[remove_id]
+                        del tokenizer._added_tokens_encoder[bad_token]
+
+                        if remove_generic and (try_removal[j] == bad_token):
+                            # Remove sep token for example
+                            setattr(tokenizer, try_mapper[j], None)
+                            setattr(tokenizer, try_mapper[j] + "_id", None)
+                        pass
+                    pass
+                    # Confirm 1 more time!
+                    if max(tokenizer.added_tokens_decoder.keys()) < max_embedding_size:
+                        logger.warning_once(
+                            f"Unsloth loaded a broken tokenizer `{model_name}`, but managed to repair it!\n"\
+                            f"Tokens {bad_tokens} with ids {bad_indices} exceeds the max vocab size of {max_embedding_size}.\n"\
+                            "We removed these bad tokens. If you think this is incorrect, fix your tokenizer first."
+                        )
+                        return convert_to_fast_tokenizer(tokenizer)
+                    pass
+                pass
+
+                # :( Failure
+                raise RuntimeError(
+                    f"Unsloth tried to load `{model_name}`, but cannot succeed.\n"\
+                    f"Tokens {bad_tokens} with ids {bad_indices} exceeds the max vocab size of {max_embedding_size}.\n"\
+                    f"Fix your tokenizer since it'll perform out of bounds memory accesses."
+                )
+            pass
+            
+            if IS_COLAB_ENVIRONMENT or IS_KAGGLE_ENVIRONMENT:
+                cache_dir = "huggingface_tokenizers_cache"
+            else:
+                cache_dir = None
+            pass
+
+            # Sometimes slow tokenizer does not work like Deepseek
+            try:
+                # Try slow tokenizer which can fix things!
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_name,
+                    model_max_length = model_max_length,
+                    padding_side = padding_side,
+                    token = token,
+                    # Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373
+                    use_fast = False,
+                    legacy = False,
+                    from_slow = True,
+                    cache_dir = cache_dir,
+                )
+                return check_tokenizer(
+                    model = model,
+                    tokenizer = tokenizer,
+                    model_name = model_name,
+                    model_max_length = model_max_length,
+                    padding_side = padding_side,
+                    token = token,
+                    _reload = False,
+                )
+                break
+            except:
+                # Tokenizer has out of bounds issues and we can't
+                # load the slow tokenizer version :(
+                logger.warning_once(
+                    "Unsloth: Tokenizer is most likely buggy, and Unsloth failed to repair it.\n"\
+                    "It will still work, but beware of out of bounds memory accesses.\n"\
+                    "Please file an issue on the model owner's repo about this issue."
+                )
+                return tokenizer
+            pass
+        pass
+    pass
+    return convert_to_fast_tokenizer(tokenizer)
+pass
+
+
+def check_nvidia():
+    # Unsloth doesn't work yet on AMD devices - we're working on it!
+    output = np.array([0,])
+    try:
+        output = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv", shell = True)
+        output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
+        output = np.array([int(x.decode('utf-8'))/1024 for x in output])
+    except:
+        if not torch.cuda.is_available():
+            raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")
+    return output
+pass
+PRE_CHECK = check_nvidia()
+
+
+import inspect
+from inspect import getsource
+import trl.trainer.sft_trainer
+from trl.trainer.sft_trainer import *
+from transformers.trainer import *
+try:
+    from trl.trainer.sft_trainer import neftune_post_forward_hook
+except:
+    def neftune_post_forward_hook(module, input, output):
+        """
+        Implements the NEFTune forward pass for the model using forward hooks. Note this works only for
+        torch.nn.Embedding layers. This method is slightly adapted from the original source code
+        that can be found here: https://github.com/neelsjain/NEFTune
+
+        Simply add it to your model as follows:
+        ```python
+        model = ...
+        model.embed_tokens.neftune_noise_alpha = 0.1
+        model.embed_tokens.register_forward_hook(neftune_post_forward_hook)
+        ```
+
+        Args:
+            module (`torch.nn.Module`):
+                The embedding module where the hook is attached. Note that you need to set
+                `module.neftune_noise_alpha` to the desired noise alpha value.
+            input (`torch.Tensor`):
+                The input tensor to the model.
+            output (`torch.Tensor`):
+                The output tensor of the model (i.e. the embeddings).
+        """
+        if module.training:
+            dims = torch.tensor(output.size(1) * output.size(2))
+            mag_norm = module.neftune_noise_alpha / torch.sqrt(dims)
+            output = output + torch.zeros_like(output).uniform_(-mag_norm, mag_norm)
+        return output
+    pass
+pass
+
+
+def patch_trl_tokenizer_processing_class(trainer_name):
+    # New TRL removes tokenizer!
+    # We return it back!
+    exec(f"from trl import {trainer_name}", globals())
+    if str(eval(f"{trainer_name}").__name__).startswith("Unsloth"): return None
+    parameters = eval(f"inspect.signature({trainer_name}).parameters")
+    if "tokenizer" in parameters: return None
+
+    args = {
+        key : \
+            value.default \
+            if type(value.default) is not str else \
+            f"'{value.default}'" \
+        for key, value in parameters.items()
+    }
+    args["tokenizer"] = None
+    new_args = args.copy()
+    del new_args["tokenizer"]
+    del new_args["processing_class"]
+    new_args = ",\n".join(f"{' '*12}{key} = {key}" for key in new_args) + \
+        f",\n{' '*12}processing_class = tokenizer if tokenizer else processing_class"
+    args = ",\n".join(f"{' '*8}{key} = {value}" for key, value in args.items())
+    args = f"def __init__(\n" + f"{' '*8}self,\n" + args + "):"
+    args += f"\n{' '*8}\n{' '*8}super().__init__(\n{new_args}\n{' '*8})"
+    new_class = f"""class Unsloth{trainer_name}({trainer_name}):\n{' '*4}{args}\n"""
+    return new_class
+pass
+
+
+def patch_sft_trainer_tokenizer():
+    """
+        Patches the trainer with changes
+    """
+    for function_name, replacer in (
+        ("_prepare_non_packed_dataloader", "def tokenize(element):",),
+        # ("_prepare_packed_dataloader", "if dataset_text_field is not None",),
+    ):
+        function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}"))
+        where = function.find("def")
+        function = function.split("\n")
+        function = "\n".join(x[where:] for x in function)
+
+        check_text = \
+        "\n"\
+        "if 'tokenizer'          not in locals(): tokenizer = processing_class\n"\
+        "if 'formatting_func'    not in locals(): raise RuntimeError('Unsloth: Please file a bug report - `formatting_func` does not exist!')\n"\
+        "if 'dataset_text_field' not in locals(): raise RuntimeError('Unsloth: Please file a bug report - `dataset_text_field` does not exist!')\n"\
+        "test_text = dataset[0][dataset_text_field] if (formatting_func is None and dataset_text_field is not None) else formatting_func(dataset[0])[0]\n"\
+        "chat_template = getattr(tokenizer, 'chat_template', None)\n"\
+        "chat_template = '' if chat_template is None else chat_template\n"\
+        "has_bos_token_already = (test_text.startswith(tokenizer.bos_token) or tokenizer.bos_token in chat_template) "\
+        "if getattr(tokenizer, 'bos_token', None) is not None else False\n"\
+        "add_special_tokens = False if has_bos_token_already else add_special_tokens\n\n"
+
+        check_text = check_text.split("\n")
+        check_text = "\n".join(" "*where + x for x in check_text)
+
+        function = function.replace(replacer, check_text + replacer)
+        exec(function, globals())
+
+        exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals())
+    pass
+
+    # Patch train with fix_untrained_tokens
+    for path_to_trainer in \
+        ("sft_trainer.SFTTrainer", "dpo_trainer.DPOTrainer", "kto_trainer.KTOTrainer"):
+
+        function_name, replacer = "train", "if resume_from_checkpoint is False:"
+        function = getsource(eval(f"trl.trainer.{path_to_trainer}.{function_name}"))
+        where = function.find("def")
+        function = function.split("\n")
+        function = "\n".join(x[where:] for x in function)
+
+        check_text = \
+        "\n"\
+        "import subprocess, re, gc, numpy as np\n"\
+        "a = np.array([0,])\n"\
+        "try:\n"\
+        "    a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)\n"\
+        "    a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)\n"\
+        "    a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n"\
+        "except:\n"\
+        "    if not torch.cuda.is_available():\n"\
+        "        raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n"\
+        "if ((a - PRE_CHECK) >= 1).sum() > 1:\n"\
+        "    raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n"\
+        "for _ in range(3):\n"\
+        "    gc.collect()\n"\
+        "    torch.cuda.empty_cache()\n"\
+        "pass\n"\
+        "\n"\
+        "tokenizer = self.processing_class if hasattr(self, 'processing_class') else self.tokenizer\n"\
+        "fix_untrained_tokens(self.model, tokenizer, self.train_dataset, IGNORED_TOKENIZER_NAMES, eps = 1e-16)\n\n"\
+        "fix_zero_training_loss(self.model, tokenizer, self.train_dataset)\n\n"
+
+        # Warn on gradient accumulation steps if it's used
+        check_text += \
+        "\n"\
+        "try:\n"\
+        "    gradient_accumulation_steps = self.args.gradient_accumulation_steps\n"\
+        "    if type(gradient_accumulation_steps) is int and gradient_accumulation_steps > 1:\n"\
+        "        from transformers import __version__ as transformers_version\n"\
+        "        from packaging.version import Version\n"\
+        "        if Version(transformers_version) <= Version('4.45.2'):\n"\
+        "            print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\\n'\\\n"\
+        "                  '`pip install --upgrade --no-cache-dir --no-deps unsloth transformers git+https://github.com/huggingface/trl.git`')\n"\
+        "except:\n"\
+        "    pass\n"\
+        "\n\n"
+
+        # Add NEFTune since it doesn't seem to work?? We need to manually inject it
+        check_text += \
+        "\n"\
+        "if hasattr(self, 'neftune_hook_handle'):\n"\
+        "    self.neftune_hook_handle.remove()\n"\
+        "    if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle\n"\
+        "\n"\
+        "if getattr(self, 'neftune_noise_alpha', None) is not None:\n"\
+        "    self.model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha\n"\
+        "    self.neftune_hook_handle = self.model.get_input_embeddings().register_forward_hook(neftune_post_forward_hook)\n"\
+        "pass\n"\
+        "\n"
+
+        # Also DPO weirdly tokenizes non numeric columns? Delete them!
+        check_text += \
+        "\n"\
+        "if hasattr(self.train_dataset, 'column_names'):\n"\
+        "    column_names = set(self.train_dataset.column_names)\n"\
+        "    check = ['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask',\n"\
+        "        'chosen_labels', 'rejected_input_ids', 'rejected_attention_mask', 'rejected_labels',\n"\
+        "        'prompt_input_ids', 'prompt_attention_mask']\n"\
+        "    if all(x in column_names for x in check):\n"\
+        "        self.train_dataset = self.train_dataset.remove_columns(['chosen', 'rejected', 'prompt'])\n"\
+        "    del check, column_names\n"\
+        "\n"
+
+        check_text = check_text.split("\n")
+        check_text = "\n".join(" "*where + x for x in check_text)
+
+        function = function.replace(replacer, check_text + replacer)
+        exec(function, globals())
+
+        exec(f"trl.trainer.{path_to_trainer}.{function_name} = {function_name}", globals())
+    pass
+pass
+
+# Fix TRL trainers with removed tokenizer args (got replaced with processing_class)
+for trainer_name in ("SFTTrainer", "DPOTrainer", "KTOTrainer"):
+    trainer_text = patch_trl_tokenizer_processing_class(trainer_name)
+    if trainer_text is None: continue
+    try:
+        exec(trainer_text, globals())
+    except:
+        raise RuntimeError(f"Unsloth: Please file a bug report! Error patching {trainer_name}")
+    exec(f"trl.trainer.{trainer_name} = Unsloth{trainer_name}", globals())
+pass
+
+# FInally patch TRL tokenizer things
+patch_sft_trainer_tokenizer()
diff --git a/unsloth-main/unsloth/trainer.py b/unsloth-main/unsloth/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..012be4b0cbdac062c177c119226cb24b62792f2e
--- /dev/null
+++ b/unsloth-main/unsloth/trainer.py
@@ -0,0 +1,226 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass, field
+from typing import Optional
+from functools import wraps
+
+import trl
+import inspect
+from trl import SFTTrainer
+from . import is_bfloat16_supported
+from unsloth_zoo.training_utils import (
+    unsloth_train as _unsloth_train,
+)
+from unsloth_zoo.vision_utils import (
+    UnslothVisionDataCollator,
+)
+from packaging.version import Version
+import dataclasses
+
+__all__ = [
+    "UnslothTrainingArguments",
+    "UnslothTrainer",
+    "unsloth_train",
+    "_patch_trl_trainer",
+    "UnslothVisionDataCollator",
+]
+
+# Unsloth gradient accumulation fix:
+from transformers import __version__ as transformers_version
+if Version(transformers_version) > Version("4.45.2"):
+    def unsloth_train(trainer, *args, **kwargs):
+        return trainer.train(*args, **kwargs)
+    pass
+else:
+    def unsloth_train(trainer, *args, **kwargs):
+        if len(args) != 0 or len(kwargs) != 0:
+            raise RuntimeError(
+                "Unsloth: Our custom gradient accumulation fixed trainer does not support other arguments.\n"\
+                "If you want to use our fix inside of HF, please update `transformers` to the latest version via:\n"\
+                '`pip uninstall transformers -y && pip install --upgrade --no-cache-dir transformers`'
+            )
+        print(
+            "Unsloth: Using our custom gradient accumulation fixed trainer, which is not feature complete.\n"\
+            "If you want to use our fix inside of HF, please update `transformers` to the latest version via:\n"\
+            '`pip uninstall transformers -y && pip install --upgrade --no-cache-dir transformers`'
+        )
+        return _unsloth_train(trainer)
+    pass
+pass
+
+try:
+    from trl import SFTConfig as TrainingArguments
+except:
+    from transformers import TrainingArguments
+pass
+@dataclass
+class UnslothTrainingArguments(TrainingArguments):
+    embedding_learning_rate : Optional[float] = field(
+        default = None,
+        metadata = {"help" : "Different learning rates for embeddings and lm_head."}
+    )
+pass
+
+
+def _create_unsloth_optimizer(
+    model,
+    optimizer_cls,
+    optimizer_kwargs,
+    embedding_lr = 5e-5,
+):
+    lr = optimizer_kwargs["lr"]
+    weight_decay = optimizer_kwargs.get("weight_decay", 0.0)
+
+    param_groups = \
+    {
+        "non_embeddings" : {},
+        "embeddings"     : {},
+    }
+
+    for name, param in model.named_parameters():
+        if not param.requires_grad: continue
+        if name.endswith("modules_to_save.default.weight"):
+            partial_name = name[:-len(".modules_to_save.default.weight")]
+            partial_name = partial_name[partial_name.rfind(".")+1:]
+            print(f"Unsloth: Setting lr = {embedding_lr:.2e} instead of {lr:.2e} for {partial_name}.")
+            param_groups["embeddings"]    [name] = param
+        else:
+            param_groups["non_embeddings"][name] = param
+        pass
+    pass
+
+    optimizer_grouped_parameters = [
+        {
+            "params"       : list(param_groups["non_embeddings"].values()),
+            "weight_decay" : weight_decay,
+            "lr"           : lr,
+        },
+        {
+            "params"       : list(param_groups["embeddings"].values()),
+            "weight_decay" : weight_decay,
+            "lr"           : embedding_lr,
+        },
+    ]
+    optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+    return optimizer
+pass
+
+
+class UnslothTrainer(SFTTrainer):
+    def create_optimizer(self):
+        embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None)
+        if embedding_learning_rate is None: return super().create_optimizer()
+
+        if self.optimizer is None:
+            optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args)
+            self.optimizer = _create_unsloth_optimizer(
+                self.model,
+                optimizer_cls,
+                optimizer_kwargs,
+                embedding_learning_rate,
+            )
+        pass
+        return self.optimizer
+    pass
+pass
+
+# From `trl>=0.13.0`, they changed how to pass several params to the trainer
+# We need to patch to make the transition smooth
+def _backwards_compatible_trainer(trainer_class, config_class):
+    original_init = trainer_class.__init__
+    
+    @wraps(original_init)
+    def new_init(self, *args, **kwargs):
+        # All Trainer tokenizer are now called processing_class
+        trainer_params = set(inspect.signature(original_init).parameters.keys())
+
+        if "processing_class" in trainer_params and "tokenizer" in kwargs:
+            kwargs["processing_class"] = kwargs.pop("tokenizer")
+        pass
+
+        if ("args" in kwargs) and (Version(trl.__version__) >= Version("0.13.0.dev0")):
+            training_args = kwargs.pop("args", None)
+
+            # Get parameters that Trainer.__init__ actually expects
+            trainer_params.remove('self')
+            trainer_params.remove('args')
+
+            # Get fields that should be passed to Config init
+            config_fields = {
+                field.name: field for field in dataclasses.fields(config_class) 
+                if field.init
+            }
+            
+            # Create config dict with valid fields from training_args
+            config_dict = {
+                name: getattr(training_args, name)
+                for name in config_fields
+                if hasattr(training_args, name)
+            }
+
+            # Get parameters that exist in Config but not in TrainingArguments
+            from transformers import TrainingArguments
+            moved_params = \
+                set(inspect.signature(config_class)     .parameters.keys()) - \
+                set(inspect.signature(TrainingArguments).parameters.keys())
+            
+            # Separate kwargs into trainer kwargs and config kwargs
+            trainer_kwargs = {}
+            additional_config_kwargs = {}
+
+            for key, value in kwargs.items():
+                if key in trainer_params: trainer_kwargs[key] = value
+                elif key in moved_params or key in config_fields:
+                    additional_config_kwargs[key] = value
+                else:
+                    additional_config_kwargs[key] = value
+                pass
+            pass
+
+            # Update config_dict with additional kwargs
+            config_dict.update(additional_config_kwargs)
+
+            # Create Config with all the collected parameters
+            config = config_class(**config_dict)
+            
+            # Reconstruct kwargs for Trainer
+            kwargs = trainer_kwargs
+            kwargs["args"] = config
+        pass
+        original_init(self, *args, **kwargs)
+    pass
+    return new_init
+pass
+
+
+def _patch_trl_trainer():
+    import trl
+    if hasattr(trl, "__UNSLOTH_BACKWARDS_COMPATIBLE__"): return
+    if Version(trl.__version__) <= Version("0.11.0"): return
+
+    import trl.trainer
+    trl_classes = dir(trl.trainer)
+    trl_trainers = set(x[:-len("Trainer")] for x in trl_classes if x.endswith("Trainer"))
+    trl_configs  = set(x[:-len("Config")]  for x in trl_classes if x.endswith("Config"))
+    trl_classes = list(trl_trainers & trl_configs)
+
+    for x in trl_classes:
+        try:    exec(f"trl.{x}Trainer.__init__ = _backwards_compatible_trainer(trl.{x}Trainer, trl.{x}Config)", globals())
+        except: continue
+    pass
+
+    trl.__UNSLOTH_BACKWARDS_COMPATIBLE__ = True
+pass