shanjay commited on
Commit
9667edf
1 Parent(s): c25a9ff

Upload folder using huggingface_hub

Browse files
Files changed (29) hide show
  1. .ipynb_checkpoints/07.dsc-mgc-v2-checkpoint.ipynb +0 -0
  2. .ipynb_checkpoints/07.dscv4-checkpoint.ipynb +0 -0
  3. .ipynb_checkpoints/Prediction-mgc-checkpoint.csv +0 -0
  4. .ipynb_checkpoints/Prediction-mgc-checkpoint.json +0 -0
  5. .ipynb_checkpoints/Test-checkpoint.ipynb +6 -0
  6. .ipynb_checkpoints/Test-mgc-Copy1-checkpoint.ipynb +1177 -0
  7. .ipynb_checkpoints/Test-mgc-f-checkpoint.ipynb +866 -0
  8. .ipynb_checkpoints/Testv3-checkpoint.ipynb +831 -0
  9. .ipynb_checkpoints/Testv4-checkpoint.ipynb +698 -0
  10. .ipynb_checkpoints/ds1000-train-cleaned-checkpoint.json +0 -0
  11. 07.dsc-mgc-v2.ipynb +0 -0
  12. 07.dscv4.ipynb +0 -0
  13. Prediction-mgc.csv +0 -0
  14. Prediction-mgc.json +0 -0
  15. Test-mgc-Copy1.ipynb +1177 -0
  16. Test-mgc-f.ipynb +0 -0
  17. Test.ipynb +725 -0
  18. Testv3.ipynb +831 -0
  19. Testv4.ipynb +866 -0
  20. ds1000-test-cleaned.json +0 -0
  21. ds1000-train-cleaned.json +0 -0
  22. experiments/runs/Dec11_06-38-12_114d9a2e28a3/1702276693.5141723/events.out.tfevents.1702276693.114d9a2e28a3.6724.1 +3 -0
  23. experiments/runs/Dec11_06-38-12_114d9a2e28a3/events.out.tfevents.1702276693.114d9a2e28a3.6724.0 +3 -0
  24. experiments/runs/Dec11_06-44-21_114d9a2e28a3/1702277061.7964196/events.out.tfevents.1702277061.114d9a2e28a3.9175.1 +3 -0
  25. experiments/runs/Dec11_06-44-21_114d9a2e28a3/events.out.tfevents.1702277061.114d9a2e28a3.9175.0 +3 -0
  26. experiments/runs/Dec11_06-47-03_114d9a2e28a3/1702277223.9722266/events.out.tfevents.1702277223.114d9a2e28a3.10257.1 +3 -0
  27. experiments/runs/Dec11_06-47-03_114d9a2e28a3/events.out.tfevents.1702277223.114d9a2e28a3.10257.0 +3 -0
  28. trained-model/adapter_config.json +20 -0
  29. trained-model/adapter_model.bin +3 -0
.ipynb_checkpoints/07.dsc-mgc-v2-checkpoint.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
.ipynb_checkpoints/07.dscv4-checkpoint.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
.ipynb_checkpoints/Prediction-mgc-checkpoint.csv ADDED
The diff for this file is too large to render. See raw diff
 
.ipynb_checkpoints/Prediction-mgc-checkpoint.json ADDED
The diff for this file is too large to render. See raw diff
 
.ipynb_checkpoints/Test-checkpoint.ipynb ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 5
6
+ }
.ipynb_checkpoints/Test-mgc-Copy1-checkpoint.ipynb ADDED
@@ -0,0 +1,1177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "addd199c-097c-419d-a0f2-c3d73efb8d5d",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "\n",
14
+ "===================================BUG REPORT===================================\n",
15
+ "Welcome to bitsandbytes. For bug reports, please run\n",
16
+ "\n",
17
+ "python -m bitsandbytes\n",
18
+ "\n",
19
+ " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
20
+ "================================================================================\n",
21
+ "bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so\n",
22
+ "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n",
23
+ "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so\n",
24
+ "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n",
25
+ "CUDA SETUP: Detected CUDA version 121\n",
26
+ "CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...\n"
27
+ ]
28
+ },
29
+ {
30
+ "name": "stderr",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib64'), PosixPath('/usr/local/nvidia/lib')}\n",
34
+ " warn(msg)\n",
35
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n",
36
+ " warn(msg)\n",
37
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCcuY6EsmJRfLsI1l1rpDWVRhwkL7A9nzITTDbCFOX0wzshP65l/Sa54NrS1pX2uM6YiB7OvgGUm7uUKf9OBCcpd2ohFJiOkTznhDHk+D7IkFZf/VTRIHy/JZoAtzN/qBQKMOygFam1XzTMDnkehMkKvR23BgH72hzGUfYPIsq+OlStYVMhE1bncYSnC4SRucbdT5BeIsival514xsbAhCjjwPd8UHfw1cxaDq4edWjbhN8wkDU+V8i/jS/wWTZIt7pIZiAREEl/YC+Sc4FCSnb4c3p+adl5pqXrEsKygi+UmBtC1poLSXTgZOc/0kerx4jv/HB8NiH4kLsg4S2HjdFFQIB0WSV0i4KDVRE9cv18gQ7kbEv0t9Uwg4xdoMntCNS6aFDm51ufhshwQylzfSwX71Ka3mPdftfnVk81wKpIxN784FEcb7IE7HcNyomnP9N382Fg8j6pILwsKK6w4oOg8Cn2C66cySA6CNTFpK1kYBwsqdU3X8WBQUIZZNVCn4x/qRWYxrKHmdlUW8oCf9AT32eydDQWp1y0AlycA4wfbDQ8g4dtu9Rf+tBrYTztdCt5PbGy4SbwfynWysc/PuhcyaLNtuRYt3LeiCKhKJFNFST1BqjACrjkQ9kMrPSB/7j3JX9O2ncDHDQgCQIQon9BETVQZJ49EqMrusQ3/K39w== shanjay@LAPTOP-Q1PG3AE7')}\n",
38
+ " warn(msg)\n",
39
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('//g.notebooksg.jarvislabs.net'), PosixPath('https')}\n",
40
+ " warn(msg)\n",
41
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n",
42
+ " warn(msg)\n"
43
+ ]
44
+ }
45
+ ],
46
+ "source": [
47
+ "import json\n",
48
+ "import os\n",
49
+ "from pprint import pprint\n",
50
+ "\n",
51
+ "import bitsandbytes as bnb\n",
52
+ "import pandas as pd\n",
53
+ "import torch\n",
54
+ "import torch.nn as nn\n",
55
+ "\n",
56
+ "import transformers\n",
57
+ "from datasets import load_dataset\n",
58
+ "from huggingface_hub import notebook_login\n",
59
+ "from peft import (\n",
60
+ " LoraConfig,\n",
61
+ " PeftConfig,\n",
62
+ " PeftModel,\n",
63
+ " get_peft_model,\n",
64
+ " prepare_model_for_kbit_training,\n",
65
+ ")\n",
66
+ "from transformers import (\n",
67
+ " AutoConfig,\n",
68
+ " AutoModelForCausalLM,\n",
69
+ " AutoTokenizer,\n",
70
+ " BitsAndBytesConfig,\n",
71
+ ")\n",
72
+ "import warnings\n",
73
+ "warnings.filterwarnings(\"ignore\")\n",
74
+ "\n",
75
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 3,
81
+ "id": "acfb1578-a66f-44f0-8df9-1c6bcf7530ea",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "data": {
86
+ "application/vnd.jupyter.widget-view+json": {
87
+ "model_id": "3edf6ee054e9464eb510d3aff9d1dc5f",
88
+ "version_major": 2,
89
+ "version_minor": 0
90
+ },
91
+ "text/plain": [
92
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
93
+ ]
94
+ },
95
+ "metadata": {},
96
+ "output_type": "display_data"
97
+ }
98
+ ],
99
+ "source": [
100
+ "notebook_login()"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 4,
106
+ "id": "d2f13cac-1536-4da0-8ff7-0a0454fd0b4a",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "with open(\"ds1000-test-cleaned.json\") as json_file:\n",
111
+ " data = json.load(json_file)"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 5,
117
+ "id": "6706e68b-d525-4392-ab2c-1dff356da52d",
118
+ "metadata": {},
119
+ "outputs": [
120
+ {
121
+ "name": "stdout",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "{'answer': 'import pandas as pd\\n'\n",
125
+ " '\\n'\n",
126
+ " '\\n'\n",
127
+ " 'index = range(14)\\n'\n",
128
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
129
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
130
+ " 'def g(df):\\n'\n",
131
+ " \" l = df['A'].replace(to_replace=0, method='ffill')\\n\"\n",
132
+ " \" r = df['A'].replace(to_replace=0, method='bfill')\\n\"\n",
133
+ " ' for i in range(len(df)):\\n'\n",
134
+ " \" df['A'].iloc[i] = max(l[i], r[i])\\n\"\n",
135
+ " ' return df\\n'\n",
136
+ " '\\n'\n",
137
+ " 'df = g(df.copy())\\n'\n",
138
+ " 'result = df\\n'\n",
139
+ " 'print(result)',\n",
140
+ " 'question': 'Problem:\\n'\n",
141
+ " 'I have the following dataframe:\\n'\n",
142
+ " 'index = range(14)\\n'\n",
143
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
144
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
145
+ " '\\n'\n",
146
+ " '\\n'\n",
147
+ " 'How can I fill the zeros with the maximun between previous and '\n",
148
+ " 'posterior non-zero value using pandas? Is there a fillna that is '\n",
149
+ " 'not just for \"NaN\"?. \\n'\n",
150
+ " 'The output should look like:\\n'\n",
151
+ " ' A\\n'\n",
152
+ " '0 1\\n'\n",
153
+ " '1 2\\n'\n",
154
+ " '2 2\\n'\n",
155
+ " '3 2\\n'\n",
156
+ " '4 4\\n'\n",
157
+ " '5 4\\n'\n",
158
+ " '6 6\\n'\n",
159
+ " '7 8\\n'\n",
160
+ " '8 8\\n'\n",
161
+ " '9 8\\n'\n",
162
+ " '10 8\\n'\n",
163
+ " '11 8\\n'\n",
164
+ " '12 2\\n'\n",
165
+ " '13 1'}\n"
166
+ ]
167
+ }
168
+ ],
169
+ "source": [
170
+ "pprint(data[0])"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 6,
176
+ "id": "9cc4983a-9a3f-485f-983f-efe2f10ce516",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "with open(\"ds1000-test-cleaned.json\", \"w\") as f:\n",
181
+ " json.dump(data, f)"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 7,
187
+ "id": "f45c3674-4eed-4ca5-8343-2184ff1e4da1",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/html": [
193
+ "<div>\n",
194
+ "<style scoped>\n",
195
+ " .dataframe tbody tr th:only-of-type {\n",
196
+ " vertical-align: middle;\n",
197
+ " }\n",
198
+ "\n",
199
+ " .dataframe tbody tr th {\n",
200
+ " vertical-align: top;\n",
201
+ " }\n",
202
+ "\n",
203
+ " .dataframe thead th {\n",
204
+ " text-align: right;\n",
205
+ " }\n",
206
+ "</style>\n",
207
+ "<table border=\"1\" class=\"dataframe\">\n",
208
+ " <thead>\n",
209
+ " <tr style=\"text-align: right;\">\n",
210
+ " <th></th>\n",
211
+ " <th>question</th>\n",
212
+ " <th>answer</th>\n",
213
+ " </tr>\n",
214
+ " </thead>\n",
215
+ " <tbody>\n",
216
+ " <tr>\n",
217
+ " <th>0</th>\n",
218
+ " <td>Problem:\\nI have the following dataframe:\\nind...</td>\n",
219
+ " <td>import pandas as pd\\n\\n\\nindex = range(14)\\nda...</td>\n",
220
+ " </tr>\n",
221
+ " <tr>\n",
222
+ " <th>1</th>\n",
223
+ " <td>Problem:\\ni got an issue over ranking of date ...</td>\n",
224
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I...</td>\n",
225
+ " </tr>\n",
226
+ " <tr>\n",
227
+ " <th>2</th>\n",
228
+ " <td>Problem:\\nI have a DataFrame like :\\n 0 ...</td>\n",
229
+ " <td>import pandas as pd\\nimport numpy as np\\n\\ndf ...</td>\n",
230
+ " </tr>\n",
231
+ " <tr>\n",
232
+ " <th>3</th>\n",
233
+ " <td>Problem:\\nI have this Pandas dataframe (df):\\n...</td>\n",
234
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A...</td>\n",
235
+ " </tr>\n",
236
+ " <tr>\n",
237
+ " <th>4</th>\n",
238
+ " <td>Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic...</td>\n",
239
+ " <td>import pandas as pd\\n\\ndf = pd.DataFrame.from_...</td>\n",
240
+ " </tr>\n",
241
+ " </tbody>\n",
242
+ "</table>\n",
243
+ "</div>"
244
+ ],
245
+ "text/plain": [
246
+ " question \\\n",
247
+ "0 Problem:\\nI have the following dataframe:\\nind... \n",
248
+ "1 Problem:\\ni got an issue over ranking of date ... \n",
249
+ "2 Problem:\\nI have a DataFrame like :\\n 0 ... \n",
250
+ "3 Problem:\\nI have this Pandas dataframe (df):\\n... \n",
251
+ "4 Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic... \n",
252
+ "\n",
253
+ " answer \n",
254
+ "0 import pandas as pd\\n\\n\\nindex = range(14)\\nda... \n",
255
+ "1 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I... \n",
256
+ "2 import pandas as pd\\nimport numpy as np\\n\\ndf ... \n",
257
+ "3 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A... \n",
258
+ "4 import pandas as pd\\n\\ndf = pd.DataFrame.from_... "
259
+ ]
260
+ },
261
+ "execution_count": 7,
262
+ "metadata": {},
263
+ "output_type": "execute_result"
264
+ }
265
+ ],
266
+ "source": [
267
+ "pd.DataFrame(data).head()"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": 8,
273
+ "id": "6fbdd3ad-062f-4744-bb8e-1c19950adfd5",
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": [
277
+ "bnb_config = BitsAndBytesConfig(\n",
278
+ " load_in_4bit=True,\n",
279
+ " bnb_4bit_use_double_quant=True,\n",
280
+ " bnb_4bit_quant_type=\"nf4\",\n",
281
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
282
+ ")"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 9,
288
+ "id": "2b5ae38c-b0d2-4b9a-acde-3370130ca6e7",
289
+ "metadata": {},
290
+ "outputs": [
291
+ {
292
+ "data": {
293
+ "application/vnd.jupyter.widget-view+json": {
294
+ "model_id": "2be27a54d3e14399a41c46cd9c423399",
295
+ "version_major": 2,
296
+ "version_minor": 0
297
+ },
298
+ "text/plain": [
299
+ "Loading checkpoint shards: 0%| | 0/6 [00:00<?, ?it/s]"
300
+ ]
301
+ },
302
+ "metadata": {},
303
+ "output_type": "display_data"
304
+ },
305
+ {
306
+ "name": "stderr",
307
+ "output_type": "stream",
308
+ "text": [
309
+ "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at ise-uiuc/Magicoder-S-DS-6.7B and are newly initialized: ['model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq']\n",
310
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
311
+ ]
312
+ }
313
+ ],
314
+ "source": [
315
+ "PEFT_MODEL = \"shanjay/mgc-ds\"\n",
316
+ "\n",
317
+ "config = PeftConfig.from_pretrained(PEFT_MODEL)\n",
318
+ "model = AutoModelForCausalLM.from_pretrained(\n",
319
+ " config.base_model_name_or_path,\n",
320
+ " return_dict=True,\n",
321
+ " quantization_config=bnb_config,\n",
322
+ " device_map=\"auto\",\n",
323
+ " trust_remote_code=True,\n",
324
+ ")\n",
325
+ "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
326
+ "tokenizer.pad_token = tokenizer.eos_token\n",
327
+ "\n",
328
+ "model = PeftModel.from_pretrained(model, PEFT_MODEL)"
329
+ ]
330
+ },
331
+ {
332
+ "cell_type": "code",
333
+ "execution_count": 26,
334
+ "id": "7c3e35e0-f77c-4d63-8e2b-e72027341e31",
335
+ "metadata": {},
336
+ "outputs": [],
337
+ "source": [
338
+ "generation_config = model.generation_config\n",
339
+ "generation_config.max_new_tokens = 400\n",
340
+ "generation_config.temperature = 0.7\n",
341
+ "generation_config.top_p = 0.7\n",
342
+ "generation_config.num_return_sequences = 1\n",
343
+ "generation_config.pad_token_id = tokenizer.eos_token_id\n",
344
+ "generation_config.eos_token_id = tokenizer.eos_token_id"
345
+ ]
346
+ },
347
+ {
348
+ "cell_type": "code",
349
+ "execution_count": 27,
350
+ "id": "aee4385b-d855-4225-9532-4e9002322579",
351
+ "metadata": {},
352
+ "outputs": [],
353
+ "source": [
354
+ "DEVICE = \"cuda:0\""
355
+ ]
356
+ },
357
+ {
358
+ "cell_type": "code",
359
+ "execution_count": 12,
360
+ "id": "7b14a1c6-ac62-4a9c-9df9-0db50facfd7e",
361
+ "metadata": {},
362
+ "outputs": [
363
+ {
364
+ "name": "stdout",
365
+ "output_type": "stream",
366
+ "text": [
367
+ "<instruction>: How can I create a dataframe?\n",
368
+ "<output>: import pandas as pd\n",
369
+ "\n",
370
+ "\n",
371
+ "\n",
372
+ "\n",
373
+ "\n",
374
+ "\n",
375
+ "\n",
376
+ "\n",
377
+ "\n",
378
+ "\n",
379
+ "\n",
380
+ "\n",
381
+ "\n",
382
+ "\n",
383
+ "\n",
384
+ "\n",
385
+ "\n",
386
+ "\n",
387
+ "\n",
388
+ "\n",
389
+ "\n",
390
+ "\n",
391
+ "\n",
392
+ "\n",
393
+ "\n",
394
+ "\n",
395
+ "\n",
396
+ "\n",
397
+ "\n",
398
+ "\n",
399
+ "\n",
400
+ "\n",
401
+ "\n",
402
+ "\n",
403
+ "\n",
404
+ "\n",
405
+ "\n",
406
+ "\n",
407
+ "\n",
408
+ "\n",
409
+ "\n",
410
+ "\n",
411
+ "\n",
412
+ "\n",
413
+ "\n",
414
+ "\n",
415
+ "\n",
416
+ "\n",
417
+ "\n",
418
+ "\n",
419
+ "\n",
420
+ "\n",
421
+ "\n",
422
+ "\n",
423
+ "\n",
424
+ "\n",
425
+ "\n",
426
+ "\n",
427
+ "\n",
428
+ "\n",
429
+ "\n",
430
+ "\n",
431
+ "\n",
432
+ "\n",
433
+ "\n",
434
+ "\n",
435
+ "\n",
436
+ "\n",
437
+ "\n",
438
+ "\n",
439
+ "\n",
440
+ "\n",
441
+ "\n",
442
+ "\n",
443
+ "\n",
444
+ "\n",
445
+ "\n",
446
+ "\n",
447
+ "\n",
448
+ "\n",
449
+ "\n",
450
+ "\n",
451
+ "\n",
452
+ "\n",
453
+ "\n",
454
+ "\n",
455
+ "\n",
456
+ "\n",
457
+ "\n",
458
+ "\n",
459
+ "\n",
460
+ "\n",
461
+ "\n",
462
+ "\n",
463
+ "\n",
464
+ "\n",
465
+ "\n",
466
+ "\n",
467
+ "\n",
468
+ "\n",
469
+ "\n",
470
+ "\n",
471
+ "\n",
472
+ "\n",
473
+ "\n",
474
+ "\n",
475
+ "\n",
476
+ "\n",
477
+ "\n",
478
+ "\n",
479
+ "\n",
480
+ "\n",
481
+ "\n",
482
+ "\n",
483
+ "\n",
484
+ "\n",
485
+ "\n",
486
+ "\n",
487
+ "\n",
488
+ "\n",
489
+ "\n",
490
+ "\n",
491
+ "\n",
492
+ "\n",
493
+ "\n",
494
+ "\n",
495
+ "\n",
496
+ "\n",
497
+ "\n",
498
+ "\n",
499
+ "\n",
500
+ "\n",
501
+ "\n",
502
+ "\n",
503
+ "\n",
504
+ "\n",
505
+ "\n",
506
+ "\n",
507
+ "\n",
508
+ "\n",
509
+ "\n",
510
+ "\n",
511
+ "\n",
512
+ "\n",
513
+ "\n",
514
+ "\n",
515
+ "\n",
516
+ "\n",
517
+ "\n",
518
+ "\n",
519
+ "\n",
520
+ "\n",
521
+ "\n",
522
+ "\n",
523
+ "\n",
524
+ "\n",
525
+ "\n",
526
+ "\n",
527
+ "\n",
528
+ "\n",
529
+ "\n",
530
+ "\n",
531
+ "\n",
532
+ "\n",
533
+ "\n",
534
+ "\n",
535
+ "\n",
536
+ "\n",
537
+ "\n",
538
+ "\n",
539
+ "\n",
540
+ "\n",
541
+ "\n",
542
+ "\n",
543
+ "\n",
544
+ "\n",
545
+ "\n",
546
+ "\n",
547
+ "\n",
548
+ "\n",
549
+ "\n",
550
+ "\n",
551
+ "\n",
552
+ "\n",
553
+ "\n",
554
+ "\n",
555
+ "\n",
556
+ "\n",
557
+ "\n",
558
+ "\n",
559
+ "\n",
560
+ "\n",
561
+ "\n",
562
+ "\n",
563
+ "CPU times: user 26.5 s, sys: 177 ms, total: 26.7 s\n",
564
+ "Wall time: 26.7 s\n"
565
+ ]
566
+ }
567
+ ],
568
+ "source": [
569
+ "%%time\n",
570
+ "prompt = f\"\"\"\n",
571
+ "<instruction>: How can I create a dataframe?\n",
572
+ "<output>:\n",
573
+ "\"\"\".strip()\n",
574
+ "\n",
575
+ "encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
576
+ "with torch.inference_mode():\n",
577
+ " outputs = model.generate(\n",
578
+ " input_ids=encoding.input_ids,\n",
579
+ " attention_mask=encoding.attention_mask,\n",
580
+ " generation_config=generation_config,\n",
581
+ " )\n",
582
+ "print(tokenizer.decode(outputs[0], skip_special_tokens=True))"
583
+ ]
584
+ },
585
+ {
586
+ "cell_type": "code",
587
+ "execution_count": 28,
588
+ "id": "93c95988-c563-4871-974d-004bf73fbce8",
589
+ "metadata": {},
590
+ "outputs": [],
591
+ "source": [
592
+ "def generate_response(question: str) -> str:\n",
593
+ " prompt = f\"\"\"\n",
594
+ "<instruction>: {question}\n",
595
+ "<output>:\n",
596
+ "\"\"\".strip()\n",
597
+ " encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
598
+ " with torch.inference_mode():\n",
599
+ " outputs = model.generate(\n",
600
+ " input_ids=encoding.input_ids,\n",
601
+ " attention_mask=encoding.attention_mask,\n",
602
+ " generation_config=generation_config,\n",
603
+ " )\n",
604
+ " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
605
+ "\n",
606
+ " assistant_start = \"<output>:\"\n",
607
+ " response_start = response.find(assistant_start)\n",
608
+ " return response[response_start + len(assistant_start) :].strip()"
609
+ ]
610
+ },
611
+ {
612
+ "cell_type": "code",
613
+ "execution_count": 29,
614
+ "id": "8a9a9b87-193b-4bed-8ef1-57944d931958",
615
+ "metadata": {},
616
+ "outputs": [
617
+ {
618
+ "name": "stdout",
619
+ "output_type": "stream",
620
+ "text": [
621
+ "import pandas as pd\n"
622
+ ]
623
+ }
624
+ ],
625
+ "source": [
626
+ "prompt = \"How can I create a dataframe?\"\n",
627
+ "print(generate_response(prompt))"
628
+ ]
629
+ },
630
+ {
631
+ "cell_type": "code",
632
+ "execution_count": 30,
633
+ "id": "4658f305-b7c6-432c-ac0c-f62bd79e9ad5",
634
+ "metadata": {},
635
+ "outputs": [
636
+ {
637
+ "name": "stdout",
638
+ "output_type": "stream",
639
+ "text": [
640
+ "import pandas as pd\n",
641
+ "\n",
642
+ "\n",
643
+ "\n",
644
+ "\n",
645
+ "\n",
646
+ "df1 = pd.DataFrame({'A': ['A', 'B', 'C', 'D'],\n",
647
+ " 'B': [1, 2, 3, 4]})\n",
648
+ "df2 = pd.DataFrame({'A': ['A', 'B', 'C', 'E'],\n",
649
+ " 'B': [1, 2, 3, 5]})\n",
650
+ "# merge df1 and df2 on column 'A'\n",
651
+ "# SOLUTION START\n",
652
+ "\n",
653
+ "<output>: import pandas as pd\n",
654
+ "\n",
655
+ "\n",
656
+ "\n",
657
+ "\n",
658
+ "\n",
659
+ "df1 = pd.DataFrame({'A': ['A', 'B', 'C', 'D'],\n",
660
+ " 'B': [1, 2, 3, 4]})\n",
661
+ "df2 = pd.DataFrame({'A': ['A', 'B', 'C', 'E'],\n",
662
+ " 'B': [1, 2, 3, 5]})\n",
663
+ "# merge df1 and df2 on column 'A'\n",
664
+ "result = pd.merge(df1, df2, on='A')\n",
665
+ "print(result)\n"
666
+ ]
667
+ }
668
+ ],
669
+ "source": [
670
+ "prompt = \"How to merge two dataframes?\"\n",
671
+ "print(generate_response(prompt))"
672
+ ]
673
+ },
674
+ {
675
+ "cell_type": "code",
676
+ "execution_count": 16,
677
+ "id": "0e9ed231-4a62-4331-94df-f3bcd601f138",
678
+ "metadata": {},
679
+ "outputs": [
680
+ {
681
+ "name": "stdout",
682
+ "output_type": "stream",
683
+ "text": [
684
+ "import pandas as pd\n",
685
+ "\n",
686
+ "\n",
687
+ "name = ['joy', 'shan']\n",
688
+ "roll_no = [1, 2]\n",
689
+ "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
690
+ "print(df)\n"
691
+ ]
692
+ }
693
+ ],
694
+ "source": [
695
+ "prompt = \"given two arrays name=['joy','shan'], roll_no=[1,2]. put these array in a dataframe ?\"\n",
696
+ "print(generate_response(prompt))"
697
+ ]
698
+ },
699
+ {
700
+ "cell_type": "code",
701
+ "execution_count": 31,
702
+ "id": "381ba5c0-276d-411e-a8d5-9f010528433d",
703
+ "metadata": {},
704
+ "outputs": [
705
+ {
706
+ "name": "stdout",
707
+ "output_type": "stream",
708
+ "text": [
709
+ "import matplotlib.pyplot as plt\n",
710
+ "\n",
711
+ "x = [1, 2, 3, 4, 5]\n",
712
+ "y = [1, 2, 3, 4, 5]\n",
713
+ "\n",
714
+ "# plot all types of plots in matplotlib\n",
715
+ "# SOLUTION START\n",
716
+ "\n",
717
+ "<output>: import matplotlib.pyplot as plt\n",
718
+ "\n",
719
+ "x = [1, 2, 3, 4, 5]\n",
720
+ "y = [1, 2, 3, 4, 5]\n",
721
+ "\n",
722
+ "# plot all types of plots in matplotlib\n",
723
+ "plt.plot(x, y, label=\"plot\")\n",
724
+ "plt.scatter(x, y, label=\"scatter\")\n",
725
+ "plt.bar(x, y, label=\"bar\")\n",
726
+ "plt.hist(x, y, label=\"hist\")\n",
727
+ "plt.boxplot(x, y, label=\"boxplot\")\n",
728
+ "plt.show()\n",
729
+ "<output>: import matplotlib.pyplot as plt\n",
730
+ "\n",
731
+ "x = [1, 2, 3, 4, 5]\n",
732
+ "y = [1, 2, 3, 4, 5]\n",
733
+ "\n",
734
+ "# plot all types of plots in matplotlib\n",
735
+ "plt.plot(x, y, label=\"plot\")\n",
736
+ "plt.scatter(x, y, label=\"scatter\")\n",
737
+ "plt.bar(x, y, label=\"bar\")\n",
738
+ "plt.hist(x, y, label=\"hist\")\n",
739
+ "plt.boxplot(x, y, label=\"boxplot\")\n",
740
+ "plt.show()\n",
741
+ "<output>: import matplotlib.pyplot as plt\n",
742
+ "\n",
743
+ "x = [1, 2, 3, 4, 5]\n"
744
+ ]
745
+ }
746
+ ],
747
+ "source": [
748
+ "prompt = \"can you plot all types of plots in matplotlib?\"\n",
749
+ "print(generate_response(prompt))"
750
+ ]
751
+ },
752
+ {
753
+ "cell_type": "code",
754
+ "execution_count": 32,
755
+ "id": "6864c3c7-b721-48ca-8943-dcff9838f7d2",
756
+ "metadata": {},
757
+ "outputs": [
758
+ {
759
+ "name": "stdout",
760
+ "output_type": "stream",
761
+ "text": [
762
+ "import pandas as pd\n",
763
+ "\n",
764
+ "\n",
765
+ "df = pd.DataFrame({'ID': ['01', '01', '01', '02', '02'],\n",
766
+ " 'TIME': ['2018-07-11 11:12:20', '2018-07-12 12:00:23', '2018-07-13 12:00:00', '2019-09-11 11:00:00', '2019-09-12 12:00:00']})\n",
767
+ "def g(df):\n",
768
+ " df['TIME'] = pd.to_datetime(df['TIME'])\n",
769
+ " df['RANK'] = df.groupby('ID')['TIME'].rank(ascending=True)\n",
770
+ " return df\n",
771
+ "\n",
772
+ "df = g(df.copy())\n",
773
+ "print(df)\n",
774
+ "<output>: import pandas as pd\n",
775
+ "\n",
776
+ "\n",
777
+ "df = pd.DataFrame({'ID': ['01', '01', '01', '02', '02'],\n",
778
+ " 'TIME': ['2018-07-11 11:12:20', '2018-07-12 12:00:23', '2018-07-13 12:00:00', '2019-09-11 11:00:00', '2019-09-12 12:00:00']})\n",
779
+ "def g(df):\n",
780
+ " df['TIME'] = pd.to_datetime(df['TIME'])\n"
781
+ ]
782
+ }
783
+ ],
784
+ "source": [
785
+ "prompt = \"\"\"Problem:\n",
786
+ "i got an issue over ranking of date times. Lets say i have following table.\n",
787
+ "ID TIME\n",
788
+ "01 2018-07-11 11:12:20\n",
789
+ "01 2018-07-12 12:00:23\n",
790
+ "01 2018-07-13 12:00:00\n",
791
+ "02 2019-09-11 11:00:00\n",
792
+ "02 2019-09-12 12:00:00\n",
793
+ "\n",
794
+ "\n",
795
+ "and i want to add another column to rank the table by time for each id and group. I used \n",
796
+ "df['RANK'] = data.groupby('ID')['TIME'].rank(ascending=True)\n",
797
+ "\n",
798
+ "\n",
799
+ "but get an error:\n",
800
+ "'NoneType' object is not callable\n",
801
+ "\n",
802
+ "\n",
803
+ "If i replace datetime to numbers, it works.... any solutions?\n",
804
+ "\"\"\"\n",
805
+ "print(generate_response(prompt))"
806
+ ]
807
+ },
808
+ {
809
+ "cell_type": "code",
810
+ "execution_count": 33,
811
+ "id": "7fa02929-5c65-4aa6-81ce-9c51879e7535",
812
+ "metadata": {},
813
+ "outputs": [
814
+ {
815
+ "name": "stdout",
816
+ "output_type": "stream",
817
+ "text": [
818
+ "import pandas as pd\n",
819
+ "\n",
820
+ "\n",
821
+ "index = range(14)\n",
822
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
823
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
824
+ "def g(df):\n",
825
+ " df['A'] = df['A'].replace(0, np.nan)\n",
826
+ " df['A'] = df['A'].fillna(method='ffill')\n",
827
+ " df['A'] = df['A'].fillna(method='bfill')\n",
828
+ " return df\n",
829
+ "\n",
830
+ "df = g(df.copy())\n",
831
+ "result = df\n",
832
+ "print(result)\n",
833
+ "<output>: import pandas as pd\n",
834
+ "import numpy as np\n",
835
+ "\n",
836
+ "\n",
837
+ "index = range(14)\n",
838
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
839
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
840
+ "def g(df):\n",
841
+ " df['A'] = df['A'].replace(0, np.nan)\n",
842
+ " df['A'] = df['A'].fillna(method='ffill')\n",
843
+ " df['A'] = df['A'].fillna(method='bfill')\n",
844
+ " return df\n",
845
+ "\n",
846
+ "df = g(df.copy())\n",
847
+ "result = df\n",
848
+ "print(result)\n",
849
+ "<output>: import pandas as pd\n",
850
+ "import numpy as np\n",
851
+ "\n",
852
+ "\n",
853
+ "index = range(14)\n",
854
+ "data = [1, 0, 0, 2, 0, 4\n"
855
+ ]
856
+ }
857
+ ],
858
+ "source": [
859
+ "prompt = \"\"\"Problem:\n",
860
+ "I have the following dataframe:\n",
861
+ "index = range(14)\n",
862
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
863
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
864
+ "\n",
865
+ "\n",
866
+ "How can I fill the zeros with the maximun between previous and posterior non-zero value using pandas? Is there a fillna that is not just for \"NaN\"?. \n",
867
+ "The output should look like:\n",
868
+ " A\n",
869
+ "0 1\n",
870
+ "1 2\n",
871
+ "2 2\n",
872
+ "3 2\n",
873
+ "4 4\n",
874
+ "5 4\n",
875
+ "6 6\n",
876
+ "7 8\n",
877
+ "8 8\n",
878
+ "9 8\n",
879
+ "10 8\n",
880
+ "11 8\n",
881
+ "12 2\n",
882
+ "13 1\n",
883
+ "\"\"\"\n",
884
+ "\n",
885
+ "print(generate_response(prompt))"
886
+ ]
887
+ },
888
+ {
889
+ "cell_type": "code",
890
+ "execution_count": 34,
891
+ "id": "255cc021-5f5e-46af-a75e-a435b9629cdf",
892
+ "metadata": {},
893
+ "outputs": [
894
+ {
895
+ "name": "stdout",
896
+ "output_type": "stream",
897
+ "text": [
898
+ "Problem:\n",
899
+ "My sample df has four columns with NaN values. The goal is to concatenate all the keywords rows while excluding the NaN values.\n",
900
+ "import pandas as pd\n",
901
+ "import numpy as np\n",
902
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
903
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
904
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
905
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
906
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
907
+ "\n",
908
+ "\n",
909
+ " users keywords_0 keywords_1 keywords_2 keywords_3\n",
910
+ "0 Hu Tao a d NaN f\n",
911
+ "1 Zhongli NaN e NaN NaN\n",
912
+ "2 Xingqiu c NaN b g\n",
913
+ "\n",
914
+ "\n",
915
+ "Want to accomplish the following:\n",
916
+ " users keywords_0 keywords_1 keywords_2 keywords_3 keywords_all\n",
917
+ "0 Hu Tao a d NaN f a-d-f\n",
918
+ "1 Zhongli NaN e NaN NaN e\n",
919
+ "2 Xingqiu c NaN b g c-b-g\n",
920
+ "\n",
921
+ "\n",
922
+ "Pseudo code:\n",
923
+ "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
924
+ "df[\"keywords_all\"] = df[\"keywords_all\"].apply(lambda cols: \"-\".join(cols), axis=1)\n",
925
+ "\n",
926
+ "\n",
927
+ "I know I can use \"-\".join() to get the exact result, but I am unsure how to pass the column names into the function.\n"
928
+ ]
929
+ }
930
+ ],
931
+ "source": [
932
+ "print(data[5]['question'])"
933
+ ]
934
+ },
935
+ {
936
+ "cell_type": "code",
937
+ "execution_count": 35,
938
+ "id": "1c5841e9-4331-4185-a7ad-7dd00d4e13b1",
939
+ "metadata": {},
940
+ "outputs": [
941
+ {
942
+ "name": "stdout",
943
+ "output_type": "stream",
944
+ "text": [
945
+ "import pandas as pd\n",
946
+ "import numpy as np\n",
947
+ "\n",
948
+ "\n",
949
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
950
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
951
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
952
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
953
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
954
+ "import numpy as np\n",
955
+ "def g(df):\n",
956
+ " df[\"keywords_all\"] = df.filter(like='keyword').apply(lambda x: '-'.join(x.dropna()), axis=1)\n",
957
+ " return df\n",
958
+ "\n",
959
+ "df = g(df.copy())\n",
960
+ "result = df\n",
961
+ "print(result)\n"
962
+ ]
963
+ }
964
+ ],
965
+ "source": [
966
+ "print(data[5]['answer'])"
967
+ ]
968
+ },
969
+ {
970
+ "cell_type": "code",
971
+ "execution_count": 36,
972
+ "id": "090e98c3-78db-4e33-af4b-01c6e1fc23d0",
973
+ "metadata": {},
974
+ "outputs": [
975
+ {
976
+ "name": "stdout",
977
+ "output_type": "stream",
978
+ "text": [
979
+ "import pandas as pd\n",
980
+ "import numpy as np\n",
981
+ "\n",
982
+ "\n",
983
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
984
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
985
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
986
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
987
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
988
+ "\n",
989
+ "\n",
990
+ "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
991
+ "def f(cols):\n",
992
+ " return \"-\".join(cols)\n",
993
+ "\n",
994
+ "\n",
995
+ "df[\"keywords_all\"] = df.apply(lambda row: f(row[cols]), axis=1)\n",
996
+ "\n",
997
+ "\n",
998
+ "print(df)\n",
999
+ "<output>: import pandas as pd\n",
1000
+ "import numpy as np\n",
1001
+ "\n",
1002
+ "\n",
1003
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
1004
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
1005
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
1006
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
1007
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
1008
+ "\n",
1009
+ "\n",
1010
+ "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
1011
+ "def f(cols):\n",
1012
+ " return \"-\".join(cols)\n",
1013
+ "\n",
1014
+ "\n",
1015
+ "df[\"keywords_all\"] = df.apply(lambda\n"
1016
+ ]
1017
+ }
1018
+ ],
1019
+ "source": [
1020
+ "prompt = data[5]['question']\n",
1021
+ "print(generate_response(prompt))"
1022
+ ]
1023
+ },
1024
+ {
1025
+ "cell_type": "code",
1026
+ "execution_count": 37,
1027
+ "id": "29609669-1ac7-4f6a-b0e3-64a3bf7a6545",
1028
+ "metadata": {},
1029
+ "outputs": [
1030
+ {
1031
+ "name": "stdout",
1032
+ "output_type": "stream",
1033
+ "text": [
1034
+ "import pandas as pd\n",
1035
+ "\n",
1036
+ "\n",
1037
+ "df = pd.DataFrame({'A': [1, 2, None, 4, 5],\n",
1038
+ " 'B': [None, 2, 3, 4, 5],\n",
1039
+ " 'C': [1, 2, 3, 4, 5]})\n",
1040
+ "df = df.dropna()\n",
1041
+ "print(df)\n",
1042
+ "<output>: import pandas as pd\n",
1043
+ "\n",
1044
+ "\n",
1045
+ "df = pd.DataFrame({'A': [1, 2, None, 4, 5],\n",
1046
+ " 'B': [None, 2, 3, 4, 5],\n",
1047
+ " 'C': [1, 2, 3, 4, 5]})\n",
1048
+ "df = df.dropna()\n",
1049
+ "print(df)\n",
1050
+ "<output>: import pandas as pd\n",
1051
+ "\n",
1052
+ "\n",
1053
+ "df = pd.DataFrame({'A': [1, 2, None, 4, 5],\n",
1054
+ " 'B': [None, 2, 3, 4, 5],\n",
1055
+ " 'C': [1, 2, 3, 4, 5]})\n",
1056
+ "df = df.dropna()\n",
1057
+ "print(df)\n",
1058
+ "<output>: import pandas as pd\n",
1059
+ "\n",
1060
+ "\n",
1061
+ "df = pd.DataFrame({'A': [1, 2, None, 4, 5],\n",
1062
+ " 'B': [None, 2, 3, 4, 5],\n",
1063
+ " 'C': [1, 2, 3, 4, 5]})\n",
1064
+ "df = df.dropna()\n",
1065
+ "print(df)\n",
1066
+ "<output>: import pandas as pd\n",
1067
+ "\n",
1068
+ "\n",
1069
+ "df = pd.DataFrame({'A': [1, 2, None,\n"
1070
+ ]
1071
+ }
1072
+ ],
1073
+ "source": [
1074
+ "prompt = \"How to remove null valued rows?\"\n",
1075
+ "print(generate_response(prompt))"
1076
+ ]
1077
+ },
1078
+ {
1079
+ "cell_type": "code",
1080
+ "execution_count": 39,
1081
+ "id": "5ca085f6-30fc-4e50-a436-673f3baa75af",
1082
+ "metadata": {},
1083
+ "outputs": [
1084
+ {
1085
+ "name": "stdout",
1086
+ "output_type": "stream",
1087
+ "text": [
1088
+ "import numpy as np\n",
1089
+ "import pandas as pd\n",
1090
+ "import matplotlib.pyplot as plt\n",
1091
+ "import seaborn as sns\n",
1092
+ "import sklearn\n",
1093
+ "from sklearn.linear_model import LogisticRegression\n",
1094
+ "from sklearn.model_selection import train_test_split\n",
1095
+ "\n",
1096
+ "\n",
1097
+ "X, y = load_data()\n",
1098
+ "\n",
1099
+ "# Split the data into training and test sets\n",
1100
+ "# Split the data into training and test sets\n",
1101
+ "# Split the data into training and test sets\n",
1102
+ "# Train a Logistic Regression model on the training data\n",
1103
+ "# Print the accuracy of the model on the test data\n",
1104
+ "# SOLUTION START\n",
1105
+ "\n",
1106
+ "<output>: import numpy as np\n",
1107
+ "import pandas as pd\n",
1108
+ "import matplotlib.pyplot as plt\n",
1109
+ "import seaborn as sns\n",
1110
+ "import sklearn\n",
1111
+ "from sklearn.linear_model import LogisticRegression\n",
1112
+ "from sklearn.model_selection import train_test_split\n",
1113
+ "\n",
1114
+ "\n",
1115
+ "X, y = load_data()\n",
1116
+ "\n",
1117
+ "# Split the data into training and test sets\n",
1118
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
1119
+ "# Train a Logistic Regression model on the training data\n",
1120
+ "model = LogisticRegression()\n",
1121
+ "model.fit(X_train, y_train)\n",
1122
+ "# Print the accuracy of the model on the test data\n",
1123
+ "print(model.score(X_test, y_test))\n",
1124
+ "<output>: import numpy as np\n",
1125
+ "import pandas as pd\n",
1126
+ "import matplotlib.pyplot as plt\n",
1127
+ "import seaborn as sns\n",
1128
+ "import sklearn\n",
1129
+ "from sklearn.linear_model import LogisticRegression\n",
1130
+ "from sklearn.model_selection import train_test_split\n"
1131
+ ]
1132
+ }
1133
+ ],
1134
+ "source": [
1135
+ "prompt = \"How to train a Logistic Regression model?\"\n",
1136
+ "print(generate_response(prompt))"
1137
+ ]
1138
+ },
1139
+ {
1140
+ "cell_type": "code",
1141
+ "execution_count": null,
1142
+ "id": "146527ff-5d37-42c7-b06b-45c1aa224d17",
1143
+ "metadata": {},
1144
+ "outputs": [],
1145
+ "source": []
1146
+ },
1147
+ {
1148
+ "cell_type": "code",
1149
+ "execution_count": null,
1150
+ "id": "84f671f3-7bd6-4a7c-81e9-758052b424cf",
1151
+ "metadata": {},
1152
+ "outputs": [],
1153
+ "source": []
1154
+ }
1155
+ ],
1156
+ "metadata": {
1157
+ "kernelspec": {
1158
+ "display_name": "Python 3 (ipykernel)",
1159
+ "language": "python",
1160
+ "name": "python3"
1161
+ },
1162
+ "language_info": {
1163
+ "codemirror_mode": {
1164
+ "name": "ipython",
1165
+ "version": 3
1166
+ },
1167
+ "file_extension": ".py",
1168
+ "mimetype": "text/x-python",
1169
+ "name": "python",
1170
+ "nbconvert_exporter": "python",
1171
+ "pygments_lexer": "ipython3",
1172
+ "version": "3.10.13"
1173
+ }
1174
+ },
1175
+ "nbformat": 4,
1176
+ "nbformat_minor": 5
1177
+ }
.ipynb_checkpoints/Test-mgc-f-checkpoint.ipynb ADDED
@@ -0,0 +1,866 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "addd199c-097c-419d-a0f2-c3d73efb8d5d",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "\n",
14
+ "===================================BUG REPORT===================================\n",
15
+ "Welcome to bitsandbytes. For bug reports, please run\n",
16
+ "\n",
17
+ "python -m bitsandbytes\n",
18
+ "\n",
19
+ " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
20
+ "================================================================================\n",
21
+ "bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so\n",
22
+ "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n",
23
+ "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so\n",
24
+ "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n",
25
+ "CUDA SETUP: Detected CUDA version 121\n",
26
+ "CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...\n"
27
+ ]
28
+ },
29
+ {
30
+ "name": "stderr",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib'), PosixPath('/usr/local/nvidia/lib64')}\n",
34
+ " warn(msg)\n",
35
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n",
36
+ " warn(msg)\n",
37
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCcuY6EsmJRfLsI1l1rpDWVRhwkL7A9nzITTDbCFOX0wzshP65l/Sa54NrS1pX2uM6YiB7OvgGUm7uUKf9OBCcpd2ohFJiOkTznhDHk+D7IkFZf/VTRIHy/JZoAtzN/qBQKMOygFam1XzTMDnkehMkKvR23BgH72hzGUfYPIsq+OlStYVMhE1bncYSnC4SRucbdT5BeIsival514xsbAhCjjwPd8UHfw1cxaDq4edWjbhN8wkDU+V8i/jS/wWTZIt7pIZiAREEl/YC+Sc4FCSnb4c3p+adl5pqXrEsKygi+UmBtC1poLSXTgZOc/0kerx4jv/HB8NiH4kLsg4S2HjdFFQIB0WSV0i4KDVRE9cv18gQ7kbEv0t9Uwg4xdoMntCNS6aFDm51ufhshwQylzfSwX71Ka3mPdftfnVk81wKpIxN784FEcb7IE7HcNyomnP9N382Fg8j6pILwsKK6w4oOg8Cn2C66cySA6CNTFpK1kYBwsqdU3X8WBQUIZZNVCn4x/qRWYxrKHmdlUW8oCf9AT32eydDQWp1y0AlycA4wfbDQ8g4dtu9Rf+tBrYTztdCt5PbGy4SbwfynWysc/PuhcyaLNtuRYt3LeiCKhKJFNFST1BqjACrjkQ9kMrPSB/7j3JX9O2ncDHDQgCQIQon9BETVQZJ49EqMrusQ3/K39w== shanjay@LAPTOP-Q1PG3AE7')}\n",
38
+ " warn(msg)\n",
39
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('https'), PosixPath('//g.notebooksg.jarvislabs.net')}\n",
40
+ " warn(msg)\n",
41
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n",
42
+ " warn(msg)\n"
43
+ ]
44
+ }
45
+ ],
46
+ "source": [
47
+ "import json\n",
48
+ "import os\n",
49
+ "from pprint import pprint\n",
50
+ "\n",
51
+ "import bitsandbytes as bnb\n",
52
+ "import pandas as pd\n",
53
+ "import torch\n",
54
+ "import torch.nn as nn\n",
55
+ "\n",
56
+ "import transformers\n",
57
+ "from datasets import load_dataset\n",
58
+ "from huggingface_hub import notebook_login\n",
59
+ "from peft import (\n",
60
+ " LoraConfig,\n",
61
+ " PeftConfig,\n",
62
+ " PeftModel,\n",
63
+ " get_peft_model,\n",
64
+ " prepare_model_for_kbit_training,\n",
65
+ ")\n",
66
+ "from transformers import (\n",
67
+ " AutoConfig,\n",
68
+ " AutoModelForCausalLM,\n",
69
+ " AutoTokenizer,\n",
70
+ " BitsAndBytesConfig,\n",
71
+ ")\n",
72
+ "import warnings\n",
73
+ "warnings.filterwarnings(\"ignore\")\n",
74
+ "\n",
75
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 2,
81
+ "id": "acfb1578-a66f-44f0-8df9-1c6bcf7530ea",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "data": {
86
+ "application/vnd.jupyter.widget-view+json": {
87
+ "model_id": "2316598db91e443f8c4e1cc615a9507f",
88
+ "version_major": 2,
89
+ "version_minor": 0
90
+ },
91
+ "text/plain": [
92
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
93
+ ]
94
+ },
95
+ "metadata": {},
96
+ "output_type": "display_data"
97
+ }
98
+ ],
99
+ "source": [
100
+ "notebook_login()"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 3,
106
+ "id": "d2f13cac-1536-4da0-8ff7-0a0454fd0b4a",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "with open(\"ds1000-test-cleaned.json\") as json_file:\n",
111
+ " data = json.load(json_file)"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 4,
117
+ "id": "6706e68b-d525-4392-ab2c-1dff356da52d",
118
+ "metadata": {},
119
+ "outputs": [
120
+ {
121
+ "name": "stdout",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "{'answer': 'import pandas as pd\\n'\n",
125
+ " '\\n'\n",
126
+ " '\\n'\n",
127
+ " 'index = range(14)\\n'\n",
128
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
129
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
130
+ " 'def g(df):\\n'\n",
131
+ " \" l = df['A'].replace(to_replace=0, method='ffill')\\n\"\n",
132
+ " \" r = df['A'].replace(to_replace=0, method='bfill')\\n\"\n",
133
+ " ' for i in range(len(df)):\\n'\n",
134
+ " \" df['A'].iloc[i] = max(l[i], r[i])\\n\"\n",
135
+ " ' return df\\n'\n",
136
+ " '\\n'\n",
137
+ " 'df = g(df.copy())\\n'\n",
138
+ " 'result = df\\n'\n",
139
+ " 'print(result)',\n",
140
+ " 'question': 'Problem:\\n'\n",
141
+ " 'I have the following dataframe:\\n'\n",
142
+ " 'index = range(14)\\n'\n",
143
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
144
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
145
+ " '\\n'\n",
146
+ " '\\n'\n",
147
+ " 'How can I fill the zeros with the maximun between previous and '\n",
148
+ " 'posterior non-zero value using pandas? Is there a fillna that is '\n",
149
+ " 'not just for \"NaN\"?. \\n'\n",
150
+ " 'The output should look like:\\n'\n",
151
+ " ' A\\n'\n",
152
+ " '0 1\\n'\n",
153
+ " '1 2\\n'\n",
154
+ " '2 2\\n'\n",
155
+ " '3 2\\n'\n",
156
+ " '4 4\\n'\n",
157
+ " '5 4\\n'\n",
158
+ " '6 6\\n'\n",
159
+ " '7 8\\n'\n",
160
+ " '8 8\\n'\n",
161
+ " '9 8\\n'\n",
162
+ " '10 8\\n'\n",
163
+ " '11 8\\n'\n",
164
+ " '12 2\\n'\n",
165
+ " '13 1'}\n"
166
+ ]
167
+ }
168
+ ],
169
+ "source": [
170
+ "pprint(data[0])"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 5,
176
+ "id": "9cc4983a-9a3f-485f-983f-efe2f10ce516",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "with open(\"ds1000-test-cleaned.json\", \"w\") as f:\n",
181
+ " json.dump(data, f)"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 6,
187
+ "id": "f45c3674-4eed-4ca5-8343-2184ff1e4da1",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/html": [
193
+ "<div>\n",
194
+ "<style scoped>\n",
195
+ " .dataframe tbody tr th:only-of-type {\n",
196
+ " vertical-align: middle;\n",
197
+ " }\n",
198
+ "\n",
199
+ " .dataframe tbody tr th {\n",
200
+ " vertical-align: top;\n",
201
+ " }\n",
202
+ "\n",
203
+ " .dataframe thead th {\n",
204
+ " text-align: right;\n",
205
+ " }\n",
206
+ "</style>\n",
207
+ "<table border=\"1\" class=\"dataframe\">\n",
208
+ " <thead>\n",
209
+ " <tr style=\"text-align: right;\">\n",
210
+ " <th></th>\n",
211
+ " <th>question</th>\n",
212
+ " <th>answer</th>\n",
213
+ " </tr>\n",
214
+ " </thead>\n",
215
+ " <tbody>\n",
216
+ " <tr>\n",
217
+ " <th>0</th>\n",
218
+ " <td>Problem:\\nI have the following dataframe:\\nind...</td>\n",
219
+ " <td>import pandas as pd\\n\\n\\nindex = range(14)\\nda...</td>\n",
220
+ " </tr>\n",
221
+ " <tr>\n",
222
+ " <th>1</th>\n",
223
+ " <td>Problem:\\ni got an issue over ranking of date ...</td>\n",
224
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I...</td>\n",
225
+ " </tr>\n",
226
+ " <tr>\n",
227
+ " <th>2</th>\n",
228
+ " <td>Problem:\\nI have a DataFrame like :\\n 0 ...</td>\n",
229
+ " <td>import pandas as pd\\nimport numpy as np\\n\\ndf ...</td>\n",
230
+ " </tr>\n",
231
+ " <tr>\n",
232
+ " <th>3</th>\n",
233
+ " <td>Problem:\\nI have this Pandas dataframe (df):\\n...</td>\n",
234
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A...</td>\n",
235
+ " </tr>\n",
236
+ " <tr>\n",
237
+ " <th>4</th>\n",
238
+ " <td>Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic...</td>\n",
239
+ " <td>import pandas as pd\\n\\ndf = pd.DataFrame.from_...</td>\n",
240
+ " </tr>\n",
241
+ " </tbody>\n",
242
+ "</table>\n",
243
+ "</div>"
244
+ ],
245
+ "text/plain": [
246
+ " question \\\n",
247
+ "0 Problem:\\nI have the following dataframe:\\nind... \n",
248
+ "1 Problem:\\ni got an issue over ranking of date ... \n",
249
+ "2 Problem:\\nI have a DataFrame like :\\n 0 ... \n",
250
+ "3 Problem:\\nI have this Pandas dataframe (df):\\n... \n",
251
+ "4 Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic... \n",
252
+ "\n",
253
+ " answer \n",
254
+ "0 import pandas as pd\\n\\n\\nindex = range(14)\\nda... \n",
255
+ "1 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I... \n",
256
+ "2 import pandas as pd\\nimport numpy as np\\n\\ndf ... \n",
257
+ "3 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A... \n",
258
+ "4 import pandas as pd\\n\\ndf = pd.DataFrame.from_... "
259
+ ]
260
+ },
261
+ "execution_count": 6,
262
+ "metadata": {},
263
+ "output_type": "execute_result"
264
+ }
265
+ ],
266
+ "source": [
267
+ "pd.DataFrame(data).head()"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": 7,
273
+ "id": "6fbdd3ad-062f-4744-bb8e-1c19950adfd5",
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": [
277
+ "bnb_config = BitsAndBytesConfig(\n",
278
+ " load_in_4bit=True,\n",
279
+ " bnb_4bit_use_double_quant=True,\n",
280
+ " bnb_4bit_quant_type=\"nf4\",\n",
281
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
282
+ ")"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 8,
288
+ "id": "2b5ae38c-b0d2-4b9a-acde-3370130ca6e7",
289
+ "metadata": {},
290
+ "outputs": [
291
+ {
292
+ "data": {
293
+ "application/vnd.jupyter.widget-view+json": {
294
+ "model_id": "7e1406ca2f5f4c0dbf0a581edebc9a6b",
295
+ "version_major": 2,
296
+ "version_minor": 0
297
+ },
298
+ "text/plain": [
299
+ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
300
+ ]
301
+ },
302
+ "metadata": {},
303
+ "output_type": "display_data"
304
+ },
305
+ {
306
+ "name": "stderr",
307
+ "output_type": "stream",
308
+ "text": [
309
+ "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at deepseek-ai/deepseek-coder-6.7b-instruct and are newly initialized: ['model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq']\n",
310
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
311
+ ]
312
+ },
313
+ {
314
+ "data": {
315
+ "application/vnd.jupyter.widget-view+json": {
316
+ "model_id": "2b9911223be04aeabbc44875a5929afa",
317
+ "version_major": 2,
318
+ "version_minor": 0
319
+ },
320
+ "text/plain": [
321
+ "adapter_model.bin: 0%| | 0.00/33.6M [00:00<?, ?B/s]"
322
+ ]
323
+ },
324
+ "metadata": {},
325
+ "output_type": "display_data"
326
+ }
327
+ ],
328
+ "source": [
329
+ "PEFT_MODEL = \"shanjay/ds-dsc-v4\"\n",
330
+ "\n",
331
+ "config = PeftConfig.from_pretrained(PEFT_MODEL)\n",
332
+ "model = AutoModelForCausalLM.from_pretrained(\n",
333
+ " config.base_model_name_or_path,\n",
334
+ " return_dict=True,\n",
335
+ " quantization_config=bnb_config,\n",
336
+ " device_map=\"auto\",\n",
337
+ " trust_remote_code=True,\n",
338
+ ")\n",
339
+ "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
340
+ "tokenizer.pad_token = tokenizer.eos_token\n",
341
+ "\n",
342
+ "model = PeftModel.from_pretrained(model, PEFT_MODEL)"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": 9,
348
+ "id": "7c3e35e0-f77c-4d63-8e2b-e72027341e31",
349
+ "metadata": {},
350
+ "outputs": [],
351
+ "source": [
352
+ "generation_config = model.generation_config\n",
353
+ "generation_config.max_new_tokens = 200\n",
354
+ "generation_config.temperature = 0.7\n",
355
+ "generation_config.top_p = 0.7\n",
356
+ "generation_config.num_return_sequences = 1\n",
357
+ "generation_config.pad_token_id = tokenizer.eos_token_id\n",
358
+ "generation_config.eos_token_id = tokenizer.eos_token_id"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "execution_count": 10,
364
+ "id": "aee4385b-d855-4225-9532-4e9002322579",
365
+ "metadata": {},
366
+ "outputs": [],
367
+ "source": [
368
+ "DEVICE = \"cuda:0\""
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": 11,
374
+ "id": "7b14a1c6-ac62-4a9c-9df9-0db50facfd7e",
375
+ "metadata": {},
376
+ "outputs": [
377
+ {
378
+ "name": "stdout",
379
+ "output_type": "stream",
380
+ "text": [
381
+ "<instruction>: How can I create a dataframe?\n",
382
+ "<output>: import pandas as pd\n",
383
+ "\n",
384
+ "\n",
385
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
386
+ "print(df)\n",
387
+ " A B\n",
388
+ "0 1 4\n",
389
+ "1 2 5\n",
390
+ "2 3 6\n",
391
+ "<output>: import pandas as pd\n",
392
+ "\n",
393
+ "\n",
394
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
395
+ "print(df)\n",
396
+ " A B\n",
397
+ "0 1 4\n",
398
+ "1 2 5\n",
399
+ "2 3 6\n",
400
+ "<output>: import pandas as pd\n",
401
+ "\n",
402
+ "\n",
403
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
404
+ "print(df)\n",
405
+ " A\n",
406
+ "CPU times: user 26.8 s, sys: 346 ms, total: 27.1 s\n",
407
+ "Wall time: 27.2 s\n"
408
+ ]
409
+ }
410
+ ],
411
+ "source": [
412
+ "%%time\n",
413
+ "prompt = f\"\"\"\n",
414
+ "<instruction>: How can I create a dataframe?\n",
415
+ "<output>:\n",
416
+ "\"\"\".strip()\n",
417
+ "\n",
418
+ "encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
419
+ "with torch.inference_mode():\n",
420
+ " outputs = model.generate(\n",
421
+ " input_ids=encoding.input_ids,\n",
422
+ " attention_mask=encoding.attention_mask,\n",
423
+ " generation_config=generation_config,\n",
424
+ " )\n",
425
+ "print(tokenizer.decode(outputs[0], skip_special_tokens=True))"
426
+ ]
427
+ },
428
+ {
429
+ "cell_type": "code",
430
+ "execution_count": 12,
431
+ "id": "93c95988-c563-4871-974d-004bf73fbce8",
432
+ "metadata": {},
433
+ "outputs": [],
434
+ "source": [
435
+ "def generate_response(question: str) -> str:\n",
436
+ " prompt = f\"\"\"\n",
437
+ "<instruction>: {question}\n",
438
+ "<output>:\n",
439
+ "\"\"\".strip()\n",
440
+ " encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
441
+ " with torch.inference_mode():\n",
442
+ " outputs = model.generate(\n",
443
+ " input_ids=encoding.input_ids,\n",
444
+ " attention_mask=encoding.attention_mask,\n",
445
+ " generation_config=generation_config,\n",
446
+ " )\n",
447
+ " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
448
+ "\n",
449
+ " assistant_start = \"<output>:\"\n",
450
+ " response_start = response.find(assistant_start)\n",
451
+ " return response[response_start + len(assistant_start) :].strip()"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "code",
456
+ "execution_count": 13,
457
+ "id": "8a9a9b87-193b-4bed-8ef1-57944d931958",
458
+ "metadata": {},
459
+ "outputs": [
460
+ {
461
+ "name": "stdout",
462
+ "output_type": "stream",
463
+ "text": [
464
+ "import pandas as pd\n",
465
+ "\n",
466
+ "\n",
467
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
468
+ "print(df)\n",
469
+ " A B\n",
470
+ "0 1 4\n",
471
+ "1 2 5\n",
472
+ "2 3 6\n",
473
+ "<output>: import pandas as pd\n",
474
+ "\n",
475
+ "\n",
476
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
477
+ "print(df)\n",
478
+ " A B\n",
479
+ "0 1 4\n",
480
+ "1 2 5\n",
481
+ "2 3 6\n",
482
+ "<output>: import pandas as pd\n",
483
+ "\n",
484
+ "\n",
485
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
486
+ "print(df)\n",
487
+ " A\n"
488
+ ]
489
+ }
490
+ ],
491
+ "source": [
492
+ "prompt = \"How can I create a dataframe?\"\n",
493
+ "print(generate_response(prompt))"
494
+ ]
495
+ },
496
+ {
497
+ "cell_type": "code",
498
+ "execution_count": 14,
499
+ "id": "4658f305-b7c6-432c-ac0c-f62bd79e9ad5",
500
+ "metadata": {},
501
+ "outputs": [
502
+ {
503
+ "name": "stdout",
504
+ "output_type": "stream",
505
+ "text": [
506
+ "import pandas as pd\n",
507
+ "\n",
508
+ "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
509
+ "df2 = pd.DataFrame({'C': [7, 8, 9], 'D': [10, 11, 12]})\n",
510
+ "\n",
511
+ "# merge df1 and df2\n",
512
+ "result = ...\n",
513
+ "\n",
514
+ "print(result)\n",
515
+ "\n",
516
+ "# Expected output\n",
517
+ "# A B C D\n",
518
+ "# 0 1 4 7 10\n",
519
+ "# 1 2 5 8 11\n",
520
+ "# 2 3 6 9 12\n",
521
+ "<output>: import pandas as pd\n",
522
+ "\n",
523
+ "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]\n"
524
+ ]
525
+ }
526
+ ],
527
+ "source": [
528
+ "prompt = \"How to merge two dataframes?\"\n",
529
+ "print(generate_response(prompt))"
530
+ ]
531
+ },
532
+ {
533
+ "cell_type": "code",
534
+ "execution_count": 15,
535
+ "id": "0e9ed231-4a62-4331-94df-f3bcd601f138",
536
+ "metadata": {},
537
+ "outputs": [
538
+ {
539
+ "name": "stdout",
540
+ "output_type": "stream",
541
+ "text": [
542
+ "import pandas as pd\n",
543
+ "\n",
544
+ "\n",
545
+ "name=['joy','shan']\n",
546
+ "roll_no=[1,2]\n",
547
+ "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
548
+ "print(df)\n",
549
+ "\n",
550
+ "\n",
551
+ " name roll_no\n",
552
+ "0 joy 1\n",
553
+ "1 shan 2\n",
554
+ "<output>: import pandas as pd\n",
555
+ "\n",
556
+ "\n",
557
+ "name=['joy','shan']\n",
558
+ "roll_no=[1,2]\n",
559
+ "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
560
+ "print(df)\n",
561
+ "\n",
562
+ "\n",
563
+ " name roll_no\n",
564
+ "0 joy 1\n",
565
+ "1 shan 2\n",
566
+ "<output>: import pandas as pd\n",
567
+ "\n",
568
+ "\n",
569
+ "name=['joy','shan']\n",
570
+ "roll_no=[1,2]\n",
571
+ "df = pd.DataFrame({\n"
572
+ ]
573
+ }
574
+ ],
575
+ "source": [
576
+ "prompt = \"given two arrays name=['joy','shan'], roll_no=[1,2]. put these array in a dataframe ?\"\n",
577
+ "print(generate_response(prompt))"
578
+ ]
579
+ },
580
+ {
581
+ "cell_type": "code",
582
+ "execution_count": 16,
583
+ "id": "381ba5c0-276d-411e-a8d5-9f010528433d",
584
+ "metadata": {},
585
+ "outputs": [
586
+ {
587
+ "name": "stdout",
588
+ "output_type": "stream",
589
+ "text": [
590
+ "import matplotlib.pyplot as plt\n",
591
+ "\n",
592
+ "x = range(10)\n",
593
+ "y = range(10)\n",
594
+ "\n",
595
+ "plt.plot(x, y, label='line')\n",
596
+ "plt.scatter(x, y, label='scatter')\n",
597
+ "plt.bar(x, y, label='bar')\n",
598
+ "plt.hist(x, y, label='hist')\n",
599
+ "plt.legend()\n",
600
+ "plt.show()\n",
601
+ "<output>: import matplotlib.pyplot as plt\n",
602
+ "\n",
603
+ "x = range(10)\n",
604
+ "y = range(10)\n",
605
+ "\n",
606
+ "plt.plot(x, y, label='line')\n",
607
+ "plt.scatter(x, y, label='scatter')\n",
608
+ "plt.bar(x, y, label='bar')\n",
609
+ "plt.hist(x, y, label='hist')\n",
610
+ "pl\n"
611
+ ]
612
+ }
613
+ ],
614
+ "source": [
615
+ "prompt = \"can you plot all types of plots in matplotlib?\"\n",
616
+ "print(generate_response(prompt))"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": 19,
622
+ "id": "6864c3c7-b721-48ca-8943-dcff9838f7d2",
623
+ "metadata": {},
624
+ "outputs": [
625
+ {
626
+ "name": "stdout",
627
+ "output_type": "stream",
628
+ "text": [
629
+ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
630
+ ]
631
+ }
632
+ ],
633
+ "source": [
634
+ "prompt = \"\"\"Problem:\n",
635
+ "i got an issue over ranking of date times. Lets say i have following table.\n",
636
+ "ID TIME\n",
637
+ "01 2018-07-11 11:12:20\n",
638
+ "01 2018-07-12 12:00:23\n",
639
+ "01 2018-07-13 12:00:00\n",
640
+ "02 2019-09-11 11:00:00\n",
641
+ "02 2019-09-12 12:00:00\n",
642
+ "\n",
643
+ "\n",
644
+ "and i want to add another column to rank the table by time for each id and group. I used \n",
645
+ "df['RANK'] = data.groupby('ID')['TIME'].rank(ascending=True)\n",
646
+ "\n",
647
+ "\n",
648
+ "but get an error:\n",
649
+ "'NoneType' object is not callable\n",
650
+ "\n",
651
+ "\n",
652
+ "If i replace datetime to numbers, it works.... any solutions?\n",
653
+ "\"\"\"\n",
654
+ "print(generate_response(prompt))"
655
+ ]
656
+ },
657
+ {
658
+ "cell_type": "code",
659
+ "execution_count": 20,
660
+ "id": "7fa02929-5c65-4aa6-81ce-9c51879e7535",
661
+ "metadata": {},
662
+ "outputs": [
663
+ {
664
+ "name": "stdout",
665
+ "output_type": "stream",
666
+ "text": [
667
+ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
668
+ ]
669
+ }
670
+ ],
671
+ "source": [
672
+ "prompt = \"\"\"Problem:\n",
673
+ "I have the following dataframe:\n",
674
+ "index = range(14)\n",
675
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
676
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
677
+ "\n",
678
+ "\n",
679
+ "How can I fill the zeros with the maximun between previous and posterior non-zero value using pandas? Is there a fillna that is not just for \"NaN\"?. \n",
680
+ "The output should look like:\n",
681
+ " A\n",
682
+ "0 1\n",
683
+ "1 2\n",
684
+ "2 2\n",
685
+ "3 2\n",
686
+ "4 4\n",
687
+ "5 4\n",
688
+ "6 6\n",
689
+ "7 8\n",
690
+ "8 8\n",
691
+ "9 8\n",
692
+ "10 8\n",
693
+ "11 8\n",
694
+ "12 2\n",
695
+ "13 1\n",
696
+ "\"\"\"\n",
697
+ "\n",
698
+ "print(generate_response(prompt))"
699
+ ]
700
+ },
701
+ {
702
+ "cell_type": "code",
703
+ "execution_count": 27,
704
+ "id": "255cc021-5f5e-46af-a75e-a435b9629cdf",
705
+ "metadata": {},
706
+ "outputs": [
707
+ {
708
+ "name": "stdout",
709
+ "output_type": "stream",
710
+ "text": [
711
+ "Problem:\n",
712
+ "My sample df has four columns with NaN values. The goal is to concatenate all the keywords rows while excluding the NaN values.\n",
713
+ "import pandas as pd\n",
714
+ "import numpy as np\n",
715
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
716
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
717
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
718
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
719
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
720
+ "\n",
721
+ "\n",
722
+ " users keywords_0 keywords_1 keywords_2 keywords_3\n",
723
+ "0 Hu Tao a d NaN f\n",
724
+ "1 Zhongli NaN e NaN NaN\n",
725
+ "2 Xingqiu c NaN b g\n",
726
+ "\n",
727
+ "\n",
728
+ "Want to accomplish the following:\n",
729
+ " users keywords_0 keywords_1 keywords_2 keywords_3 keywords_all\n",
730
+ "0 Hu Tao a d NaN f a-d-f\n",
731
+ "1 Zhongli NaN e NaN NaN e\n",
732
+ "2 Xingqiu c NaN b g c-b-g\n",
733
+ "\n",
734
+ "\n",
735
+ "Pseudo code:\n",
736
+ "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
737
+ "df[\"keywords_all\"] = df[\"keywords_all\"].apply(lambda cols: \"-\".join(cols), axis=1)\n",
738
+ "\n",
739
+ "\n",
740
+ "I know I can use \"-\".join() to get the exact result, but I am unsure how to pass the column names into the function.\n"
741
+ ]
742
+ }
743
+ ],
744
+ "source": [
745
+ "print(data[5]['question'])"
746
+ ]
747
+ },
748
+ {
749
+ "cell_type": "code",
750
+ "execution_count": 28,
751
+ "id": "1c5841e9-4331-4185-a7ad-7dd00d4e13b1",
752
+ "metadata": {},
753
+ "outputs": [
754
+ {
755
+ "name": "stdout",
756
+ "output_type": "stream",
757
+ "text": [
758
+ "import pandas as pd\n",
759
+ "import numpy as np\n",
760
+ "\n",
761
+ "\n",
762
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
763
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
764
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
765
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
766
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
767
+ "import numpy as np\n",
768
+ "def g(df):\n",
769
+ " df[\"keywords_all\"] = df.filter(like='keyword').apply(lambda x: '-'.join(x.dropna()), axis=1)\n",
770
+ " return df\n",
771
+ "\n",
772
+ "df = g(df.copy())\n",
773
+ "result = df\n",
774
+ "print(result)\n"
775
+ ]
776
+ }
777
+ ],
778
+ "source": [
779
+ "print(data[5]['answer'])"
780
+ ]
781
+ },
782
+ {
783
+ "cell_type": "code",
784
+ "execution_count": 29,
785
+ "id": "090e98c3-78db-4e33-af4b-01c6e1fc23d0",
786
+ "metadata": {},
787
+ "outputs": [
788
+ {
789
+ "name": "stdout",
790
+ "output_type": "stream",
791
+ "text": [
792
+ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
793
+ ]
794
+ }
795
+ ],
796
+ "source": [
797
+ "prompt = data[5]['question']\n",
798
+ "print(generate_response(prompt))"
799
+ ]
800
+ },
801
+ {
802
+ "cell_type": "code",
803
+ "execution_count": 30,
804
+ "id": "29609669-1ac7-4f6a-b0e3-64a3bf7a6545",
805
+ "metadata": {},
806
+ "outputs": [
807
+ {
808
+ "name": "stdout",
809
+ "output_type": "stream",
810
+ "text": [
811
+ "import pandas as pd\n",
812
+ "\n",
813
+ "\n",
814
+ "df = pd.DataFrame({'A': [1, 2, 3, None, 5],\n",
815
+ " 'B': [1, 2, 3, None, 5],\n",
816
+ " 'C': [1, 2, 3, None, 5],\n",
817
+ " 'D': [1, 2, 3, None, 5],\n",
818
+ " 'E': [1, 2, 3, None, 5]})\n",
819
+ "\n",
820
+ "df = df.dropna(how='all')\n",
821
+ "print(df)\n",
822
+ "<output>: A B C D E\n",
823
+ "0 1 1 1 1 1\n",
824
+ "1 2 2 2 2 2\n",
825
+ "2 3 3 3 3 3\n",
826
+ "4 5 5 5 5 5\n",
827
+ "<output>: import pand\n"
828
+ ]
829
+ }
830
+ ],
831
+ "source": [
832
+ "prompt = \"How to remove null valued rows?\"\n",
833
+ "print(generate_response(prompt))"
834
+ ]
835
+ },
836
+ {
837
+ "cell_type": "code",
838
+ "execution_count": null,
839
+ "id": "5ca085f6-30fc-4e50-a436-673f3baa75af",
840
+ "metadata": {},
841
+ "outputs": [],
842
+ "source": []
843
+ }
844
+ ],
845
+ "metadata": {
846
+ "kernelspec": {
847
+ "display_name": "Python 3 (ipykernel)",
848
+ "language": "python",
849
+ "name": "python3"
850
+ },
851
+ "language_info": {
852
+ "codemirror_mode": {
853
+ "name": "ipython",
854
+ "version": 3
855
+ },
856
+ "file_extension": ".py",
857
+ "mimetype": "text/x-python",
858
+ "name": "python",
859
+ "nbconvert_exporter": "python",
860
+ "pygments_lexer": "ipython3",
861
+ "version": "3.10.13"
862
+ }
863
+ },
864
+ "nbformat": 4,
865
+ "nbformat_minor": 5
866
+ }
.ipynb_checkpoints/Testv3-checkpoint.ipynb ADDED
@@ -0,0 +1,831 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "addd199c-097c-419d-a0f2-c3d73efb8d5d",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "\n",
14
+ "===================================BUG REPORT===================================\n",
15
+ "Welcome to bitsandbytes. For bug reports, please run\n",
16
+ "\n",
17
+ "python -m bitsandbytes\n",
18
+ "\n",
19
+ " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
20
+ "================================================================================\n",
21
+ "bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so\n",
22
+ "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n",
23
+ "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so\n",
24
+ "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n",
25
+ "CUDA SETUP: Detected CUDA version 121\n",
26
+ "CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...\n"
27
+ ]
28
+ },
29
+ {
30
+ "name": "stderr",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib'), PosixPath('/usr/local/nvidia/lib64')}\n",
34
+ " warn(msg)\n",
35
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n",
36
+ " warn(msg)\n",
37
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCcuY6EsmJRfLsI1l1rpDWVRhwkL7A9nzITTDbCFOX0wzshP65l/Sa54NrS1pX2uM6YiB7OvgGUm7uUKf9OBCcpd2ohFJiOkTznhDHk+D7IkFZf/VTRIHy/JZoAtzN/qBQKMOygFam1XzTMDnkehMkKvR23BgH72hzGUfYPIsq+OlStYVMhE1bncYSnC4SRucbdT5BeIsival514xsbAhCjjwPd8UHfw1cxaDq4edWjbhN8wkDU+V8i/jS/wWTZIt7pIZiAREEl/YC+Sc4FCSnb4c3p+adl5pqXrEsKygi+UmBtC1poLSXTgZOc/0kerx4jv/HB8NiH4kLsg4S2HjdFFQIB0WSV0i4KDVRE9cv18gQ7kbEv0t9Uwg4xdoMntCNS6aFDm51ufhshwQylzfSwX71Ka3mPdftfnVk81wKpIxN784FEcb7IE7HcNyomnP9N382Fg8j6pILwsKK6w4oOg8Cn2C66cySA6CNTFpK1kYBwsqdU3X8WBQUIZZNVCn4x/qRWYxrKHmdlUW8oCf9AT32eydDQWp1y0AlycA4wfbDQ8g4dtu9Rf+tBrYTztdCt5PbGy4SbwfynWysc/PuhcyaLNtuRYt3LeiCKhKJFNFST1BqjACrjkQ9kMrPSB/7j3JX9O2ncDHDQgCQIQon9BETVQZJ49EqMrusQ3/K39w== shanjay@LAPTOP-Q1PG3AE7')}\n",
38
+ " warn(msg)\n",
39
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('https'), PosixPath('//g.notebooksg.jarvislabs.net')}\n",
40
+ " warn(msg)\n",
41
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n",
42
+ " warn(msg)\n"
43
+ ]
44
+ }
45
+ ],
46
+ "source": [
47
+ "import json\n",
48
+ "import os\n",
49
+ "from pprint import pprint\n",
50
+ "\n",
51
+ "import bitsandbytes as bnb\n",
52
+ "import pandas as pd\n",
53
+ "import torch\n",
54
+ "import torch.nn as nn\n",
55
+ "\n",
56
+ "import transformers\n",
57
+ "from datasets import load_dataset\n",
58
+ "from huggingface_hub import notebook_login\n",
59
+ "from peft import (\n",
60
+ " LoraConfig,\n",
61
+ " PeftConfig,\n",
62
+ " PeftModel,\n",
63
+ " get_peft_model,\n",
64
+ " prepare_model_for_kbit_training,\n",
65
+ ")\n",
66
+ "from transformers import (\n",
67
+ " AutoConfig,\n",
68
+ " AutoModelForCausalLM,\n",
69
+ " AutoTokenizer,\n",
70
+ " BitsAndBytesConfig,\n",
71
+ ")\n",
72
+ "import warnings\n",
73
+ "warnings.filterwarnings(\"ignore\")\n",
74
+ "\n",
75
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 2,
81
+ "id": "acfb1578-a66f-44f0-8df9-1c6bcf7530ea",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "data": {
86
+ "application/vnd.jupyter.widget-view+json": {
87
+ "model_id": "2316598db91e443f8c4e1cc615a9507f",
88
+ "version_major": 2,
89
+ "version_minor": 0
90
+ },
91
+ "text/plain": [
92
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
93
+ ]
94
+ },
95
+ "metadata": {},
96
+ "output_type": "display_data"
97
+ }
98
+ ],
99
+ "source": [
100
+ "notebook_login()"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 3,
106
+ "id": "d2f13cac-1536-4da0-8ff7-0a0454fd0b4a",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "with open(\"ds1000-test-cleaned.json\") as json_file:\n",
111
+ " data = json.load(json_file)"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 4,
117
+ "id": "6706e68b-d525-4392-ab2c-1dff356da52d",
118
+ "metadata": {},
119
+ "outputs": [
120
+ {
121
+ "name": "stdout",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "{'answer': 'import pandas as pd\\n'\n",
125
+ " '\\n'\n",
126
+ " '\\n'\n",
127
+ " 'index = range(14)\\n'\n",
128
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
129
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
130
+ " 'def g(df):\\n'\n",
131
+ " \" l = df['A'].replace(to_replace=0, method='ffill')\\n\"\n",
132
+ " \" r = df['A'].replace(to_replace=0, method='bfill')\\n\"\n",
133
+ " ' for i in range(len(df)):\\n'\n",
134
+ " \" df['A'].iloc[i] = max(l[i], r[i])\\n\"\n",
135
+ " ' return df\\n'\n",
136
+ " '\\n'\n",
137
+ " 'df = g(df.copy())\\n'\n",
138
+ " 'result = df\\n'\n",
139
+ " 'print(result)',\n",
140
+ " 'question': 'Problem:\\n'\n",
141
+ " 'I have the following dataframe:\\n'\n",
142
+ " 'index = range(14)\\n'\n",
143
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
144
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
145
+ " '\\n'\n",
146
+ " '\\n'\n",
147
+ " 'How can I fill the zeros with the maximun between previous and '\n",
148
+ " 'posterior non-zero value using pandas? Is there a fillna that is '\n",
149
+ " 'not just for \"NaN\"?. \\n'\n",
150
+ " 'The output should look like:\\n'\n",
151
+ " ' A\\n'\n",
152
+ " '0 1\\n'\n",
153
+ " '1 2\\n'\n",
154
+ " '2 2\\n'\n",
155
+ " '3 2\\n'\n",
156
+ " '4 4\\n'\n",
157
+ " '5 4\\n'\n",
158
+ " '6 6\\n'\n",
159
+ " '7 8\\n'\n",
160
+ " '8 8\\n'\n",
161
+ " '9 8\\n'\n",
162
+ " '10 8\\n'\n",
163
+ " '11 8\\n'\n",
164
+ " '12 2\\n'\n",
165
+ " '13 1'}\n"
166
+ ]
167
+ }
168
+ ],
169
+ "source": [
170
+ "pprint(data[0])"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 5,
176
+ "id": "9cc4983a-9a3f-485f-983f-efe2f10ce516",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "with open(\"ds1000-test-cleaned.json\", \"w\") as f:\n",
181
+ " json.dump(data, f)"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 6,
187
+ "id": "f45c3674-4eed-4ca5-8343-2184ff1e4da1",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/html": [
193
+ "<div>\n",
194
+ "<style scoped>\n",
195
+ " .dataframe tbody tr th:only-of-type {\n",
196
+ " vertical-align: middle;\n",
197
+ " }\n",
198
+ "\n",
199
+ " .dataframe tbody tr th {\n",
200
+ " vertical-align: top;\n",
201
+ " }\n",
202
+ "\n",
203
+ " .dataframe thead th {\n",
204
+ " text-align: right;\n",
205
+ " }\n",
206
+ "</style>\n",
207
+ "<table border=\"1\" class=\"dataframe\">\n",
208
+ " <thead>\n",
209
+ " <tr style=\"text-align: right;\">\n",
210
+ " <th></th>\n",
211
+ " <th>question</th>\n",
212
+ " <th>answer</th>\n",
213
+ " </tr>\n",
214
+ " </thead>\n",
215
+ " <tbody>\n",
216
+ " <tr>\n",
217
+ " <th>0</th>\n",
218
+ " <td>Problem:\\nI have the following dataframe:\\nind...</td>\n",
219
+ " <td>import pandas as pd\\n\\n\\nindex = range(14)\\nda...</td>\n",
220
+ " </tr>\n",
221
+ " <tr>\n",
222
+ " <th>1</th>\n",
223
+ " <td>Problem:\\ni got an issue over ranking of date ...</td>\n",
224
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I...</td>\n",
225
+ " </tr>\n",
226
+ " <tr>\n",
227
+ " <th>2</th>\n",
228
+ " <td>Problem:\\nI have a DataFrame like :\\n 0 ...</td>\n",
229
+ " <td>import pandas as pd\\nimport numpy as np\\n\\ndf ...</td>\n",
230
+ " </tr>\n",
231
+ " <tr>\n",
232
+ " <th>3</th>\n",
233
+ " <td>Problem:\\nI have this Pandas dataframe (df):\\n...</td>\n",
234
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A...</td>\n",
235
+ " </tr>\n",
236
+ " <tr>\n",
237
+ " <th>4</th>\n",
238
+ " <td>Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic...</td>\n",
239
+ " <td>import pandas as pd\\n\\ndf = pd.DataFrame.from_...</td>\n",
240
+ " </tr>\n",
241
+ " </tbody>\n",
242
+ "</table>\n",
243
+ "</div>"
244
+ ],
245
+ "text/plain": [
246
+ " question \\\n",
247
+ "0 Problem:\\nI have the following dataframe:\\nind... \n",
248
+ "1 Problem:\\ni got an issue over ranking of date ... \n",
249
+ "2 Problem:\\nI have a DataFrame like :\\n 0 ... \n",
250
+ "3 Problem:\\nI have this Pandas dataframe (df):\\n... \n",
251
+ "4 Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic... \n",
252
+ "\n",
253
+ " answer \n",
254
+ "0 import pandas as pd\\n\\n\\nindex = range(14)\\nda... \n",
255
+ "1 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I... \n",
256
+ "2 import pandas as pd\\nimport numpy as np\\n\\ndf ... \n",
257
+ "3 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A... \n",
258
+ "4 import pandas as pd\\n\\ndf = pd.DataFrame.from_... "
259
+ ]
260
+ },
261
+ "execution_count": 6,
262
+ "metadata": {},
263
+ "output_type": "execute_result"
264
+ }
265
+ ],
266
+ "source": [
267
+ "pd.DataFrame(data).head()"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": 7,
273
+ "id": "6fbdd3ad-062f-4744-bb8e-1c19950adfd5",
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": [
277
+ "bnb_config = BitsAndBytesConfig(\n",
278
+ " load_in_4bit=True,\n",
279
+ " bnb_4bit_use_double_quant=True,\n",
280
+ " bnb_4bit_quant_type=\"nf4\",\n",
281
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
282
+ ")"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 8,
288
+ "id": "2b5ae38c-b0d2-4b9a-acde-3370130ca6e7",
289
+ "metadata": {},
290
+ "outputs": [
291
+ {
292
+ "data": {
293
+ "application/vnd.jupyter.widget-view+json": {
294
+ "model_id": "7e1406ca2f5f4c0dbf0a581edebc9a6b",
295
+ "version_major": 2,
296
+ "version_minor": 0
297
+ },
298
+ "text/plain": [
299
+ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
300
+ ]
301
+ },
302
+ "metadata": {},
303
+ "output_type": "display_data"
304
+ },
305
+ {
306
+ "name": "stderr",
307
+ "output_type": "stream",
308
+ "text": [
309
+ "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at deepseek-ai/deepseek-coder-6.7b-instruct and are newly initialized: ['model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq']\n",
310
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
311
+ ]
312
+ },
313
+ {
314
+ "data": {
315
+ "application/vnd.jupyter.widget-view+json": {
316
+ "model_id": "2b9911223be04aeabbc44875a5929afa",
317
+ "version_major": 2,
318
+ "version_minor": 0
319
+ },
320
+ "text/plain": [
321
+ "adapter_model.bin: 0%| | 0.00/33.6M [00:00<?, ?B/s]"
322
+ ]
323
+ },
324
+ "metadata": {},
325
+ "output_type": "display_data"
326
+ }
327
+ ],
328
+ "source": [
329
+ "PEFT_MODEL = \"shanjay/ds-dsc-v4\"\n",
330
+ "\n",
331
+ "config = PeftConfig.from_pretrained(PEFT_MODEL)\n",
332
+ "model = AutoModelForCausalLM.from_pretrained(\n",
333
+ " config.base_model_name_or_path,\n",
334
+ " return_dict=True,\n",
335
+ " quantization_config=bnb_config,\n",
336
+ " device_map=\"auto\",\n",
337
+ " trust_remote_code=True,\n",
338
+ ")\n",
339
+ "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
340
+ "tokenizer.pad_token = tokenizer.eos_token\n",
341
+ "\n",
342
+ "model = PeftModel.from_pretrained(model, PEFT_MODEL)"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": 9,
348
+ "id": "7c3e35e0-f77c-4d63-8e2b-e72027341e31",
349
+ "metadata": {},
350
+ "outputs": [],
351
+ "source": [
352
+ "generation_config = model.generation_config\n",
353
+ "generation_config.max_new_tokens = 200\n",
354
+ "generation_config.temperature = 0.7\n",
355
+ "generation_config.top_p = 0.7\n",
356
+ "generation_config.num_return_sequences = 1\n",
357
+ "generation_config.pad_token_id = tokenizer.eos_token_id\n",
358
+ "generation_config.eos_token_id = tokenizer.eos_token_id"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "execution_count": 10,
364
+ "id": "aee4385b-d855-4225-9532-4e9002322579",
365
+ "metadata": {},
366
+ "outputs": [],
367
+ "source": [
368
+ "DEVICE = \"cuda:0\""
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": 11,
374
+ "id": "7b14a1c6-ac62-4a9c-9df9-0db50facfd7e",
375
+ "metadata": {},
376
+ "outputs": [
377
+ {
378
+ "name": "stdout",
379
+ "output_type": "stream",
380
+ "text": [
381
+ "<instruction>: How can I create a dataframe?\n",
382
+ "<output>: import pandas as pd\n",
383
+ "\n",
384
+ "\n",
385
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
386
+ "print(df)\n",
387
+ " A B\n",
388
+ "0 1 4\n",
389
+ "1 2 5\n",
390
+ "2 3 6\n",
391
+ "<output>: import pandas as pd\n",
392
+ "\n",
393
+ "\n",
394
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
395
+ "print(df)\n",
396
+ " A B\n",
397
+ "0 1 4\n",
398
+ "1 2 5\n",
399
+ "2 3 6\n",
400
+ "<output>: import pandas as pd\n",
401
+ "\n",
402
+ "\n",
403
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
404
+ "print(df)\n",
405
+ " A\n",
406
+ "CPU times: user 26.8 s, sys: 346 ms, total: 27.1 s\n",
407
+ "Wall time: 27.2 s\n"
408
+ ]
409
+ }
410
+ ],
411
+ "source": [
412
+ "%%time\n",
413
+ "prompt = f\"\"\"\n",
414
+ "<instruction>: How can I create a dataframe?\n",
415
+ "<output>:\n",
416
+ "\"\"\".strip()\n",
417
+ "\n",
418
+ "encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
419
+ "with torch.inference_mode():\n",
420
+ " outputs = model.generate(\n",
421
+ " input_ids=encoding.input_ids,\n",
422
+ " attention_mask=encoding.attention_mask,\n",
423
+ " generation_config=generation_config,\n",
424
+ " )\n",
425
+ "print(tokenizer.decode(outputs[0], skip_special_tokens=True))"
426
+ ]
427
+ },
428
+ {
429
+ "cell_type": "code",
430
+ "execution_count": 12,
431
+ "id": "93c95988-c563-4871-974d-004bf73fbce8",
432
+ "metadata": {},
433
+ "outputs": [],
434
+ "source": [
435
+ "def generate_response(question: str) -> str:\n",
436
+ " prompt = f\"\"\"\n",
437
+ "<instruction>: {question}\n",
438
+ "<output>:\n",
439
+ "\"\"\".strip()\n",
440
+ " encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
441
+ " with torch.inference_mode():\n",
442
+ " outputs = model.generate(\n",
443
+ " input_ids=encoding.input_ids,\n",
444
+ " attention_mask=encoding.attention_mask,\n",
445
+ " generation_config=generation_config,\n",
446
+ " )\n",
447
+ " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
448
+ "\n",
449
+ " assistant_start = \"<output>:\"\n",
450
+ " response_start = response.find(assistant_start)\n",
451
+ " return response[response_start + len(assistant_start) :].strip()"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "code",
456
+ "execution_count": 13,
457
+ "id": "8a9a9b87-193b-4bed-8ef1-57944d931958",
458
+ "metadata": {},
459
+ "outputs": [
460
+ {
461
+ "name": "stdout",
462
+ "output_type": "stream",
463
+ "text": [
464
+ "import pandas as pd\n",
465
+ "\n",
466
+ "\n",
467
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
468
+ "print(df)\n",
469
+ " A B\n",
470
+ "0 1 4\n",
471
+ "1 2 5\n",
472
+ "2 3 6\n",
473
+ "<output>: import pandas as pd\n",
474
+ "\n",
475
+ "\n",
476
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
477
+ "print(df)\n",
478
+ " A B\n",
479
+ "0 1 4\n",
480
+ "1 2 5\n",
481
+ "2 3 6\n",
482
+ "<output>: import pandas as pd\n",
483
+ "\n",
484
+ "\n",
485
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
486
+ "print(df)\n",
487
+ " A\n"
488
+ ]
489
+ }
490
+ ],
491
+ "source": [
492
+ "prompt = \"How can I create a dataframe?\"\n",
493
+ "print(generate_response(prompt))"
494
+ ]
495
+ },
496
+ {
497
+ "cell_type": "code",
498
+ "execution_count": 14,
499
+ "id": "4658f305-b7c6-432c-ac0c-f62bd79e9ad5",
500
+ "metadata": {},
501
+ "outputs": [
502
+ {
503
+ "name": "stdout",
504
+ "output_type": "stream",
505
+ "text": [
506
+ "import pandas as pd\n",
507
+ "\n",
508
+ "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
509
+ "df2 = pd.DataFrame({'C': [7, 8, 9], 'D': [10, 11, 12]})\n",
510
+ "\n",
511
+ "# merge df1 and df2\n",
512
+ "result = ...\n",
513
+ "\n",
514
+ "print(result)\n",
515
+ "\n",
516
+ "# Expected output\n",
517
+ "# A B C D\n",
518
+ "# 0 1 4 7 10\n",
519
+ "# 1 2 5 8 11\n",
520
+ "# 2 3 6 9 12\n",
521
+ "<output>: import pandas as pd\n",
522
+ "\n",
523
+ "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]\n"
524
+ ]
525
+ }
526
+ ],
527
+ "source": [
528
+ "prompt = \"How to merge two dataframes?\"\n",
529
+ "print(generate_response(prompt))"
530
+ ]
531
+ },
532
+ {
533
+ "cell_type": "code",
534
+ "execution_count": 15,
535
+ "id": "0e9ed231-4a62-4331-94df-f3bcd601f138",
536
+ "metadata": {},
537
+ "outputs": [
538
+ {
539
+ "name": "stdout",
540
+ "output_type": "stream",
541
+ "text": [
542
+ "import pandas as pd\n",
543
+ "\n",
544
+ "\n",
545
+ "name=['joy','shan']\n",
546
+ "roll_no=[1,2]\n",
547
+ "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
548
+ "print(df)\n",
549
+ "\n",
550
+ "\n",
551
+ " name roll_no\n",
552
+ "0 joy 1\n",
553
+ "1 shan 2\n",
554
+ "<output>: import pandas as pd\n",
555
+ "\n",
556
+ "\n",
557
+ "name=['joy','shan']\n",
558
+ "roll_no=[1,2]\n",
559
+ "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
560
+ "print(df)\n",
561
+ "\n",
562
+ "\n",
563
+ " name roll_no\n",
564
+ "0 joy 1\n",
565
+ "1 shan 2\n",
566
+ "<output>: import pandas as pd\n",
567
+ "\n",
568
+ "\n",
569
+ "name=['joy','shan']\n",
570
+ "roll_no=[1,2]\n",
571
+ "df = pd.DataFrame({\n"
572
+ ]
573
+ }
574
+ ],
575
+ "source": [
576
+ "prompt = \"given two arrays name=['joy','shan'], roll_no=[1,2]. put these array in a dataframe ?\"\n",
577
+ "print(generate_response(prompt))"
578
+ ]
579
+ },
580
+ {
581
+ "cell_type": "code",
582
+ "execution_count": 16,
583
+ "id": "381ba5c0-276d-411e-a8d5-9f010528433d",
584
+ "metadata": {},
585
+ "outputs": [
586
+ {
587
+ "name": "stdout",
588
+ "output_type": "stream",
589
+ "text": [
590
+ "import matplotlib.pyplot as plt\n",
591
+ "\n",
592
+ "x = range(10)\n",
593
+ "y = range(10)\n",
594
+ "\n",
595
+ "plt.plot(x, y, label='line')\n",
596
+ "plt.scatter(x, y, label='scatter')\n",
597
+ "plt.bar(x, y, label='bar')\n",
598
+ "plt.hist(x, y, label='hist')\n",
599
+ "plt.legend()\n",
600
+ "plt.show()\n",
601
+ "<output>: import matplotlib.pyplot as plt\n",
602
+ "\n",
603
+ "x = range(10)\n",
604
+ "y = range(10)\n",
605
+ "\n",
606
+ "plt.plot(x, y, label='line')\n",
607
+ "plt.scatter(x, y, label='scatter')\n",
608
+ "plt.bar(x, y, label='bar')\n",
609
+ "plt.hist(x, y, label='hist')\n",
610
+ "pl\n"
611
+ ]
612
+ }
613
+ ],
614
+ "source": [
615
+ "prompt = \"can you plot all types of plots in matplotlib?\"\n",
616
+ "print(generate_response(prompt))"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": 19,
622
+ "id": "6864c3c7-b721-48ca-8943-dcff9838f7d2",
623
+ "metadata": {},
624
+ "outputs": [
625
+ {
626
+ "name": "stdout",
627
+ "output_type": "stream",
628
+ "text": [
629
+ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
630
+ ]
631
+ }
632
+ ],
633
+ "source": [
634
+ "prompt = \"\"\"Problem:\n",
635
+ "i got an issue over ranking of date times. Lets say i have following table.\n",
636
+ "ID TIME\n",
637
+ "01 2018-07-11 11:12:20\n",
638
+ "01 2018-07-12 12:00:23\n",
639
+ "01 2018-07-13 12:00:00\n",
640
+ "02 2019-09-11 11:00:00\n",
641
+ "02 2019-09-12 12:00:00\n",
642
+ "\n",
643
+ "\n",
644
+ "and i want to add another column to rank the table by time for each id and group. I used \n",
645
+ "df['RANK'] = data.groupby('ID')['TIME'].rank(ascending=True)\n",
646
+ "\n",
647
+ "\n",
648
+ "but get an error:\n",
649
+ "'NoneType' object is not callable\n",
650
+ "\n",
651
+ "\n",
652
+ "If i replace datetime to numbers, it works.... any solutions?\n",
653
+ "\"\"\"\n",
654
+ "print(generate_response(prompt))"
655
+ ]
656
+ },
657
+ {
658
+ "cell_type": "code",
659
+ "execution_count": 20,
660
+ "id": "7fa02929-5c65-4aa6-81ce-9c51879e7535",
661
+ "metadata": {},
662
+ "outputs": [
663
+ {
664
+ "name": "stdout",
665
+ "output_type": "stream",
666
+ "text": [
667
+ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
668
+ ]
669
+ }
670
+ ],
671
+ "source": [
672
+ "prompt = \"\"\"Problem:\n",
673
+ "I have the following dataframe:\n",
674
+ "index = range(14)\n",
675
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
676
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
677
+ "\n",
678
+ "\n",
679
+ "How can I fill the zeros with the maximun between previous and posterior non-zero value using pandas? Is there a fillna that is not just for \"NaN\"?. \n",
680
+ "The output should look like:\n",
681
+ " A\n",
682
+ "0 1\n",
683
+ "1 2\n",
684
+ "2 2\n",
685
+ "3 2\n",
686
+ "4 4\n",
687
+ "5 4\n",
688
+ "6 6\n",
689
+ "7 8\n",
690
+ "8 8\n",
691
+ "9 8\n",
692
+ "10 8\n",
693
+ "11 8\n",
694
+ "12 2\n",
695
+ "13 1\n",
696
+ "\"\"\"\n",
697
+ "\n",
698
+ "print(generate_response(prompt))"
699
+ ]
700
+ },
701
+ {
702
+ "cell_type": "code",
703
+ "execution_count": 27,
704
+ "id": "255cc021-5f5e-46af-a75e-a435b9629cdf",
705
+ "metadata": {},
706
+ "outputs": [
707
+ {
708
+ "name": "stdout",
709
+ "output_type": "stream",
710
+ "text": [
711
+ "Problem:\n",
712
+ "My sample df has four columns with NaN values. The goal is to concatenate all the keywords rows while excluding the NaN values.\n",
713
+ "import pandas as pd\n",
714
+ "import numpy as np\n",
715
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
716
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
717
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
718
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
719
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
720
+ "\n",
721
+ "\n",
722
+ " users keywords_0 keywords_1 keywords_2 keywords_3\n",
723
+ "0 Hu Tao a d NaN f\n",
724
+ "1 Zhongli NaN e NaN NaN\n",
725
+ "2 Xingqiu c NaN b g\n",
726
+ "\n",
727
+ "\n",
728
+ "Want to accomplish the following:\n",
729
+ " users keywords_0 keywords_1 keywords_2 keywords_3 keywords_all\n",
730
+ "0 Hu Tao a d NaN f a-d-f\n",
731
+ "1 Zhongli NaN e NaN NaN e\n",
732
+ "2 Xingqiu c NaN b g c-b-g\n",
733
+ "\n",
734
+ "\n",
735
+ "Pseudo code:\n",
736
+ "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
737
+ "df[\"keywords_all\"] = df[\"keywords_all\"].apply(lambda cols: \"-\".join(cols), axis=1)\n",
738
+ "\n",
739
+ "\n",
740
+ "I know I can use \"-\".join() to get the exact result, but I am unsure how to pass the column names into the function.\n"
741
+ ]
742
+ }
743
+ ],
744
+ "source": [
745
+ "print(data[5]['question'])"
746
+ ]
747
+ },
748
+ {
749
+ "cell_type": "code",
750
+ "execution_count": 28,
751
+ "id": "1c5841e9-4331-4185-a7ad-7dd00d4e13b1",
752
+ "metadata": {},
753
+ "outputs": [
754
+ {
755
+ "name": "stdout",
756
+ "output_type": "stream",
757
+ "text": [
758
+ "import pandas as pd\n",
759
+ "import numpy as np\n",
760
+ "\n",
761
+ "\n",
762
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
763
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
764
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
765
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
766
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
767
+ "import numpy as np\n",
768
+ "def g(df):\n",
769
+ " df[\"keywords_all\"] = df.filter(like='keyword').apply(lambda x: '-'.join(x.dropna()), axis=1)\n",
770
+ " return df\n",
771
+ "\n",
772
+ "df = g(df.copy())\n",
773
+ "result = df\n",
774
+ "print(result)\n"
775
+ ]
776
+ }
777
+ ],
778
+ "source": [
779
+ "print(data[5]['answer'])"
780
+ ]
781
+ },
782
+ {
783
+ "cell_type": "code",
784
+ "execution_count": 29,
785
+ "id": "090e98c3-78db-4e33-af4b-01c6e1fc23d0",
786
+ "metadata": {},
787
+ "outputs": [
788
+ {
789
+ "name": "stdout",
790
+ "output_type": "stream",
791
+ "text": [
792
+ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
793
+ ]
794
+ }
795
+ ],
796
+ "source": [
797
+ "prompt = data[5]['question']\n",
798
+ "print(generate_response(prompt))"
799
+ ]
800
+ },
801
+ {
802
+ "cell_type": "code",
803
+ "execution_count": null,
804
+ "id": "29609669-1ac7-4f6a-b0e3-64a3bf7a6545",
805
+ "metadata": {},
806
+ "outputs": [],
807
+ "source": []
808
+ }
809
+ ],
810
+ "metadata": {
811
+ "kernelspec": {
812
+ "display_name": "Python 3 (ipykernel)",
813
+ "language": "python",
814
+ "name": "python3"
815
+ },
816
+ "language_info": {
817
+ "codemirror_mode": {
818
+ "name": "ipython",
819
+ "version": 3
820
+ },
821
+ "file_extension": ".py",
822
+ "mimetype": "text/x-python",
823
+ "name": "python",
824
+ "nbconvert_exporter": "python",
825
+ "pygments_lexer": "ipython3",
826
+ "version": "3.10.13"
827
+ }
828
+ },
829
+ "nbformat": 4,
830
+ "nbformat_minor": 5
831
+ }
.ipynb_checkpoints/Testv4-checkpoint.ipynb ADDED
@@ -0,0 +1,698 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 11,
6
+ "id": "addd199c-097c-419d-a0f2-c3d73efb8d5d",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import json\n",
11
+ "import os\n",
12
+ "from pprint import pprint\n",
13
+ "\n",
14
+ "import bitsandbytes as bnb\n",
15
+ "import pandas as pd\n",
16
+ "import torch\n",
17
+ "import torch.nn as nn\n",
18
+ "\n",
19
+ "import transformers\n",
20
+ "from datasets import load_dataset\n",
21
+ "from huggingface_hub import notebook_login\n",
22
+ "from peft import (\n",
23
+ " LoraConfig,\n",
24
+ " PeftConfig,\n",
25
+ " PeftModel,\n",
26
+ " get_peft_model,\n",
27
+ " prepare_model_for_kbit_training,\n",
28
+ ")\n",
29
+ "from transformers import (\n",
30
+ " AutoConfig,\n",
31
+ " AutoModelForCausalLM,\n",
32
+ " AutoTokenizer,\n",
33
+ " BitsAndBytesConfig,\n",
34
+ ")\n",
35
+ "import warnings\n",
36
+ "warnings.filterwarnings(\"ignore\")\n",
37
+ "\n",
38
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 2,
44
+ "id": "acfb1578-a66f-44f0-8df9-1c6bcf7530ea",
45
+ "metadata": {},
46
+ "outputs": [
47
+ {
48
+ "data": {
49
+ "application/vnd.jupyter.widget-view+json": {
50
+ "model_id": "b92bb6f7a2784be8bf5cab2ee87292ff",
51
+ "version_major": 2,
52
+ "version_minor": 0
53
+ },
54
+ "text/plain": [
55
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
56
+ ]
57
+ },
58
+ "metadata": {},
59
+ "output_type": "display_data"
60
+ }
61
+ ],
62
+ "source": [
63
+ "notebook_login()"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": 3,
69
+ "id": "d2f13cac-1536-4da0-8ff7-0a0454fd0b4a",
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "with open(\"ds1000-test-cleaned.json\") as json_file:\n",
74
+ " data = json.load(json_file)"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 5,
80
+ "id": "6706e68b-d525-4392-ab2c-1dff356da52d",
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "name": "stdout",
85
+ "output_type": "stream",
86
+ "text": [
87
+ "{'answer': 'import pandas as pd\\n'\n",
88
+ " '\\n'\n",
89
+ " '\\n'\n",
90
+ " 'index = range(14)\\n'\n",
91
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
92
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
93
+ " 'def g(df):\\n'\n",
94
+ " \" l = df['A'].replace(to_replace=0, method='ffill')\\n\"\n",
95
+ " \" r = df['A'].replace(to_replace=0, method='bfill')\\n\"\n",
96
+ " ' for i in range(len(df)):\\n'\n",
97
+ " \" df['A'].iloc[i] = max(l[i], r[i])\\n\"\n",
98
+ " ' return df\\n'\n",
99
+ " '\\n'\n",
100
+ " 'df = g(df.copy())\\n'\n",
101
+ " 'result = df\\n'\n",
102
+ " 'print(result)',\n",
103
+ " 'question': 'Problem:\\n'\n",
104
+ " 'I have the following dataframe:\\n'\n",
105
+ " 'index = range(14)\\n'\n",
106
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
107
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
108
+ " '\\n'\n",
109
+ " '\\n'\n",
110
+ " 'How can I fill the zeros with the maximun between previous and '\n",
111
+ " 'posterior non-zero value using pandas? Is there a fillna that is '\n",
112
+ " 'not just for \"NaN\"?. \\n'\n",
113
+ " 'The output should look like:\\n'\n",
114
+ " ' A\\n'\n",
115
+ " '0 1\\n'\n",
116
+ " '1 2\\n'\n",
117
+ " '2 2\\n'\n",
118
+ " '3 2\\n'\n",
119
+ " '4 4\\n'\n",
120
+ " '5 4\\n'\n",
121
+ " '6 6\\n'\n",
122
+ " '7 8\\n'\n",
123
+ " '8 8\\n'\n",
124
+ " '9 8\\n'\n",
125
+ " '10 8\\n'\n",
126
+ " '11 8\\n'\n",
127
+ " '12 2\\n'\n",
128
+ " '13 1'}\n"
129
+ ]
130
+ }
131
+ ],
132
+ "source": [
133
+ "pprint(data[0])"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 6,
139
+ "id": "9cc4983a-9a3f-485f-983f-efe2f10ce516",
140
+ "metadata": {},
141
+ "outputs": [],
142
+ "source": [
143
+ "with open(\"ds1000-test-cleaned.json\", \"w\") as f:\n",
144
+ " json.dump(data, f)"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 7,
150
+ "id": "f45c3674-4eed-4ca5-8343-2184ff1e4da1",
151
+ "metadata": {},
152
+ "outputs": [
153
+ {
154
+ "data": {
155
+ "text/html": [
156
+ "<div>\n",
157
+ "<style scoped>\n",
158
+ " .dataframe tbody tr th:only-of-type {\n",
159
+ " vertical-align: middle;\n",
160
+ " }\n",
161
+ "\n",
162
+ " .dataframe tbody tr th {\n",
163
+ " vertical-align: top;\n",
164
+ " }\n",
165
+ "\n",
166
+ " .dataframe thead th {\n",
167
+ " text-align: right;\n",
168
+ " }\n",
169
+ "</style>\n",
170
+ "<table border=\"1\" class=\"dataframe\">\n",
171
+ " <thead>\n",
172
+ " <tr style=\"text-align: right;\">\n",
173
+ " <th></th>\n",
174
+ " <th>question</th>\n",
175
+ " <th>answer</th>\n",
176
+ " </tr>\n",
177
+ " </thead>\n",
178
+ " <tbody>\n",
179
+ " <tr>\n",
180
+ " <th>0</th>\n",
181
+ " <td>Problem:\\nI have the following dataframe:\\nind...</td>\n",
182
+ " <td>import pandas as pd\\n\\n\\nindex = range(14)\\nda...</td>\n",
183
+ " </tr>\n",
184
+ " <tr>\n",
185
+ " <th>1</th>\n",
186
+ " <td>Problem:\\ni got an issue over ranking of date ...</td>\n",
187
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I...</td>\n",
188
+ " </tr>\n",
189
+ " <tr>\n",
190
+ " <th>2</th>\n",
191
+ " <td>Problem:\\nI have a DataFrame like :\\n 0 ...</td>\n",
192
+ " <td>import pandas as pd\\nimport numpy as np\\n\\ndf ...</td>\n",
193
+ " </tr>\n",
194
+ " <tr>\n",
195
+ " <th>3</th>\n",
196
+ " <td>Problem:\\nI have this Pandas dataframe (df):\\n...</td>\n",
197
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A...</td>\n",
198
+ " </tr>\n",
199
+ " <tr>\n",
200
+ " <th>4</th>\n",
201
+ " <td>Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic...</td>\n",
202
+ " <td>import pandas as pd\\n\\ndf = pd.DataFrame.from_...</td>\n",
203
+ " </tr>\n",
204
+ " </tbody>\n",
205
+ "</table>\n",
206
+ "</div>"
207
+ ],
208
+ "text/plain": [
209
+ " question \\\n",
210
+ "0 Problem:\\nI have the following dataframe:\\nind... \n",
211
+ "1 Problem:\\ni got an issue over ranking of date ... \n",
212
+ "2 Problem:\\nI have a DataFrame like :\\n 0 ... \n",
213
+ "3 Problem:\\nI have this Pandas dataframe (df):\\n... \n",
214
+ "4 Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic... \n",
215
+ "\n",
216
+ " answer \n",
217
+ "0 import pandas as pd\\n\\n\\nindex = range(14)\\nda... \n",
218
+ "1 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I... \n",
219
+ "2 import pandas as pd\\nimport numpy as np\\n\\ndf ... \n",
220
+ "3 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A... \n",
221
+ "4 import pandas as pd\\n\\ndf = pd.DataFrame.from_... "
222
+ ]
223
+ },
224
+ "execution_count": 7,
225
+ "metadata": {},
226
+ "output_type": "execute_result"
227
+ }
228
+ ],
229
+ "source": [
230
+ "pd.DataFrame(data).head()"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": 9,
236
+ "id": "6fbdd3ad-062f-4744-bb8e-1c19950adfd5",
237
+ "metadata": {},
238
+ "outputs": [],
239
+ "source": [
240
+ "bnb_config = BitsAndBytesConfig(\n",
241
+ " load_in_4bit=True,\n",
242
+ " bnb_4bit_use_double_quant=True,\n",
243
+ " bnb_4bit_quant_type=\"nf4\",\n",
244
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
245
+ ")"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 12,
251
+ "id": "2b5ae38c-b0d2-4b9a-acde-3370130ca6e7",
252
+ "metadata": {},
253
+ "outputs": [
254
+ {
255
+ "data": {
256
+ "application/vnd.jupyter.widget-view+json": {
257
+ "model_id": "3d6c5533e9ea48e295b7fdfd96da6d47",
258
+ "version_major": 2,
259
+ "version_minor": 0
260
+ },
261
+ "text/plain": [
262
+ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
263
+ ]
264
+ },
265
+ "metadata": {},
266
+ "output_type": "display_data"
267
+ },
268
+ {
269
+ "name": "stderr",
270
+ "output_type": "stream",
271
+ "text": [
272
+ "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at deepseek-ai/deepseek-coder-6.7b-instruct and are newly initialized: ['model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq']\n",
273
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
274
+ ]
275
+ }
276
+ ],
277
+ "source": [
278
+ "PEFT_MODEL = \"shanjay/ds-dsc\"\n",
279
+ "\n",
280
+ "config = PeftConfig.from_pretrained(PEFT_MODEL)\n",
281
+ "model = AutoModelForCausalLM.from_pretrained(\n",
282
+ " config.base_model_name_or_path,\n",
283
+ " return_dict=True,\n",
284
+ " quantization_config=bnb_config,\n",
285
+ " device_map=\"auto\",\n",
286
+ " trust_remote_code=True,\n",
287
+ ")\n",
288
+ "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
289
+ "tokenizer.pad_token = tokenizer.eos_token\n",
290
+ "\n",
291
+ "model = PeftModel.from_pretrained(model, PEFT_MODEL)"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": 13,
297
+ "id": "7c3e35e0-f77c-4d63-8e2b-e72027341e31",
298
+ "metadata": {},
299
+ "outputs": [],
300
+ "source": [
301
+ "generation_config = model.generation_config\n",
302
+ "generation_config.max_new_tokens = 200\n",
303
+ "generation_config.temperature = 0.7\n",
304
+ "generation_config.top_p = 0.7\n",
305
+ "generation_config.num_return_sequences = 1\n",
306
+ "generation_config.pad_token_id = tokenizer.eos_token_id\n",
307
+ "generation_config.eos_token_id = tokenizer.eos_token_id"
308
+ ]
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": 14,
313
+ "id": "aee4385b-d855-4225-9532-4e9002322579",
314
+ "metadata": {},
315
+ "outputs": [],
316
+ "source": [
317
+ "DEVICE = \"cuda:0\""
318
+ ]
319
+ },
320
+ {
321
+ "cell_type": "code",
322
+ "execution_count": 15,
323
+ "id": "7b14a1c6-ac62-4a9c-9df9-0db50facfd7e",
324
+ "metadata": {},
325
+ "outputs": [
326
+ {
327
+ "name": "stdout",
328
+ "output_type": "stream",
329
+ "text": [
330
+ "<instruction>: How can I create a dataframe?\n",
331
+ "<output>:\n",
332
+ "import pandas as pd\n",
333
+ "import numpy as np\n",
334
+ "\n",
335
+ "df = pd.DataFrame(\n",
336
+ " {\n",
337
+ " \"A\": np.array([1, 2, 3]),\n",
338
+ " \"B\": np.array([4, 5, 6]),\n",
339
+ " \"C\": np.array([7, 8, 9]),\n",
340
+ " }\n",
341
+ ")\n",
342
+ "</output>\n",
343
+ "BEGIN SOLUTION\n",
344
+ "<output>\n",
345
+ "[1]\n",
346
+ "<code>\n",
347
+ "[python]\n",
348
+ "# Your code here\n",
349
+ "</code>\n",
350
+ "</output>\n",
351
+ "END SOLUTION\n",
352
+ "<output>\n",
353
+ "[1]\n",
354
+ "<code>\n",
355
+ "[python]\n",
356
+ "print(df)\n",
357
+ "</code>\n",
358
+ "</output>\n",
359
+ "\n",
360
+ "<assistant>: df = pd.DataFrame(\n",
361
+ " {\n",
362
+ " \"A\": np.array([1, 2, 3]),\n",
363
+ "CPU times: user 27.4 s, sys: 372 ms, total: 27.8 s\n",
364
+ "Wall time: 27.9 s\n"
365
+ ]
366
+ }
367
+ ],
368
+ "source": [
369
+ "%%time\n",
370
+ "prompt = f\"\"\"\n",
371
+ "<instruction>: How can I create a dataframe?\n",
372
+ "<output>:\n",
373
+ "\"\"\".strip()\n",
374
+ "\n",
375
+ "encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
376
+ "with torch.inference_mode():\n",
377
+ " outputs = model.generate(\n",
378
+ " input_ids=encoding.input_ids,\n",
379
+ " attention_mask=encoding.attention_mask,\n",
380
+ " generation_config=generation_config,\n",
381
+ " )\n",
382
+ "print(tokenizer.decode(outputs[0], skip_special_tokens=True))"
383
+ ]
384
+ },
385
+ {
386
+ "cell_type": "code",
387
+ "execution_count": 16,
388
+ "id": "93c95988-c563-4871-974d-004bf73fbce8",
389
+ "metadata": {},
390
+ "outputs": [],
391
+ "source": [
392
+ "def generate_response(question: str) -> str:\n",
393
+ " prompt = f\"\"\"\n",
394
+ "<instruction>: {question}\n",
395
+ "<output>:\n",
396
+ "\"\"\".strip()\n",
397
+ " encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
398
+ " with torch.inference_mode():\n",
399
+ " outputs = model.generate(\n",
400
+ " input_ids=encoding.input_ids,\n",
401
+ " attention_mask=encoding.attention_mask,\n",
402
+ " generation_config=generation_config,\n",
403
+ " )\n",
404
+ " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
405
+ "\n",
406
+ " assistant_start = \"<output>:\"\n",
407
+ " response_start = response.find(assistant_start)\n",
408
+ " return response[response_start + len(assistant_start) :].strip()"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 17,
414
+ "id": "8a9a9b87-193b-4bed-8ef1-57944d931958",
415
+ "metadata": {},
416
+ "outputs": [
417
+ {
418
+ "name": "stdout",
419
+ "output_type": "stream",
420
+ "text": [
421
+ "import pandas as pd\n",
422
+ "import numpy as np\n",
423
+ "\n",
424
+ "df = pd.DataFrame(\n",
425
+ " {\n",
426
+ " \"A\": np.array([1, 2, 3]),\n",
427
+ " \"B\": np.array([4, 5, 6]),\n",
428
+ " \"C\": np.array([7, 8, 9]),\n",
429
+ " }\n",
430
+ ")\n",
431
+ "</output>\n",
432
+ "BEGIN SOLUTION\n",
433
+ "<output>\n",
434
+ "[1]\n",
435
+ "<code>\n",
436
+ "[python]\n",
437
+ "# Your code here\n",
438
+ "</code>\n",
439
+ "</output>\n",
440
+ "END SOLUTION\n",
441
+ "<output>\n",
442
+ "[1]\n",
443
+ "<code>\n",
444
+ "[python]\n",
445
+ "print(df)\n",
446
+ "</code>\n",
447
+ "</output>\n",
448
+ "\n",
449
+ "<assistant>: df = pd.DataFrame(\n",
450
+ " {\n",
451
+ " \"A\": np.array([1, 2, 3]),\n"
452
+ ]
453
+ }
454
+ ],
455
+ "source": [
456
+ "prompt = \"How can I create a dataframe?\"\n",
457
+ "print(generate_response(prompt))"
458
+ ]
459
+ },
460
+ {
461
+ "cell_type": "code",
462
+ "execution_count": 18,
463
+ "id": "4658f305-b7c6-432c-ac0c-f62bd79e9ad5",
464
+ "metadata": {},
465
+ "outputs": [
466
+ {
467
+ "name": "stdout",
468
+ "output_type": "stream",
469
+ "text": [
470
+ "import pandas as pd\n",
471
+ "import numpy as np\n",
472
+ "\n",
473
+ "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
474
+ "df2 = pd.DataFrame({'A': [4, 5, 6], 'B': [7, 8, 9]})\n",
475
+ "</output>\n",
476
+ "<assistant>: df = pd.concat([df1, df2])\n",
477
+ "</assistant>\n",
478
+ "<output>: df\n",
479
+ "</output>\n",
480
+ "<code>\n",
481
+ "import pandas as pd\n",
482
+ "import numpy as np\n",
483
+ "\n",
484
+ "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
485
+ "df2 = pd.DataFrame({'A': [4, 5, 6],\n"
486
+ ]
487
+ }
488
+ ],
489
+ "source": [
490
+ "prompt = \"How to merge two dataframes?\"\n",
491
+ "print(generate_response(prompt))"
492
+ ]
493
+ },
494
+ {
495
+ "cell_type": "code",
496
+ "execution_count": 19,
497
+ "id": "0e9ed231-4a62-4331-94df-f3bcd601f138",
498
+ "metadata": {},
499
+ "outputs": [
500
+ {
501
+ "name": "stdout",
502
+ "output_type": "stream",
503
+ "text": [
504
+ "<code>\n",
505
+ "import pandas as pd\n",
506
+ "import numpy as np\n",
507
+ "\n",
508
+ "name=np.array(['joy','shan'])\n",
509
+ "roll_no=np.array([1,2])\n",
510
+ "</code>\n",
511
+ "BEGIN SOLUTION\n",
512
+ "<code>\n",
513
+ "[insert]\n",
514
+ "</code>\n",
515
+ "END SOLUTION\n",
516
+ "<code>\n",
517
+ "print(df)\n",
518
+ "</code>\n",
519
+ "<assistant>: df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
520
+ "</assistant>\n",
521
+ "<code>\n",
522
+ "print(df)\n",
523
+ "</code>\n",
524
+ "\n",
525
+ "<assistant>: df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
526
+ "print(df)\n",
527
+ "</assistant>\n",
528
+ "<code>\n",
529
+ "print(df)\n",
530
+ "</code>\n",
531
+ "<assistant>: df = pd.\n"
532
+ ]
533
+ }
534
+ ],
535
+ "source": [
536
+ "prompt = \"given two arrays name=['joy','shan'], roll_no=[1,2]. put these array in a dataframe ?\"\n",
537
+ "print(generate_response(prompt))"
538
+ ]
539
+ },
540
+ {
541
+ "cell_type": "code",
542
+ "execution_count": 20,
543
+ "id": "381ba5c0-276d-411e-a8d5-9f010528433d",
544
+ "metadata": {},
545
+ "outputs": [
546
+ {
547
+ "name": "stdout",
548
+ "output_type": "stream",
549
+ "text": [
550
+ "[ ]: import matplotlib.pyplot as plt\n",
551
+ "import numpy as np\n",
552
+ "\n",
553
+ "x = np.linspace(0, 10, 100)\n",
554
+ "y = np.sin(x)\n",
555
+ "\n",
556
+ "# your code here\n",
557
+ "</output>\n",
558
+ "<assistant>: plt.plot(x, y)\n",
559
+ "plt.show()\n",
560
+ "</assistant>\n",
561
+ "<output>: [ ]: plt.plot(x, y)\n",
562
+ "plt.show()\n",
563
+ "</output>\n",
564
+ "<assistant>: plt.plot(x, y)\n",
565
+ "plt.show()\n",
566
+ "</assistant>\n",
567
+ "<output>: [ ]: plt.plot(x, y)\n",
568
+ "plt.show()\n",
569
+ "</output>\n",
570
+ "<assistant>: plt.plot(x, y)\n",
571
+ "plt.show()\n",
572
+ "</assistant>\n",
573
+ "<output>\n"
574
+ ]
575
+ }
576
+ ],
577
+ "source": [
578
+ "prompt = \"can you plot all types of plots in matplotlib?\"\n",
579
+ "print(generate_response(prompt))"
580
+ ]
581
+ },
582
+ {
583
+ "cell_type": "code",
584
+ "execution_count": 21,
585
+ "id": "6864c3c7-b721-48ca-8943-dcff9838f7d2",
586
+ "metadata": {},
587
+ "outputs": [
588
+ {
589
+ "name": "stdout",
590
+ "output_type": "stream",
591
+ "text": [
592
+ "import pandas as pd\n",
593
+ "import numpy as np\n",
594
+ "\n",
595
+ "data = pd.DataFrame({'ID': ['01', '01', '01', '02', '02'],\n",
596
+ "'TIME': ['2018-07-11 11:12:20', '2018-07-12 12:00:23', '2018-07-13 12:00:00', '2019-09-11 11:00:00', '2019-09-12 12:00:00']})\n",
597
+ "\n",
598
+ "data['TIME'] = pd.to_datetime(data['TIME'])\n",
599
+ "\n",
600
+ "</output>\n",
601
+ "BEGIN SOLUTION\n",
602
+ "<output>\n",
603
+ "[insert]\n",
604
+ "</output>\n"
605
+ ]
606
+ }
607
+ ],
608
+ "source": [
609
+ "prompt = \"\"\"Problem:\n",
610
+ "i got an issue over ranking of date times. Lets say i have following table.\n",
611
+ "ID TIME\n",
612
+ "01 2018-07-11 11:12:20\n",
613
+ "01 2018-07-12 12:00:23\n",
614
+ "01 2018-07-13 12:00:00\n",
615
+ "02 2019-09-11 11:00:00\n",
616
+ "02 2019-09-12 12:00:00\n",
617
+ "\n",
618
+ "\n",
619
+ "and i want to add another column to rank the table by time for each id and group. I used \n",
620
+ "df['RANK'] = data.groupby('ID')['TIME'].rank(ascending=True)\n",
621
+ "\n",
622
+ "\n",
623
+ "but get an error:\n",
624
+ "'NoneType' object is not callable\n",
625
+ "\n",
626
+ "\n",
627
+ "If i replace datetime to numbers, it works.... any solutions?\n",
628
+ "\"\"\"\n",
629
+ "print(generate_response(prompt))"
630
+ ]
631
+ },
632
+ {
633
+ "cell_type": "code",
634
+ "execution_count": null,
635
+ "id": "7fa02929-5c65-4aa6-81ce-9c51879e7535",
636
+ "metadata": {},
637
+ "outputs": [],
638
+ "source": [
639
+ "prompt = \"\"\"Problem:\n",
640
+ "I have the following dataframe:\n",
641
+ "index = range(14)\n",
642
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
643
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
644
+ "\n",
645
+ "\n",
646
+ "How can I fill the zeros with the maximun between previous and posterior non-zero value using pandas? Is there a fillna that is not just for \"NaN\"?. \n",
647
+ "The output should look like:\n",
648
+ " A\n",
649
+ "0 1\n",
650
+ "1 2\n",
651
+ "2 2\n",
652
+ "3 2\n",
653
+ "4 4\n",
654
+ "5 4\n",
655
+ "6 6\n",
656
+ "7 8\n",
657
+ "8 8\n",
658
+ "9 8\n",
659
+ "10 8\n",
660
+ "11 8\n",
661
+ "12 2\n",
662
+ "13 1\n",
663
+ "\"\"\"\n",
664
+ "\n",
665
+ "print(generate_response(prompt))"
666
+ ]
667
+ },
668
+ {
669
+ "cell_type": "code",
670
+ "execution_count": null,
671
+ "id": "255cc021-5f5e-46af-a75e-a435b9629cdf",
672
+ "metadata": {},
673
+ "outputs": [],
674
+ "source": []
675
+ }
676
+ ],
677
+ "metadata": {
678
+ "kernelspec": {
679
+ "display_name": "Python 3 (ipykernel)",
680
+ "language": "python",
681
+ "name": "python3"
682
+ },
683
+ "language_info": {
684
+ "codemirror_mode": {
685
+ "name": "ipython",
686
+ "version": 3
687
+ },
688
+ "file_extension": ".py",
689
+ "mimetype": "text/x-python",
690
+ "name": "python",
691
+ "nbconvert_exporter": "python",
692
+ "pygments_lexer": "ipython3",
693
+ "version": "3.10.13"
694
+ }
695
+ },
696
+ "nbformat": 4,
697
+ "nbformat_minor": 5
698
+ }
.ipynb_checkpoints/ds1000-train-cleaned-checkpoint.json ADDED
The diff for this file is too large to render. See raw diff
 
07.dsc-mgc-v2.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
07.dscv4.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Prediction-mgc.csv ADDED
The diff for this file is too large to render. See raw diff
 
Prediction-mgc.json ADDED
The diff for this file is too large to render. See raw diff
 
Test-mgc-Copy1.ipynb ADDED
@@ -0,0 +1,1177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "addd199c-097c-419d-a0f2-c3d73efb8d5d",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "\n",
14
+ "===================================BUG REPORT===================================\n",
15
+ "Welcome to bitsandbytes. For bug reports, please run\n",
16
+ "\n",
17
+ "python -m bitsandbytes\n",
18
+ "\n",
19
+ " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
20
+ "================================================================================\n",
21
+ "bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so\n",
22
+ "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n",
23
+ "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so\n",
24
+ "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n",
25
+ "CUDA SETUP: Detected CUDA version 121\n",
26
+ "CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...\n"
27
+ ]
28
+ },
29
+ {
30
+ "name": "stderr",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib64'), PosixPath('/usr/local/nvidia/lib')}\n",
34
+ " warn(msg)\n",
35
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n",
36
+ " warn(msg)\n",
37
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCcuY6EsmJRfLsI1l1rpDWVRhwkL7A9nzITTDbCFOX0wzshP65l/Sa54NrS1pX2uM6YiB7OvgGUm7uUKf9OBCcpd2ohFJiOkTznhDHk+D7IkFZf/VTRIHy/JZoAtzN/qBQKMOygFam1XzTMDnkehMkKvR23BgH72hzGUfYPIsq+OlStYVMhE1bncYSnC4SRucbdT5BeIsival514xsbAhCjjwPd8UHfw1cxaDq4edWjbhN8wkDU+V8i/jS/wWTZIt7pIZiAREEl/YC+Sc4FCSnb4c3p+adl5pqXrEsKygi+UmBtC1poLSXTgZOc/0kerx4jv/HB8NiH4kLsg4S2HjdFFQIB0WSV0i4KDVRE9cv18gQ7kbEv0t9Uwg4xdoMntCNS6aFDm51ufhshwQylzfSwX71Ka3mPdftfnVk81wKpIxN784FEcb7IE7HcNyomnP9N382Fg8j6pILwsKK6w4oOg8Cn2C66cySA6CNTFpK1kYBwsqdU3X8WBQUIZZNVCn4x/qRWYxrKHmdlUW8oCf9AT32eydDQWp1y0AlycA4wfbDQ8g4dtu9Rf+tBrYTztdCt5PbGy4SbwfynWysc/PuhcyaLNtuRYt3LeiCKhKJFNFST1BqjACrjkQ9kMrPSB/7j3JX9O2ncDHDQgCQIQon9BETVQZJ49EqMrusQ3/K39w== shanjay@LAPTOP-Q1PG3AE7')}\n",
38
+ " warn(msg)\n",
39
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('//g.notebooksg.jarvislabs.net'), PosixPath('https')}\n",
40
+ " warn(msg)\n",
41
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n",
42
+ " warn(msg)\n"
43
+ ]
44
+ }
45
+ ],
46
+ "source": [
47
+ "import json\n",
48
+ "import os\n",
49
+ "from pprint import pprint\n",
50
+ "\n",
51
+ "import bitsandbytes as bnb\n",
52
+ "import pandas as pd\n",
53
+ "import torch\n",
54
+ "import torch.nn as nn\n",
55
+ "\n",
56
+ "import transformers\n",
57
+ "from datasets import load_dataset\n",
58
+ "from huggingface_hub import notebook_login\n",
59
+ "from peft import (\n",
60
+ " LoraConfig,\n",
61
+ " PeftConfig,\n",
62
+ " PeftModel,\n",
63
+ " get_peft_model,\n",
64
+ " prepare_model_for_kbit_training,\n",
65
+ ")\n",
66
+ "from transformers import (\n",
67
+ " AutoConfig,\n",
68
+ " AutoModelForCausalLM,\n",
69
+ " AutoTokenizer,\n",
70
+ " BitsAndBytesConfig,\n",
71
+ ")\n",
72
+ "import warnings\n",
73
+ "warnings.filterwarnings(\"ignore\")\n",
74
+ "\n",
75
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 3,
81
+ "id": "acfb1578-a66f-44f0-8df9-1c6bcf7530ea",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "data": {
86
+ "application/vnd.jupyter.widget-view+json": {
87
+ "model_id": "3edf6ee054e9464eb510d3aff9d1dc5f",
88
+ "version_major": 2,
89
+ "version_minor": 0
90
+ },
91
+ "text/plain": [
92
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
93
+ ]
94
+ },
95
+ "metadata": {},
96
+ "output_type": "display_data"
97
+ }
98
+ ],
99
+ "source": [
100
+ "notebook_login()"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 4,
106
+ "id": "d2f13cac-1536-4da0-8ff7-0a0454fd0b4a",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "with open(\"ds1000-test-cleaned.json\") as json_file:\n",
111
+ " data = json.load(json_file)"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 5,
117
+ "id": "6706e68b-d525-4392-ab2c-1dff356da52d",
118
+ "metadata": {},
119
+ "outputs": [
120
+ {
121
+ "name": "stdout",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "{'answer': 'import pandas as pd\\n'\n",
125
+ " '\\n'\n",
126
+ " '\\n'\n",
127
+ " 'index = range(14)\\n'\n",
128
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
129
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
130
+ " 'def g(df):\\n'\n",
131
+ " \" l = df['A'].replace(to_replace=0, method='ffill')\\n\"\n",
132
+ " \" r = df['A'].replace(to_replace=0, method='bfill')\\n\"\n",
133
+ " ' for i in range(len(df)):\\n'\n",
134
+ " \" df['A'].iloc[i] = max(l[i], r[i])\\n\"\n",
135
+ " ' return df\\n'\n",
136
+ " '\\n'\n",
137
+ " 'df = g(df.copy())\\n'\n",
138
+ " 'result = df\\n'\n",
139
+ " 'print(result)',\n",
140
+ " 'question': 'Problem:\\n'\n",
141
+ " 'I have the following dataframe:\\n'\n",
142
+ " 'index = range(14)\\n'\n",
143
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
144
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
145
+ " '\\n'\n",
146
+ " '\\n'\n",
147
+ " 'How can I fill the zeros with the maximun between previous and '\n",
148
+ " 'posterior non-zero value using pandas? Is there a fillna that is '\n",
149
+ " 'not just for \"NaN\"?. \\n'\n",
150
+ " 'The output should look like:\\n'\n",
151
+ " ' A\\n'\n",
152
+ " '0 1\\n'\n",
153
+ " '1 2\\n'\n",
154
+ " '2 2\\n'\n",
155
+ " '3 2\\n'\n",
156
+ " '4 4\\n'\n",
157
+ " '5 4\\n'\n",
158
+ " '6 6\\n'\n",
159
+ " '7 8\\n'\n",
160
+ " '8 8\\n'\n",
161
+ " '9 8\\n'\n",
162
+ " '10 8\\n'\n",
163
+ " '11 8\\n'\n",
164
+ " '12 2\\n'\n",
165
+ " '13 1'}\n"
166
+ ]
167
+ }
168
+ ],
169
+ "source": [
170
+ "pprint(data[0])"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 6,
176
+ "id": "9cc4983a-9a3f-485f-983f-efe2f10ce516",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "with open(\"ds1000-test-cleaned.json\", \"w\") as f:\n",
181
+ " json.dump(data, f)"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 7,
187
+ "id": "f45c3674-4eed-4ca5-8343-2184ff1e4da1",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/html": [
193
+ "<div>\n",
194
+ "<style scoped>\n",
195
+ " .dataframe tbody tr th:only-of-type {\n",
196
+ " vertical-align: middle;\n",
197
+ " }\n",
198
+ "\n",
199
+ " .dataframe tbody tr th {\n",
200
+ " vertical-align: top;\n",
201
+ " }\n",
202
+ "\n",
203
+ " .dataframe thead th {\n",
204
+ " text-align: right;\n",
205
+ " }\n",
206
+ "</style>\n",
207
+ "<table border=\"1\" class=\"dataframe\">\n",
208
+ " <thead>\n",
209
+ " <tr style=\"text-align: right;\">\n",
210
+ " <th></th>\n",
211
+ " <th>question</th>\n",
212
+ " <th>answer</th>\n",
213
+ " </tr>\n",
214
+ " </thead>\n",
215
+ " <tbody>\n",
216
+ " <tr>\n",
217
+ " <th>0</th>\n",
218
+ " <td>Problem:\\nI have the following dataframe:\\nind...</td>\n",
219
+ " <td>import pandas as pd\\n\\n\\nindex = range(14)\\nda...</td>\n",
220
+ " </tr>\n",
221
+ " <tr>\n",
222
+ " <th>1</th>\n",
223
+ " <td>Problem:\\ni got an issue over ranking of date ...</td>\n",
224
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I...</td>\n",
225
+ " </tr>\n",
226
+ " <tr>\n",
227
+ " <th>2</th>\n",
228
+ " <td>Problem:\\nI have a DataFrame like :\\n 0 ...</td>\n",
229
+ " <td>import pandas as pd\\nimport numpy as np\\n\\ndf ...</td>\n",
230
+ " </tr>\n",
231
+ " <tr>\n",
232
+ " <th>3</th>\n",
233
+ " <td>Problem:\\nI have this Pandas dataframe (df):\\n...</td>\n",
234
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A...</td>\n",
235
+ " </tr>\n",
236
+ " <tr>\n",
237
+ " <th>4</th>\n",
238
+ " <td>Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic...</td>\n",
239
+ " <td>import pandas as pd\\n\\ndf = pd.DataFrame.from_...</td>\n",
240
+ " </tr>\n",
241
+ " </tbody>\n",
242
+ "</table>\n",
243
+ "</div>"
244
+ ],
245
+ "text/plain": [
246
+ " question \\\n",
247
+ "0 Problem:\\nI have the following dataframe:\\nind... \n",
248
+ "1 Problem:\\ni got an issue over ranking of date ... \n",
249
+ "2 Problem:\\nI have a DataFrame like :\\n 0 ... \n",
250
+ "3 Problem:\\nI have this Pandas dataframe (df):\\n... \n",
251
+ "4 Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic... \n",
252
+ "\n",
253
+ " answer \n",
254
+ "0 import pandas as pd\\n\\n\\nindex = range(14)\\nda... \n",
255
+ "1 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I... \n",
256
+ "2 import pandas as pd\\nimport numpy as np\\n\\ndf ... \n",
257
+ "3 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A... \n",
258
+ "4 import pandas as pd\\n\\ndf = pd.DataFrame.from_... "
259
+ ]
260
+ },
261
+ "execution_count": 7,
262
+ "metadata": {},
263
+ "output_type": "execute_result"
264
+ }
265
+ ],
266
+ "source": [
267
+ "pd.DataFrame(data).head()"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": 8,
273
+ "id": "6fbdd3ad-062f-4744-bb8e-1c19950adfd5",
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": [
277
+ "bnb_config = BitsAndBytesConfig(\n",
278
+ " load_in_4bit=True,\n",
279
+ " bnb_4bit_use_double_quant=True,\n",
280
+ " bnb_4bit_quant_type=\"nf4\",\n",
281
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
282
+ ")"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 9,
288
+ "id": "2b5ae38c-b0d2-4b9a-acde-3370130ca6e7",
289
+ "metadata": {},
290
+ "outputs": [
291
+ {
292
+ "data": {
293
+ "application/vnd.jupyter.widget-view+json": {
294
+ "model_id": "2be27a54d3e14399a41c46cd9c423399",
295
+ "version_major": 2,
296
+ "version_minor": 0
297
+ },
298
+ "text/plain": [
299
+ "Loading checkpoint shards: 0%| | 0/6 [00:00<?, ?it/s]"
300
+ ]
301
+ },
302
+ "metadata": {},
303
+ "output_type": "display_data"
304
+ },
305
+ {
306
+ "name": "stderr",
307
+ "output_type": "stream",
308
+ "text": [
309
+ "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at ise-uiuc/Magicoder-S-DS-6.7B and are newly initialized: ['model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq']\n",
310
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
311
+ ]
312
+ }
313
+ ],
314
+ "source": [
315
+ "PEFT_MODEL = \"shanjay/mgc-ds\"\n",
316
+ "\n",
317
+ "config = PeftConfig.from_pretrained(PEFT_MODEL)\n",
318
+ "model = AutoModelForCausalLM.from_pretrained(\n",
319
+ " config.base_model_name_or_path,\n",
320
+ " return_dict=True,\n",
321
+ " quantization_config=bnb_config,\n",
322
+ " device_map=\"auto\",\n",
323
+ " trust_remote_code=True,\n",
324
+ ")\n",
325
+ "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
326
+ "tokenizer.pad_token = tokenizer.eos_token\n",
327
+ "\n",
328
+ "model = PeftModel.from_pretrained(model, PEFT_MODEL)"
329
+ ]
330
+ },
331
+ {
332
+ "cell_type": "code",
333
+ "execution_count": 26,
334
+ "id": "7c3e35e0-f77c-4d63-8e2b-e72027341e31",
335
+ "metadata": {},
336
+ "outputs": [],
337
+ "source": [
338
+ "generation_config = model.generation_config\n",
339
+ "generation_config.max_new_tokens = 400\n",
340
+ "generation_config.temperature = 0.7\n",
341
+ "generation_config.top_p = 0.7\n",
342
+ "generation_config.num_return_sequences = 1\n",
343
+ "generation_config.pad_token_id = tokenizer.eos_token_id\n",
344
+ "generation_config.eos_token_id = tokenizer.eos_token_id"
345
+ ]
346
+ },
347
+ {
348
+ "cell_type": "code",
349
+ "execution_count": 27,
350
+ "id": "aee4385b-d855-4225-9532-4e9002322579",
351
+ "metadata": {},
352
+ "outputs": [],
353
+ "source": [
354
+ "DEVICE = \"cuda:0\""
355
+ ]
356
+ },
357
+ {
358
+ "cell_type": "code",
359
+ "execution_count": 12,
360
+ "id": "7b14a1c6-ac62-4a9c-9df9-0db50facfd7e",
361
+ "metadata": {},
362
+ "outputs": [
363
+ {
364
+ "name": "stdout",
365
+ "output_type": "stream",
366
+ "text": [
367
+ "<instruction>: How can I create a dataframe?\n",
368
+ "<output>: import pandas as pd\n",
369
+ "\n",
370
+ "\n",
371
+ "\n",
372
+ "\n",
373
+ "\n",
374
+ "\n",
375
+ "\n",
376
+ "\n",
377
+ "\n",
378
+ "\n",
379
+ "\n",
380
+ "\n",
381
+ "\n",
382
+ "\n",
383
+ "\n",
384
+ "\n",
385
+ "\n",
386
+ "\n",
387
+ "\n",
388
+ "\n",
389
+ "\n",
390
+ "\n",
391
+ "\n",
392
+ "\n",
393
+ "\n",
394
+ "\n",
395
+ "\n",
396
+ "\n",
397
+ "\n",
398
+ "\n",
399
+ "\n",
400
+ "\n",
401
+ "\n",
402
+ "\n",
403
+ "\n",
404
+ "\n",
405
+ "\n",
406
+ "\n",
407
+ "\n",
408
+ "\n",
409
+ "\n",
410
+ "\n",
411
+ "\n",
412
+ "\n",
413
+ "\n",
414
+ "\n",
415
+ "\n",
416
+ "\n",
417
+ "\n",
418
+ "\n",
419
+ "\n",
420
+ "\n",
421
+ "\n",
422
+ "\n",
423
+ "\n",
424
+ "\n",
425
+ "\n",
426
+ "\n",
427
+ "\n",
428
+ "\n",
429
+ "\n",
430
+ "\n",
431
+ "\n",
432
+ "\n",
433
+ "\n",
434
+ "\n",
435
+ "\n",
436
+ "\n",
437
+ "\n",
438
+ "\n",
439
+ "\n",
440
+ "\n",
441
+ "\n",
442
+ "\n",
443
+ "\n",
444
+ "\n",
445
+ "\n",
446
+ "\n",
447
+ "\n",
448
+ "\n",
449
+ "\n",
450
+ "\n",
451
+ "\n",
452
+ "\n",
453
+ "\n",
454
+ "\n",
455
+ "\n",
456
+ "\n",
457
+ "\n",
458
+ "\n",
459
+ "\n",
460
+ "\n",
461
+ "\n",
462
+ "\n",
463
+ "\n",
464
+ "\n",
465
+ "\n",
466
+ "\n",
467
+ "\n",
468
+ "\n",
469
+ "\n",
470
+ "\n",
471
+ "\n",
472
+ "\n",
473
+ "\n",
474
+ "\n",
475
+ "\n",
476
+ "\n",
477
+ "\n",
478
+ "\n",
479
+ "\n",
480
+ "\n",
481
+ "\n",
482
+ "\n",
483
+ "\n",
484
+ "\n",
485
+ "\n",
486
+ "\n",
487
+ "\n",
488
+ "\n",
489
+ "\n",
490
+ "\n",
491
+ "\n",
492
+ "\n",
493
+ "\n",
494
+ "\n",
495
+ "\n",
496
+ "\n",
497
+ "\n",
498
+ "\n",
499
+ "\n",
500
+ "\n",
501
+ "\n",
502
+ "\n",
503
+ "\n",
504
+ "\n",
505
+ "\n",
506
+ "\n",
507
+ "\n",
508
+ "\n",
509
+ "\n",
510
+ "\n",
511
+ "\n",
512
+ "\n",
513
+ "\n",
514
+ "\n",
515
+ "\n",
516
+ "\n",
517
+ "\n",
518
+ "\n",
519
+ "\n",
520
+ "\n",
521
+ "\n",
522
+ "\n",
523
+ "\n",
524
+ "\n",
525
+ "\n",
526
+ "\n",
527
+ "\n",
528
+ "\n",
529
+ "\n",
530
+ "\n",
531
+ "\n",
532
+ "\n",
533
+ "\n",
534
+ "\n",
535
+ "\n",
536
+ "\n",
537
+ "\n",
538
+ "\n",
539
+ "\n",
540
+ "\n",
541
+ "\n",
542
+ "\n",
543
+ "\n",
544
+ "\n",
545
+ "\n",
546
+ "\n",
547
+ "\n",
548
+ "\n",
549
+ "\n",
550
+ "\n",
551
+ "\n",
552
+ "\n",
553
+ "\n",
554
+ "\n",
555
+ "\n",
556
+ "\n",
557
+ "\n",
558
+ "\n",
559
+ "\n",
560
+ "\n",
561
+ "\n",
562
+ "\n",
563
+ "CPU times: user 26.5 s, sys: 177 ms, total: 26.7 s\n",
564
+ "Wall time: 26.7 s\n"
565
+ ]
566
+ }
567
+ ],
568
+ "source": [
569
+ "%%time\n",
570
+ "prompt = f\"\"\"\n",
571
+ "<instruction>: How can I create a dataframe?\n",
572
+ "<output>:\n",
573
+ "\"\"\".strip()\n",
574
+ "\n",
575
+ "encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
576
+ "with torch.inference_mode():\n",
577
+ " outputs = model.generate(\n",
578
+ " input_ids=encoding.input_ids,\n",
579
+ " attention_mask=encoding.attention_mask,\n",
580
+ " generation_config=generation_config,\n",
581
+ " )\n",
582
+ "print(tokenizer.decode(outputs[0], skip_special_tokens=True))"
583
+ ]
584
+ },
585
+ {
586
+ "cell_type": "code",
587
+ "execution_count": 28,
588
+ "id": "93c95988-c563-4871-974d-004bf73fbce8",
589
+ "metadata": {},
590
+ "outputs": [],
591
+ "source": [
592
+ "def generate_response(question: str) -> str:\n",
593
+ " prompt = f\"\"\"\n",
594
+ "<instruction>: {question}\n",
595
+ "<output>:\n",
596
+ "\"\"\".strip()\n",
597
+ " encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
598
+ " with torch.inference_mode():\n",
599
+ " outputs = model.generate(\n",
600
+ " input_ids=encoding.input_ids,\n",
601
+ " attention_mask=encoding.attention_mask,\n",
602
+ " generation_config=generation_config,\n",
603
+ " )\n",
604
+ " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
605
+ "\n",
606
+ " assistant_start = \"<output>:\"\n",
607
+ " response_start = response.find(assistant_start)\n",
608
+ " return response[response_start + len(assistant_start) :].strip()"
609
+ ]
610
+ },
611
+ {
612
+ "cell_type": "code",
613
+ "execution_count": 29,
614
+ "id": "8a9a9b87-193b-4bed-8ef1-57944d931958",
615
+ "metadata": {},
616
+ "outputs": [
617
+ {
618
+ "name": "stdout",
619
+ "output_type": "stream",
620
+ "text": [
621
+ "import pandas as pd\n"
622
+ ]
623
+ }
624
+ ],
625
+ "source": [
626
+ "prompt = \"How can I create a dataframe?\"\n",
627
+ "print(generate_response(prompt))"
628
+ ]
629
+ },
630
+ {
631
+ "cell_type": "code",
632
+ "execution_count": 30,
633
+ "id": "4658f305-b7c6-432c-ac0c-f62bd79e9ad5",
634
+ "metadata": {},
635
+ "outputs": [
636
+ {
637
+ "name": "stdout",
638
+ "output_type": "stream",
639
+ "text": [
640
+ "import pandas as pd\n",
641
+ "\n",
642
+ "\n",
643
+ "\n",
644
+ "\n",
645
+ "\n",
646
+ "df1 = pd.DataFrame({'A': ['A', 'B', 'C', 'D'],\n",
647
+ " 'B': [1, 2, 3, 4]})\n",
648
+ "df2 = pd.DataFrame({'A': ['A', 'B', 'C', 'E'],\n",
649
+ " 'B': [1, 2, 3, 5]})\n",
650
+ "# merge df1 and df2 on column 'A'\n",
651
+ "# SOLUTION START\n",
652
+ "\n",
653
+ "<output>: import pandas as pd\n",
654
+ "\n",
655
+ "\n",
656
+ "\n",
657
+ "\n",
658
+ "\n",
659
+ "df1 = pd.DataFrame({'A': ['A', 'B', 'C', 'D'],\n",
660
+ " 'B': [1, 2, 3, 4]})\n",
661
+ "df2 = pd.DataFrame({'A': ['A', 'B', 'C', 'E'],\n",
662
+ " 'B': [1, 2, 3, 5]})\n",
663
+ "# merge df1 and df2 on column 'A'\n",
664
+ "result = pd.merge(df1, df2, on='A')\n",
665
+ "print(result)\n"
666
+ ]
667
+ }
668
+ ],
669
+ "source": [
670
+ "prompt = \"How to merge two dataframes?\"\n",
671
+ "print(generate_response(prompt))"
672
+ ]
673
+ },
674
+ {
675
+ "cell_type": "code",
676
+ "execution_count": 16,
677
+ "id": "0e9ed231-4a62-4331-94df-f3bcd601f138",
678
+ "metadata": {},
679
+ "outputs": [
680
+ {
681
+ "name": "stdout",
682
+ "output_type": "stream",
683
+ "text": [
684
+ "import pandas as pd\n",
685
+ "\n",
686
+ "\n",
687
+ "name = ['joy', 'shan']\n",
688
+ "roll_no = [1, 2]\n",
689
+ "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
690
+ "print(df)\n"
691
+ ]
692
+ }
693
+ ],
694
+ "source": [
695
+ "prompt = \"given two arrays name=['joy','shan'], roll_no=[1,2]. put these array in a dataframe ?\"\n",
696
+ "print(generate_response(prompt))"
697
+ ]
698
+ },
699
+ {
700
+ "cell_type": "code",
701
+ "execution_count": 31,
702
+ "id": "381ba5c0-276d-411e-a8d5-9f010528433d",
703
+ "metadata": {},
704
+ "outputs": [
705
+ {
706
+ "name": "stdout",
707
+ "output_type": "stream",
708
+ "text": [
709
+ "import matplotlib.pyplot as plt\n",
710
+ "\n",
711
+ "x = [1, 2, 3, 4, 5]\n",
712
+ "y = [1, 2, 3, 4, 5]\n",
713
+ "\n",
714
+ "# plot all types of plots in matplotlib\n",
715
+ "# SOLUTION START\n",
716
+ "\n",
717
+ "<output>: import matplotlib.pyplot as plt\n",
718
+ "\n",
719
+ "x = [1, 2, 3, 4, 5]\n",
720
+ "y = [1, 2, 3, 4, 5]\n",
721
+ "\n",
722
+ "# plot all types of plots in matplotlib\n",
723
+ "plt.plot(x, y, label=\"plot\")\n",
724
+ "plt.scatter(x, y, label=\"scatter\")\n",
725
+ "plt.bar(x, y, label=\"bar\")\n",
726
+ "plt.hist(x, y, label=\"hist\")\n",
727
+ "plt.boxplot(x, y, label=\"boxplot\")\n",
728
+ "plt.show()\n",
729
+ "<output>: import matplotlib.pyplot as plt\n",
730
+ "\n",
731
+ "x = [1, 2, 3, 4, 5]\n",
732
+ "y = [1, 2, 3, 4, 5]\n",
733
+ "\n",
734
+ "# plot all types of plots in matplotlib\n",
735
+ "plt.plot(x, y, label=\"plot\")\n",
736
+ "plt.scatter(x, y, label=\"scatter\")\n",
737
+ "plt.bar(x, y, label=\"bar\")\n",
738
+ "plt.hist(x, y, label=\"hist\")\n",
739
+ "plt.boxplot(x, y, label=\"boxplot\")\n",
740
+ "plt.show()\n",
741
+ "<output>: import matplotlib.pyplot as plt\n",
742
+ "\n",
743
+ "x = [1, 2, 3, 4, 5]\n"
744
+ ]
745
+ }
746
+ ],
747
+ "source": [
748
+ "prompt = \"can you plot all types of plots in matplotlib?\"\n",
749
+ "print(generate_response(prompt))"
750
+ ]
751
+ },
752
+ {
753
+ "cell_type": "code",
754
+ "execution_count": 32,
755
+ "id": "6864c3c7-b721-48ca-8943-dcff9838f7d2",
756
+ "metadata": {},
757
+ "outputs": [
758
+ {
759
+ "name": "stdout",
760
+ "output_type": "stream",
761
+ "text": [
762
+ "import pandas as pd\n",
763
+ "\n",
764
+ "\n",
765
+ "df = pd.DataFrame({'ID': ['01', '01', '01', '02', '02'],\n",
766
+ " 'TIME': ['2018-07-11 11:12:20', '2018-07-12 12:00:23', '2018-07-13 12:00:00', '2019-09-11 11:00:00', '2019-09-12 12:00:00']})\n",
767
+ "def g(df):\n",
768
+ " df['TIME'] = pd.to_datetime(df['TIME'])\n",
769
+ " df['RANK'] = df.groupby('ID')['TIME'].rank(ascending=True)\n",
770
+ " return df\n",
771
+ "\n",
772
+ "df = g(df.copy())\n",
773
+ "print(df)\n",
774
+ "<output>: import pandas as pd\n",
775
+ "\n",
776
+ "\n",
777
+ "df = pd.DataFrame({'ID': ['01', '01', '01', '02', '02'],\n",
778
+ " 'TIME': ['2018-07-11 11:12:20', '2018-07-12 12:00:23', '2018-07-13 12:00:00', '2019-09-11 11:00:00', '2019-09-12 12:00:00']})\n",
779
+ "def g(df):\n",
780
+ " df['TIME'] = pd.to_datetime(df['TIME'])\n"
781
+ ]
782
+ }
783
+ ],
784
+ "source": [
785
+ "prompt = \"\"\"Problem:\n",
786
+ "i got an issue over ranking of date times. Lets say i have following table.\n",
787
+ "ID TIME\n",
788
+ "01 2018-07-11 11:12:20\n",
789
+ "01 2018-07-12 12:00:23\n",
790
+ "01 2018-07-13 12:00:00\n",
791
+ "02 2019-09-11 11:00:00\n",
792
+ "02 2019-09-12 12:00:00\n",
793
+ "\n",
794
+ "\n",
795
+ "and i want to add another column to rank the table by time for each id and group. I used \n",
796
+ "df['RANK'] = data.groupby('ID')['TIME'].rank(ascending=True)\n",
797
+ "\n",
798
+ "\n",
799
+ "but get an error:\n",
800
+ "'NoneType' object is not callable\n",
801
+ "\n",
802
+ "\n",
803
+ "If i replace datetime to numbers, it works.... any solutions?\n",
804
+ "\"\"\"\n",
805
+ "print(generate_response(prompt))"
806
+ ]
807
+ },
808
+ {
809
+ "cell_type": "code",
810
+ "execution_count": 33,
811
+ "id": "7fa02929-5c65-4aa6-81ce-9c51879e7535",
812
+ "metadata": {},
813
+ "outputs": [
814
+ {
815
+ "name": "stdout",
816
+ "output_type": "stream",
817
+ "text": [
818
+ "import pandas as pd\n",
819
+ "\n",
820
+ "\n",
821
+ "index = range(14)\n",
822
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
823
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
824
+ "def g(df):\n",
825
+ " df['A'] = df['A'].replace(0, np.nan)\n",
826
+ " df['A'] = df['A'].fillna(method='ffill')\n",
827
+ " df['A'] = df['A'].fillna(method='bfill')\n",
828
+ " return df\n",
829
+ "\n",
830
+ "df = g(df.copy())\n",
831
+ "result = df\n",
832
+ "print(result)\n",
833
+ "<output>: import pandas as pd\n",
834
+ "import numpy as np\n",
835
+ "\n",
836
+ "\n",
837
+ "index = range(14)\n",
838
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
839
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
840
+ "def g(df):\n",
841
+ " df['A'] = df['A'].replace(0, np.nan)\n",
842
+ " df['A'] = df['A'].fillna(method='ffill')\n",
843
+ " df['A'] = df['A'].fillna(method='bfill')\n",
844
+ " return df\n",
845
+ "\n",
846
+ "df = g(df.copy())\n",
847
+ "result = df\n",
848
+ "print(result)\n",
849
+ "<output>: import pandas as pd\n",
850
+ "import numpy as np\n",
851
+ "\n",
852
+ "\n",
853
+ "index = range(14)\n",
854
+ "data = [1, 0, 0, 2, 0, 4\n"
855
+ ]
856
+ }
857
+ ],
858
+ "source": [
859
+ "prompt = \"\"\"Problem:\n",
860
+ "I have the following dataframe:\n",
861
+ "index = range(14)\n",
862
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
863
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
864
+ "\n",
865
+ "\n",
866
+ "How can I fill the zeros with the maximun between previous and posterior non-zero value using pandas? Is there a fillna that is not just for \"NaN\"?. \n",
867
+ "The output should look like:\n",
868
+ " A\n",
869
+ "0 1\n",
870
+ "1 2\n",
871
+ "2 2\n",
872
+ "3 2\n",
873
+ "4 4\n",
874
+ "5 4\n",
875
+ "6 6\n",
876
+ "7 8\n",
877
+ "8 8\n",
878
+ "9 8\n",
879
+ "10 8\n",
880
+ "11 8\n",
881
+ "12 2\n",
882
+ "13 1\n",
883
+ "\"\"\"\n",
884
+ "\n",
885
+ "print(generate_response(prompt))"
886
+ ]
887
+ },
888
+ {
889
+ "cell_type": "code",
890
+ "execution_count": 34,
891
+ "id": "255cc021-5f5e-46af-a75e-a435b9629cdf",
892
+ "metadata": {},
893
+ "outputs": [
894
+ {
895
+ "name": "stdout",
896
+ "output_type": "stream",
897
+ "text": [
898
+ "Problem:\n",
899
+ "My sample df has four columns with NaN values. The goal is to concatenate all the keywords rows while excluding the NaN values.\n",
900
+ "import pandas as pd\n",
901
+ "import numpy as np\n",
902
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
903
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
904
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
905
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
906
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
907
+ "\n",
908
+ "\n",
909
+ " users keywords_0 keywords_1 keywords_2 keywords_3\n",
910
+ "0 Hu Tao a d NaN f\n",
911
+ "1 Zhongli NaN e NaN NaN\n",
912
+ "2 Xingqiu c NaN b g\n",
913
+ "\n",
914
+ "\n",
915
+ "Want to accomplish the following:\n",
916
+ " users keywords_0 keywords_1 keywords_2 keywords_3 keywords_all\n",
917
+ "0 Hu Tao a d NaN f a-d-f\n",
918
+ "1 Zhongli NaN e NaN NaN e\n",
919
+ "2 Xingqiu c NaN b g c-b-g\n",
920
+ "\n",
921
+ "\n",
922
+ "Pseudo code:\n",
923
+ "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
924
+ "df[\"keywords_all\"] = df[\"keywords_all\"].apply(lambda cols: \"-\".join(cols), axis=1)\n",
925
+ "\n",
926
+ "\n",
927
+ "I know I can use \"-\".join() to get the exact result, but I am unsure how to pass the column names into the function.\n"
928
+ ]
929
+ }
930
+ ],
931
+ "source": [
932
+ "print(data[5]['question'])"
933
+ ]
934
+ },
935
+ {
936
+ "cell_type": "code",
937
+ "execution_count": 35,
938
+ "id": "1c5841e9-4331-4185-a7ad-7dd00d4e13b1",
939
+ "metadata": {},
940
+ "outputs": [
941
+ {
942
+ "name": "stdout",
943
+ "output_type": "stream",
944
+ "text": [
945
+ "import pandas as pd\n",
946
+ "import numpy as np\n",
947
+ "\n",
948
+ "\n",
949
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
950
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
951
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
952
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
953
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
954
+ "import numpy as np\n",
955
+ "def g(df):\n",
956
+ " df[\"keywords_all\"] = df.filter(like='keyword').apply(lambda x: '-'.join(x.dropna()), axis=1)\n",
957
+ " return df\n",
958
+ "\n",
959
+ "df = g(df.copy())\n",
960
+ "result = df\n",
961
+ "print(result)\n"
962
+ ]
963
+ }
964
+ ],
965
+ "source": [
966
+ "print(data[5]['answer'])"
967
+ ]
968
+ },
969
+ {
970
+ "cell_type": "code",
971
+ "execution_count": 36,
972
+ "id": "090e98c3-78db-4e33-af4b-01c6e1fc23d0",
973
+ "metadata": {},
974
+ "outputs": [
975
+ {
976
+ "name": "stdout",
977
+ "output_type": "stream",
978
+ "text": [
979
+ "import pandas as pd\n",
980
+ "import numpy as np\n",
981
+ "\n",
982
+ "\n",
983
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
984
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
985
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
986
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
987
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
988
+ "\n",
989
+ "\n",
990
+ "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
991
+ "def f(cols):\n",
992
+ " return \"-\".join(cols)\n",
993
+ "\n",
994
+ "\n",
995
+ "df[\"keywords_all\"] = df.apply(lambda row: f(row[cols]), axis=1)\n",
996
+ "\n",
997
+ "\n",
998
+ "print(df)\n",
999
+ "<output>: import pandas as pd\n",
1000
+ "import numpy as np\n",
1001
+ "\n",
1002
+ "\n",
1003
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
1004
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
1005
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
1006
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
1007
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
1008
+ "\n",
1009
+ "\n",
1010
+ "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
1011
+ "def f(cols):\n",
1012
+ " return \"-\".join(cols)\n",
1013
+ "\n",
1014
+ "\n",
1015
+ "df[\"keywords_all\"] = df.apply(lambda\n"
1016
+ ]
1017
+ }
1018
+ ],
1019
+ "source": [
1020
+ "prompt = data[5]['question']\n",
1021
+ "print(generate_response(prompt))"
1022
+ ]
1023
+ },
1024
+ {
1025
+ "cell_type": "code",
1026
+ "execution_count": 37,
1027
+ "id": "29609669-1ac7-4f6a-b0e3-64a3bf7a6545",
1028
+ "metadata": {},
1029
+ "outputs": [
1030
+ {
1031
+ "name": "stdout",
1032
+ "output_type": "stream",
1033
+ "text": [
1034
+ "import pandas as pd\n",
1035
+ "\n",
1036
+ "\n",
1037
+ "df = pd.DataFrame({'A': [1, 2, None, 4, 5],\n",
1038
+ " 'B': [None, 2, 3, 4, 5],\n",
1039
+ " 'C': [1, 2, 3, 4, 5]})\n",
1040
+ "df = df.dropna()\n",
1041
+ "print(df)\n",
1042
+ "<output>: import pandas as pd\n",
1043
+ "\n",
1044
+ "\n",
1045
+ "df = pd.DataFrame({'A': [1, 2, None, 4, 5],\n",
1046
+ " 'B': [None, 2, 3, 4, 5],\n",
1047
+ " 'C': [1, 2, 3, 4, 5]})\n",
1048
+ "df = df.dropna()\n",
1049
+ "print(df)\n",
1050
+ "<output>: import pandas as pd\n",
1051
+ "\n",
1052
+ "\n",
1053
+ "df = pd.DataFrame({'A': [1, 2, None, 4, 5],\n",
1054
+ " 'B': [None, 2, 3, 4, 5],\n",
1055
+ " 'C': [1, 2, 3, 4, 5]})\n",
1056
+ "df = df.dropna()\n",
1057
+ "print(df)\n",
1058
+ "<output>: import pandas as pd\n",
1059
+ "\n",
1060
+ "\n",
1061
+ "df = pd.DataFrame({'A': [1, 2, None, 4, 5],\n",
1062
+ " 'B': [None, 2, 3, 4, 5],\n",
1063
+ " 'C': [1, 2, 3, 4, 5]})\n",
1064
+ "df = df.dropna()\n",
1065
+ "print(df)\n",
1066
+ "<output>: import pandas as pd\n",
1067
+ "\n",
1068
+ "\n",
1069
+ "df = pd.DataFrame({'A': [1, 2, None,\n"
1070
+ ]
1071
+ }
1072
+ ],
1073
+ "source": [
1074
+ "prompt = \"How to remove null valued rows?\"\n",
1075
+ "print(generate_response(prompt))"
1076
+ ]
1077
+ },
1078
+ {
1079
+ "cell_type": "code",
1080
+ "execution_count": 39,
1081
+ "id": "5ca085f6-30fc-4e50-a436-673f3baa75af",
1082
+ "metadata": {},
1083
+ "outputs": [
1084
+ {
1085
+ "name": "stdout",
1086
+ "output_type": "stream",
1087
+ "text": [
1088
+ "import numpy as np\n",
1089
+ "import pandas as pd\n",
1090
+ "import matplotlib.pyplot as plt\n",
1091
+ "import seaborn as sns\n",
1092
+ "import sklearn\n",
1093
+ "from sklearn.linear_model import LogisticRegression\n",
1094
+ "from sklearn.model_selection import train_test_split\n",
1095
+ "\n",
1096
+ "\n",
1097
+ "X, y = load_data()\n",
1098
+ "\n",
1099
+ "# Split the data into training and test sets\n",
1100
+ "# Split the data into training and test sets\n",
1101
+ "# Split the data into training and test sets\n",
1102
+ "# Train a Logistic Regression model on the training data\n",
1103
+ "# Print the accuracy of the model on the test data\n",
1104
+ "# SOLUTION START\n",
1105
+ "\n",
1106
+ "<output>: import numpy as np\n",
1107
+ "import pandas as pd\n",
1108
+ "import matplotlib.pyplot as plt\n",
1109
+ "import seaborn as sns\n",
1110
+ "import sklearn\n",
1111
+ "from sklearn.linear_model import LogisticRegression\n",
1112
+ "from sklearn.model_selection import train_test_split\n",
1113
+ "\n",
1114
+ "\n",
1115
+ "X, y = load_data()\n",
1116
+ "\n",
1117
+ "# Split the data into training and test sets\n",
1118
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
1119
+ "# Train a Logistic Regression model on the training data\n",
1120
+ "model = LogisticRegression()\n",
1121
+ "model.fit(X_train, y_train)\n",
1122
+ "# Print the accuracy of the model on the test data\n",
1123
+ "print(model.score(X_test, y_test))\n",
1124
+ "<output>: import numpy as np\n",
1125
+ "import pandas as pd\n",
1126
+ "import matplotlib.pyplot as plt\n",
1127
+ "import seaborn as sns\n",
1128
+ "import sklearn\n",
1129
+ "from sklearn.linear_model import LogisticRegression\n",
1130
+ "from sklearn.model_selection import train_test_split\n"
1131
+ ]
1132
+ }
1133
+ ],
1134
+ "source": [
1135
+ "prompt = \"How to train a Logistic Regression model?\"\n",
1136
+ "print(generate_response(prompt))"
1137
+ ]
1138
+ },
1139
+ {
1140
+ "cell_type": "code",
1141
+ "execution_count": null,
1142
+ "id": "146527ff-5d37-42c7-b06b-45c1aa224d17",
1143
+ "metadata": {},
1144
+ "outputs": [],
1145
+ "source": []
1146
+ },
1147
+ {
1148
+ "cell_type": "code",
1149
+ "execution_count": null,
1150
+ "id": "84f671f3-7bd6-4a7c-81e9-758052b424cf",
1151
+ "metadata": {},
1152
+ "outputs": [],
1153
+ "source": []
1154
+ }
1155
+ ],
1156
+ "metadata": {
1157
+ "kernelspec": {
1158
+ "display_name": "Python 3 (ipykernel)",
1159
+ "language": "python",
1160
+ "name": "python3"
1161
+ },
1162
+ "language_info": {
1163
+ "codemirror_mode": {
1164
+ "name": "ipython",
1165
+ "version": 3
1166
+ },
1167
+ "file_extension": ".py",
1168
+ "mimetype": "text/x-python",
1169
+ "name": "python",
1170
+ "nbconvert_exporter": "python",
1171
+ "pygments_lexer": "ipython3",
1172
+ "version": "3.10.13"
1173
+ }
1174
+ },
1175
+ "nbformat": 4,
1176
+ "nbformat_minor": 5
1177
+ }
Test-mgc-f.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Test.ipynb ADDED
@@ -0,0 +1,725 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 11,
6
+ "id": "addd199c-097c-419d-a0f2-c3d73efb8d5d",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import json\n",
11
+ "import os\n",
12
+ "from pprint import pprint\n",
13
+ "\n",
14
+ "import bitsandbytes as bnb\n",
15
+ "import pandas as pd\n",
16
+ "import torch\n",
17
+ "import torch.nn as nn\n",
18
+ "\n",
19
+ "import transformers\n",
20
+ "from datasets import load_dataset\n",
21
+ "from huggingface_hub import notebook_login\n",
22
+ "from peft import (\n",
23
+ " LoraConfig,\n",
24
+ " PeftConfig,\n",
25
+ " PeftModel,\n",
26
+ " get_peft_model,\n",
27
+ " prepare_model_for_kbit_training,\n",
28
+ ")\n",
29
+ "from transformers import (\n",
30
+ " AutoConfig,\n",
31
+ " AutoModelForCausalLM,\n",
32
+ " AutoTokenizer,\n",
33
+ " BitsAndBytesConfig,\n",
34
+ ")\n",
35
+ "import warnings\n",
36
+ "warnings.filterwarnings(\"ignore\")\n",
37
+ "\n",
38
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 2,
44
+ "id": "acfb1578-a66f-44f0-8df9-1c6bcf7530ea",
45
+ "metadata": {},
46
+ "outputs": [
47
+ {
48
+ "data": {
49
+ "application/vnd.jupyter.widget-view+json": {
50
+ "model_id": "b92bb6f7a2784be8bf5cab2ee87292ff",
51
+ "version_major": 2,
52
+ "version_minor": 0
53
+ },
54
+ "text/plain": [
55
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
56
+ ]
57
+ },
58
+ "metadata": {},
59
+ "output_type": "display_data"
60
+ }
61
+ ],
62
+ "source": [
63
+ "notebook_login()"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": 3,
69
+ "id": "d2f13cac-1536-4da0-8ff7-0a0454fd0b4a",
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "with open(\"ds1000-test-cleaned.json\") as json_file:\n",
74
+ " data = json.load(json_file)"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 5,
80
+ "id": "6706e68b-d525-4392-ab2c-1dff356da52d",
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "name": "stdout",
85
+ "output_type": "stream",
86
+ "text": [
87
+ "{'answer': 'import pandas as pd\\n'\n",
88
+ " '\\n'\n",
89
+ " '\\n'\n",
90
+ " 'index = range(14)\\n'\n",
91
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
92
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
93
+ " 'def g(df):\\n'\n",
94
+ " \" l = df['A'].replace(to_replace=0, method='ffill')\\n\"\n",
95
+ " \" r = df['A'].replace(to_replace=0, method='bfill')\\n\"\n",
96
+ " ' for i in range(len(df)):\\n'\n",
97
+ " \" df['A'].iloc[i] = max(l[i], r[i])\\n\"\n",
98
+ " ' return df\\n'\n",
99
+ " '\\n'\n",
100
+ " 'df = g(df.copy())\\n'\n",
101
+ " 'result = df\\n'\n",
102
+ " 'print(result)',\n",
103
+ " 'question': 'Problem:\\n'\n",
104
+ " 'I have the following dataframe:\\n'\n",
105
+ " 'index = range(14)\\n'\n",
106
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
107
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
108
+ " '\\n'\n",
109
+ " '\\n'\n",
110
+ " 'How can I fill the zeros with the maximun between previous and '\n",
111
+ " 'posterior non-zero value using pandas? Is there a fillna that is '\n",
112
+ " 'not just for \"NaN\"?. \\n'\n",
113
+ " 'The output should look like:\\n'\n",
114
+ " ' A\\n'\n",
115
+ " '0 1\\n'\n",
116
+ " '1 2\\n'\n",
117
+ " '2 2\\n'\n",
118
+ " '3 2\\n'\n",
119
+ " '4 4\\n'\n",
120
+ " '5 4\\n'\n",
121
+ " '6 6\\n'\n",
122
+ " '7 8\\n'\n",
123
+ " '8 8\\n'\n",
124
+ " '9 8\\n'\n",
125
+ " '10 8\\n'\n",
126
+ " '11 8\\n'\n",
127
+ " '12 2\\n'\n",
128
+ " '13 1'}\n"
129
+ ]
130
+ }
131
+ ],
132
+ "source": [
133
+ "pprint(data[0])"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 6,
139
+ "id": "9cc4983a-9a3f-485f-983f-efe2f10ce516",
140
+ "metadata": {},
141
+ "outputs": [],
142
+ "source": [
143
+ "with open(\"ds1000-test-cleaned.json\", \"w\") as f:\n",
144
+ " json.dump(data, f)"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 7,
150
+ "id": "f45c3674-4eed-4ca5-8343-2184ff1e4da1",
151
+ "metadata": {},
152
+ "outputs": [
153
+ {
154
+ "data": {
155
+ "text/html": [
156
+ "<div>\n",
157
+ "<style scoped>\n",
158
+ " .dataframe tbody tr th:only-of-type {\n",
159
+ " vertical-align: middle;\n",
160
+ " }\n",
161
+ "\n",
162
+ " .dataframe tbody tr th {\n",
163
+ " vertical-align: top;\n",
164
+ " }\n",
165
+ "\n",
166
+ " .dataframe thead th {\n",
167
+ " text-align: right;\n",
168
+ " }\n",
169
+ "</style>\n",
170
+ "<table border=\"1\" class=\"dataframe\">\n",
171
+ " <thead>\n",
172
+ " <tr style=\"text-align: right;\">\n",
173
+ " <th></th>\n",
174
+ " <th>question</th>\n",
175
+ " <th>answer</th>\n",
176
+ " </tr>\n",
177
+ " </thead>\n",
178
+ " <tbody>\n",
179
+ " <tr>\n",
180
+ " <th>0</th>\n",
181
+ " <td>Problem:\\nI have the following dataframe:\\nind...</td>\n",
182
+ " <td>import pandas as pd\\n\\n\\nindex = range(14)\\nda...</td>\n",
183
+ " </tr>\n",
184
+ " <tr>\n",
185
+ " <th>1</th>\n",
186
+ " <td>Problem:\\ni got an issue over ranking of date ...</td>\n",
187
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I...</td>\n",
188
+ " </tr>\n",
189
+ " <tr>\n",
190
+ " <th>2</th>\n",
191
+ " <td>Problem:\\nI have a DataFrame like :\\n 0 ...</td>\n",
192
+ " <td>import pandas as pd\\nimport numpy as np\\n\\ndf ...</td>\n",
193
+ " </tr>\n",
194
+ " <tr>\n",
195
+ " <th>3</th>\n",
196
+ " <td>Problem:\\nI have this Pandas dataframe (df):\\n...</td>\n",
197
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A...</td>\n",
198
+ " </tr>\n",
199
+ " <tr>\n",
200
+ " <th>4</th>\n",
201
+ " <td>Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic...</td>\n",
202
+ " <td>import pandas as pd\\n\\ndf = pd.DataFrame.from_...</td>\n",
203
+ " </tr>\n",
204
+ " </tbody>\n",
205
+ "</table>\n",
206
+ "</div>"
207
+ ],
208
+ "text/plain": [
209
+ " question \\\n",
210
+ "0 Problem:\\nI have the following dataframe:\\nind... \n",
211
+ "1 Problem:\\ni got an issue over ranking of date ... \n",
212
+ "2 Problem:\\nI have a DataFrame like :\\n 0 ... \n",
213
+ "3 Problem:\\nI have this Pandas dataframe (df):\\n... \n",
214
+ "4 Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic... \n",
215
+ "\n",
216
+ " answer \n",
217
+ "0 import pandas as pd\\n\\n\\nindex = range(14)\\nda... \n",
218
+ "1 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I... \n",
219
+ "2 import pandas as pd\\nimport numpy as np\\n\\ndf ... \n",
220
+ "3 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A... \n",
221
+ "4 import pandas as pd\\n\\ndf = pd.DataFrame.from_... "
222
+ ]
223
+ },
224
+ "execution_count": 7,
225
+ "metadata": {},
226
+ "output_type": "execute_result"
227
+ }
228
+ ],
229
+ "source": [
230
+ "pd.DataFrame(data).head()"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": 9,
236
+ "id": "6fbdd3ad-062f-4744-bb8e-1c19950adfd5",
237
+ "metadata": {},
238
+ "outputs": [],
239
+ "source": [
240
+ "bnb_config = BitsAndBytesConfig(\n",
241
+ " load_in_4bit=True,\n",
242
+ " bnb_4bit_use_double_quant=True,\n",
243
+ " bnb_4bit_quant_type=\"nf4\",\n",
244
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
245
+ ")"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 12,
251
+ "id": "2b5ae38c-b0d2-4b9a-acde-3370130ca6e7",
252
+ "metadata": {},
253
+ "outputs": [
254
+ {
255
+ "data": {
256
+ "application/vnd.jupyter.widget-view+json": {
257
+ "model_id": "3d6c5533e9ea48e295b7fdfd96da6d47",
258
+ "version_major": 2,
259
+ "version_minor": 0
260
+ },
261
+ "text/plain": [
262
+ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
263
+ ]
264
+ },
265
+ "metadata": {},
266
+ "output_type": "display_data"
267
+ },
268
+ {
269
+ "name": "stderr",
270
+ "output_type": "stream",
271
+ "text": [
272
+ "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at deepseek-ai/deepseek-coder-6.7b-instruct and are newly initialized: ['model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq']\n",
273
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
274
+ ]
275
+ }
276
+ ],
277
+ "source": [
278
+ "PEFT_MODEL = \"shanjay/ds-dsc\"\n",
279
+ "\n",
280
+ "config = PeftConfig.from_pretrained(PEFT_MODEL)\n",
281
+ "model = AutoModelForCausalLM.from_pretrained(\n",
282
+ " config.base_model_name_or_path,\n",
283
+ " return_dict=True,\n",
284
+ " quantization_config=bnb_config,\n",
285
+ " device_map=\"auto\",\n",
286
+ " trust_remote_code=True,\n",
287
+ ")\n",
288
+ "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
289
+ "tokenizer.pad_token = tokenizer.eos_token\n",
290
+ "\n",
291
+ "model = PeftModel.from_pretrained(model, PEFT_MODEL)"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": 13,
297
+ "id": "7c3e35e0-f77c-4d63-8e2b-e72027341e31",
298
+ "metadata": {},
299
+ "outputs": [],
300
+ "source": [
301
+ "generation_config = model.generation_config\n",
302
+ "generation_config.max_new_tokens = 200\n",
303
+ "generation_config.temperature = 0.7\n",
304
+ "generation_config.top_p = 0.7\n",
305
+ "generation_config.num_return_sequences = 1\n",
306
+ "generation_config.pad_token_id = tokenizer.eos_token_id\n",
307
+ "generation_config.eos_token_id = tokenizer.eos_token_id"
308
+ ]
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": 14,
313
+ "id": "aee4385b-d855-4225-9532-4e9002322579",
314
+ "metadata": {},
315
+ "outputs": [],
316
+ "source": [
317
+ "DEVICE = \"cuda:0\""
318
+ ]
319
+ },
320
+ {
321
+ "cell_type": "code",
322
+ "execution_count": 15,
323
+ "id": "7b14a1c6-ac62-4a9c-9df9-0db50facfd7e",
324
+ "metadata": {},
325
+ "outputs": [
326
+ {
327
+ "name": "stdout",
328
+ "output_type": "stream",
329
+ "text": [
330
+ "<instruction>: How can I create a dataframe?\n",
331
+ "<output>:\n",
332
+ "import pandas as pd\n",
333
+ "import numpy as np\n",
334
+ "\n",
335
+ "df = pd.DataFrame(\n",
336
+ " {\n",
337
+ " \"A\": np.array([1, 2, 3]),\n",
338
+ " \"B\": np.array([4, 5, 6]),\n",
339
+ " \"C\": np.array([7, 8, 9]),\n",
340
+ " }\n",
341
+ ")\n",
342
+ "</output>\n",
343
+ "BEGIN SOLUTION\n",
344
+ "<output>\n",
345
+ "[1]\n",
346
+ "<code>\n",
347
+ "[python]\n",
348
+ "# Your code here\n",
349
+ "</code>\n",
350
+ "</output>\n",
351
+ "END SOLUTION\n",
352
+ "<output>\n",
353
+ "[1]\n",
354
+ "<code>\n",
355
+ "[python]\n",
356
+ "print(df)\n",
357
+ "</code>\n",
358
+ "</output>\n",
359
+ "\n",
360
+ "<assistant>: df = pd.DataFrame(\n",
361
+ " {\n",
362
+ " \"A\": np.array([1, 2, 3]),\n",
363
+ "CPU times: user 27.4 s, sys: 372 ms, total: 27.8 s\n",
364
+ "Wall time: 27.9 s\n"
365
+ ]
366
+ }
367
+ ],
368
+ "source": [
369
+ "%%time\n",
370
+ "prompt = f\"\"\"\n",
371
+ "<instruction>: How can I create a dataframe?\n",
372
+ "<output>:\n",
373
+ "\"\"\".strip()\n",
374
+ "\n",
375
+ "encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
376
+ "with torch.inference_mode():\n",
377
+ " outputs = model.generate(\n",
378
+ " input_ids=encoding.input_ids,\n",
379
+ " attention_mask=encoding.attention_mask,\n",
380
+ " generation_config=generation_config,\n",
381
+ " )\n",
382
+ "print(tokenizer.decode(outputs[0], skip_special_tokens=True))"
383
+ ]
384
+ },
385
+ {
386
+ "cell_type": "code",
387
+ "execution_count": 16,
388
+ "id": "93c95988-c563-4871-974d-004bf73fbce8",
389
+ "metadata": {},
390
+ "outputs": [],
391
+ "source": [
392
+ "def generate_response(question: str) -> str:\n",
393
+ " prompt = f\"\"\"\n",
394
+ "<instruction>: {question}\n",
395
+ "<output>:\n",
396
+ "\"\"\".strip()\n",
397
+ " encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
398
+ " with torch.inference_mode():\n",
399
+ " outputs = model.generate(\n",
400
+ " input_ids=encoding.input_ids,\n",
401
+ " attention_mask=encoding.attention_mask,\n",
402
+ " generation_config=generation_config,\n",
403
+ " )\n",
404
+ " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
405
+ "\n",
406
+ " assistant_start = \"<output>:\"\n",
407
+ " response_start = response.find(assistant_start)\n",
408
+ " return response[response_start + len(assistant_start) :].strip()"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 17,
414
+ "id": "8a9a9b87-193b-4bed-8ef1-57944d931958",
415
+ "metadata": {},
416
+ "outputs": [
417
+ {
418
+ "name": "stdout",
419
+ "output_type": "stream",
420
+ "text": [
421
+ "import pandas as pd\n",
422
+ "import numpy as np\n",
423
+ "\n",
424
+ "df = pd.DataFrame(\n",
425
+ " {\n",
426
+ " \"A\": np.array([1, 2, 3]),\n",
427
+ " \"B\": np.array([4, 5, 6]),\n",
428
+ " \"C\": np.array([7, 8, 9]),\n",
429
+ " }\n",
430
+ ")\n",
431
+ "</output>\n",
432
+ "BEGIN SOLUTION\n",
433
+ "<output>\n",
434
+ "[1]\n",
435
+ "<code>\n",
436
+ "[python]\n",
437
+ "# Your code here\n",
438
+ "</code>\n",
439
+ "</output>\n",
440
+ "END SOLUTION\n",
441
+ "<output>\n",
442
+ "[1]\n",
443
+ "<code>\n",
444
+ "[python]\n",
445
+ "print(df)\n",
446
+ "</code>\n",
447
+ "</output>\n",
448
+ "\n",
449
+ "<assistant>: df = pd.DataFrame(\n",
450
+ " {\n",
451
+ " \"A\": np.array([1, 2, 3]),\n"
452
+ ]
453
+ }
454
+ ],
455
+ "source": [
456
+ "prompt = \"How can I create a dataframe?\"\n",
457
+ "print(generate_response(prompt))"
458
+ ]
459
+ },
460
+ {
461
+ "cell_type": "code",
462
+ "execution_count": 18,
463
+ "id": "4658f305-b7c6-432c-ac0c-f62bd79e9ad5",
464
+ "metadata": {},
465
+ "outputs": [
466
+ {
467
+ "name": "stdout",
468
+ "output_type": "stream",
469
+ "text": [
470
+ "import pandas as pd\n",
471
+ "import numpy as np\n",
472
+ "\n",
473
+ "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
474
+ "df2 = pd.DataFrame({'A': [4, 5, 6], 'B': [7, 8, 9]})\n",
475
+ "</output>\n",
476
+ "<assistant>: df = pd.concat([df1, df2])\n",
477
+ "</assistant>\n",
478
+ "<output>: df\n",
479
+ "</output>\n",
480
+ "<code>\n",
481
+ "import pandas as pd\n",
482
+ "import numpy as np\n",
483
+ "\n",
484
+ "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
485
+ "df2 = pd.DataFrame({'A': [4, 5, 6],\n"
486
+ ]
487
+ }
488
+ ],
489
+ "source": [
490
+ "prompt = \"How to merge two dataframes?\"\n",
491
+ "print(generate_response(prompt))"
492
+ ]
493
+ },
494
+ {
495
+ "cell_type": "code",
496
+ "execution_count": 19,
497
+ "id": "0e9ed231-4a62-4331-94df-f3bcd601f138",
498
+ "metadata": {},
499
+ "outputs": [
500
+ {
501
+ "name": "stdout",
502
+ "output_type": "stream",
503
+ "text": [
504
+ "<code>\n",
505
+ "import pandas as pd\n",
506
+ "import numpy as np\n",
507
+ "\n",
508
+ "name=np.array(['joy','shan'])\n",
509
+ "roll_no=np.array([1,2])\n",
510
+ "</code>\n",
511
+ "BEGIN SOLUTION\n",
512
+ "<code>\n",
513
+ "[insert]\n",
514
+ "</code>\n",
515
+ "END SOLUTION\n",
516
+ "<code>\n",
517
+ "print(df)\n",
518
+ "</code>\n",
519
+ "<assistant>: df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
520
+ "</assistant>\n",
521
+ "<code>\n",
522
+ "print(df)\n",
523
+ "</code>\n",
524
+ "\n",
525
+ "<assistant>: df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
526
+ "print(df)\n",
527
+ "</assistant>\n",
528
+ "<code>\n",
529
+ "print(df)\n",
530
+ "</code>\n",
531
+ "<assistant>: df = pd.\n"
532
+ ]
533
+ }
534
+ ],
535
+ "source": [
536
+ "prompt = \"given two arrays name=['joy','shan'], roll_no=[1,2]. put these array in a dataframe ?\"\n",
537
+ "print(generate_response(prompt))"
538
+ ]
539
+ },
540
+ {
541
+ "cell_type": "code",
542
+ "execution_count": 20,
543
+ "id": "381ba5c0-276d-411e-a8d5-9f010528433d",
544
+ "metadata": {},
545
+ "outputs": [
546
+ {
547
+ "name": "stdout",
548
+ "output_type": "stream",
549
+ "text": [
550
+ "[ ]: import matplotlib.pyplot as plt\n",
551
+ "import numpy as np\n",
552
+ "\n",
553
+ "x = np.linspace(0, 10, 100)\n",
554
+ "y = np.sin(x)\n",
555
+ "\n",
556
+ "# your code here\n",
557
+ "</output>\n",
558
+ "<assistant>: plt.plot(x, y)\n",
559
+ "plt.show()\n",
560
+ "</assistant>\n",
561
+ "<output>: [ ]: plt.plot(x, y)\n",
562
+ "plt.show()\n",
563
+ "</output>\n",
564
+ "<assistant>: plt.plot(x, y)\n",
565
+ "plt.show()\n",
566
+ "</assistant>\n",
567
+ "<output>: [ ]: plt.plot(x, y)\n",
568
+ "plt.show()\n",
569
+ "</output>\n",
570
+ "<assistant>: plt.plot(x, y)\n",
571
+ "plt.show()\n",
572
+ "</assistant>\n",
573
+ "<output>\n"
574
+ ]
575
+ }
576
+ ],
577
+ "source": [
578
+ "prompt = \"can you plot all types of plots in matplotlib?\"\n",
579
+ "print(generate_response(prompt))"
580
+ ]
581
+ },
582
+ {
583
+ "cell_type": "code",
584
+ "execution_count": 21,
585
+ "id": "6864c3c7-b721-48ca-8943-dcff9838f7d2",
586
+ "metadata": {},
587
+ "outputs": [
588
+ {
589
+ "name": "stdout",
590
+ "output_type": "stream",
591
+ "text": [
592
+ "import pandas as pd\n",
593
+ "import numpy as np\n",
594
+ "\n",
595
+ "data = pd.DataFrame({'ID': ['01', '01', '01', '02', '02'],\n",
596
+ "'TIME': ['2018-07-11 11:12:20', '2018-07-12 12:00:23', '2018-07-13 12:00:00', '2019-09-11 11:00:00', '2019-09-12 12:00:00']})\n",
597
+ "\n",
598
+ "data['TIME'] = pd.to_datetime(data['TIME'])\n",
599
+ "\n",
600
+ "</output>\n",
601
+ "BEGIN SOLUTION\n",
602
+ "<output>\n",
603
+ "[insert]\n",
604
+ "</output>\n"
605
+ ]
606
+ }
607
+ ],
608
+ "source": [
609
+ "prompt = \"\"\"Problem:\n",
610
+ "i got an issue over ranking of date times. Lets say i have following table.\n",
611
+ "ID TIME\n",
612
+ "01 2018-07-11 11:12:20\n",
613
+ "01 2018-07-12 12:00:23\n",
614
+ "01 2018-07-13 12:00:00\n",
615
+ "02 2019-09-11 11:00:00\n",
616
+ "02 2019-09-12 12:00:00\n",
617
+ "\n",
618
+ "\n",
619
+ "and i want to add another column to rank the table by time for each id and group. I used \n",
620
+ "df['RANK'] = data.groupby('ID')['TIME'].rank(ascending=True)\n",
621
+ "\n",
622
+ "\n",
623
+ "but get an error:\n",
624
+ "'NoneType' object is not callable\n",
625
+ "\n",
626
+ "\n",
627
+ "If i replace datetime to numbers, it works.... any solutions?\n",
628
+ "\"\"\"\n",
629
+ "print(generate_response(prompt))"
630
+ ]
631
+ },
632
+ {
633
+ "cell_type": "code",
634
+ "execution_count": 22,
635
+ "id": "7fa02929-5c65-4aa6-81ce-9c51879e7535",
636
+ "metadata": {},
637
+ "outputs": [
638
+ {
639
+ "name": "stdout",
640
+ "output_type": "stream",
641
+ "text": [
642
+ "<code>\n",
643
+ "import pandas as pd\n",
644
+ "import numpy as np\n",
645
+ "\n",
646
+ "index = range(14)\n",
647
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
648
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
649
+ "</code>\n",
650
+ "BEGIN SOLUTION\n",
651
+ "<code>\n",
652
+ "[insert]\n",
653
+ "</code>\n",
654
+ "END SOLUTION\n",
655
+ "<code>\n",
656
+ "print(df)\n",
657
+ "</code>\n",
658
+ "\n",
659
+ "<assistant>: df['A'] = df['A'].replace(0, np.nan)\n",
660
+ "df['A'] = df['A'].fillna(method='ffill')\n",
661
+ "df['A'] = df['A'].fillna(method='bfill')\n"
662
+ ]
663
+ }
664
+ ],
665
+ "source": [
666
+ "prompt = \"\"\"Problem:\n",
667
+ "I have the following dataframe:\n",
668
+ "index = range(14)\n",
669
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
670
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
671
+ "\n",
672
+ "\n",
673
+ "How can I fill the zeros with the maximun between previous and posterior non-zero value using pandas? Is there a fillna that is not just for \"NaN\"?. \n",
674
+ "The output should look like:\n",
675
+ " A\n",
676
+ "0 1\n",
677
+ "1 2\n",
678
+ "2 2\n",
679
+ "3 2\n",
680
+ "4 4\n",
681
+ "5 4\n",
682
+ "6 6\n",
683
+ "7 8\n",
684
+ "8 8\n",
685
+ "9 8\n",
686
+ "10 8\n",
687
+ "11 8\n",
688
+ "12 2\n",
689
+ "13 1\n",
690
+ "\"\"\"\n",
691
+ "\n",
692
+ "print(generate_response(prompt))"
693
+ ]
694
+ },
695
+ {
696
+ "cell_type": "code",
697
+ "execution_count": null,
698
+ "id": "255cc021-5f5e-46af-a75e-a435b9629cdf",
699
+ "metadata": {},
700
+ "outputs": [],
701
+ "source": []
702
+ }
703
+ ],
704
+ "metadata": {
705
+ "kernelspec": {
706
+ "display_name": "Python 3 (ipykernel)",
707
+ "language": "python",
708
+ "name": "python3"
709
+ },
710
+ "language_info": {
711
+ "codemirror_mode": {
712
+ "name": "ipython",
713
+ "version": 3
714
+ },
715
+ "file_extension": ".py",
716
+ "mimetype": "text/x-python",
717
+ "name": "python",
718
+ "nbconvert_exporter": "python",
719
+ "pygments_lexer": "ipython3",
720
+ "version": "3.10.13"
721
+ }
722
+ },
723
+ "nbformat": 4,
724
+ "nbformat_minor": 5
725
+ }
Testv3.ipynb ADDED
@@ -0,0 +1,831 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "addd199c-097c-419d-a0f2-c3d73efb8d5d",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "\n",
14
+ "===================================BUG REPORT===================================\n",
15
+ "Welcome to bitsandbytes. For bug reports, please run\n",
16
+ "\n",
17
+ "python -m bitsandbytes\n",
18
+ "\n",
19
+ " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
20
+ "================================================================================\n",
21
+ "bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so\n",
22
+ "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n",
23
+ "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so\n",
24
+ "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n",
25
+ "CUDA SETUP: Detected CUDA version 121\n",
26
+ "CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...\n"
27
+ ]
28
+ },
29
+ {
30
+ "name": "stderr",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib'), PosixPath('/usr/local/nvidia/lib64')}\n",
34
+ " warn(msg)\n",
35
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n",
36
+ " warn(msg)\n",
37
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCcuY6EsmJRfLsI1l1rpDWVRhwkL7A9nzITTDbCFOX0wzshP65l/Sa54NrS1pX2uM6YiB7OvgGUm7uUKf9OBCcpd2ohFJiOkTznhDHk+D7IkFZf/VTRIHy/JZoAtzN/qBQKMOygFam1XzTMDnkehMkKvR23BgH72hzGUfYPIsq+OlStYVMhE1bncYSnC4SRucbdT5BeIsival514xsbAhCjjwPd8UHfw1cxaDq4edWjbhN8wkDU+V8i/jS/wWTZIt7pIZiAREEl/YC+Sc4FCSnb4c3p+adl5pqXrEsKygi+UmBtC1poLSXTgZOc/0kerx4jv/HB8NiH4kLsg4S2HjdFFQIB0WSV0i4KDVRE9cv18gQ7kbEv0t9Uwg4xdoMntCNS6aFDm51ufhshwQylzfSwX71Ka3mPdftfnVk81wKpIxN784FEcb7IE7HcNyomnP9N382Fg8j6pILwsKK6w4oOg8Cn2C66cySA6CNTFpK1kYBwsqdU3X8WBQUIZZNVCn4x/qRWYxrKHmdlUW8oCf9AT32eydDQWp1y0AlycA4wfbDQ8g4dtu9Rf+tBrYTztdCt5PbGy4SbwfynWysc/PuhcyaLNtuRYt3LeiCKhKJFNFST1BqjACrjkQ9kMrPSB/7j3JX9O2ncDHDQgCQIQon9BETVQZJ49EqMrusQ3/K39w== shanjay@LAPTOP-Q1PG3AE7')}\n",
38
+ " warn(msg)\n",
39
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('https'), PosixPath('//g.notebooksg.jarvislabs.net')}\n",
40
+ " warn(msg)\n",
41
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n",
42
+ " warn(msg)\n"
43
+ ]
44
+ }
45
+ ],
46
+ "source": [
47
+ "import json\n",
48
+ "import os\n",
49
+ "from pprint import pprint\n",
50
+ "\n",
51
+ "import bitsandbytes as bnb\n",
52
+ "import pandas as pd\n",
53
+ "import torch\n",
54
+ "import torch.nn as nn\n",
55
+ "\n",
56
+ "import transformers\n",
57
+ "from datasets import load_dataset\n",
58
+ "from huggingface_hub import notebook_login\n",
59
+ "from peft import (\n",
60
+ " LoraConfig,\n",
61
+ " PeftConfig,\n",
62
+ " PeftModel,\n",
63
+ " get_peft_model,\n",
64
+ " prepare_model_for_kbit_training,\n",
65
+ ")\n",
66
+ "from transformers import (\n",
67
+ " AutoConfig,\n",
68
+ " AutoModelForCausalLM,\n",
69
+ " AutoTokenizer,\n",
70
+ " BitsAndBytesConfig,\n",
71
+ ")\n",
72
+ "import warnings\n",
73
+ "warnings.filterwarnings(\"ignore\")\n",
74
+ "\n",
75
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 2,
81
+ "id": "acfb1578-a66f-44f0-8df9-1c6bcf7530ea",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "data": {
86
+ "application/vnd.jupyter.widget-view+json": {
87
+ "model_id": "2316598db91e443f8c4e1cc615a9507f",
88
+ "version_major": 2,
89
+ "version_minor": 0
90
+ },
91
+ "text/plain": [
92
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
93
+ ]
94
+ },
95
+ "metadata": {},
96
+ "output_type": "display_data"
97
+ }
98
+ ],
99
+ "source": [
100
+ "notebook_login()"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 3,
106
+ "id": "d2f13cac-1536-4da0-8ff7-0a0454fd0b4a",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "with open(\"ds1000-test-cleaned.json\") as json_file:\n",
111
+ " data = json.load(json_file)"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 4,
117
+ "id": "6706e68b-d525-4392-ab2c-1dff356da52d",
118
+ "metadata": {},
119
+ "outputs": [
120
+ {
121
+ "name": "stdout",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "{'answer': 'import pandas as pd\\n'\n",
125
+ " '\\n'\n",
126
+ " '\\n'\n",
127
+ " 'index = range(14)\\n'\n",
128
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
129
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
130
+ " 'def g(df):\\n'\n",
131
+ " \" l = df['A'].replace(to_replace=0, method='ffill')\\n\"\n",
132
+ " \" r = df['A'].replace(to_replace=0, method='bfill')\\n\"\n",
133
+ " ' for i in range(len(df)):\\n'\n",
134
+ " \" df['A'].iloc[i] = max(l[i], r[i])\\n\"\n",
135
+ " ' return df\\n'\n",
136
+ " '\\n'\n",
137
+ " 'df = g(df.copy())\\n'\n",
138
+ " 'result = df\\n'\n",
139
+ " 'print(result)',\n",
140
+ " 'question': 'Problem:\\n'\n",
141
+ " 'I have the following dataframe:\\n'\n",
142
+ " 'index = range(14)\\n'\n",
143
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
144
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
145
+ " '\\n'\n",
146
+ " '\\n'\n",
147
+ " 'How can I fill the zeros with the maximun between previous and '\n",
148
+ " 'posterior non-zero value using pandas? Is there a fillna that is '\n",
149
+ " 'not just for \"NaN\"?. \\n'\n",
150
+ " 'The output should look like:\\n'\n",
151
+ " ' A\\n'\n",
152
+ " '0 1\\n'\n",
153
+ " '1 2\\n'\n",
154
+ " '2 2\\n'\n",
155
+ " '3 2\\n'\n",
156
+ " '4 4\\n'\n",
157
+ " '5 4\\n'\n",
158
+ " '6 6\\n'\n",
159
+ " '7 8\\n'\n",
160
+ " '8 8\\n'\n",
161
+ " '9 8\\n'\n",
162
+ " '10 8\\n'\n",
163
+ " '11 8\\n'\n",
164
+ " '12 2\\n'\n",
165
+ " '13 1'}\n"
166
+ ]
167
+ }
168
+ ],
169
+ "source": [
170
+ "pprint(data[0])"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 5,
176
+ "id": "9cc4983a-9a3f-485f-983f-efe2f10ce516",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "with open(\"ds1000-test-cleaned.json\", \"w\") as f:\n",
181
+ " json.dump(data, f)"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 6,
187
+ "id": "f45c3674-4eed-4ca5-8343-2184ff1e4da1",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/html": [
193
+ "<div>\n",
194
+ "<style scoped>\n",
195
+ " .dataframe tbody tr th:only-of-type {\n",
196
+ " vertical-align: middle;\n",
197
+ " }\n",
198
+ "\n",
199
+ " .dataframe tbody tr th {\n",
200
+ " vertical-align: top;\n",
201
+ " }\n",
202
+ "\n",
203
+ " .dataframe thead th {\n",
204
+ " text-align: right;\n",
205
+ " }\n",
206
+ "</style>\n",
207
+ "<table border=\"1\" class=\"dataframe\">\n",
208
+ " <thead>\n",
209
+ " <tr style=\"text-align: right;\">\n",
210
+ " <th></th>\n",
211
+ " <th>question</th>\n",
212
+ " <th>answer</th>\n",
213
+ " </tr>\n",
214
+ " </thead>\n",
215
+ " <tbody>\n",
216
+ " <tr>\n",
217
+ " <th>0</th>\n",
218
+ " <td>Problem:\\nI have the following dataframe:\\nind...</td>\n",
219
+ " <td>import pandas as pd\\n\\n\\nindex = range(14)\\nda...</td>\n",
220
+ " </tr>\n",
221
+ " <tr>\n",
222
+ " <th>1</th>\n",
223
+ " <td>Problem:\\ni got an issue over ranking of date ...</td>\n",
224
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I...</td>\n",
225
+ " </tr>\n",
226
+ " <tr>\n",
227
+ " <th>2</th>\n",
228
+ " <td>Problem:\\nI have a DataFrame like :\\n 0 ...</td>\n",
229
+ " <td>import pandas as pd\\nimport numpy as np\\n\\ndf ...</td>\n",
230
+ " </tr>\n",
231
+ " <tr>\n",
232
+ " <th>3</th>\n",
233
+ " <td>Problem:\\nI have this Pandas dataframe (df):\\n...</td>\n",
234
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A...</td>\n",
235
+ " </tr>\n",
236
+ " <tr>\n",
237
+ " <th>4</th>\n",
238
+ " <td>Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic...</td>\n",
239
+ " <td>import pandas as pd\\n\\ndf = pd.DataFrame.from_...</td>\n",
240
+ " </tr>\n",
241
+ " </tbody>\n",
242
+ "</table>\n",
243
+ "</div>"
244
+ ],
245
+ "text/plain": [
246
+ " question \\\n",
247
+ "0 Problem:\\nI have the following dataframe:\\nind... \n",
248
+ "1 Problem:\\ni got an issue over ranking of date ... \n",
249
+ "2 Problem:\\nI have a DataFrame like :\\n 0 ... \n",
250
+ "3 Problem:\\nI have this Pandas dataframe (df):\\n... \n",
251
+ "4 Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic... \n",
252
+ "\n",
253
+ " answer \n",
254
+ "0 import pandas as pd\\n\\n\\nindex = range(14)\\nda... \n",
255
+ "1 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I... \n",
256
+ "2 import pandas as pd\\nimport numpy as np\\n\\ndf ... \n",
257
+ "3 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A... \n",
258
+ "4 import pandas as pd\\n\\ndf = pd.DataFrame.from_... "
259
+ ]
260
+ },
261
+ "execution_count": 6,
262
+ "metadata": {},
263
+ "output_type": "execute_result"
264
+ }
265
+ ],
266
+ "source": [
267
+ "pd.DataFrame(data).head()"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": 7,
273
+ "id": "6fbdd3ad-062f-4744-bb8e-1c19950adfd5",
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": [
277
+ "bnb_config = BitsAndBytesConfig(\n",
278
+ " load_in_4bit=True,\n",
279
+ " bnb_4bit_use_double_quant=True,\n",
280
+ " bnb_4bit_quant_type=\"nf4\",\n",
281
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
282
+ ")"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 8,
288
+ "id": "2b5ae38c-b0d2-4b9a-acde-3370130ca6e7",
289
+ "metadata": {},
290
+ "outputs": [
291
+ {
292
+ "data": {
293
+ "application/vnd.jupyter.widget-view+json": {
294
+ "model_id": "7e1406ca2f5f4c0dbf0a581edebc9a6b",
295
+ "version_major": 2,
296
+ "version_minor": 0
297
+ },
298
+ "text/plain": [
299
+ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
300
+ ]
301
+ },
302
+ "metadata": {},
303
+ "output_type": "display_data"
304
+ },
305
+ {
306
+ "name": "stderr",
307
+ "output_type": "stream",
308
+ "text": [
309
+ "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at deepseek-ai/deepseek-coder-6.7b-instruct and are newly initialized: ['model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq']\n",
310
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
311
+ ]
312
+ },
313
+ {
314
+ "data": {
315
+ "application/vnd.jupyter.widget-view+json": {
316
+ "model_id": "2b9911223be04aeabbc44875a5929afa",
317
+ "version_major": 2,
318
+ "version_minor": 0
319
+ },
320
+ "text/plain": [
321
+ "adapter_model.bin: 0%| | 0.00/33.6M [00:00<?, ?B/s]"
322
+ ]
323
+ },
324
+ "metadata": {},
325
+ "output_type": "display_data"
326
+ }
327
+ ],
328
+ "source": [
329
+ "PEFT_MODEL = \"shanjay/ds-dsc-v4\"\n",
330
+ "\n",
331
+ "config = PeftConfig.from_pretrained(PEFT_MODEL)\n",
332
+ "model = AutoModelForCausalLM.from_pretrained(\n",
333
+ " config.base_model_name_or_path,\n",
334
+ " return_dict=True,\n",
335
+ " quantization_config=bnb_config,\n",
336
+ " device_map=\"auto\",\n",
337
+ " trust_remote_code=True,\n",
338
+ ")\n",
339
+ "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
340
+ "tokenizer.pad_token = tokenizer.eos_token\n",
341
+ "\n",
342
+ "model = PeftModel.from_pretrained(model, PEFT_MODEL)"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": 9,
348
+ "id": "7c3e35e0-f77c-4d63-8e2b-e72027341e31",
349
+ "metadata": {},
350
+ "outputs": [],
351
+ "source": [
352
+ "generation_config = model.generation_config\n",
353
+ "generation_config.max_new_tokens = 200\n",
354
+ "generation_config.temperature = 0.7\n",
355
+ "generation_config.top_p = 0.7\n",
356
+ "generation_config.num_return_sequences = 1\n",
357
+ "generation_config.pad_token_id = tokenizer.eos_token_id\n",
358
+ "generation_config.eos_token_id = tokenizer.eos_token_id"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "execution_count": 10,
364
+ "id": "aee4385b-d855-4225-9532-4e9002322579",
365
+ "metadata": {},
366
+ "outputs": [],
367
+ "source": [
368
+ "DEVICE = \"cuda:0\""
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": 11,
374
+ "id": "7b14a1c6-ac62-4a9c-9df9-0db50facfd7e",
375
+ "metadata": {},
376
+ "outputs": [
377
+ {
378
+ "name": "stdout",
379
+ "output_type": "stream",
380
+ "text": [
381
+ "<instruction>: How can I create a dataframe?\n",
382
+ "<output>: import pandas as pd\n",
383
+ "\n",
384
+ "\n",
385
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
386
+ "print(df)\n",
387
+ " A B\n",
388
+ "0 1 4\n",
389
+ "1 2 5\n",
390
+ "2 3 6\n",
391
+ "<output>: import pandas as pd\n",
392
+ "\n",
393
+ "\n",
394
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
395
+ "print(df)\n",
396
+ " A B\n",
397
+ "0 1 4\n",
398
+ "1 2 5\n",
399
+ "2 3 6\n",
400
+ "<output>: import pandas as pd\n",
401
+ "\n",
402
+ "\n",
403
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
404
+ "print(df)\n",
405
+ " A\n",
406
+ "CPU times: user 26.8 s, sys: 346 ms, total: 27.1 s\n",
407
+ "Wall time: 27.2 s\n"
408
+ ]
409
+ }
410
+ ],
411
+ "source": [
412
+ "%%time\n",
413
+ "prompt = f\"\"\"\n",
414
+ "<instruction>: How can I create a dataframe?\n",
415
+ "<output>:\n",
416
+ "\"\"\".strip()\n",
417
+ "\n",
418
+ "encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
419
+ "with torch.inference_mode():\n",
420
+ " outputs = model.generate(\n",
421
+ " input_ids=encoding.input_ids,\n",
422
+ " attention_mask=encoding.attention_mask,\n",
423
+ " generation_config=generation_config,\n",
424
+ " )\n",
425
+ "print(tokenizer.decode(outputs[0], skip_special_tokens=True))"
426
+ ]
427
+ },
428
+ {
429
+ "cell_type": "code",
430
+ "execution_count": 12,
431
+ "id": "93c95988-c563-4871-974d-004bf73fbce8",
432
+ "metadata": {},
433
+ "outputs": [],
434
+ "source": [
435
+ "def generate_response(question: str) -> str:\n",
436
+ " prompt = f\"\"\"\n",
437
+ "<instruction>: {question}\n",
438
+ "<output>:\n",
439
+ "\"\"\".strip()\n",
440
+ " encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
441
+ " with torch.inference_mode():\n",
442
+ " outputs = model.generate(\n",
443
+ " input_ids=encoding.input_ids,\n",
444
+ " attention_mask=encoding.attention_mask,\n",
445
+ " generation_config=generation_config,\n",
446
+ " )\n",
447
+ " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
448
+ "\n",
449
+ " assistant_start = \"<output>:\"\n",
450
+ " response_start = response.find(assistant_start)\n",
451
+ " return response[response_start + len(assistant_start) :].strip()"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "code",
456
+ "execution_count": 13,
457
+ "id": "8a9a9b87-193b-4bed-8ef1-57944d931958",
458
+ "metadata": {},
459
+ "outputs": [
460
+ {
461
+ "name": "stdout",
462
+ "output_type": "stream",
463
+ "text": [
464
+ "import pandas as pd\n",
465
+ "\n",
466
+ "\n",
467
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
468
+ "print(df)\n",
469
+ " A B\n",
470
+ "0 1 4\n",
471
+ "1 2 5\n",
472
+ "2 3 6\n",
473
+ "<output>: import pandas as pd\n",
474
+ "\n",
475
+ "\n",
476
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
477
+ "print(df)\n",
478
+ " A B\n",
479
+ "0 1 4\n",
480
+ "1 2 5\n",
481
+ "2 3 6\n",
482
+ "<output>: import pandas as pd\n",
483
+ "\n",
484
+ "\n",
485
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
486
+ "print(df)\n",
487
+ " A\n"
488
+ ]
489
+ }
490
+ ],
491
+ "source": [
492
+ "prompt = \"How can I create a dataframe?\"\n",
493
+ "print(generate_response(prompt))"
494
+ ]
495
+ },
496
+ {
497
+ "cell_type": "code",
498
+ "execution_count": 14,
499
+ "id": "4658f305-b7c6-432c-ac0c-f62bd79e9ad5",
500
+ "metadata": {},
501
+ "outputs": [
502
+ {
503
+ "name": "stdout",
504
+ "output_type": "stream",
505
+ "text": [
506
+ "import pandas as pd\n",
507
+ "\n",
508
+ "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
509
+ "df2 = pd.DataFrame({'C': [7, 8, 9], 'D': [10, 11, 12]})\n",
510
+ "\n",
511
+ "# merge df1 and df2\n",
512
+ "result = ...\n",
513
+ "\n",
514
+ "print(result)\n",
515
+ "\n",
516
+ "# Expected output\n",
517
+ "# A B C D\n",
518
+ "# 0 1 4 7 10\n",
519
+ "# 1 2 5 8 11\n",
520
+ "# 2 3 6 9 12\n",
521
+ "<output>: import pandas as pd\n",
522
+ "\n",
523
+ "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]\n"
524
+ ]
525
+ }
526
+ ],
527
+ "source": [
528
+ "prompt = \"How to merge two dataframes?\"\n",
529
+ "print(generate_response(prompt))"
530
+ ]
531
+ },
532
+ {
533
+ "cell_type": "code",
534
+ "execution_count": 15,
535
+ "id": "0e9ed231-4a62-4331-94df-f3bcd601f138",
536
+ "metadata": {},
537
+ "outputs": [
538
+ {
539
+ "name": "stdout",
540
+ "output_type": "stream",
541
+ "text": [
542
+ "import pandas as pd\n",
543
+ "\n",
544
+ "\n",
545
+ "name=['joy','shan']\n",
546
+ "roll_no=[1,2]\n",
547
+ "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
548
+ "print(df)\n",
549
+ "\n",
550
+ "\n",
551
+ " name roll_no\n",
552
+ "0 joy 1\n",
553
+ "1 shan 2\n",
554
+ "<output>: import pandas as pd\n",
555
+ "\n",
556
+ "\n",
557
+ "name=['joy','shan']\n",
558
+ "roll_no=[1,2]\n",
559
+ "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
560
+ "print(df)\n",
561
+ "\n",
562
+ "\n",
563
+ " name roll_no\n",
564
+ "0 joy 1\n",
565
+ "1 shan 2\n",
566
+ "<output>: import pandas as pd\n",
567
+ "\n",
568
+ "\n",
569
+ "name=['joy','shan']\n",
570
+ "roll_no=[1,2]\n",
571
+ "df = pd.DataFrame({\n"
572
+ ]
573
+ }
574
+ ],
575
+ "source": [
576
+ "prompt = \"given two arrays name=['joy','shan'], roll_no=[1,2]. put these array in a dataframe ?\"\n",
577
+ "print(generate_response(prompt))"
578
+ ]
579
+ },
580
+ {
581
+ "cell_type": "code",
582
+ "execution_count": 16,
583
+ "id": "381ba5c0-276d-411e-a8d5-9f010528433d",
584
+ "metadata": {},
585
+ "outputs": [
586
+ {
587
+ "name": "stdout",
588
+ "output_type": "stream",
589
+ "text": [
590
+ "import matplotlib.pyplot as plt\n",
591
+ "\n",
592
+ "x = range(10)\n",
593
+ "y = range(10)\n",
594
+ "\n",
595
+ "plt.plot(x, y, label='line')\n",
596
+ "plt.scatter(x, y, label='scatter')\n",
597
+ "plt.bar(x, y, label='bar')\n",
598
+ "plt.hist(x, y, label='hist')\n",
599
+ "plt.legend()\n",
600
+ "plt.show()\n",
601
+ "<output>: import matplotlib.pyplot as plt\n",
602
+ "\n",
603
+ "x = range(10)\n",
604
+ "y = range(10)\n",
605
+ "\n",
606
+ "plt.plot(x, y, label='line')\n",
607
+ "plt.scatter(x, y, label='scatter')\n",
608
+ "plt.bar(x, y, label='bar')\n",
609
+ "plt.hist(x, y, label='hist')\n",
610
+ "pl\n"
611
+ ]
612
+ }
613
+ ],
614
+ "source": [
615
+ "prompt = \"can you plot all types of plots in matplotlib?\"\n",
616
+ "print(generate_response(prompt))"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": 19,
622
+ "id": "6864c3c7-b721-48ca-8943-dcff9838f7d2",
623
+ "metadata": {},
624
+ "outputs": [
625
+ {
626
+ "name": "stdout",
627
+ "output_type": "stream",
628
+ "text": [
629
+ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
630
+ ]
631
+ }
632
+ ],
633
+ "source": [
634
+ "prompt = \"\"\"Problem:\n",
635
+ "i got an issue over ranking of date times. Lets say i have following table.\n",
636
+ "ID TIME\n",
637
+ "01 2018-07-11 11:12:20\n",
638
+ "01 2018-07-12 12:00:23\n",
639
+ "01 2018-07-13 12:00:00\n",
640
+ "02 2019-09-11 11:00:00\n",
641
+ "02 2019-09-12 12:00:00\n",
642
+ "\n",
643
+ "\n",
644
+ "and i want to add another column to rank the table by time for each id and group. I used \n",
645
+ "df['RANK'] = data.groupby('ID')['TIME'].rank(ascending=True)\n",
646
+ "\n",
647
+ "\n",
648
+ "but get an error:\n",
649
+ "'NoneType' object is not callable\n",
650
+ "\n",
651
+ "\n",
652
+ "If i replace datetime to numbers, it works.... any solutions?\n",
653
+ "\"\"\"\n",
654
+ "print(generate_response(prompt))"
655
+ ]
656
+ },
657
+ {
658
+ "cell_type": "code",
659
+ "execution_count": 20,
660
+ "id": "7fa02929-5c65-4aa6-81ce-9c51879e7535",
661
+ "metadata": {},
662
+ "outputs": [
663
+ {
664
+ "name": "stdout",
665
+ "output_type": "stream",
666
+ "text": [
667
+ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
668
+ ]
669
+ }
670
+ ],
671
+ "source": [
672
+ "prompt = \"\"\"Problem:\n",
673
+ "I have the following dataframe:\n",
674
+ "index = range(14)\n",
675
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
676
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
677
+ "\n",
678
+ "\n",
679
+ "How can I fill the zeros with the maximun between previous and posterior non-zero value using pandas? Is there a fillna that is not just for \"NaN\"?. \n",
680
+ "The output should look like:\n",
681
+ " A\n",
682
+ "0 1\n",
683
+ "1 2\n",
684
+ "2 2\n",
685
+ "3 2\n",
686
+ "4 4\n",
687
+ "5 4\n",
688
+ "6 6\n",
689
+ "7 8\n",
690
+ "8 8\n",
691
+ "9 8\n",
692
+ "10 8\n",
693
+ "11 8\n",
694
+ "12 2\n",
695
+ "13 1\n",
696
+ "\"\"\"\n",
697
+ "\n",
698
+ "print(generate_response(prompt))"
699
+ ]
700
+ },
701
+ {
702
+ "cell_type": "code",
703
+ "execution_count": 27,
704
+ "id": "255cc021-5f5e-46af-a75e-a435b9629cdf",
705
+ "metadata": {},
706
+ "outputs": [
707
+ {
708
+ "name": "stdout",
709
+ "output_type": "stream",
710
+ "text": [
711
+ "Problem:\n",
712
+ "My sample df has four columns with NaN values. The goal is to concatenate all the keywords rows while excluding the NaN values.\n",
713
+ "import pandas as pd\n",
714
+ "import numpy as np\n",
715
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
716
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
717
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
718
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
719
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
720
+ "\n",
721
+ "\n",
722
+ " users keywords_0 keywords_1 keywords_2 keywords_3\n",
723
+ "0 Hu Tao a d NaN f\n",
724
+ "1 Zhongli NaN e NaN NaN\n",
725
+ "2 Xingqiu c NaN b g\n",
726
+ "\n",
727
+ "\n",
728
+ "Want to accomplish the following:\n",
729
+ " users keywords_0 keywords_1 keywords_2 keywords_3 keywords_all\n",
730
+ "0 Hu Tao a d NaN f a-d-f\n",
731
+ "1 Zhongli NaN e NaN NaN e\n",
732
+ "2 Xingqiu c NaN b g c-b-g\n",
733
+ "\n",
734
+ "\n",
735
+ "Pseudo code:\n",
736
+ "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
737
+ "df[\"keywords_all\"] = df[\"keywords_all\"].apply(lambda cols: \"-\".join(cols), axis=1)\n",
738
+ "\n",
739
+ "\n",
740
+ "I know I can use \"-\".join() to get the exact result, but I am unsure how to pass the column names into the function.\n"
741
+ ]
742
+ }
743
+ ],
744
+ "source": [
745
+ "print(data[5]['question'])"
746
+ ]
747
+ },
748
+ {
749
+ "cell_type": "code",
750
+ "execution_count": 28,
751
+ "id": "1c5841e9-4331-4185-a7ad-7dd00d4e13b1",
752
+ "metadata": {},
753
+ "outputs": [
754
+ {
755
+ "name": "stdout",
756
+ "output_type": "stream",
757
+ "text": [
758
+ "import pandas as pd\n",
759
+ "import numpy as np\n",
760
+ "\n",
761
+ "\n",
762
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
763
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
764
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
765
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
766
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
767
+ "import numpy as np\n",
768
+ "def g(df):\n",
769
+ " df[\"keywords_all\"] = df.filter(like='keyword').apply(lambda x: '-'.join(x.dropna()), axis=1)\n",
770
+ " return df\n",
771
+ "\n",
772
+ "df = g(df.copy())\n",
773
+ "result = df\n",
774
+ "print(result)\n"
775
+ ]
776
+ }
777
+ ],
778
+ "source": [
779
+ "print(data[5]['answer'])"
780
+ ]
781
+ },
782
+ {
783
+ "cell_type": "code",
784
+ "execution_count": 29,
785
+ "id": "090e98c3-78db-4e33-af4b-01c6e1fc23d0",
786
+ "metadata": {},
787
+ "outputs": [
788
+ {
789
+ "name": "stdout",
790
+ "output_type": "stream",
791
+ "text": [
792
+ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
793
+ ]
794
+ }
795
+ ],
796
+ "source": [
797
+ "prompt = data[5]['question']\n",
798
+ "print(generate_response(prompt))"
799
+ ]
800
+ },
801
+ {
802
+ "cell_type": "code",
803
+ "execution_count": null,
804
+ "id": "29609669-1ac7-4f6a-b0e3-64a3bf7a6545",
805
+ "metadata": {},
806
+ "outputs": [],
807
+ "source": []
808
+ }
809
+ ],
810
+ "metadata": {
811
+ "kernelspec": {
812
+ "display_name": "Python 3 (ipykernel)",
813
+ "language": "python",
814
+ "name": "python3"
815
+ },
816
+ "language_info": {
817
+ "codemirror_mode": {
818
+ "name": "ipython",
819
+ "version": 3
820
+ },
821
+ "file_extension": ".py",
822
+ "mimetype": "text/x-python",
823
+ "name": "python",
824
+ "nbconvert_exporter": "python",
825
+ "pygments_lexer": "ipython3",
826
+ "version": "3.10.13"
827
+ }
828
+ },
829
+ "nbformat": 4,
830
+ "nbformat_minor": 5
831
+ }
Testv4.ipynb ADDED
@@ -0,0 +1,866 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "addd199c-097c-419d-a0f2-c3d73efb8d5d",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "\n",
14
+ "===================================BUG REPORT===================================\n",
15
+ "Welcome to bitsandbytes. For bug reports, please run\n",
16
+ "\n",
17
+ "python -m bitsandbytes\n",
18
+ "\n",
19
+ " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
20
+ "================================================================================\n",
21
+ "bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so\n",
22
+ "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n",
23
+ "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so\n",
24
+ "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n",
25
+ "CUDA SETUP: Detected CUDA version 121\n",
26
+ "CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...\n"
27
+ ]
28
+ },
29
+ {
30
+ "name": "stderr",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib'), PosixPath('/usr/local/nvidia/lib64')}\n",
34
+ " warn(msg)\n",
35
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n",
36
+ " warn(msg)\n",
37
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCcuY6EsmJRfLsI1l1rpDWVRhwkL7A9nzITTDbCFOX0wzshP65l/Sa54NrS1pX2uM6YiB7OvgGUm7uUKf9OBCcpd2ohFJiOkTznhDHk+D7IkFZf/VTRIHy/JZoAtzN/qBQKMOygFam1XzTMDnkehMkKvR23BgH72hzGUfYPIsq+OlStYVMhE1bncYSnC4SRucbdT5BeIsival514xsbAhCjjwPd8UHfw1cxaDq4edWjbhN8wkDU+V8i/jS/wWTZIt7pIZiAREEl/YC+Sc4FCSnb4c3p+adl5pqXrEsKygi+UmBtC1poLSXTgZOc/0kerx4jv/HB8NiH4kLsg4S2HjdFFQIB0WSV0i4KDVRE9cv18gQ7kbEv0t9Uwg4xdoMntCNS6aFDm51ufhshwQylzfSwX71Ka3mPdftfnVk81wKpIxN784FEcb7IE7HcNyomnP9N382Fg8j6pILwsKK6w4oOg8Cn2C66cySA6CNTFpK1kYBwsqdU3X8WBQUIZZNVCn4x/qRWYxrKHmdlUW8oCf9AT32eydDQWp1y0AlycA4wfbDQ8g4dtu9Rf+tBrYTztdCt5PbGy4SbwfynWysc/PuhcyaLNtuRYt3LeiCKhKJFNFST1BqjACrjkQ9kMrPSB/7j3JX9O2ncDHDQgCQIQon9BETVQZJ49EqMrusQ3/K39w== shanjay@LAPTOP-Q1PG3AE7')}\n",
38
+ " warn(msg)\n",
39
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('https'), PosixPath('//g.notebooksg.jarvislabs.net')}\n",
40
+ " warn(msg)\n",
41
+ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n",
42
+ " warn(msg)\n"
43
+ ]
44
+ }
45
+ ],
46
+ "source": [
47
+ "import json\n",
48
+ "import os\n",
49
+ "from pprint import pprint\n",
50
+ "\n",
51
+ "import bitsandbytes as bnb\n",
52
+ "import pandas as pd\n",
53
+ "import torch\n",
54
+ "import torch.nn as nn\n",
55
+ "\n",
56
+ "import transformers\n",
57
+ "from datasets import load_dataset\n",
58
+ "from huggingface_hub import notebook_login\n",
59
+ "from peft import (\n",
60
+ " LoraConfig,\n",
61
+ " PeftConfig,\n",
62
+ " PeftModel,\n",
63
+ " get_peft_model,\n",
64
+ " prepare_model_for_kbit_training,\n",
65
+ ")\n",
66
+ "from transformers import (\n",
67
+ " AutoConfig,\n",
68
+ " AutoModelForCausalLM,\n",
69
+ " AutoTokenizer,\n",
70
+ " BitsAndBytesConfig,\n",
71
+ ")\n",
72
+ "import warnings\n",
73
+ "warnings.filterwarnings(\"ignore\")\n",
74
+ "\n",
75
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 2,
81
+ "id": "acfb1578-a66f-44f0-8df9-1c6bcf7530ea",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "data": {
86
+ "application/vnd.jupyter.widget-view+json": {
87
+ "model_id": "2316598db91e443f8c4e1cc615a9507f",
88
+ "version_major": 2,
89
+ "version_minor": 0
90
+ },
91
+ "text/plain": [
92
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
93
+ ]
94
+ },
95
+ "metadata": {},
96
+ "output_type": "display_data"
97
+ }
98
+ ],
99
+ "source": [
100
+ "notebook_login()"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 3,
106
+ "id": "d2f13cac-1536-4da0-8ff7-0a0454fd0b4a",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "with open(\"ds1000-test-cleaned.json\") as json_file:\n",
111
+ " data = json.load(json_file)"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 4,
117
+ "id": "6706e68b-d525-4392-ab2c-1dff356da52d",
118
+ "metadata": {},
119
+ "outputs": [
120
+ {
121
+ "name": "stdout",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "{'answer': 'import pandas as pd\\n'\n",
125
+ " '\\n'\n",
126
+ " '\\n'\n",
127
+ " 'index = range(14)\\n'\n",
128
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
129
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
130
+ " 'def g(df):\\n'\n",
131
+ " \" l = df['A'].replace(to_replace=0, method='ffill')\\n\"\n",
132
+ " \" r = df['A'].replace(to_replace=0, method='bfill')\\n\"\n",
133
+ " ' for i in range(len(df)):\\n'\n",
134
+ " \" df['A'].iloc[i] = max(l[i], r[i])\\n\"\n",
135
+ " ' return df\\n'\n",
136
+ " '\\n'\n",
137
+ " 'df = g(df.copy())\\n'\n",
138
+ " 'result = df\\n'\n",
139
+ " 'print(result)',\n",
140
+ " 'question': 'Problem:\\n'\n",
141
+ " 'I have the following dataframe:\\n'\n",
142
+ " 'index = range(14)\\n'\n",
143
+ " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
144
+ " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
145
+ " '\\n'\n",
146
+ " '\\n'\n",
147
+ " 'How can I fill the zeros with the maximun between previous and '\n",
148
+ " 'posterior non-zero value using pandas? Is there a fillna that is '\n",
149
+ " 'not just for \"NaN\"?. \\n'\n",
150
+ " 'The output should look like:\\n'\n",
151
+ " ' A\\n'\n",
152
+ " '0 1\\n'\n",
153
+ " '1 2\\n'\n",
154
+ " '2 2\\n'\n",
155
+ " '3 2\\n'\n",
156
+ " '4 4\\n'\n",
157
+ " '5 4\\n'\n",
158
+ " '6 6\\n'\n",
159
+ " '7 8\\n'\n",
160
+ " '8 8\\n'\n",
161
+ " '9 8\\n'\n",
162
+ " '10 8\\n'\n",
163
+ " '11 8\\n'\n",
164
+ " '12 2\\n'\n",
165
+ " '13 1'}\n"
166
+ ]
167
+ }
168
+ ],
169
+ "source": [
170
+ "pprint(data[0])"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 5,
176
+ "id": "9cc4983a-9a3f-485f-983f-efe2f10ce516",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "with open(\"ds1000-test-cleaned.json\", \"w\") as f:\n",
181
+ " json.dump(data, f)"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 6,
187
+ "id": "f45c3674-4eed-4ca5-8343-2184ff1e4da1",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/html": [
193
+ "<div>\n",
194
+ "<style scoped>\n",
195
+ " .dataframe tbody tr th:only-of-type {\n",
196
+ " vertical-align: middle;\n",
197
+ " }\n",
198
+ "\n",
199
+ " .dataframe tbody tr th {\n",
200
+ " vertical-align: top;\n",
201
+ " }\n",
202
+ "\n",
203
+ " .dataframe thead th {\n",
204
+ " text-align: right;\n",
205
+ " }\n",
206
+ "</style>\n",
207
+ "<table border=\"1\" class=\"dataframe\">\n",
208
+ " <thead>\n",
209
+ " <tr style=\"text-align: right;\">\n",
210
+ " <th></th>\n",
211
+ " <th>question</th>\n",
212
+ " <th>answer</th>\n",
213
+ " </tr>\n",
214
+ " </thead>\n",
215
+ " <tbody>\n",
216
+ " <tr>\n",
217
+ " <th>0</th>\n",
218
+ " <td>Problem:\\nI have the following dataframe:\\nind...</td>\n",
219
+ " <td>import pandas as pd\\n\\n\\nindex = range(14)\\nda...</td>\n",
220
+ " </tr>\n",
221
+ " <tr>\n",
222
+ " <th>1</th>\n",
223
+ " <td>Problem:\\ni got an issue over ranking of date ...</td>\n",
224
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I...</td>\n",
225
+ " </tr>\n",
226
+ " <tr>\n",
227
+ " <th>2</th>\n",
228
+ " <td>Problem:\\nI have a DataFrame like :\\n 0 ...</td>\n",
229
+ " <td>import pandas as pd\\nimport numpy as np\\n\\ndf ...</td>\n",
230
+ " </tr>\n",
231
+ " <tr>\n",
232
+ " <th>3</th>\n",
233
+ " <td>Problem:\\nI have this Pandas dataframe (df):\\n...</td>\n",
234
+ " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A...</td>\n",
235
+ " </tr>\n",
236
+ " <tr>\n",
237
+ " <th>4</th>\n",
238
+ " <td>Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic...</td>\n",
239
+ " <td>import pandas as pd\\n\\ndf = pd.DataFrame.from_...</td>\n",
240
+ " </tr>\n",
241
+ " </tbody>\n",
242
+ "</table>\n",
243
+ "</div>"
244
+ ],
245
+ "text/plain": [
246
+ " question \\\n",
247
+ "0 Problem:\\nI have the following dataframe:\\nind... \n",
248
+ "1 Problem:\\ni got an issue over ranking of date ... \n",
249
+ "2 Problem:\\nI have a DataFrame like :\\n 0 ... \n",
250
+ "3 Problem:\\nI have this Pandas dataframe (df):\\n... \n",
251
+ "4 Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic... \n",
252
+ "\n",
253
+ " answer \n",
254
+ "0 import pandas as pd\\n\\n\\nindex = range(14)\\nda... \n",
255
+ "1 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I... \n",
256
+ "2 import pandas as pd\\nimport numpy as np\\n\\ndf ... \n",
257
+ "3 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A... \n",
258
+ "4 import pandas as pd\\n\\ndf = pd.DataFrame.from_... "
259
+ ]
260
+ },
261
+ "execution_count": 6,
262
+ "metadata": {},
263
+ "output_type": "execute_result"
264
+ }
265
+ ],
266
+ "source": [
267
+ "pd.DataFrame(data).head()"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": 7,
273
+ "id": "6fbdd3ad-062f-4744-bb8e-1c19950adfd5",
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": [
277
+ "bnb_config = BitsAndBytesConfig(\n",
278
+ " load_in_4bit=True,\n",
279
+ " bnb_4bit_use_double_quant=True,\n",
280
+ " bnb_4bit_quant_type=\"nf4\",\n",
281
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
282
+ ")"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 8,
288
+ "id": "2b5ae38c-b0d2-4b9a-acde-3370130ca6e7",
289
+ "metadata": {},
290
+ "outputs": [
291
+ {
292
+ "data": {
293
+ "application/vnd.jupyter.widget-view+json": {
294
+ "model_id": "7e1406ca2f5f4c0dbf0a581edebc9a6b",
295
+ "version_major": 2,
296
+ "version_minor": 0
297
+ },
298
+ "text/plain": [
299
+ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
300
+ ]
301
+ },
302
+ "metadata": {},
303
+ "output_type": "display_data"
304
+ },
305
+ {
306
+ "name": "stderr",
307
+ "output_type": "stream",
308
+ "text": [
309
+ "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at deepseek-ai/deepseek-coder-6.7b-instruct and are newly initialized: ['model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq']\n",
310
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
311
+ ]
312
+ },
313
+ {
314
+ "data": {
315
+ "application/vnd.jupyter.widget-view+json": {
316
+ "model_id": "2b9911223be04aeabbc44875a5929afa",
317
+ "version_major": 2,
318
+ "version_minor": 0
319
+ },
320
+ "text/plain": [
321
+ "adapter_model.bin: 0%| | 0.00/33.6M [00:00<?, ?B/s]"
322
+ ]
323
+ },
324
+ "metadata": {},
325
+ "output_type": "display_data"
326
+ }
327
+ ],
328
+ "source": [
329
+ "PEFT_MODEL = \"shanjay/ds-dsc-v4\"\n",
330
+ "\n",
331
+ "config = PeftConfig.from_pretrained(PEFT_MODEL)\n",
332
+ "model = AutoModelForCausalLM.from_pretrained(\n",
333
+ " config.base_model_name_or_path,\n",
334
+ " return_dict=True,\n",
335
+ " quantization_config=bnb_config,\n",
336
+ " device_map=\"auto\",\n",
337
+ " trust_remote_code=True,\n",
338
+ ")\n",
339
+ "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
340
+ "tokenizer.pad_token = tokenizer.eos_token\n",
341
+ "\n",
342
+ "model = PeftModel.from_pretrained(model, PEFT_MODEL)"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": 9,
348
+ "id": "7c3e35e0-f77c-4d63-8e2b-e72027341e31",
349
+ "metadata": {},
350
+ "outputs": [],
351
+ "source": [
352
+ "generation_config = model.generation_config\n",
353
+ "generation_config.max_new_tokens = 200\n",
354
+ "generation_config.temperature = 0.7\n",
355
+ "generation_config.top_p = 0.7\n",
356
+ "generation_config.num_return_sequences = 1\n",
357
+ "generation_config.pad_token_id = tokenizer.eos_token_id\n",
358
+ "generation_config.eos_token_id = tokenizer.eos_token_id"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "execution_count": 10,
364
+ "id": "aee4385b-d855-4225-9532-4e9002322579",
365
+ "metadata": {},
366
+ "outputs": [],
367
+ "source": [
368
+ "DEVICE = \"cuda:0\""
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": 11,
374
+ "id": "7b14a1c6-ac62-4a9c-9df9-0db50facfd7e",
375
+ "metadata": {},
376
+ "outputs": [
377
+ {
378
+ "name": "stdout",
379
+ "output_type": "stream",
380
+ "text": [
381
+ "<instruction>: How can I create a dataframe?\n",
382
+ "<output>: import pandas as pd\n",
383
+ "\n",
384
+ "\n",
385
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
386
+ "print(df)\n",
387
+ " A B\n",
388
+ "0 1 4\n",
389
+ "1 2 5\n",
390
+ "2 3 6\n",
391
+ "<output>: import pandas as pd\n",
392
+ "\n",
393
+ "\n",
394
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
395
+ "print(df)\n",
396
+ " A B\n",
397
+ "0 1 4\n",
398
+ "1 2 5\n",
399
+ "2 3 6\n",
400
+ "<output>: import pandas as pd\n",
401
+ "\n",
402
+ "\n",
403
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
404
+ "print(df)\n",
405
+ " A\n",
406
+ "CPU times: user 26.8 s, sys: 346 ms, total: 27.1 s\n",
407
+ "Wall time: 27.2 s\n"
408
+ ]
409
+ }
410
+ ],
411
+ "source": [
412
+ "%%time\n",
413
+ "prompt = f\"\"\"\n",
414
+ "<instruction>: How can I create a dataframe?\n",
415
+ "<output>:\n",
416
+ "\"\"\".strip()\n",
417
+ "\n",
418
+ "encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
419
+ "with torch.inference_mode():\n",
420
+ " outputs = model.generate(\n",
421
+ " input_ids=encoding.input_ids,\n",
422
+ " attention_mask=encoding.attention_mask,\n",
423
+ " generation_config=generation_config,\n",
424
+ " )\n",
425
+ "print(tokenizer.decode(outputs[0], skip_special_tokens=True))"
426
+ ]
427
+ },
428
+ {
429
+ "cell_type": "code",
430
+ "execution_count": 12,
431
+ "id": "93c95988-c563-4871-974d-004bf73fbce8",
432
+ "metadata": {},
433
+ "outputs": [],
434
+ "source": [
435
+ "def generate_response(question: str) -> str:\n",
436
+ " prompt = f\"\"\"\n",
437
+ "<instruction>: {question}\n",
438
+ "<output>:\n",
439
+ "\"\"\".strip()\n",
440
+ " encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
441
+ " with torch.inference_mode():\n",
442
+ " outputs = model.generate(\n",
443
+ " input_ids=encoding.input_ids,\n",
444
+ " attention_mask=encoding.attention_mask,\n",
445
+ " generation_config=generation_config,\n",
446
+ " )\n",
447
+ " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
448
+ "\n",
449
+ " assistant_start = \"<output>:\"\n",
450
+ " response_start = response.find(assistant_start)\n",
451
+ " return response[response_start + len(assistant_start) :].strip()"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "code",
456
+ "execution_count": 13,
457
+ "id": "8a9a9b87-193b-4bed-8ef1-57944d931958",
458
+ "metadata": {},
459
+ "outputs": [
460
+ {
461
+ "name": "stdout",
462
+ "output_type": "stream",
463
+ "text": [
464
+ "import pandas as pd\n",
465
+ "\n",
466
+ "\n",
467
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
468
+ "print(df)\n",
469
+ " A B\n",
470
+ "0 1 4\n",
471
+ "1 2 5\n",
472
+ "2 3 6\n",
473
+ "<output>: import pandas as pd\n",
474
+ "\n",
475
+ "\n",
476
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
477
+ "print(df)\n",
478
+ " A B\n",
479
+ "0 1 4\n",
480
+ "1 2 5\n",
481
+ "2 3 6\n",
482
+ "<output>: import pandas as pd\n",
483
+ "\n",
484
+ "\n",
485
+ "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
486
+ "print(df)\n",
487
+ " A\n"
488
+ ]
489
+ }
490
+ ],
491
+ "source": [
492
+ "prompt = \"How can I create a dataframe?\"\n",
493
+ "print(generate_response(prompt))"
494
+ ]
495
+ },
496
+ {
497
+ "cell_type": "code",
498
+ "execution_count": 14,
499
+ "id": "4658f305-b7c6-432c-ac0c-f62bd79e9ad5",
500
+ "metadata": {},
501
+ "outputs": [
502
+ {
503
+ "name": "stdout",
504
+ "output_type": "stream",
505
+ "text": [
506
+ "import pandas as pd\n",
507
+ "\n",
508
+ "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n",
509
+ "df2 = pd.DataFrame({'C': [7, 8, 9], 'D': [10, 11, 12]})\n",
510
+ "\n",
511
+ "# merge df1 and df2\n",
512
+ "result = ...\n",
513
+ "\n",
514
+ "print(result)\n",
515
+ "\n",
516
+ "# Expected output\n",
517
+ "# A B C D\n",
518
+ "# 0 1 4 7 10\n",
519
+ "# 1 2 5 8 11\n",
520
+ "# 2 3 6 9 12\n",
521
+ "<output>: import pandas as pd\n",
522
+ "\n",
523
+ "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]\n"
524
+ ]
525
+ }
526
+ ],
527
+ "source": [
528
+ "prompt = \"How to merge two dataframes?\"\n",
529
+ "print(generate_response(prompt))"
530
+ ]
531
+ },
532
+ {
533
+ "cell_type": "code",
534
+ "execution_count": 15,
535
+ "id": "0e9ed231-4a62-4331-94df-f3bcd601f138",
536
+ "metadata": {},
537
+ "outputs": [
538
+ {
539
+ "name": "stdout",
540
+ "output_type": "stream",
541
+ "text": [
542
+ "import pandas as pd\n",
543
+ "\n",
544
+ "\n",
545
+ "name=['joy','shan']\n",
546
+ "roll_no=[1,2]\n",
547
+ "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
548
+ "print(df)\n",
549
+ "\n",
550
+ "\n",
551
+ " name roll_no\n",
552
+ "0 joy 1\n",
553
+ "1 shan 2\n",
554
+ "<output>: import pandas as pd\n",
555
+ "\n",
556
+ "\n",
557
+ "name=['joy','shan']\n",
558
+ "roll_no=[1,2]\n",
559
+ "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
560
+ "print(df)\n",
561
+ "\n",
562
+ "\n",
563
+ " name roll_no\n",
564
+ "0 joy 1\n",
565
+ "1 shan 2\n",
566
+ "<output>: import pandas as pd\n",
567
+ "\n",
568
+ "\n",
569
+ "name=['joy','shan']\n",
570
+ "roll_no=[1,2]\n",
571
+ "df = pd.DataFrame({\n"
572
+ ]
573
+ }
574
+ ],
575
+ "source": [
576
+ "prompt = \"given two arrays name=['joy','shan'], roll_no=[1,2]. put these array in a dataframe ?\"\n",
577
+ "print(generate_response(prompt))"
578
+ ]
579
+ },
580
+ {
581
+ "cell_type": "code",
582
+ "execution_count": 16,
583
+ "id": "381ba5c0-276d-411e-a8d5-9f010528433d",
584
+ "metadata": {},
585
+ "outputs": [
586
+ {
587
+ "name": "stdout",
588
+ "output_type": "stream",
589
+ "text": [
590
+ "import matplotlib.pyplot as plt\n",
591
+ "\n",
592
+ "x = range(10)\n",
593
+ "y = range(10)\n",
594
+ "\n",
595
+ "plt.plot(x, y, label='line')\n",
596
+ "plt.scatter(x, y, label='scatter')\n",
597
+ "plt.bar(x, y, label='bar')\n",
598
+ "plt.hist(x, y, label='hist')\n",
599
+ "plt.legend()\n",
600
+ "plt.show()\n",
601
+ "<output>: import matplotlib.pyplot as plt\n",
602
+ "\n",
603
+ "x = range(10)\n",
604
+ "y = range(10)\n",
605
+ "\n",
606
+ "plt.plot(x, y, label='line')\n",
607
+ "plt.scatter(x, y, label='scatter')\n",
608
+ "plt.bar(x, y, label='bar')\n",
609
+ "plt.hist(x, y, label='hist')\n",
610
+ "pl\n"
611
+ ]
612
+ }
613
+ ],
614
+ "source": [
615
+ "prompt = \"can you plot all types of plots in matplotlib?\"\n",
616
+ "print(generate_response(prompt))"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": 19,
622
+ "id": "6864c3c7-b721-48ca-8943-dcff9838f7d2",
623
+ "metadata": {},
624
+ "outputs": [
625
+ {
626
+ "name": "stdout",
627
+ "output_type": "stream",
628
+ "text": [
629
+ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
630
+ ]
631
+ }
632
+ ],
633
+ "source": [
634
+ "prompt = \"\"\"Problem:\n",
635
+ "i got an issue over ranking of date times. Lets say i have following table.\n",
636
+ "ID TIME\n",
637
+ "01 2018-07-11 11:12:20\n",
638
+ "01 2018-07-12 12:00:23\n",
639
+ "01 2018-07-13 12:00:00\n",
640
+ "02 2019-09-11 11:00:00\n",
641
+ "02 2019-09-12 12:00:00\n",
642
+ "\n",
643
+ "\n",
644
+ "and i want to add another column to rank the table by time for each id and group. I used \n",
645
+ "df['RANK'] = data.groupby('ID')['TIME'].rank(ascending=True)\n",
646
+ "\n",
647
+ "\n",
648
+ "but get an error:\n",
649
+ "'NoneType' object is not callable\n",
650
+ "\n",
651
+ "\n",
652
+ "If i replace datetime to numbers, it works.... any solutions?\n",
653
+ "\"\"\"\n",
654
+ "print(generate_response(prompt))"
655
+ ]
656
+ },
657
+ {
658
+ "cell_type": "code",
659
+ "execution_count": 20,
660
+ "id": "7fa02929-5c65-4aa6-81ce-9c51879e7535",
661
+ "metadata": {},
662
+ "outputs": [
663
+ {
664
+ "name": "stdout",
665
+ "output_type": "stream",
666
+ "text": [
667
+ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
668
+ ]
669
+ }
670
+ ],
671
+ "source": [
672
+ "prompt = \"\"\"Problem:\n",
673
+ "I have the following dataframe:\n",
674
+ "index = range(14)\n",
675
+ "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
676
+ "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
677
+ "\n",
678
+ "\n",
679
+ "How can I fill the zeros with the maximun between previous and posterior non-zero value using pandas? Is there a fillna that is not just for \"NaN\"?. \n",
680
+ "The output should look like:\n",
681
+ " A\n",
682
+ "0 1\n",
683
+ "1 2\n",
684
+ "2 2\n",
685
+ "3 2\n",
686
+ "4 4\n",
687
+ "5 4\n",
688
+ "6 6\n",
689
+ "7 8\n",
690
+ "8 8\n",
691
+ "9 8\n",
692
+ "10 8\n",
693
+ "11 8\n",
694
+ "12 2\n",
695
+ "13 1\n",
696
+ "\"\"\"\n",
697
+ "\n",
698
+ "print(generate_response(prompt))"
699
+ ]
700
+ },
701
+ {
702
+ "cell_type": "code",
703
+ "execution_count": 27,
704
+ "id": "255cc021-5f5e-46af-a75e-a435b9629cdf",
705
+ "metadata": {},
706
+ "outputs": [
707
+ {
708
+ "name": "stdout",
709
+ "output_type": "stream",
710
+ "text": [
711
+ "Problem:\n",
712
+ "My sample df has four columns with NaN values. The goal is to concatenate all the keywords rows while excluding the NaN values.\n",
713
+ "import pandas as pd\n",
714
+ "import numpy as np\n",
715
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
716
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
717
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
718
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
719
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
720
+ "\n",
721
+ "\n",
722
+ " users keywords_0 keywords_1 keywords_2 keywords_3\n",
723
+ "0 Hu Tao a d NaN f\n",
724
+ "1 Zhongli NaN e NaN NaN\n",
725
+ "2 Xingqiu c NaN b g\n",
726
+ "\n",
727
+ "\n",
728
+ "Want to accomplish the following:\n",
729
+ " users keywords_0 keywords_1 keywords_2 keywords_3 keywords_all\n",
730
+ "0 Hu Tao a d NaN f a-d-f\n",
731
+ "1 Zhongli NaN e NaN NaN e\n",
732
+ "2 Xingqiu c NaN b g c-b-g\n",
733
+ "\n",
734
+ "\n",
735
+ "Pseudo code:\n",
736
+ "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
737
+ "df[\"keywords_all\"] = df[\"keywords_all\"].apply(lambda cols: \"-\".join(cols), axis=1)\n",
738
+ "\n",
739
+ "\n",
740
+ "I know I can use \"-\".join() to get the exact result, but I am unsure how to pass the column names into the function.\n"
741
+ ]
742
+ }
743
+ ],
744
+ "source": [
745
+ "print(data[5]['question'])"
746
+ ]
747
+ },
748
+ {
749
+ "cell_type": "code",
750
+ "execution_count": 28,
751
+ "id": "1c5841e9-4331-4185-a7ad-7dd00d4e13b1",
752
+ "metadata": {},
753
+ "outputs": [
754
+ {
755
+ "name": "stdout",
756
+ "output_type": "stream",
757
+ "text": [
758
+ "import pandas as pd\n",
759
+ "import numpy as np\n",
760
+ "\n",
761
+ "\n",
762
+ "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
763
+ " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
764
+ " 'keywords_1': [\"d\", \"e\", np.nan],\n",
765
+ " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
766
+ " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
767
+ "import numpy as np\n",
768
+ "def g(df):\n",
769
+ " df[\"keywords_all\"] = df.filter(like='keyword').apply(lambda x: '-'.join(x.dropna()), axis=1)\n",
770
+ " return df\n",
771
+ "\n",
772
+ "df = g(df.copy())\n",
773
+ "result = df\n",
774
+ "print(result)\n"
775
+ ]
776
+ }
777
+ ],
778
+ "source": [
779
+ "print(data[5]['answer'])"
780
+ ]
781
+ },
782
+ {
783
+ "cell_type": "code",
784
+ "execution_count": 29,
785
+ "id": "090e98c3-78db-4e33-af4b-01c6e1fc23d0",
786
+ "metadata": {},
787
+ "outputs": [
788
+ {
789
+ "name": "stdout",
790
+ "output_type": "stream",
791
+ "text": [
792
+ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
793
+ ]
794
+ }
795
+ ],
796
+ "source": [
797
+ "prompt = data[5]['question']\n",
798
+ "print(generate_response(prompt))"
799
+ ]
800
+ },
801
+ {
802
+ "cell_type": "code",
803
+ "execution_count": 30,
804
+ "id": "29609669-1ac7-4f6a-b0e3-64a3bf7a6545",
805
+ "metadata": {},
806
+ "outputs": [
807
+ {
808
+ "name": "stdout",
809
+ "output_type": "stream",
810
+ "text": [
811
+ "import pandas as pd\n",
812
+ "\n",
813
+ "\n",
814
+ "df = pd.DataFrame({'A': [1, 2, 3, None, 5],\n",
815
+ " 'B': [1, 2, 3, None, 5],\n",
816
+ " 'C': [1, 2, 3, None, 5],\n",
817
+ " 'D': [1, 2, 3, None, 5],\n",
818
+ " 'E': [1, 2, 3, None, 5]})\n",
819
+ "\n",
820
+ "df = df.dropna(how='all')\n",
821
+ "print(df)\n",
822
+ "<output>: A B C D E\n",
823
+ "0 1 1 1 1 1\n",
824
+ "1 2 2 2 2 2\n",
825
+ "2 3 3 3 3 3\n",
826
+ "4 5 5 5 5 5\n",
827
+ "<output>: import pand\n"
828
+ ]
829
+ }
830
+ ],
831
+ "source": [
832
+ "prompt = \"How to remove null valued rows?\"\n",
833
+ "print(generate_response(prompt))"
834
+ ]
835
+ },
836
+ {
837
+ "cell_type": "code",
838
+ "execution_count": null,
839
+ "id": "5ca085f6-30fc-4e50-a436-673f3baa75af",
840
+ "metadata": {},
841
+ "outputs": [],
842
+ "source": []
843
+ }
844
+ ],
845
+ "metadata": {
846
+ "kernelspec": {
847
+ "display_name": "Python 3 (ipykernel)",
848
+ "language": "python",
849
+ "name": "python3"
850
+ },
851
+ "language_info": {
852
+ "codemirror_mode": {
853
+ "name": "ipython",
854
+ "version": 3
855
+ },
856
+ "file_extension": ".py",
857
+ "mimetype": "text/x-python",
858
+ "name": "python",
859
+ "nbconvert_exporter": "python",
860
+ "pygments_lexer": "ipython3",
861
+ "version": "3.10.13"
862
+ }
863
+ },
864
+ "nbformat": 4,
865
+ "nbformat_minor": 5
866
+ }
ds1000-test-cleaned.json ADDED
The diff for this file is too large to render. See raw diff
 
ds1000-train-cleaned.json ADDED
The diff for this file is too large to render. See raw diff
 
experiments/runs/Dec11_06-38-12_114d9a2e28a3/1702276693.5141723/events.out.tfevents.1702276693.114d9a2e28a3.6724.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f25e1807afb8c8257bbc7fff824e498df2917d3fbdfabce9311e29dbfd64ee4c
3
+ size 5900
experiments/runs/Dec11_06-38-12_114d9a2e28a3/events.out.tfevents.1702276693.114d9a2e28a3.6724.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eef523b5bf07e40277c221485f9b694b435700bf700f4ca2b72b91f038ac6441
3
+ size 4470
experiments/runs/Dec11_06-44-21_114d9a2e28a3/1702277061.7964196/events.out.tfevents.1702277061.114d9a2e28a3.9175.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c5cd2abb0dfdfd8ae2ef03e0800fb3b0cab4628d98832ac59083f3cca2cdaab
3
+ size 5900
experiments/runs/Dec11_06-44-21_114d9a2e28a3/events.out.tfevents.1702277061.114d9a2e28a3.9175.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b100927e05f3d0ef5550a60df85903496fa0bebf24ec51116d6dea4ccc9b78b
3
+ size 6625
experiments/runs/Dec11_06-47-03_114d9a2e28a3/1702277223.9722266/events.out.tfevents.1702277223.114d9a2e28a3.10257.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3d68e699976be729ef5e3439a979beeb843b9ca534f2669b7e7b10e84bdc8fe
3
+ size 5900
experiments/runs/Dec11_06-47-03_114d9a2e28a3/events.out.tfevents.1702277223.114d9a2e28a3.10257.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d63259dbe437e29c5dce266f395061f0037d1046552b861267b8208fb6bf426
3
+ size 74778
trained-model/adapter_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "ise-uiuc/Magicoder-S-DS-6.7B",
3
+ "bias": "none",
4
+ "fan_in_fan_out": false,
5
+ "inference_mode": true,
6
+ "init_lora_weights": true,
7
+ "layers_pattern": null,
8
+ "layers_to_transform": null,
9
+ "lora_alpha": 32,
10
+ "lora_dropout": 0.05,
11
+ "modules_to_save": null,
12
+ "peft_type": "LORA",
13
+ "r": 16,
14
+ "revision": null,
15
+ "target_modules": [
16
+ "q_proj",
17
+ "v_proj"
18
+ ],
19
+ "task_type": "CAUSAL_LM"
20
+ }
trained-model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90c57b8002f6685aa7fcf0ea30c8cf2d1c37454e7dede5b034baf2ebd984c72a
3
+ size 33600906