{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "addd199c-097c-419d-a0f2-c3d73efb8d5d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "===================================BUG REPORT===================================\n", "Welcome to bitsandbytes. For bug reports, please run\n", "\n", "python -m bitsandbytes\n", "\n", " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", "================================================================================\n", "bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so\n", "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n", "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so\n", "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n", "CUDA SETUP: Detected CUDA version 121\n", "CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib'), PosixPath('/usr/local/nvidia/lib64')}\n", " warn(msg)\n", "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n", " warn(msg)\n", "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCcuY6EsmJRfLsI1l1rpDWVRhwkL7A9nzITTDbCFOX0wzshP65l/Sa54NrS1pX2uM6YiB7OvgGUm7uUKf9OBCcpd2ohFJiOkTznhDHk+D7IkFZf/VTRIHy/JZoAtzN/qBQKMOygFam1XzTMDnkehMkKvR23BgH72hzGUfYPIsq+OlStYVMhE1bncYSnC4SRucbdT5BeIsival514xsbAhCjjwPd8UHfw1cxaDq4edWjbhN8wkDU+V8i/jS/wWTZIt7pIZiAREEl/YC+Sc4FCSnb4c3p+adl5pqXrEsKygi+UmBtC1poLSXTgZOc/0kerx4jv/HB8NiH4kLsg4S2HjdFFQIB0WSV0i4KDVRE9cv18gQ7kbEv0t9Uwg4xdoMntCNS6aFDm51ufhshwQylzfSwX71Ka3mPdftfnVk81wKpIxN784FEcb7IE7HcNyomnP9N382Fg8j6pILwsKK6w4oOg8Cn2C66cySA6CNTFpK1kYBwsqdU3X8WBQUIZZNVCn4x/qRWYxrKHmdlUW8oCf9AT32eydDQWp1y0AlycA4wfbDQ8g4dtu9Rf+tBrYTztdCt5PbGy4SbwfynWysc/PuhcyaLNtuRYt3LeiCKhKJFNFST1BqjACrjkQ9kMrPSB/7j3JX9O2ncDHDQgCQIQon9BETVQZJ49EqMrusQ3/K39w== shanjay@LAPTOP-Q1PG3AE7')}\n", " warn(msg)\n", "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('https'), PosixPath('//g.notebooksg.jarvislabs.net')}\n", " warn(msg)\n", "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n", " warn(msg)\n" ] } ], "source": [ "import json\n", "import os\n", "from pprint import pprint\n", "\n", "import bitsandbytes as bnb\n", "import pandas as pd\n", "import torch\n", "import torch.nn as nn\n", "\n", "import transformers\n", "from datasets import load_dataset\n", "from huggingface_hub import notebook_login\n", "from peft import (\n", " LoraConfig,\n", " PeftConfig,\n", " PeftModel,\n", " get_peft_model,\n", " prepare_model_for_kbit_training,\n", ")\n", "from transformers import (\n", " AutoConfig,\n", " AutoModelForCausalLM,\n", " AutoTokenizer,\n", " BitsAndBytesConfig,\n", ")\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"" ] }, { "cell_type": "code", "execution_count": 2, "id": "acfb1578-a66f-44f0-8df9-1c6bcf7530ea", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2316598db91e443f8c4e1cc615a9507f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questionanswer
0Problem:\\nI have the following dataframe:\\nind...import pandas as pd\\n\\n\\nindex = range(14)\\nda...
1Problem:\\ni got an issue over ranking of date ...import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I...
2Problem:\\nI have a DataFrame like :\\n 0 ...import pandas as pd\\nimport numpy as np\\n\\ndf ...
3Problem:\\nI have this Pandas dataframe (df):\\n...import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A...
4Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic...import pandas as pd\\n\\ndf = pd.DataFrame.from_...
\n", "" ], "text/plain": [ " question \\\n", "0 Problem:\\nI have the following dataframe:\\nind... \n", "1 Problem:\\ni got an issue over ranking of date ... \n", "2 Problem:\\nI have a DataFrame like :\\n 0 ... \n", "3 Problem:\\nI have this Pandas dataframe (df):\\n... \n", "4 Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic... \n", "\n", " answer \n", "0 import pandas as pd\\n\\n\\nindex = range(14)\\nda... \n", "1 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I... \n", "2 import pandas as pd\\nimport numpy as np\\n\\ndf ... \n", "3 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A... \n", "4 import pandas as pd\\n\\ndf = pd.DataFrame.from_... " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(data).head()" ] }, { "cell_type": "code", "execution_count": 7, "id": "6fbdd3ad-062f-4744-bb8e-1c19950adfd5", "metadata": {}, "outputs": [], "source": [ "bnb_config = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_use_double_quant=True,\n", " bnb_4bit_quant_type=\"nf4\",\n", " bnb_4bit_compute_dtype=torch.bfloat16,\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "id": "2b5ae38c-b0d2-4b9a-acde-3370130ca6e7", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7e1406ca2f5f4c0dbf0a581edebc9a6b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/2 [00:00: How can I create a dataframe?\n", ": import pandas as pd\n", "\n", "\n", "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n", "print(df)\n", " A B\n", "0 1 4\n", "1 2 5\n", "2 3 6\n", ": import pandas as pd\n", "\n", "\n", "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n", "print(df)\n", " A B\n", "0 1 4\n", "1 2 5\n", "2 3 6\n", ": import pandas as pd\n", "\n", "\n", "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n", "print(df)\n", " A\n", "CPU times: user 26.8 s, sys: 346 ms, total: 27.1 s\n", "Wall time: 27.2 s\n" ] } ], "source": [ "%%time\n", "prompt = f\"\"\"\n", ": How can I create a dataframe?\n", ":\n", "\"\"\".strip()\n", "\n", "encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n", "with torch.inference_mode():\n", " outputs = model.generate(\n", " input_ids=encoding.input_ids,\n", " attention_mask=encoding.attention_mask,\n", " generation_config=generation_config,\n", " )\n", "print(tokenizer.decode(outputs[0], skip_special_tokens=True))" ] }, { "cell_type": "code", "execution_count": 12, "id": "93c95988-c563-4871-974d-004bf73fbce8", "metadata": {}, "outputs": [], "source": [ "def generate_response(question: str) -> str:\n", " prompt = f\"\"\"\n", ": {question}\n", ":\n", "\"\"\".strip()\n", " encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n", " with torch.inference_mode():\n", " outputs = model.generate(\n", " input_ids=encoding.input_ids,\n", " attention_mask=encoding.attention_mask,\n", " generation_config=generation_config,\n", " )\n", " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", "\n", " assistant_start = \":\"\n", " response_start = response.find(assistant_start)\n", " return response[response_start + len(assistant_start) :].strip()" ] }, { "cell_type": "code", "execution_count": 13, "id": "8a9a9b87-193b-4bed-8ef1-57944d931958", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "import pandas as pd\n", "\n", "\n", "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n", "print(df)\n", " A B\n", "0 1 4\n", "1 2 5\n", "2 3 6\n", ": import pandas as pd\n", "\n", "\n", "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n", "print(df)\n", " A B\n", "0 1 4\n", "1 2 5\n", "2 3 6\n", ": import pandas as pd\n", "\n", "\n", "df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n", "print(df)\n", " A\n" ] } ], "source": [ "prompt = \"How can I create a dataframe?\"\n", "print(generate_response(prompt))" ] }, { "cell_type": "code", "execution_count": 14, "id": "4658f305-b7c6-432c-ac0c-f62bd79e9ad5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "import pandas as pd\n", "\n", "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n", "df2 = pd.DataFrame({'C': [7, 8, 9], 'D': [10, 11, 12]})\n", "\n", "# merge df1 and df2\n", "result = ...\n", "\n", "print(result)\n", "\n", "# Expected output\n", "# A B C D\n", "# 0 1 4 7 10\n", "# 1 2 5 8 11\n", "# 2 3 6 9 12\n", ": import pandas as pd\n", "\n", "df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]\n" ] } ], "source": [ "prompt = \"How to merge two dataframes?\"\n", "print(generate_response(prompt))" ] }, { "cell_type": "code", "execution_count": 15, "id": "0e9ed231-4a62-4331-94df-f3bcd601f138", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "import pandas as pd\n", "\n", "\n", "name=['joy','shan']\n", "roll_no=[1,2]\n", "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n", "print(df)\n", "\n", "\n", " name roll_no\n", "0 joy 1\n", "1 shan 2\n", ": import pandas as pd\n", "\n", "\n", "name=['joy','shan']\n", "roll_no=[1,2]\n", "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n", "print(df)\n", "\n", "\n", " name roll_no\n", "0 joy 1\n", "1 shan 2\n", ": import pandas as pd\n", "\n", "\n", "name=['joy','shan']\n", "roll_no=[1,2]\n", "df = pd.DataFrame({\n" ] } ], "source": [ "prompt = \"given two arrays name=['joy','shan'], roll_no=[1,2]. put these array in a dataframe ?\"\n", "print(generate_response(prompt))" ] }, { "cell_type": "code", "execution_count": 16, "id": "381ba5c0-276d-411e-a8d5-9f010528433d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "import matplotlib.pyplot as plt\n", "\n", "x = range(10)\n", "y = range(10)\n", "\n", "plt.plot(x, y, label='line')\n", "plt.scatter(x, y, label='scatter')\n", "plt.bar(x, y, label='bar')\n", "plt.hist(x, y, label='hist')\n", "plt.legend()\n", "plt.show()\n", ": import matplotlib.pyplot as plt\n", "\n", "x = range(10)\n", "y = range(10)\n", "\n", "plt.plot(x, y, label='line')\n", "plt.scatter(x, y, label='scatter')\n", "plt.bar(x, y, label='bar')\n", "plt.hist(x, y, label='hist')\n", "pl\n" ] } ], "source": [ "prompt = \"can you plot all types of plots in matplotlib?\"\n", "print(generate_response(prompt))" ] }, { "cell_type": "code", "execution_count": 19, "id": "6864c3c7-b721-48ca-8943-dcff9838f7d2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" ] } ], "source": [ "prompt = \"\"\"Problem:\n", "i got an issue over ranking of date times. Lets say i have following table.\n", "ID TIME\n", "01 2018-07-11 11:12:20\n", "01 2018-07-12 12:00:23\n", "01 2018-07-13 12:00:00\n", "02 2019-09-11 11:00:00\n", "02 2019-09-12 12:00:00\n", "\n", "\n", "and i want to add another column to rank the table by time for each id and group. I used \n", "df['RANK'] = data.groupby('ID')['TIME'].rank(ascending=True)\n", "\n", "\n", "but get an error:\n", "'NoneType' object is not callable\n", "\n", "\n", "If i replace datetime to numbers, it works.... any solutions?\n", "\"\"\"\n", "print(generate_response(prompt))" ] }, { "cell_type": "code", "execution_count": 20, "id": "7fa02929-5c65-4aa6-81ce-9c51879e7535", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" ] } ], "source": [ "prompt = \"\"\"Problem:\n", "I have the following dataframe:\n", "index = range(14)\n", "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n", "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n", "\n", "\n", "How can I fill the zeros with the maximun between previous and posterior non-zero value using pandas? Is there a fillna that is not just for \"NaN\"?. \n", "The output should look like:\n", " A\n", "0 1\n", "1 2\n", "2 2\n", "3 2\n", "4 4\n", "5 4\n", "6 6\n", "7 8\n", "8 8\n", "9 8\n", "10 8\n", "11 8\n", "12 2\n", "13 1\n", "\"\"\"\n", "\n", "print(generate_response(prompt))" ] }, { "cell_type": "code", "execution_count": 27, "id": "255cc021-5f5e-46af-a75e-a435b9629cdf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Problem:\n", "My sample df has four columns with NaN values. The goal is to concatenate all the keywords rows while excluding the NaN values.\n", "import pandas as pd\n", "import numpy as np\n", "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n", " 'keywords_0': [\"a\", np.nan, \"c\"],\n", " 'keywords_1': [\"d\", \"e\", np.nan],\n", " 'keywords_2': [np.nan, np.nan, \"b\"],\n", " 'keywords_3': [\"f\", np.nan, \"g\"]})\n", "\n", "\n", " users keywords_0 keywords_1 keywords_2 keywords_3\n", "0 Hu Tao a d NaN f\n", "1 Zhongli NaN e NaN NaN\n", "2 Xingqiu c NaN b g\n", "\n", "\n", "Want to accomplish the following:\n", " users keywords_0 keywords_1 keywords_2 keywords_3 keywords_all\n", "0 Hu Tao a d NaN f a-d-f\n", "1 Zhongli NaN e NaN NaN e\n", "2 Xingqiu c NaN b g c-b-g\n", "\n", "\n", "Pseudo code:\n", "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n", "df[\"keywords_all\"] = df[\"keywords_all\"].apply(lambda cols: \"-\".join(cols), axis=1)\n", "\n", "\n", "I know I can use \"-\".join() to get the exact result, but I am unsure how to pass the column names into the function.\n" ] } ], "source": [ "print(data[5]['question'])" ] }, { "cell_type": "code", "execution_count": 28, "id": "1c5841e9-4331-4185-a7ad-7dd00d4e13b1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "import pandas as pd\n", "import numpy as np\n", "\n", "\n", "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n", " 'keywords_0': [\"a\", np.nan, \"c\"],\n", " 'keywords_1': [\"d\", \"e\", np.nan],\n", " 'keywords_2': [np.nan, np.nan, \"b\"],\n", " 'keywords_3': [\"f\", np.nan, \"g\"]})\n", "import numpy as np\n", "def g(df):\n", " df[\"keywords_all\"] = df.filter(like='keyword').apply(lambda x: '-'.join(x.dropna()), axis=1)\n", " return df\n", "\n", "df = g(df.copy())\n", "result = df\n", "print(result)\n" ] } ], "source": [ "print(data[5]['answer'])" ] }, { "cell_type": "code", "execution_count": 29, "id": "090e98c3-78db-4e33-af4b-01c6e1fc23d0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" ] } ], "source": [ "prompt = data[5]['question']\n", "print(generate_response(prompt))" ] }, { "cell_type": "code", "execution_count": 30, "id": "29609669-1ac7-4f6a-b0e3-64a3bf7a6545", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "import pandas as pd\n", "\n", "\n", "df = pd.DataFrame({'A': [1, 2, 3, None, 5],\n", " 'B': [1, 2, 3, None, 5],\n", " 'C': [1, 2, 3, None, 5],\n", " 'D': [1, 2, 3, None, 5],\n", " 'E': [1, 2, 3, None, 5]})\n", "\n", "df = df.dropna(how='all')\n", "print(df)\n", ": A B C D E\n", "0 1 1 1 1 1\n", "1 2 2 2 2 2\n", "2 3 3 3 3 3\n", "4 5 5 5 5 5\n", ": import pand\n" ] } ], "source": [ "prompt = \"How to remove null valued rows?\"\n", "print(generate_response(prompt))" ] }, { "cell_type": "code", "execution_count": null, "id": "5ca085f6-30fc-4e50-a436-673f3baa75af", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }