{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Asus\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import pandas as pd \n", "import numpy as np \n", "import seaborn as sns \n", "import matplotlib as plt\n", "import gradio as gr \n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score\n", "from sklearn.ensemble import RandomForestClassifier\n", "data = pd.read_csv(\"../career-recommendation-system/Dataset/mldata.csv\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(6901, 20)\n" ] } ], "source": [ "print(data.shape)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 6901 entries, 0 to 6900\n", "Data columns (total 20 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Logical quotient rating 6901 non-null int64 \n", " 1 hackathons 6901 non-null int64 \n", " 2 coding skills rating 6901 non-null int64 \n", " 3 public speaking points 6901 non-null int64 \n", " 4 self-learning capability? 6901 non-null object\n", " 5 Extra-courses did 6901 non-null object\n", " 6 certifications 6901 non-null object\n", " 7 workshops 6901 non-null object\n", " 8 reading and writing skills 6901 non-null object\n", " 9 memory capability score 6901 non-null object\n", " 10 Interested subjects 6901 non-null object\n", " 11 interested career area 6901 non-null object\n", " 12 Type of company want to settle in? 6901 non-null object\n", " 13 Taken inputs from seniors or elders 6901 non-null object\n", " 14 Interested Type of Books 6901 non-null object\n", " 15 Management or Technical 6901 non-null object\n", " 16 hard/smart worker 6901 non-null object\n", " 17 worked in teams ever? 6901 non-null object\n", " 18 Introvert 6901 non-null object\n", " 19 Suggested Job Role 6901 non-null object\n", "dtypes: int64(4), object(16)\n", "memory usage: 1.1+ MB\n" ] } ], "source": [ "data.info()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "List of numerical features: \n", " ['Logical quotient rating', 'hackathons', 'coding skills rating', 'public speaking points']\n", "List of categorical features: \n", " ['self-learning capability?', 'Extra-courses did', 'certifications', 'workshops', 'reading and writing skills', 'memory capability score', 'Interested subjects', 'interested career area ', 'Type of company want to settle in?', 'Taken inputs from seniors or elders', 'Interested Type of Books', 'Management or Technical', 'hard/smart worker', 'worked in teams ever?', 'Introvert', 'Suggested Job Role']\n" ] } ], "source": [ "print(\"List of numerical features: \\n\", data.select_dtypes(include=\"int\").columns.tolist()) \n", "print(\"List of categorical features: \\n\", data.select_dtypes(include=\"object\").columns.tolist())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Logical quotient rating 0\n", "hackathons 0\n", "coding skills rating 0\n", "public speaking points 0\n", "self-learning capability? 0\n", "Extra-courses did 0\n", "certifications 0\n", "workshops 0\n", "reading and writing skills 0\n", "memory capability score 0\n", "Interested subjects 0\n", "interested career area 0\n", "Type of company want to settle in? 0\n", "Taken inputs from seniors or elders 0\n", "Interested Type of Books 0\n", "Management or Technical 0\n", "hard/smart worker 0\n", "worked in teams ever? 0\n", "Introvert 0\n", "Suggested Job Role 0\n", "dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "self-learning capability?\n", "yes 3496\n", "no 3405\n", "Name: count, dtype: int64\n", "\n", "Extra-courses did\n", "no 3529\n", "yes 3372\n", "Name: count, dtype: int64\n", "\n", "certifications\n", "r programming 803\n", "information security 785\n", "shell programming 783\n", "machine learning 783\n", "full stack 768\n", "hadoop 764\n", "python 756\n", "distro making 740\n", "app development 719\n", "Name: count, dtype: int64\n", "\n", "workshops\n", "database security 897\n", "system designing 891\n", "web technologies 891\n", "hacking 867\n", "testing 852\n", "data science 842\n", "game development 831\n", "cloud computing 830\n", "Name: count, dtype: int64\n", "\n", "reading and writing skills\n", "excellent 2328\n", "medium 2315\n", "poor 2258\n", "Name: count, dtype: int64\n", "\n", "memory capability score\n", "medium 2317\n", "excellent 2303\n", "poor 2281\n", "Name: count, dtype: int64\n", "\n", "Interested subjects\n", "Software Engineering 731\n", "IOT 722\n", "cloud computing 721\n", "programming 716\n", "networks 713\n", "Computer Architecture 703\n", "data engineering 672\n", "hacking 663\n", "Management 644\n", "parallel computing 616\n", "Name: count, dtype: int64\n", "\n", "interested career area \n", "system developer 1178\n", "security 1177\n", "Business process analyst 1154\n", "developer 1145\n", "testing 1128\n", "cloud computing 1119\n", "Name: count, dtype: int64\n", "\n", "Type of company want to settle in?\n", "Service Based 725\n", "Web Services 719\n", "BPA 711\n", "Testing and Maintainance Services 698\n", "Product based 695\n", "Finance 694\n", "Cloud Services 692\n", "product development 669\n", "Sales and Marketing 658\n", "SAaS services 640\n", "Name: count, dtype: int64\n", "\n", "Taken inputs from seniors or elders\n", "yes 3501\n", "no 3400\n", "Name: count, dtype: int64\n", "\n", "Interested Type of Books\n", "Guide 405\n", "Health 401\n", "Self help 377\n", "Horror 377\n", "Biographies 219\n", "Science fiction 218\n", "Satire 212\n", "Childrens 212\n", "Autobiographies 210\n", "Prayer books 207\n", "Fantasy 205\n", "Journals 203\n", "Trilogy 203\n", "Anthology 202\n", "Encyclopedias 201\n", "Drama 201\n", "Mystery 200\n", "History 199\n", "Science 198\n", "Dictionaries 198\n", "Diaries 197\n", "Religion-Spirituality 197\n", "Action and Adventure 193\n", "Poetry 193\n", "Cookbooks 186\n", "Comics 186\n", "Art 186\n", "Travel 186\n", "Series 180\n", "Math 176\n", "Romance 173\n", "Name: count, dtype: int64\n", "\n", "Management or Technical\n", "Management 3461\n", "Technical 3440\n", "Name: count, dtype: int64\n", "\n", "hard/smart worker\n", "smart worker 3523\n", "hard worker 3378\n", "Name: count, dtype: int64\n", "\n", "worked in teams ever?\n", "no 3470\n", "yes 3431\n", "Name: count, dtype: int64\n", "\n", "Introvert\n", "yes 3544\n", "no 3357\n", "Name: count, dtype: int64\n", "\n", "Suggested Job Role\n", "Network Security Engineer 630\n", "Software Engineer 590\n", "UX Designer 589\n", "Software Developer 587\n", "Database Developer 581\n", "Software Quality Assurance (QA) / Testing 571\n", "Web Developer 570\n", "CRM Technical Developer 567\n", "Technical Support 565\n", "Systems Security Administrator 562\n", "Applications Developer 551\n", "Mobile Applications Developer 538\n", "Name: count, dtype: int64\n", "\n" ] } ], "source": [ "categorical_cols = data[['self-learning capability?', 'Extra-courses did', 'certifications', 'workshops', 'reading and writing skills', 'memory capability score', 'Interested subjects', 'interested career area ', 'Type of company want to settle in?', 'Taken inputs from seniors or elders', 'Interested Type of Books', 'Management or Technical', 'hard/smart worker', 'worked in teams ever?', 'Introvert', 'Suggested Job Role']]\n", "numerical_cols = data[['Logical quotient rating', 'hackathons', 'coding skills rating', 'public speaking points']]\n", "\n", "for i in categorical_cols:\n", " print(data[i].value_counts(), end=\"\\n\\n\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Logical quotient rating\n", "6 799\n", "9 784\n", "2 782\n", "5 773\n", "3 772\n", "4 759\n", "1 756\n", "7 752\n", "8 724\n", "Name: count, dtype: int64\n", "\n", "hackathons\n", "5 1033\n", "2 1026\n", "0 1010\n", "6 989\n", "3 966\n", "1 952\n", "4 925\n", "Name: count, dtype: int64\n", "\n", "coding skills rating\n", "4 787\n", "5 777\n", "2 776\n", "6 774\n", "8 767\n", "7 766\n", "9 761\n", "3 755\n", "1 738\n", "Name: count, dtype: int64\n", "\n", "public speaking points\n", "7 807\n", "1 799\n", "8 777\n", "2 770\n", "3 766\n", "4 760\n", "9 758\n", "6 740\n", "5 724\n", "Name: count, dtype: int64\n", "\n" ] } ], "source": [ "for j in numerical_cols:\n", " print(data[j].value_counts(), end=\"\\n\\n\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "[(0.6313725490196078, 0.788235294117647, 0.9568627450980393),\n", " (1.0, 0.7058823529411765, 0.5098039215686274),\n", " (0.5529411764705883, 0.8980392156862745, 0.6313725490196078),\n", " (1.0, 0.6235294117647059, 0.6078431372549019),\n", " (0.8156862745098039, 0.7333333333333333, 1.0),\n", " (0.8705882352941177, 0.7333333333333333, 0.6078431372549019),\n", " (0.9803921568627451, 0.6901960784313725, 0.8941176470588236),\n", " (0.8117647058823529, 0.8117647058823529, 0.8117647058823529),\n", " (1.0, 0.996078431372549, 0.6392156862745098),\n", " (0.7254901960784313, 0.9490196078431372, 0.9411764705882353)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sns.set_theme(style=\"darkgrid\")\n", "Palette = sns.color_palette(\"pastel\")\n", "Palette\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Asus\\AppData\\Local\\Temp\\ipykernel_21412\\4210140923.py:3: FutureWarning: \n", "\n", "Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.\n", "\n", " sns.countplot( data[\"Suggested Job Role\"], palette=sns.color_palette(\"pastel\"))\n", "C:\\Users\\Asus\\AppData\\Local\\Temp\\ipykernel_21412\\4210140923.py:3: UserWarning: \n", "The palette list has fewer values (10) than needed (12) and will cycle, which may produce an uninterpretable plot.\n", " sns.countplot( data[\"Suggested Job Role\"], palette=sns.color_palette(\"pastel\"))\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# We check for balancy for the label attribute, since this recommendation system \n", "# is similar classification model, therefore, it is a must to balance the label attribute\n", "sns.countplot( data[\"Suggested Job Role\"], palette=sns.color_palette(\"pastel\"))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "corr = data[['Logical quotient rating', 'hackathons', 'coding skills rating', 'public speaking points']].corr() \n", "sns.heatmap(corr,square=True,annot=True,linewidth = .2,center=2,cmap=Palette)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Logical quotient rating hackathons coding skills rating \\\n", "0 5 0 6 \n", "1 7 6 4 \n", "2 2 3 9 \n", "3 2 6 3 \n", "4 2 0 3 \n", "... ... ... ... \n", "6896 7 5 6 \n", "6897 6 5 1 \n", "6898 5 1 6 \n", "6899 1 6 4 \n", "6900 5 6 2 \n", "\n", " public speaking points \n", "0 2 \n", "1 3 \n", "2 1 \n", "3 5 \n", "4 4 \n", "... ... \n", "6896 2 \n", "6897 8 \n", "6898 7 \n", "6899 6 \n", "6900 5 \n", "\n", "[6901 rows x 4 columns]\n" ] } ], "source": [ "print(data.T.head(4).T)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['self-learning capability?', 'Extra-courses did', 'certifications', 'workshops', 'reading and writing skills', 'memory capability score', 'Interested subjects', 'interested career area ', 'Type of company want to settle in?', 'Taken inputs from seniors or elders', 'Interested Type of Books', 'Management or Technical', 'hard/smart worker', 'worked in teams ever?', 'Introvert', 'Suggested Job Role']\n", " self-learning capability? Extra-courses did certifications \\\n", "0 yes no information security \n", "1 no yes shell programming \n", "2 no yes information security \n", "3 no yes r programming \n", "4 yes no distro making \n", "... ... ... ... \n", "6896 yes no shell programming \n", "6897 no no machine learning \n", "6898 yes no distro making \n", "6899 no no app development \n", "6900 no yes information security \n", "\n", " workshops reading and writing skills memory capability score \\\n", "0 testing poor poor \n", "1 testing excellent medium \n", "2 testing excellent poor \n", "3 database security excellent poor \n", "4 game development excellent medium \n", "... ... ... ... \n", "6896 hacking poor poor \n", "6897 hacking excellent excellent \n", "6898 data science poor poor \n", "6899 game development poor excellent \n", "6900 database security excellent medium \n", "\n", " Interested subjects interested career area \\\n", "0 programming testing \n", "1 Management system developer \n", "2 data engineering Business process analyst \n", "3 networks testing \n", "4 Software Engineering system developer \n", "... ... ... \n", "6896 Software Engineering testing \n", "6897 programming testing \n", "6898 IOT system developer \n", "6899 data engineering developer \n", "6900 Computer Architecture security \n", "\n", " Type of company want to settle in? Taken inputs from seniors or elders \\\n", "0 BPA no \n", "1 Cloud Services yes \n", "2 product development yes \n", "3 Testing and Maintainance Services yes \n", "4 BPA no \n", "... ... ... \n", "6896 Testing and Maintainance Services yes \n", "6897 Testing and Maintainance Services no \n", "6898 Cloud Services yes \n", "6899 SAaS services no \n", "6900 Sales and Marketing yes \n", "\n", " Interested Type of Books Management or Technical hard/smart worker \\\n", "0 Series Management smart worker \n", "1 Autobiographies Technical hard worker \n", "2 Travel Technical smart worker \n", "3 Guide Management smart worker \n", "4 Health Technical hard worker \n", "... ... ... ... \n", "6896 Trilogy Management smart worker \n", "6897 Science Management hard worker \n", "6898 Self help Technical hard worker \n", "6899 Drama Technical smart worker \n", "6900 Drama Management smart worker \n", "\n", " worked in teams ever? Introvert Suggested Job Role \n", "0 yes no Applications Developer \n", "1 no yes Applications Developer \n", "2 no no Applications Developer \n", "3 yes yes Applications Developer \n", "4 yes no Applications Developer \n", "... ... ... ... \n", "6896 no yes Web Developer \n", "6897 no no Web Developer \n", "6898 yes no Web Developer \n", "6899 no yes Web Developer \n", "6900 yes no Web Developer \n", "\n", "[6901 rows x 16 columns]\n" ] } ], "source": [ "# Now we understand there are 4 numerical values, 16 categorical values (nominal or binary[Yes/No])\n", "# Therefore, we need to split encoding as each of the question is different with answer option \n", "# For example, there are about 12 options (will be encoded 0-11) in suggested job role vs yes/no question that will be only (0/1)\n", "# Hence, we will proceed with Binary Encoding, then nominal encoding (Ordinal/ Label encoding)\n", "# Call Categorical variables for referring and columns name to copy and paste:\n", "print(categorical_cols.columns.tolist())\n", "print(categorical_cols)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "bicols = data[['self-learning capability?', 'Extra-courses did', 'Taken inputs from seniors or elders','worked in teams ever?', 'Introvert']]\n", "for i in bicols:\n", " replace_nums = {i: {\"yes\": 1 ,\"no\": 0}}\n", " data = data.replace(replace_nums )\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 6901 entries, 0 to 6900\n", "Data columns (total 20 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Logical quotient rating 6901 non-null int64 \n", " 1 hackathons 6901 non-null int64 \n", " 2 coding skills rating 6901 non-null int64 \n", " 3 public speaking points 6901 non-null int64 \n", " 4 self-learning capability? 6901 non-null int64 \n", " 5 Extra-courses did 6901 non-null int64 \n", " 6 certifications 6901 non-null object\n", " 7 workshops 6901 non-null object\n", " 8 reading and writing skills 6901 non-null int64 \n", " 9 memory capability score 6901 non-null int64 \n", " 10 Interested subjects 6901 non-null object\n", " 11 interested career area 6901 non-null object\n", " 12 Type of company want to settle in? 6901 non-null object\n", " 13 Taken inputs from seniors or elders 6901 non-null int64 \n", " 14 Interested Type of Books 6901 non-null object\n", " 15 Management or Technical 6901 non-null object\n", " 16 hard/smart worker 6901 non-null object\n", " 17 worked in teams ever? 6901 non-null int64 \n", " 18 Introvert 6901 non-null int64 \n", " 19 Suggested Job Role 6901 non-null object\n", "dtypes: int64(11), object(9)\n", "memory usage: 1.1+ MB\n", "None\n" ] } ], "source": [ "orcols = data[[\"reading and writing skills\", \"memory capability score\"]]\n", "for i in orcols:\n", " replace_nums = {i: {\"poor\": 0, \"medium\": 1, \"excellent\": 2}}\n", " data = data.replace(replace_nums)\n", "\n", "print(data.info())\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Logical quotient ratinghackathonscoding skills ratingpublic speaking pointsself-learning capability?Extra-courses didcertificationsworkshopsreading and writing skillsmemory capability score...Type of company want to settle in?Taken inputs from seniors or eldersInterested Type of Booksworked in teams ever?IntrovertSuggested Job RoleManagement or Technical_ManagementManagement or Technical_Technicalhard/smart worker_hard workerhard/smart worker_smart worker
05062104600...002810Applications DeveloperTrueFalseFalseTrue
17643018621...11301Applications DeveloperFalseTrueTrueFalse
22391014620...912900Applications DeveloperFalseTrueFalseTrue
32635017220...711311Applications DeveloperTrueFalseFalseTrue
42034101321...001410Applications DeveloperFalseTrueTrueFalse
\n", "

5 rows × 22 columns

\n", "
" ], "text/plain": [ " Logical quotient rating hackathons coding skills rating \\\n", "0 5 0 6 \n", "1 7 6 4 \n", "2 2 3 9 \n", "3 2 6 3 \n", "4 2 0 3 \n", "\n", " public speaking points self-learning capability? Extra-courses did \\\n", "0 2 1 0 \n", "1 3 0 1 \n", "2 1 0 1 \n", "3 5 0 1 \n", "4 4 1 0 \n", "\n", " certifications workshops reading and writing skills \\\n", "0 4 6 0 \n", "1 8 6 2 \n", "2 4 6 2 \n", "3 7 2 2 \n", "4 1 3 2 \n", "\n", " memory capability score ... Type of company want to settle in? \\\n", "0 0 ... 0 \n", "1 1 ... 1 \n", "2 0 ... 9 \n", "3 0 ... 7 \n", "4 1 ... 0 \n", "\n", " Taken inputs from seniors or elders Interested Type of Books \\\n", "0 0 28 \n", "1 1 3 \n", "2 1 29 \n", "3 1 13 \n", "4 0 14 \n", "\n", " worked in teams ever? Introvert Suggested Job Role \\\n", "0 1 0 Applications Developer \n", "1 0 1 Applications Developer \n", "2 0 0 Applications Developer \n", "3 1 1 Applications Developer \n", "4 1 0 Applications Developer \n", "\n", " Management or Technical_Management Management or Technical_Technical \\\n", "0 True False \n", "1 False True \n", "2 False True \n", "3 True False \n", "4 False True \n", "\n", " hard/smart worker_hard worker hard/smart worker_smart worker \n", "0 False True \n", "1 True False \n", "2 False True \n", "3 False True \n", "4 True False \n", "\n", "[5 rows x 22 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.select_dtypes(include=\"object\").columns.tolist()\n", "nominalcols = ['certifications',\n", " 'workshops',\n", " 'Interested subjects',\n", " 'interested career area ',\n", " 'Type of company want to settle in?',\n", " 'Interested Type of Books']\n", "#Left with these nominal attributes\n", "data.select_dtypes(include=\"object\").head()\n", "for i in nominalcols:\n", " data[i] = data[i].astype('category')\n", " data[i] = data[i].cat.codes\n", "\n", "data = pd.get_dummies(data, columns = ['Management or Technical','hard/smart worker'])\n", "data.head()\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "#done converting all to numerical left with the label attribute\n", "#try with building machine learning model\n", "dataset = data\n", "dataset.head()\n", "#Separate all independant variables and targeted variable column\n", "df_train_x = dataset.drop('Suggested Job Role', axis = 1)\n", "df_train_y = dataset['Suggested Job Role']\n", "#Therefore, we split the dataset into train and test dataset\n", "x_train, x_test, y_train, y_test = train_test_split(df_train_x,df_train_y,test_size=.20,random_state=42)\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 8 14 10 6 15 4 11 16 11 9 13 15]\n", " [ 9 14 13 9 10 8 10 6 4 5 9 8]\n", " [ 8 11 14 9 13 9 10 8 11 12 8 10]\n", " [ 7 5 7 9 11 7 9 8 10 6 10 11]\n", " [12 13 6 7 18 11 8 11 9 8 9 9]\n", " [ 7 13 9 6 14 12 9 13 9 8 4 9]\n", " [ 4 10 14 9 15 13 10 9 6 8 11 7]\n", " [ 7 11 15 10 12 13 6 10 10 11 7 6]\n", " [ 5 11 11 5 23 12 9 3 7 9 12 7]\n", " [ 5 8 8 7 14 15 8 10 13 6 6 11]\n", " [10 16 11 4 14 16 10 5 14 8 5 6]\n", " [ 7 8 10 8 10 16 12 12 6 8 5 7]]\n", "\n", "0.8689355539464156\n" ] } ], "source": [ "#Proceed with random forest classifier\n", "rf = RandomForestClassifier(random_state=10)\n", "rf.fit(x_train,y_train)\n", "rf_predict_y = rf.predict(x_test)\n", "rfc_cm = confusion_matrix(y_test,rf_predict_y)\n", "rfc_acc = accuracy_score(y_test,rf_predict_y)\n", "print(rfc_cm, end=\"\\n\\n\")\n", "print(rfc_acc*10)\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "userdata = [['7','6','6','8','3','5','4', '4', '7', '3', '3', '6','8', \n", " '7','5','7','4','5','6','8','8']]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Software Engineer']\n", "Probabilities of all classes: [[0.07 0.11 0.07 0.05 0.05 0.06 0.12 0.11 0.11 0.09 0.06 0.1 ]]\n", "Probability of Predicted class : 0.12\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Asus\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\base.py:465: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names\n", " warnings.warn(\n", "c:\\Users\\Asus\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\base.py:465: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names\n", " warnings.warn(\n" ] } ], "source": [ "ynewclass = rf.predict(userdata)\n", "ynew = rf.predict_proba(userdata)\n", "print(ynewclass)\n", "print(\"Probabilities of all classes: \", ynew)\n", "print(\"Probability of Predicted class : \", np.max(ynew))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pickle \n", "pickle.dump(rf, open('rfweights.pkl','rb'))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 2 }