{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "authorship_tag": "ABX9TyPpkUAEEYS6RkzEdfRuVCBt", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "" ] }, { "cell_type": "code", "source": [ "# Importamos las bilbiotecas que usaremos\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.cluster import KMeans\n", "from sklearn.manifold import TSNE\n", "from sklearn.metrics import silhouette_samples, silhouette_score\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "from matplotlib.patches import Patch\n", "from matplotlib.lines import Line2D" ], "metadata": { "id": "uniSsoyDnJiK" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "source": [ "# Carga y limpieza del dataset\n", "url = 'https://raw.githubusercontent.com/LabSWPP12023S2G2/TPInicial/main/datasetUNC.csv'\n", "data = pd.read_csv(url, delimiter=';')\n", "columns_to_drop = ['SUB PERIODS', 'SEX']\n", "data = data.drop(columns=columns_to_drop)\n", "data = data.dropna(axis=0)\n", "data.drop(data[data['PROVINCE'] == 'Otro'].index, inplace=True)\n", "data.drop(data[data['PROVINCE'] == 'other'].index, inplace=True)\n", "data.drop(data[data['EDUCATION'] == 'Otro'].index, inplace=True)" ], "metadata": { "id": "uFRd2OE3nMUB" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "unicos= data['PROVINCE'].unique()\n", "print(unicos)" ], "metadata": { "id": "6NyVlHY-2FeZ", "outputId": "ed3be1da-c51d-40ce-b9ee-69d56024140c", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "['CABA (Buenos Aires capital)' 'Tierra del Fuego' 'Jujuy' 'Córdoba'\n", " 'Misiones' 'Buenos Aires provincia' 'Santa Fe' 'Salta' 'Mendoza'\n", " 'Tucumán' 'Neuquén' 'San Luis' 'Entre Ríos' 'Santiago del Estero' 'Chaco'\n", " 'San Juan' 'Corrientes' 'Río Negro' 'La Pampa' 'La Rioja' 'Chubut'\n", " 'Catamarca' 'Santa Cruz' 'Formosa']\n" ] } ] }, { "cell_type": "code", "source": [ "# Asignaciones para columnas no númericas\n", "assignment_mapping = {\n", " 'MENTAL DISORDER HISTORY': {'no': 0, 'yes': 50},\n", " 'EDUCATION': {\n", " 'Completed postgraduate': 30,\n", " 'Incomplete tertiary or university': 60,\n", " 'Completed high school': 70,\n", " 'Incomplete postgraduate': 40,\n", " 'Completed tertiary or university': 50,\n", " 'Incomplete high school': 80,\n", " 'Incomplete elementary school': 100,\n", " 'Completed elementary school': 90\n", " },\n", " 'SUIC ATTEMPT HISTORY': {'ideation': 50, 'no': 0, 'yes': 100},\n", " 'LIVING WITH SOMEBODY': {'no': 20, 'yes': 0},\n", " 'ECONOMIC INCOME': {'yes': 0, 'no': 50}\n", "}\n", "\n", "# Aplicamos las asignaciones\n", "for column, mapping in assignment_mapping.items():\n", " data[column] = data[column].map(mapping)" ], "metadata": { "id": "pVasDzLJnQkT" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "# Queremos ver cuantos casos por provincia hay para en los casos que haya pocos\n", "# los tome como una región en particular\n", "ocurrences = data['PROVINCE'].value_counts()\n", "print(ocurrences)" ], "metadata": { "id": "Exomxoy6nVCG", "outputId": "330e7a20-b2ee-480d-9162-c7aaa5f33dd1", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Santa Fe 292\n", "Córdoba 262\n", "Buenos Aires provincia 225\n", "CABA (Buenos Aires capital) 98\n", "Jujuy 35\n", "Tucumán 25\n", "Tierra del Fuego 19\n", "Mendoza 17\n", "Entre Ríos 15\n", "Misiones 14\n", "Santiago del Estero 14\n", "Salta 12\n", "Neuquén 8\n", "La Pampa 8\n", "Chaco 7\n", "Corrientes 5\n", "San Juan 4\n", "Río Negro 4\n", "Chubut 4\n", "Formosa 3\n", "Catamarca 2\n", "Santa Cruz 2\n", "San Luis 1\n", "La Rioja 1\n", "Name: PROVINCE, dtype: int64\n" ] } ] }, { "cell_type": "code", "source": [ "# Función para asignar una región a cada provincia\n", "def assign_region(province):\n", " if province in ['Corrientes', 'Chaco', 'Misiones', 'Formosa', 'Entre Ríos']:\n", " return 'Nordeste-Litoral'\n", " elif province in ['Tucumán', 'Jujuy', 'Salta', 'Catamarca', 'Santiago del Estero']:\n", " return 'Noroeste'\n", " elif province in ['San Luis', 'San Juan', 'Mendoza', 'La Rioja']:\n", " return 'Cuyo'\n", " elif province in ['Neuquén', 'Río Negro', 'La Pampa']:\n", " return 'Patagonia Centro-Norte'\n", " elif province in ['Tierra del Fuego', 'Santa Cruz', 'Chubut']:\n", " return 'Patagonia Centro-Sur'\n", " elif province == 'Santa Fe':\n", " return 'Santa Fe'\n", " elif province == 'Buenos Aires provincia':\n", " return 'Buenos Aires'\n", " elif province == 'Córdoba':\n", " return 'Córdoba'\n", " else:\n", " return 'CABA'\n", "\n", "# Aplicamos la función a la columna 'PROVINCE' y guardamos el resultado en una nueva columna 'REGION'\n", "data['REGION'] = data['PROVINCE'].apply(assign_region)" ], "metadata": { "id": "_y8_kb19f5NK" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "# Guardamos el dataset refinado\n", "data.to_csv('ref_dataset.csv', index=False)" ], "metadata": { "id": "HY6UXW7AnRs7" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "# Descartamos las columnas que no usaremos momentaneamente\n", "columns_to_drop = ['REGION', 'PROVINCE', 'SUIC RISK']\n", "data_ref = data.drop(columns=columns_to_drop)\n", "data_ref" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 441 }, "id": "kZV1T1Y_f6lB", "outputId": "20facde9-cc52-4703-bb0e-1ac4d30e00d0" }, "execution_count": 8, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " EDUCATION AGE MENTAL DISORDER HISTORY SUIC ATTEMPT HISTORY \\\n", "0 30 30 0 50 \n", "1 60 30 0 50 \n", "2 70 39 50 0 \n", "3 60 36 0 0 \n", "5 30 35 0 0 \n", "... ... ... ... ... \n", "1095 30 28 0 100 \n", "1096 50 39 0 0 \n", "1097 60 22 0 0 \n", "1098 40 31 0 0 \n", "1099 80 29 0 50 \n", "\n", " LIVING WITH SOMEBODY ECONOMIC INCOME DEPRESSION ANXIETY STATE \\\n", "0 20 0 21 54 \n", "1 0 0 26 34 \n", "2 20 0 8 33 \n", "3 0 50 27 42 \n", "5 0 0 9 25 \n", "... ... ... ... ... \n", "1095 0 0 41 51 \n", "1096 0 0 10 31 \n", "1097 0 0 7 27 \n", "1098 0 0 6 14 \n", "1099 0 0 42 44 \n", "\n", " ANXIETY TRAIT \n", "0 40 \n", "1 36 \n", "2 29 \n", "3 48 \n", "5 12 \n", "... ... \n", "1095 47 \n", "1096 25 \n", "1097 24 \n", "1098 20 \n", "1099 37 \n", "\n", "[1077 rows x 9 columns]" ], "text/html": [ "\n", "
\n", " | EDUCATION | \n", "AGE | \n", "MENTAL DISORDER HISTORY | \n", "SUIC ATTEMPT HISTORY | \n", "LIVING WITH SOMEBODY | \n", "ECONOMIC INCOME | \n", "DEPRESSION | \n", "ANXIETY STATE | \n", "ANXIETY TRAIT | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "30 | \n", "30 | \n", "0 | \n", "50 | \n", "20 | \n", "0 | \n", "21 | \n", "54 | \n", "40 | \n", "
1 | \n", "60 | \n", "30 | \n", "0 | \n", "50 | \n", "0 | \n", "0 | \n", "26 | \n", "34 | \n", "36 | \n", "
2 | \n", "70 | \n", "39 | \n", "50 | \n", "0 | \n", "20 | \n", "0 | \n", "8 | \n", "33 | \n", "29 | \n", "
3 | \n", "60 | \n", "36 | \n", "0 | \n", "0 | \n", "0 | \n", "50 | \n", "27 | \n", "42 | \n", "48 | \n", "
5 | \n", "30 | \n", "35 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "9 | \n", "25 | \n", "12 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1095 | \n", "30 | \n", "28 | \n", "0 | \n", "100 | \n", "0 | \n", "0 | \n", "41 | \n", "51 | \n", "47 | \n", "
1096 | \n", "50 | \n", "39 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "10 | \n", "31 | \n", "25 | \n", "
1097 | \n", "60 | \n", "22 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "7 | \n", "27 | \n", "24 | \n", "
1098 | \n", "40 | \n", "31 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "6 | \n", "14 | \n", "20 | \n", "
1099 | \n", "80 | \n", "29 | \n", "0 | \n", "50 | \n", "0 | \n", "0 | \n", "42 | \n", "44 | \n", "37 | \n", "
1077 rows × 9 columns
\n", "