Spaces:
Running
Running
File size: 3,474 Bytes
fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c 0b212ec fb1a11c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
{
"notebook_title": "Exploratory data analysis (EDA)",
"notebook_type": "eda",
"dataset_type": "numeric",
"notebook_template": [
{
"cell_type": "markdown",
"source": "---\n# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**\n---"
},
{
"cell_type": "markdown",
"source": "## 1. Setup necessary libraries and load the dataset"
},
{
"cell_type": "code",
"source": "# Install and import necessary libraries.\n!pip install pandas matplotlib seaborn"
},
{
"cell_type": "code",
"source": "import matplotlib.pyplot as plt\nimport seaborn as sns"
},
{
"cell_type": "code",
"source": "# Load the dataset as a DataFrame\n{first_code}"
},
{
"cell_type": "markdown",
"source": "## 2. Understanding the Dataset"
},
{
"cell_type": "code",
"source": "# First rows of the dataset and info\nprint(df.head())\nprint(df.info())"
},
{
"cell_type": "code",
"source": "# Check for missing values\nprint(df.isnull().sum())"
},
{
"cell_type": "code",
"source": "# Identify data types of each column\nprint(df.dtypes)"
},
{
"cell_type": "code",
"source": "# Detect duplicated rows\nprint(df.duplicated().sum())"
},
{
"cell_type": "code",
"source": "# Generate descriptive statistics\nprint(df.describe())"
},
{
"type": "categoric",
"cell_type": "code",
"source": "# Unique values in categorical columns\ndf.select_dtypes(include=['object']).nunique()"
},
{
"cell_type": "markdown",
"source": "## 3. Data Visualization"
},
{
"type": "numeric",
"cell_type": "code",
"source": "# Correlation matrix for numerical columns\ncorr_matrix = df.corr(numeric_only=True)\nplt.figure(figsize=(10, 8))\nsns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)\nplt.title('Correlation Matrix')\nplt.show()"
},
{
"type": "numeric",
"cell_type": "code",
"source": "# Distribution plots for numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n plt.figure(figsize=(8, 4))\n sns.histplot(df[column], kde=True)\n plt.title(f'Distribution of {column}')\n plt.xlabel(column)\n plt.ylabel('Frequency')\n plt.show()"
},
{
"type": "categoric",
"cell_type": "code",
"source": "# Count plots for categorical columns\nfor column in df.select_dtypes(include=['object']).columns:\n plt.figure(figsize=(8, 4))\n sns.countplot(x=column, data=df)\n plt.title(f'Count Plot of {column}')\n plt.xlabel(column)\n plt.ylabel('Count')\n plt.show()"
},
{
"type": "numeric",
"cell_type": "code",
"source": "# Box plots for detecting outliers in numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n plt.figure(figsize=(8, 4))\n sns.boxplot(df[column])\n plt.title(f'Box Plot of {column}')\n plt.xlabel(column)\n plt.show()"
}
]
} |