File size: 8,525 Bytes
4395fda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f6d176
4395fda
 
4f6d176
4395fda
 
 
 
 
 
 
4f6d176
 
4395fda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f6d176
 
4395fda
 
 
 
 
 
 
 
 
 
 
 
 
4f6d176
4395fda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f6d176
 
4395fda
 
 
 
 
 
 
 
 
 
 
 
 
4f6d176
4395fda
 
 
 
 
 
 
 
 
 
 
 
4f6d176
 
4395fda
 
 
 
 
4f6d176
4395fda
 
4f6d176
4395fda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f6d176
4395fda
 
 
 
4f6d176
4395fda
4f6d176
4395fda
4f6d176
4395fda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import os
from collections import Counter
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
os.makedirs(REPORT_DIR, exist_ok=True)

def save_plot(fig, filename):
    fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
    plt.close(fig)

def plot_heatmap(df, title):
    # Calculate the percentage of null values for each column
    null_percentages = df.isnull().mean() * 100

    plt.figure(figsize=(12, 8))
    sns.heatmap(null_percentages.to_frame().T, cbar=True, cmap='Reds', annot=True, fmt='.1f')
    plt.title(title)
    plt.ylabel('Percentage of Missing Values')
    plt.tight_layout()
    save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')

def plot_column_schemas(df):
    # Get the data types of all columns
    data_types = df.dtypes.astype(str).tolist()
    data_types = [dtype.capitalize() for dtype in data_types]

    # Count the occurrences of each data type
    type_counts = Counter(data_types)

    fig, ax = plt.subplots(figsize=(10, 6))

    # Generate a color palette with as many colors as there are bars
    colors = plt.cm.tab20(np.linspace(0, 1, len(type_counts)))

    # Plot the bars
    bars = ax.bar(type_counts.keys(), type_counts.values(), color=colors)

    ax.set_title('Column Data Types')
    ax.set_xlabel('Data Type')
    ax.set_ylabel('Count')

    # Add value labels on top of each bar
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2., height,
                f'{int(height)}',
                ha='center', va='bottom')

    plt.xticks(rotation=45)
    plt.tight_layout()
    save_plot(fig, 'column_schemas.png')

def plot_nonconforming_cells(nonconforming_cells):
    # Ensure that nonconforming_cells is a dictionary
    if isinstance(nonconforming_cells, dict):
        # Proceed with plotting if it's a dictionary
        fig, ax = plt.subplots(figsize=(12, 6))

        # Generate a color palette with as many colors as there are bars
        colors = plt.cm.rainbow(np.linspace(0, 1, len(nonconforming_cells)))

        # Plot the bars
        bars = ax.bar(list(nonconforming_cells.keys()), list(nonconforming_cells.values()), color=colors)

        ax.set_title('Nonconforming Cells by Column')
        ax.set_xlabel('Columns')
        ax.set_ylabel('Number of Nonconforming Cells')
        plt.xticks(rotation=90)

        # Add value labels on top of each bar
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width() / 2., height,
                    f'{height:,}',
                    ha='center', va='bottom')

        save_plot(fig, 'nonconforming_cells.png')
    else:
        print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")

def plot_column_distributions(cleaned_df, primary_key_column):
    print("Plotting distribution charts for numeric columns in the cleaned DataFrame...")

    numeric_columns = cleaned_df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col != primary_key_column]
    num_columns = len(numeric_columns)

    if num_columns == 0:
        print("No numeric columns found in the cleaned DataFrame for distribution plots.")
        return

    # Create subplots for distributions
    ncols = 3
    nrows = (num_columns + ncols - 1) // ncols  # Ceiling division
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5 * nrows))
    axes = axes.flatten() if num_columns > 1 else [axes]

    for i, column in enumerate(numeric_columns):
        cleaned_data = cleaned_df[column].dropna()

        sns.histplot(cleaned_data, ax=axes[i], kde=True, color='orange', label='After Cleaning', alpha=0.7)
        axes[i].set_title(f'{column} - Distribution After Cleaning')
        axes[i].legend()

    # Remove any unused subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    save_plot(fig, 'distributions_after_cleaning.png')

def plot_boxplot_with_outliers(original_df, primary_key_column):
    print("Plotting boxplots for numeric columns in the original DataFrame...")

    numeric_columns = original_df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col != primary_key_column]
    num_columns = len(numeric_columns)

    if num_columns == 0:
        print("No numeric columns found in the original DataFrame for boxplots.")
        return

    # Create subplots based on the number of numeric columns
    ncols = 3
    nrows = (num_columns + ncols - 1) // ncols  # Ceiling division
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5 * nrows))
    axes = axes.flatten() if num_columns > 1 else [axes]

    for i, column in enumerate(numeric_columns):
        data = original_df[column].dropna()

        sns.boxplot(x=data, ax=axes[i], color='blue', orient='h')
        axes[i].set_title(f'Boxplot of {column} (Before Cleaning)')

    # Remove any unused subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    save_plot(fig, 'boxplots_before_cleaning.png')

def plot_correlation_heatmap(df, primary_key_column):
    numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col != primary_key_column]

    if not numeric_columns:
        print("No numeric columns found for correlation heatmap.")
        return

    corr_matrix = df[numeric_columns].corr()

    plt.figure(figsize=(15, 10))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar_kws={'label': 'Correlation'})
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    save_plot(plt.gcf(), 'correlation_heatmap.png')

def plot_process_times(process_times):
    # Convert seconds to minutes
    process_times_minutes = {k: v / 60 for k, v in process_times.items()}

    # Separate main processes and column cleaning processes
    main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
    column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}

    # Create the plot
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

    # Plot main processes
    bars1 = ax1.bar(main_processes.keys(), main_processes.values())
    ax1.set_title('Main Process Times')
    ax1.set_ylabel('Time (minutes)')
    ax1.tick_params(axis='x', rotation=45)

    # Plot column cleaning processes
    bars2 = ax2.bar(column_processes.keys(), column_processes.values())
    ax2.set_title('Column Cleaning Times')
    ax2.set_ylabel('Time (minutes)')
    ax2.tick_params(axis='x', rotation=90)

    # Add value labels on top of each bar with 3 decimal places
    for ax, bars in zip([ax1, ax2], [bars1, bars2]):
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width() / 2., height,
                    f'{height:.4f}', ha='center', va='bottom')

    # Add total time to the plot with 3 decimal places
    total_time = sum(process_times_minutes.values())
    fig.suptitle(f'Process Times (Total: {total_time:.3f} minutes)', fontsize=16)

    plt.tight_layout()
    save_plot(fig, 'process_times.png')

def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns,
                       removed_rows, primary_key_column):
    os.makedirs(REPORT_DIR, exist_ok=True)

    sns.set_style("whitegrid")
    plt.rcParams['figure.dpi'] = 400

    print("Plotting nonconforming cells before cleaning...")
    plot_nonconforming_cells(nonconforming_cells_before)

    print("Plotting column distributions...")
    plot_column_distributions(cleaned_df, primary_key_column)

    print("Plotting boxplots for original data...")
    plot_boxplot_with_outliers(original_df, primary_key_column)

    print("Plotting process times...")
    plot_process_times(process_times)

    print("Plotting heatmaps...")
    plot_heatmap(original_df, "Missing Values Before Cleaning")

    print("Plotting correlation heatmap...")
    plot_correlation_heatmap(cleaned_df, primary_key_column)

    print("Plotting column schemas...")
    plot_column_schemas(cleaned_df)

    print(f"All visualization reports saved in directory: {REPORT_DIR}")