AI-Data-Cleaner / report.py
reab5555's picture
Upload 3 files
ec38d9f verified
raw
history blame
8.28 kB
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
os.makedirs(REPORT_DIR, exist_ok=True)
def save_plot(fig, filename):
fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
plt.close(fig)
def plot_heatmap(df, title):
plt.figure(figsize=(12, 8))
sns.heatmap(df.isnull(), cbar=False, cmap='Reds')
plt.title(title)
plt.tight_layout()
save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')
def plot_valid_data_percentage(original_df, cleaned_df, primary_key_column):
columns_to_plot = [col for col in original_df.columns if col != primary_key_column]
original_valid = (original_df[columns_to_plot].notna().sum() / len(original_df)) * 100
cleaned_valid = (cleaned_df[columns_to_plot].notna().sum() / len(cleaned_df)) * 100
combined_data = pd.concat([original_valid, cleaned_valid], axis=1, keys=['Original', 'Cleaned']).fillna(0)
plt.figure(figsize=(15, 8))
x = range(len(combined_data))
width = 0.35
plt.bar(x, combined_data['Original'], width, label='Before Cleaning', alpha=0.8)
plt.bar([i + width for i in x], combined_data['Cleaned'], width, label='After Cleaning', alpha=0.8)
plt.xlabel('Columns')
plt.ylabel('Percentage of Valid Data')
plt.title('Percentage of Valid Data Before and After Cleaning')
plt.xticks([i + width / 2 for i in x], combined_data.index, rotation=90)
plt.legend()
for i, v in enumerate(combined_data['Original']):
plt.text(i, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
for i, v in enumerate(combined_data['Cleaned']):
plt.text(i + width, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
plt.tight_layout()
plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
plt.close()
def plot_column_schemas(df):
schemas = df.dtypes.astype(str).value_counts()
fig, ax = plt.subplots(figsize=(10, 6))
colors = plt.cm.rainbow(np.linspace(0, 1, len(schemas)))
bars = ax.bar(schemas.index, schemas.values, color=colors)
ax.set_title('Column Data Types')
ax.set_xlabel('Data Type')
ax.set_ylabel('Count')
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2., height,
f'{height}',
ha='center', va='bottom')
save_plot(fig, 'column_schemas.png')
def plot_nonconforming_cells(nonconforming_cells):
if isinstance(nonconforming_cells, dict):
fig, ax = plt.subplots(figsize=(12, 6))
colors = plt.cm.rainbow(np.linspace(0, 1, len(nonconforming_cells)))
bars = ax.bar(list(nonconforming_cells.keys()), list(nonconforming_cells.values()), color=colors)
ax.set_title('Nonconforming Cells by Column')
ax.set_xlabel('Columns')
ax.set_ylabel('Number of Nonconforming Cells')
plt.xticks(rotation=90)
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2., height,
f'{height:,}',
ha='center', va='bottom')
save_plot(fig, 'nonconforming_cells.png')
else:
print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")
def plot_column_distributions(original_df, cleaned_df, primary_key_column):
numeric_columns = [col for col in original_df.select_dtypes(include=[np.number]).columns if
col != primary_key_column]
num_columns = len(numeric_columns)
if num_columns == 0:
print("No numeric columns found for distribution plots.")
return
fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(18, 5 * ((num_columns + 2) // 3)))
axes = axes.flatten() if num_columns > 1 else [axes]
for i, column in enumerate(numeric_columns):
if column in cleaned_df.columns:
sns.histplot(original_df[column].dropna(), ax=axes[i], kde=True, color='blue', label='Before Cleaning',
alpha=0.5)
sns.histplot(cleaned_df[column].dropna(), ax=axes[i], kde=True, color='orange', label='After Cleaning',
alpha=0.5)
axes[i].set_title(f'{column} - Distribution Before & After Cleaning')
axes[i].legend()
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
save_plot(fig, 'distributions_before_after_cleaning.png')
def plot_boxplot_with_outliers(df, primary_key_column):
numeric_columns = [col for col in df.select_dtypes(include=[np.number]).columns if col != primary_key_column]
num_columns = len(numeric_columns)
if num_columns == 0:
print("No numeric columns found for boxplot.")
return
fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(15, 5 * ((num_columns + 2) // 3)))
axes = axes.flatten() if num_columns > 1 else [axes]
for i, column in enumerate(numeric_columns):
sns.boxplot(x=df[column], ax=axes[i])
axes[i].set_title(f'Boxplot of {column} with Outliers')
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
save_plot(fig, 'boxplots_with_outliers.png')
def plot_correlation_heatmap(df, primary_key_column):
numeric_df = df.select_dtypes(include=[np.number])
numeric_df = numeric_df.drop(columns=[primary_key_column], errors='ignore')
correlation_matrix = numeric_df.corr()
fig, ax = plt.subplots(figsize=(15, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', ax=ax, cbar_kws={'label': 'Correlation'})
ax.set_title('Correlation Heatmap')
save_plot(fig, 'correlation_heatmap.png')
def plot_process_times(process_times):
process_times_minutes = {k: v / 60 for k, v in process_times.items()}
main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
bars1 = ax1.bar(main_processes.keys(), main_processes.values())
ax1.set_title('Main Process Times')
ax1.set_ylabel('Time (minutes)')
ax1.tick_params(axis='x', rotation=45)
bars2 = ax2.bar(column_processes.keys(), column_processes.values())
ax2.set_title('Column Cleaning Times')
ax2.set_ylabel('Time (minutes)')
ax2.tick_params(axis='x', rotation=90)
for ax, bars in zip([ax1, ax2], [bars1, bars2]):
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2., height,
f'{height:.4f}', ha='center', va='bottom')
total_time = sum(process_times_minutes.values())
fig.suptitle(f'Process Times (Total: {total_time:.2f} minutes)', fontsize=16)
plt.tight_layout()
save_plot(fig, 'process_times.png')
def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns,
removed_rows, primary_key_column):
os.makedirs(REPORT_DIR, exist_ok=True)
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 400
print("Plotting valid data percentages...")
plot_valid_data_percentage(original_df, cleaned_df, primary_key_column)
print("Plotting column schemas...")
plot_column_schemas(cleaned_df)
print("Plotting nonconforming cells before cleaning...")
plot_nonconforming_cells(nonconforming_cells_before)
print("Plotting column distributions...")
plot_column_distributions(original_df, cleaned_df, primary_key_column)
print("Plotting process times...")
plot_process_times(process_times)
print("Plotting heatmaps...")
plot_heatmap(original_df, "Missing Values Before Cleaning")
print("Plotting correlation heatmap...")
plot_correlation_heatmap(cleaned_df, primary_key_column)
print("Plotting boxplots with outliers...")
plot_boxplot_with_outliers(cleaned_df, primary_key_column)
print(f"All visualization reports saved in directory: {REPORT_DIR}")