reab5555 commited on
Commit
deab6bd
1 Parent(s): 536f053

Update report.py

Browse files
Files changed (1) hide show
  1. report.py +205 -207
report.py CHANGED
@@ -1,208 +1,206 @@
1
- import os
2
- import numpy as np
3
- import pandas as pd
4
- import seaborn as sns
5
- import matplotlib.pyplot as plt
6
- from datetime import datetime
7
-
8
- REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
9
- os.makedirs(REPORT_DIR, exist_ok=True)
10
-
11
- def save_plot(fig, filename):
12
- fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
13
- plt.close(fig)
14
-
15
- def plot_heatmap(df, title):
16
- plt.figure(figsize=(12, 8))
17
- sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
18
- plt.title(title)
19
- plt.tight_layout()
20
- save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')
21
-
22
-
23
- def plot_valid_data_percentage(original_df, cleaned_df):
24
- original_valid = (original_df.notna().sum() / len(original_df)) * 100
25
- cleaned_valid = (cleaned_df.notna().sum() / len(cleaned_df)) * 100
26
-
27
- # Create a DataFrame with both original and cleaned percentages
28
- combined_df = pd.DataFrame({
29
- 'Original': original_valid,
30
- 'Cleaned': cleaned_valid
31
- }).fillna(0) # Fill NaN with 0 for columns that were removed
32
-
33
- plt.figure(figsize=(15, 8))
34
- combined_df.plot(kind='bar', width=0.8, alpha=0.8)
35
-
36
- plt.xlabel('Columns')
37
- plt.ylabel('Percentage of Valid Data')
38
- plt.title('Percentage of Valid Data Before and After Cleaning')
39
- plt.xticks(rotation=90)
40
- plt.legend(['Before Cleaning', 'After Cleaning'])
41
-
42
- # Add percentage labels on the bars
43
- for i, (index, row) in enumerate(combined_df.iterrows()):
44
- plt.text(i, row['Original'], f'{row["Original"]:.1f}%', ha='center', va='bottom')
45
- if row['Cleaned'] > 0: # Only add label if column exists in cleaned data
46
- plt.text(i, row['Cleaned'], f'{row["Cleaned"]:.1f}%', ha='center', va='bottom')
47
-
48
- plt.tight_layout()
49
- plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
50
- plt.close()
51
-
52
- def plot_column_schemas(df):
53
- schemas = df.dtypes.astype(str).value_counts()
54
- fig, ax = plt.subplots(figsize=(10, 6))
55
- sns.barplot(x=schemas.index, y=schemas.values, ax=ax)
56
- ax.set_title('Column Data Types')
57
- ax.set_xlabel('Data Type')
58
- ax.set_ylabel('Count')
59
- save_plot(fig, 'column_schemas.png')
60
-
61
- def plot_nonconforming_cells(nonconforming_cells):
62
- # Ensure that nonconforming_cells is a dictionary
63
- if isinstance(nonconforming_cells, dict):
64
- # Proceed with plotting if it's a dictionary
65
- fig, ax = plt.subplots(figsize=(12, 6))
66
- sns.barplot(x=list(nonconforming_cells.keys()), y=list(nonconforming_cells.values()), ax=ax)
67
- ax.set_title('Nonconforming Cells by Column')
68
- ax.set_xlabel('Columns')
69
- ax.set_ylabel('Number of Nonconforming Cells')
70
- plt.xticks(rotation=90)
71
- save_plot(fig, 'nonconforming_cells.png')
72
- else:
73
- print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")
74
-
75
-
76
- def plot_column_distributions(original_df, cleaned_df):
77
- numeric_columns = original_df.select_dtypes(include=[np.number]).columns
78
- num_columns = len(numeric_columns)
79
-
80
- if num_columns == 0:
81
- print("No numeric columns found for distribution plots.")
82
- return
83
-
84
- # Create subplots for distributions
85
- fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(18, 5 * ((num_columns + 2) // 3)))
86
- axes = axes.flatten() if num_columns > 1 else [axes]
87
-
88
- for i, column in enumerate(numeric_columns):
89
- if column in cleaned_df.columns:
90
- sns.histplot(original_df[column].dropna(), ax=axes[i], kde=True, color='blue', label='Before Cleaning', alpha=0.5)
91
- sns.histplot(cleaned_df[column].dropna(), ax=axes[i], kde=True, color='orange', label='After Cleaning', alpha=0.5)
92
- axes[i].set_title(f'{column} - Distribution Before & After Cleaning')
93
- axes[i].legend()
94
-
95
- # Remove any unused subplots
96
- for j in range(i + 1, len(axes)):
97
- fig.delaxes(axes[j])
98
-
99
- plt.tight_layout()
100
- save_plot(fig, 'distributions_before_after_cleaning.png')
101
-
102
-
103
- def plot_boxplot_with_outliers(df):
104
- print("Plotting boxplots with outliers...")
105
- numeric_columns = df.select_dtypes(include=[np.number]).columns
106
- num_columns = len(numeric_columns)
107
-
108
- if num_columns == 0:
109
- print("No numeric columns found for boxplot.")
110
- return
111
-
112
- # Create subplots based on the number of numeric columns
113
- fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(15, 5 * ((num_columns + 2) // 3)))
114
- axes = axes.flatten() if num_columns > 1 else [axes]
115
-
116
- for i, column in enumerate(numeric_columns):
117
- sns.boxplot(x=df[column], ax=axes[i])
118
- axes[i].set_title(f'Boxplot of {column} with Outliers')
119
-
120
- # Remove any unused subplots
121
- for j in range(i + 1, len(axes)):
122
- fig.delaxes(axes[j])
123
-
124
- plt.tight_layout()
125
- save_plot(fig, 'boxplots_with_outliers.png')
126
-
127
-
128
- def plot_correlation_heatmap(df):
129
- # Select only numeric, float, and integer columns
130
- numeric_df = df.select_dtypes(include=[np.number])
131
-
132
- # Compute the correlation matrix
133
- correlation_matrix = numeric_df.corr()
134
-
135
- # Plot the heatmap
136
- fig, ax = plt.subplots(figsize=(15, 10))
137
- sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', ax=ax, cbar_kws={'label': 'Correlation'})
138
- ax.set_title('Correlation Heatmap')
139
- save_plot(fig, 'correlation_heatmap.png')
140
-
141
-
142
-
143
- def plot_process_times(process_times):
144
- # Convert seconds to minutes
145
- process_times_minutes = {k: v / 60 for k, v in process_times.items()}
146
-
147
- # Separate main processes and column cleaning processes
148
- main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
149
- column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}
150
-
151
- # Create the plot
152
- fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
153
-
154
- # Plot main processes
155
- bars1 = ax1.bar(main_processes.keys(), main_processes.values())
156
- ax1.set_title('Main Process Times')
157
- ax1.set_ylabel('Time (minutes)')
158
- ax1.tick_params(axis='x', rotation=45)
159
-
160
- # Plot column cleaning processes
161
- bars2 = ax2.bar(column_processes.keys(), column_processes.values())
162
- ax2.set_title('Column Cleaning Times')
163
- ax2.set_ylabel('Time (minutes)')
164
- ax2.tick_params(axis='x', rotation=90)
165
-
166
- # Add value labels on top of each bar
167
- for ax, bars in zip([ax1, ax2], [bars1, bars2]):
168
- for bar in bars:
169
- height = bar.get_height()
170
- ax.text(bar.get_x() + bar.get_width() / 2., height,
171
- f'{height:.2f}', ha='center', va='bottom')
172
-
173
- # Add total time to the plot
174
- total_time = sum(process_times_minutes.values())
175
- fig.suptitle(f'Process Times (Total: {total_time:.2f} minutes)', fontsize=16)
176
-
177
- plt.tight_layout()
178
- save_plot(fig, 'process_times.png')
179
-
180
-
181
- def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows):
182
- os.makedirs(REPORT_DIR, exist_ok=True)
183
-
184
- sns.set_style("whitegrid")
185
- plt.rcParams['figure.dpi'] = 400
186
-
187
- print("Plotting valid data percentages...")
188
- plot_valid_data_percentage(original_df, cleaned_df)
189
-
190
- print("Plotting column schemas...")
191
- plot_column_schemas(cleaned_df)
192
-
193
- print("Plotting nonconforming cells before cleaning...")
194
- plot_nonconforming_cells(nonconforming_cells_before)
195
-
196
- print("Plotting column distributions...")
197
- plot_column_distributions(original_df, cleaned_df)
198
-
199
- print("Plotting process times...")
200
- plot_process_times(process_times)
201
-
202
- print("Plotting heatmaps...")
203
- plot_heatmap(original_df, "Missing Values Before Cleaning")
204
-
205
- print("Plotting correlation heatmap...")
206
- plot_correlation_heatmap(cleaned_df)
207
-
208
  print(f"All visualization reports saved in directory: {REPORT_DIR}")
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ from datetime import datetime
7
+
8
+ REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
9
+ os.makedirs(REPORT_DIR, exist_ok=True)
10
+
11
+ def save_plot(fig, filename):
12
+ fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
13
+ plt.close(fig)
14
+
15
+ def plot_heatmap(df, title):
16
+ plt.figure(figsize=(12, 8))
17
+ sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
18
+ plt.title(title)
19
+ plt.tight_layout()
20
+ save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')
21
+
22
+
23
+ def plot_valid_data_percentage(original_df, cleaned_df):
24
+ original_valid = (original_df.notna().sum() / len(original_df)) * 100
25
+ cleaned_valid = (cleaned_df.notna().sum() / len(cleaned_df)) * 100
26
+
27
+ plt.figure(figsize=(15, 8))
28
+ x = range(len(original_valid))
29
+ width = 0.35
30
+
31
+ plt.bar(x, original_valid, width, label='Before Cleaning', alpha=0.8)
32
+ plt.bar([i + width for i in x], cleaned_valid, width, label='After Cleaning', alpha=0.8)
33
+
34
+ plt.xlabel('Columns')
35
+ plt.ylabel('Percentage of Valid Data')
36
+ plt.title('Percentage of Valid Data Before and After Cleaning')
37
+ plt.xticks([i + width/2 for i in x], original_valid.index, rotation=90)
38
+ plt.legend()
39
+
40
+ # Add percentage labels on the bars with smaller font size
41
+ for i, v in enumerate(original_valid):
42
+ plt.text(i, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
43
+ for i, v in enumerate(cleaned_valid):
44
+ plt.text(i + width, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
45
+
46
+ plt.tight_layout()
47
+ plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
48
+ plt.close()
49
+
50
+ def plot_column_schemas(df):
51
+ schemas = df.dtypes.astype(str).value_counts()
52
+ fig, ax = plt.subplots(figsize=(10, 6))
53
+ sns.barplot(x=schemas.index, y=schemas.values, ax=ax)
54
+ ax.set_title('Column Data Types')
55
+ ax.set_xlabel('Data Type')
56
+ ax.set_ylabel('Count')
57
+ save_plot(fig, 'column_schemas.png')
58
+
59
+ def plot_nonconforming_cells(nonconforming_cells):
60
+ # Ensure that nonconforming_cells is a dictionary
61
+ if isinstance(nonconforming_cells, dict):
62
+ # Proceed with plotting if it's a dictionary
63
+ fig, ax = plt.subplots(figsize=(12, 6))
64
+ sns.barplot(x=list(nonconforming_cells.keys()), y=list(nonconforming_cells.values()), ax=ax)
65
+ ax.set_title('Nonconforming Cells by Column')
66
+ ax.set_xlabel('Columns')
67
+ ax.set_ylabel('Number of Nonconforming Cells')
68
+ plt.xticks(rotation=90)
69
+ save_plot(fig, 'nonconforming_cells.png')
70
+ else:
71
+ print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")
72
+
73
+
74
+ def plot_column_distributions(original_df, cleaned_df):
75
+ numeric_columns = original_df.select_dtypes(include=[np.number]).columns
76
+ num_columns = len(numeric_columns)
77
+
78
+ if num_columns == 0:
79
+ print("No numeric columns found for distribution plots.")
80
+ return
81
+
82
+ # Create subplots for distributions
83
+ fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(18, 5 * ((num_columns + 2) // 3)))
84
+ axes = axes.flatten() if num_columns > 1 else [axes]
85
+
86
+ for i, column in enumerate(numeric_columns):
87
+ if column in cleaned_df.columns:
88
+ sns.histplot(original_df[column].dropna(), ax=axes[i], kde=True, color='blue', label='Before Cleaning', alpha=0.5)
89
+ sns.histplot(cleaned_df[column].dropna(), ax=axes[i], kde=True, color='orange', label='After Cleaning', alpha=0.5)
90
+ axes[i].set_title(f'{column} - Distribution Before & After Cleaning')
91
+ axes[i].legend()
92
+
93
+ # Remove any unused subplots
94
+ for j in range(i + 1, len(axes)):
95
+ fig.delaxes(axes[j])
96
+
97
+ plt.tight_layout()
98
+ save_plot(fig, 'distributions_before_after_cleaning.png')
99
+
100
+
101
+ def plot_boxplot_with_outliers(df):
102
+ print("Plotting boxplots with outliers...")
103
+ numeric_columns = df.select_dtypes(include=[np.number]).columns
104
+ num_columns = len(numeric_columns)
105
+
106
+ if num_columns == 0:
107
+ print("No numeric columns found for boxplot.")
108
+ return
109
+
110
+ # Create subplots based on the number of numeric columns
111
+ fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(15, 5 * ((num_columns + 2) // 3)))
112
+ axes = axes.flatten() if num_columns > 1 else [axes]
113
+
114
+ for i, column in enumerate(numeric_columns):
115
+ sns.boxplot(x=df[column], ax=axes[i])
116
+ axes[i].set_title(f'Boxplot of {column} with Outliers')
117
+
118
+ # Remove any unused subplots
119
+ for j in range(i + 1, len(axes)):
120
+ fig.delaxes(axes[j])
121
+
122
+ plt.tight_layout()
123
+ save_plot(fig, 'boxplots_with_outliers.png')
124
+
125
+
126
+ def plot_correlation_heatmap(df):
127
+ # Select only numeric, float, and integer columns
128
+ numeric_df = df.select_dtypes(include=[np.number])
129
+
130
+ # Compute the correlation matrix
131
+ correlation_matrix = numeric_df.corr()
132
+
133
+ # Plot the heatmap
134
+ fig, ax = plt.subplots(figsize=(15, 10))
135
+ sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', ax=ax, cbar_kws={'label': 'Correlation'})
136
+ ax.set_title('Correlation Heatmap')
137
+ save_plot(fig, 'correlation_heatmap.png')
138
+
139
+
140
+
141
+ def plot_process_times(process_times):
142
+ # Convert seconds to minutes
143
+ process_times_minutes = {k: v / 60 for k, v in process_times.items()}
144
+
145
+ # Separate main processes and column cleaning processes
146
+ main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
147
+ column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}
148
+
149
+ # Create the plot
150
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
151
+
152
+ # Plot main processes
153
+ bars1 = ax1.bar(main_processes.keys(), main_processes.values())
154
+ ax1.set_title('Main Process Times')
155
+ ax1.set_ylabel('Time (minutes)')
156
+ ax1.tick_params(axis='x', rotation=45)
157
+
158
+ # Plot column cleaning processes
159
+ bars2 = ax2.bar(column_processes.keys(), column_processes.values())
160
+ ax2.set_title('Column Cleaning Times')
161
+ ax2.set_ylabel('Time (minutes)')
162
+ ax2.tick_params(axis='x', rotation=90)
163
+
164
+ # Add value labels on top of each bar
165
+ for ax, bars in zip([ax1, ax2], [bars1, bars2]):
166
+ for bar in bars:
167
+ height = bar.get_height()
168
+ ax.text(bar.get_x() + bar.get_width() / 2., height,
169
+ f'{height:.2f}', ha='center', va='bottom')
170
+
171
+ # Add total time to the plot
172
+ total_time = sum(process_times_minutes.values())
173
+ fig.suptitle(f'Process Times (Total: {total_time:.2f} minutes)', fontsize=16)
174
+
175
+ plt.tight_layout()
176
+ save_plot(fig, 'process_times.png')
177
+
178
+
179
+ def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows):
180
+ os.makedirs(REPORT_DIR, exist_ok=True)
181
+
182
+ sns.set_style("whitegrid")
183
+ plt.rcParams['figure.dpi'] = 400
184
+
185
+ print("Plotting valid data percentages...")
186
+ plot_valid_data_percentage(original_df, cleaned_df)
187
+
188
+ print("Plotting column schemas...")
189
+ plot_column_schemas(cleaned_df)
190
+
191
+ print("Plotting nonconforming cells before cleaning...")
192
+ plot_nonconforming_cells(nonconforming_cells_before)
193
+
194
+ print("Plotting column distributions...")
195
+ plot_column_distributions(original_df, cleaned_df)
196
+
197
+ print("Plotting process times...")
198
+ plot_process_times(process_times)
199
+
200
+ print("Plotting heatmaps...")
201
+ plot_heatmap(original_df, "Missing Values Before Cleaning")
202
+
203
+ print("Plotting correlation heatmap...")
204
+ plot_correlation_heatmap(cleaned_df)
205
+
 
 
206
  print(f"All visualization reports saved in directory: {REPORT_DIR}")