reab5555 commited on
Commit
ec38d9f
1 Parent(s): 6d52bd3

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +48 -27
  2. clean.py +284 -280
  3. report.py +48 -62
app.py CHANGED
@@ -5,24 +5,28 @@ from report import create_full_report, REPORT_DIR
5
  import os
6
  import tempfile
7
 
8
- def clean_and_visualize(file, progress=gr.Progress()):
 
9
  # Load the data
10
  df = pd.read_csv(file.name)
11
-
 
 
 
12
  # Clean the data
13
  cleaned_df = None
14
  nonconforming_cells_before = None
15
  process_times = None
16
  removed_columns = None
17
  removed_rows = None
18
-
19
- for progress_value, status_text in clean_data(df):
20
  if isinstance(status_text, tuple):
21
  cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows = status_text
22
  progress(progress_value, desc="Cleaning completed")
23
  else:
24
  progress(progress_value, desc=status_text)
25
-
26
  # Generate full visualization report
27
  create_full_report(
28
  df,
@@ -30,61 +34,78 @@ def clean_and_visualize(file, progress=gr.Progress()):
30
  nonconforming_cells_before,
31
  process_times,
32
  removed_columns,
33
- removed_rows
 
34
  )
35
-
36
  # Save cleaned DataFrame to a temporary CSV file
37
  with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp_file:
38
  cleaned_df.to_csv(tmp_file.name, index=False)
39
  cleaned_csv_path = tmp_file.name
40
-
41
  # Collect all generated images
42
  image_files = [os.path.join(REPORT_DIR, f) for f in os.listdir(REPORT_DIR) if f.endswith('.png')]
43
-
44
  return cleaned_csv_path, image_files
45
 
 
46
  def launch_app():
47
  with gr.Blocks() as app:
48
  gr.Markdown("# AI Data Cleaner")
49
-
50
  with gr.Row():
51
  file_input = gr.File(label="Upload CSV File", file_count="single", file_types=[".csv"])
52
-
 
 
 
53
  with gr.Row():
54
  clean_button = gr.Button("Start Cleaning")
55
-
56
  with gr.Row():
57
  progress_bar = gr.Progress()
58
-
59
  with gr.Row():
60
  cleaned_file_output = gr.File(label="Cleaned CSV", visible=True)
61
-
62
  with gr.Row():
63
  output_gallery = gr.Gallery(
64
- label="Visualization Results",
65
- show_label=True,
66
- elem_id="gallery",
67
  columns=[3],
68
- rows=[3],
69
- object_fit="contain",
70
  height="auto",
71
- visible=False # Initially set to invisible
72
  )
73
-
74
- def process_and_show_results(file):
75
- cleaned_csv_path, image_files = clean_and_visualize(file, progress=progress_bar)
 
 
 
 
 
76
  return (
77
  cleaned_csv_path,
78
- gr.Gallery(visible=True, value=image_files) # Make gallery visible and update its content
79
  )
80
-
 
 
 
 
 
 
81
  clean_button.click(
82
  fn=process_and_show_results,
83
- inputs=file_input,
84
  outputs=[cleaned_file_output, output_gallery]
85
  )
86
-
87
  app.launch()
88
 
 
89
  if __name__ == "__main__":
90
  launch_app()
 
5
  import os
6
  import tempfile
7
 
8
+
9
+ def clean_and_visualize(file, primary_key_column, progress=gr.Progress()):
10
  # Load the data
11
  df = pd.read_csv(file.name)
12
+
13
+ # Remove duplicates from the primary key column
14
+ df = df.drop_duplicates(subset=[primary_key_column], keep='first')
15
+
16
  # Clean the data
17
  cleaned_df = None
18
  nonconforming_cells_before = None
19
  process_times = None
20
  removed_columns = None
21
  removed_rows = None
22
+
23
+ for progress_value, status_text in clean_data(df, primary_key_column):
24
  if isinstance(status_text, tuple):
25
  cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows = status_text
26
  progress(progress_value, desc="Cleaning completed")
27
  else:
28
  progress(progress_value, desc=status_text)
29
+
30
  # Generate full visualization report
31
  create_full_report(
32
  df,
 
34
  nonconforming_cells_before,
35
  process_times,
36
  removed_columns,
37
+ removed_rows,
38
+ primary_key_column
39
  )
40
+
41
  # Save cleaned DataFrame to a temporary CSV file
42
  with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp_file:
43
  cleaned_df.to_csv(tmp_file.name, index=False)
44
  cleaned_csv_path = tmp_file.name
45
+
46
  # Collect all generated images
47
  image_files = [os.path.join(REPORT_DIR, f) for f in os.listdir(REPORT_DIR) if f.endswith('.png')]
48
+
49
  return cleaned_csv_path, image_files
50
 
51
+
52
  def launch_app():
53
  with gr.Blocks() as app:
54
  gr.Markdown("# AI Data Cleaner")
55
+
56
  with gr.Row():
57
  file_input = gr.File(label="Upload CSV File", file_count="single", file_types=[".csv"])
58
+
59
+ with gr.Row():
60
+ primary_key_dropdown = gr.Dropdown(label="Select Primary Key Column", choices=[], interactive=True)
61
+
62
  with gr.Row():
63
  clean_button = gr.Button("Start Cleaning")
64
+
65
  with gr.Row():
66
  progress_bar = gr.Progress()
67
+
68
  with gr.Row():
69
  cleaned_file_output = gr.File(label="Cleaned CSV", visible=True)
70
+
71
  with gr.Row():
72
  output_gallery = gr.Gallery(
73
+ label="Visualization Results",
74
+ show_label=True,
75
+ elem_id="gallery",
76
  columns=[3],
77
+ rows=[3],
78
+ object_fit="contain",
79
  height="auto",
80
+ visible=False
81
  )
82
+
83
+ def update_primary_key_options(file):
84
+ if file is not None:
85
+ df = pd.read_csv(file.name)
86
+ return gr.Dropdown(choices=df.columns.tolist())
87
+
88
+ def process_and_show_results(file, primary_key_column):
89
+ cleaned_csv_path, image_files = clean_and_visualize(file, primary_key_column, progress=progress_bar)
90
  return (
91
  cleaned_csv_path,
92
+ gr.Gallery(visible=True, value=image_files)
93
  )
94
+
95
+ file_input.change(
96
+ fn=update_primary_key_options,
97
+ inputs=file_input,
98
+ outputs=primary_key_dropdown
99
+ )
100
+
101
  clean_button.click(
102
  fn=process_and_show_results,
103
+ inputs=[file_input, primary_key_dropdown],
104
  outputs=[cleaned_file_output, output_gallery]
105
  )
106
+
107
  app.launch()
108
 
109
+
110
  if __name__ == "__main__":
111
  launch_app()
clean.py CHANGED
@@ -1,280 +1,284 @@
1
- import pandas as pd
2
- import numpy as np
3
- import json
4
- import time
5
- from tqdm import tqdm
6
- from llm_config import generate_llm_response
7
- from llm_prompts import (
8
- CHECK_HEADERS_PROMPT,
9
- NORMALIZE_HEADERS_PROMPT,
10
- CHECK_COLUMN_CONTENT_PROMPT,
11
- CHECK_TYPOS_PROMPT,
12
- TRANSFORM_STRING_PROMPT,
13
- CHECK_LOW_COUNT_VALUES_PROMPT
14
- )
15
-
16
- BATCH_SIZE = 50
17
- EMPTY_THRESHOLD = 0.5
18
-
19
-
20
- def print_dataframe_info(df, step=""):
21
- num_columns = df.shape[1]
22
- num_rows = df.shape[0]
23
- num_cells = num_columns * num_rows
24
- print(f"{step}Dataframe info:")
25
- print(f" Number of columns: {num_columns}")
26
- print(f" Number of rows: {num_rows}")
27
- print(f" Total number of cells: {num_cells}")
28
-
29
-
30
- def check_and_normalize_column_headers(df):
31
- print("Checking and normalizing column headers...")
32
-
33
- check_prompt = CHECK_HEADERS_PROMPT.format(columns=df.columns.tolist())
34
- check_response = generate_llm_response(check_prompt)
35
- try:
36
- invalid_columns = json.loads(check_response)
37
- if invalid_columns:
38
- print(f"Columns with invalid names (indices): {invalid_columns}")
39
- for idx in invalid_columns:
40
- new_name = f"column_{idx}"
41
- print(f"Renaming column at index {idx} to '{new_name}'")
42
- df.rename(columns={df.columns[idx]: new_name}, inplace=True)
43
- else:
44
- print("All column headers are valid or no invalid headers detected.")
45
- except json.JSONDecodeError:
46
- print("Error parsing LLM response for column headers check.")
47
-
48
- normalize_prompt = NORMALIZE_HEADERS_PROMPT.format(columns=df.columns.tolist())
49
- normalize_response = generate_llm_response(normalize_prompt)
50
- try:
51
- normalized_names = json.loads(normalize_response)
52
- if normalized_names:
53
- df.rename(columns=normalized_names, inplace=True)
54
- print("Column names have been normalized.")
55
- else:
56
- print("No column names were normalized. Proceeding with current names.")
57
- except json.JSONDecodeError:
58
- print("Error parsing LLM response for column name normalization.")
59
-
60
- # Fallback normalization
61
- df.columns = [col.lower().replace(' ', '_') for col in df.columns]
62
- print("Applied fallback normalization to ensure valid column names.")
63
-
64
- return df
65
-
66
-
67
- def process_column_batch(column_data, column_name):
68
- sample = column_data.sample(n=min(BATCH_SIZE, len(column_data)), random_state=42).tolist()
69
- prompt = CHECK_COLUMN_CONTENT_PROMPT.format(column_name=column_name, sample_values=str(sample))
70
- response = generate_llm_response(prompt)
71
- try:
72
- result = json.loads(response)
73
- if not all(key in result for key in ['data_type', 'empty_indices', 'invalid_indices']):
74
- raise ValueError("Missing required keys in LLM response")
75
- return result
76
- except (json.JSONDecodeError, ValueError) as e:
77
- print(f"Error parsing LLM response for column {column_name}: {str(e)}")
78
- print(f"LLM Response: {response}")
79
- return {'data_type': 'string', 'empty_indices': [], 'invalid_indices': []}
80
-
81
-
82
- def check_typos(column_data, column_name):
83
- sample = column_data.sample(n=min(BATCH_SIZE, len(column_data)), random_state=42).tolist()
84
- prompt = CHECK_TYPOS_PROMPT.format(column_name=column_name, sample_values=str(sample))
85
- response = generate_llm_response(prompt)
86
- try:
87
- return json.loads(response)
88
- except json.JSONDecodeError:
89
- print(f"Error parsing LLM response for typo check in column {column_name}")
90
- return {"typos": {}}
91
-
92
-
93
- def transform_string_column(column_data, column_name):
94
- unique_values = column_data.unique().tolist()
95
- prompt = TRANSFORM_STRING_PROMPT.format(column_name=column_name, unique_values=unique_values)
96
- response = generate_llm_response(prompt)
97
- try:
98
- result = json.loads(response)
99
- return result
100
- except json.JSONDecodeError:
101
- print(f"Error parsing LLM response for string transformation in column {column_name}")
102
- return {}
103
-
104
-
105
- def check_low_count_values(column_data, column_name):
106
- value_counts = column_data.value_counts().to_dict()
107
- prompt = CHECK_LOW_COUNT_VALUES_PROMPT.format(column_name=column_name, value_counts=value_counts)
108
- response = generate_llm_response(prompt)
109
- try:
110
- result = json.loads(response)
111
- return result
112
- except json.JSONDecodeError:
113
- print(f"Error parsing LLM response for low count values in column {column_name}")
114
- return []
115
-
116
-
117
- def remove_empty_columns(df, threshold=EMPTY_THRESHOLD):
118
- print(f"Removing columns with less than {threshold * 100}% valid data...")
119
- valid_threshold = int(df.shape[0] * threshold)
120
- df = df.dropna(axis=1, thresh=valid_threshold)
121
- return df
122
-
123
-
124
- def remove_empty_rows(df, threshold=EMPTY_THRESHOLD):
125
- print(f"Removing rows with less than {threshold * 100}% valid data...")
126
- valid_threshold = int(df.shape[1] * threshold)
127
- df = df.dropna(axis=0, thresh=valid_threshold)
128
- return df
129
-
130
-
131
- def clean_column(df, column_name):
132
- print(f"Cleaning column: {column_name}")
133
- column_data = df[column_name]
134
- total_rows = len(column_data)
135
- empty_indices = []
136
- invalid_indices = []
137
- data_type = "string"
138
- nonconforming_cells = 0
139
-
140
- for i in range(0, total_rows, BATCH_SIZE):
141
- batch = column_data.iloc[i:i + BATCH_SIZE]
142
- result = process_column_batch(batch, column_name)
143
-
144
- valid_empty_indices = [idx for idx in result["empty_indices"] if idx + i < total_rows]
145
- valid_invalid_indices = [idx for idx in result["invalid_indices"] if idx + i < total_rows]
146
-
147
- empty_indices.extend([idx + i for idx in valid_empty_indices])
148
- invalid_indices.extend([idx + i for idx in valid_invalid_indices])
149
-
150
- if i == 0: # Use the data type from the first batch
151
- data_type = result["data_type"]
152
-
153
- print(f" Data type determined: {data_type}")
154
- print(f" Empty cells: {len(empty_indices)}")
155
- print(f" Invalid cells: {len(invalid_indices)}")
156
-
157
- # Convert column to determined data type
158
- if data_type == "float":
159
- df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce')
160
- elif data_type == "integer":
161
- df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce').astype('Int64')
162
- elif data_type == "date":
163
- df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
164
- elif data_type == "string" or data_type == "object":
165
- # Transform string values
166
- transform_result = transform_string_column(column_data, column_name)
167
- df[column_name] = df[column_name].map(transform_result).fillna(df[column_name])
168
-
169
- # Handle "nan" strings
170
- df[column_name] = df[column_name].replace({"nan": np.nan, "NaN": np.nan, "NAN": np.nan})
171
-
172
- # Check for low count values
173
- low_count_values = check_low_count_values(df[column_name], column_name)
174
- df.loc[df[column_name].isin(low_count_values), column_name] = np.nan
175
-
176
- # Check for typos
177
- typo_result = check_typos(df[column_name], column_name)
178
- if typo_result["typos"]:
179
- print(f" Potential typos found: {typo_result['typos']}")
180
-
181
- # Set empty and invalid cells to NaN
182
- df.loc[empty_indices + invalid_indices, column_name] = np.nan
183
- nonconforming_cells = len(empty_indices) + len(invalid_indices)
184
-
185
- return df, nonconforming_cells
186
-
187
-
188
- def remove_outliers(df):
189
- print("Removing rows with outliers from numeric/integer/float columns...")
190
- rows_to_remove = set()
191
- for column in df.select_dtypes(include=[np.number]).columns:
192
- q1 = df[column].quantile(0.25)
193
- q3 = df[column].quantile(0.75)
194
- iqr = q3 - q1
195
- lower_bound = q1 - 1.5 * iqr
196
- upper_bound = q3 + 1.5 * iqr
197
- outlier_rows = df[(df[column] < lower_bound) | (df[column] > upper_bound)].index
198
- rows_to_remove.update(outlier_rows)
199
-
200
- initial_rows = len(df)
201
- df = df.drop(index=list(rows_to_remove))
202
- removed_rows = initial_rows - len(df)
203
- print(f"Removed {removed_rows} rows containing outliers.")
204
- return df, removed_rows
205
-
206
-
207
- def calculate_nonconforming_cells(df):
208
- nonconforming_cells = {}
209
- for column in df.columns:
210
- # Count NaN values
211
- nan_count = df[column].isna().sum()
212
-
213
- # For numeric columns, count infinite values
214
- if np.issubdtype(df[column].dtype, np.number):
215
- inf_count = np.isinf(df[column]).sum()
216
- else:
217
- inf_count = 0
218
-
219
- # For object columns, count empty strings
220
- if df[column].dtype == 'object':
221
- empty_string_count = (df[column] == '').sum()
222
- else:
223
- empty_string_count = 0
224
-
225
- nonconforming_cells[column] = nan_count + inf_count + empty_string_count
226
-
227
- return nonconforming_cells
228
-
229
-
230
- def clean_data(df):
231
- start_time = time.time()
232
- process_times = {}
233
- removed_rows = 0
234
- removed_columns = 0
235
-
236
- print("Starting data validation and cleaning...")
237
- print_dataframe_info(df, "Initial - ")
238
-
239
- # Calculate nonconforming cells before cleaning
240
- nonconforming_cells_before = calculate_nonconforming_cells(df)
241
-
242
- steps = ['Normalize headers', 'Remove empty columns', 'Remove empty rows', 'Remove low count strings', 'Clean columns', 'Remove outliers']
243
- total_steps = len(steps) + len(df.columns) # Add column count for individual column cleaning
244
-
245
- # Step 1: Normalize column headers
246
- step_start_time = time.time()
247
- df = check_and_normalize_column_headers(df)
248
- process_times['Normalize headers'] = time.time() - step_start_time
249
- yield 1 / total_steps, "Normalized headers"
250
-
251
- # Step 2: Remove empty columns (less than 60% valid data)
252
- step_start_time = time.time()
253
- df = remove_empty_columns(df)
254
- process_times['Remove empty columns'] = time.time() - step_start_time
255
- yield 2 / total_steps, "Removed empty columns"
256
-
257
- # Step 3: Remove empty rows (less than 60% valid data)
258
- step_start_time = time.time()
259
- df = remove_empty_rows(df)
260
- process_times['Remove empty rows'] = time.time() - step_start_time
261
- yield 3 / total_steps, "Removed empty rows"
262
-
263
- # Step 4: Clean columns (in batches)
264
- column_cleaning_times = {}
265
- for i, column in enumerate(df.columns):
266
- column_start_time = time.time()
267
- df, nonconforming = clean_column(df, column)
268
- column_cleaning_times[f"Clean column: {column}"] = time.time() - column_start_time
269
- yield (5 + i) / total_steps, f"Cleaning column: {column}"
270
- process_times.update(column_cleaning_times)
271
-
272
- # Step 5: Remove outliers from numeric columns
273
- step_start_time = time.time()
274
- df, outlier_rows_removed = remove_outliers(df)
275
- removed_rows += outlier_rows_removed
276
- process_times['Remove outliers'] = time.time() - step_start_time
277
- yield 1.0, (df, nonconforming_cells_before, process_times, removed_columns, removed_rows)
278
-
279
- print("Cleaning process completed.")
280
- print_dataframe_info(df, "Final - ")
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import json
4
+ import time
5
+ from tqdm import tqdm
6
+ from llm_config import generate_llm_response
7
+ from llm_prompts import (
8
+ CHECK_HEADERS_PROMPT,
9
+ NORMALIZE_HEADERS_PROMPT,
10
+ CHECK_COLUMN_CONTENT_PROMPT,
11
+ CHECK_TYPOS_PROMPT,
12
+ TRANSFORM_STRING_PROMPT,
13
+ CHECK_LOW_COUNT_VALUES_PROMPT
14
+ )
15
+
16
+ BATCH_SIZE = 50
17
+ EMPTY_THRESHOLD = 0.5
18
+
19
+ def print_dataframe_info(df, step=""):
20
+ num_columns = df.shape[1]
21
+ num_rows = df.shape[0]
22
+ num_cells = num_columns * num_rows
23
+ print(f"{step}Dataframe info:")
24
+ print(f" Number of columns: {num_columns}")
25
+ print(f" Number of rows: {num_rows}")
26
+ print(f" Total number of cells: {num_cells}")
27
+
28
+ def check_and_normalize_column_headers(df):
29
+ print("Checking and normalizing column headers...")
30
+
31
+ check_prompt = CHECK_HEADERS_PROMPT.format(columns=df.columns.tolist())
32
+ check_response = generate_llm_response(check_prompt)
33
+ try:
34
+ invalid_columns = json.loads(check_response)
35
+ if invalid_columns:
36
+ print(f"Columns with invalid names (indices): {invalid_columns}")
37
+ for idx in invalid_columns:
38
+ new_name = f"column_{idx}"
39
+ print(f"Renaming column at index {idx} to '{new_name}'")
40
+ df.rename(columns={df.columns[idx]: new_name}, inplace=True)
41
+ else:
42
+ print("All column headers are valid or no invalid headers detected.")
43
+ except json.JSONDecodeError:
44
+ print("Error parsing LLM response for column headers check.")
45
+
46
+ normalize_prompt = NORMALIZE_HEADERS_PROMPT.format(columns=df.columns.tolist())
47
+ normalize_response = generate_llm_response(normalize_prompt)
48
+ try:
49
+ normalized_names = json.loads(normalize_response)
50
+ if normalized_names:
51
+ df.rename(columns=normalized_names, inplace=True)
52
+ print("Column names have been normalized.")
53
+ else:
54
+ print("No column names were normalized. Proceeding with current names.")
55
+ except json.JSONDecodeError:
56
+ print("Error parsing LLM response for column name normalization.")
57
+
58
+ # Fallback normalization
59
+ df.columns = [col.lower().replace(' ', '_') for col in df.columns]
60
+ print("Applied fallback normalization to ensure valid column names.")
61
+
62
+ return df
63
+
64
+ def process_column_batch(column_data, column_name):
65
+ sample = column_data.sample(n=min(BATCH_SIZE, len(column_data)), random_state=42).tolist()
66
+ prompt = CHECK_COLUMN_CONTENT_PROMPT.format(column_name=column_name, sample_values=str(sample))
67
+ response = generate_llm_response(prompt)
68
+ try:
69
+ result = json.loads(response)
70
+ if not all(key in result for key in ['data_type', 'empty_indices', 'invalid_indices']):
71
+ raise ValueError("Missing required keys in LLM response")
72
+ return result
73
+ except (json.JSONDecodeError, ValueError) as e:
74
+ print(f"Error parsing LLM response for column {column_name}: {str(e)}")
75
+ print(f"LLM Response: {response}")
76
+ return {'data_type': 'string', 'empty_indices': [], 'invalid_indices': []}
77
+
78
+ def check_typos(column_data, column_name):
79
+ sample = column_data.sample(n=min(BATCH_SIZE, len(column_data)), random_state=42).tolist()
80
+ prompt = CHECK_TYPOS_PROMPT.format(column_name=column_name, sample_values=str(sample))
81
+ response = generate_llm_response(prompt)
82
+ try:
83
+ return json.loads(response)
84
+ except json.JSONDecodeError:
85
+ print(f"Error parsing LLM response for typo check in column {column_name}")
86
+ return {"typos": {}}
87
+
88
+ def transform_string_column(column_data, column_name):
89
+ unique_values = column_data.unique().tolist()
90
+ prompt = TRANSFORM_STRING_PROMPT.format(column_name=column_name, unique_values=unique_values)
91
+ response = generate_llm_response(prompt)
92
+ try:
93
+ result = json.loads(response)
94
+ return result
95
+ except json.JSONDecodeError:
96
+ print(f"Error parsing LLM response for string transformation in column {column_name}")
97
+ return {}
98
+
99
+ def check_low_count_values(column_data, column_name):
100
+ value_counts = column_data.value_counts().to_dict()
101
+ prompt = CHECK_LOW_COUNT_VALUES_PROMPT.format(column_name=column_name, value_counts=value_counts)
102
+ response = generate_llm_response(prompt)
103
+ try:
104
+ result = json.loads(response)
105
+ return result
106
+ except json.JSONDecodeError:
107
+ print(f"Error parsing LLM response for low count values in column {column_name}")
108
+ return []
109
+
110
+ def remove_empty_columns(df, threshold=EMPTY_THRESHOLD):
111
+ print(f"Removing columns with less than {threshold * 100}% valid data...")
112
+ valid_threshold = int(df.shape[0] * threshold)
113
+ df = df.dropna(axis=1, thresh=valid_threshold)
114
+ return df
115
+
116
+ def remove_empty_rows(df, threshold=EMPTY_THRESHOLD):
117
+ print(f"Removing rows with less than {threshold * 100}% valid data...")
118
+ valid_threshold = int(df.shape[1] * threshold)
119
+ df = df.dropna(axis=0, thresh=valid_threshold)
120
+ return df
121
+
122
+ def remove_low_count_categories(df):
123
+ print("Removing strings with count below 2...")
124
+ for col in df.select_dtypes(include=['object']).columns:
125
+ value_counts = df[col].value_counts()
126
+ to_remove = value_counts[value_counts < 2].index
127
+ df[col] = df[col].replace(to_remove, np.nan)
128
+ return df
129
+
130
+ def clean_column(df, column_name):
131
+ print(f"Cleaning column: {column_name}")
132
+ column_data = df[column_name]
133
+ total_rows = len(column_data)
134
+ empty_indices = []
135
+ invalid_indices = []
136
+ data_type = "string"
137
+ nonconforming_cells = 0
138
+
139
+ for i in range(0, total_rows, BATCH_SIZE):
140
+ batch = column_data.iloc[i:i + BATCH_SIZE]
141
+ result = process_column_batch(batch, column_name)
142
+
143
+ valid_empty_indices = [idx for idx in result["empty_indices"] if idx + i < total_rows]
144
+ valid_invalid_indices = [idx for idx in result["invalid_indices"] if idx + i < total_rows]
145
+
146
+ empty_indices.extend([idx + i for idx in valid_empty_indices])
147
+ invalid_indices.extend([idx + i for idx in valid_invalid_indices])
148
+
149
+ if i == 0: # Use the data type from the first batch
150
+ data_type = result["data_type"]
151
+
152
+ print(f" Data type determined: {data_type}")
153
+ print(f" Empty cells: {len(empty_indices)}")
154
+ print(f" Invalid cells: {len(invalid_indices)}")
155
+
156
+ # Convert column to determined data type
157
+ if data_type == "float":
158
+ df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce')
159
+ elif data_type == "integer":
160
+ df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce').astype('Int64')
161
+ elif data_type == "date":
162
+ df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
163
+ elif data_type == "string" or data_type == "object":
164
+ # Transform string values
165
+ transform_result = transform_string_column(column_data, column_name)
166
+ df[column_name] = df[column_name].map(transform_result).fillna(df[column_name])
167
+
168
+ # Handle "nan" strings
169
+ df[column_name] = df[column_name].replace({"nan": np.nan, "NaN": np.nan, "NAN": np.nan})
170
+
171
+ # Check for low count values
172
+ low_count_values = check_low_count_values(df[column_name], column_name)
173
+ df.loc[df[column_name].isin(low_count_values), column_name] = np.nan
174
+
175
+ # Check for typos
176
+ typo_result = check_typos(df[column_name], column_name)
177
+ if typo_result["typos"]:
178
+ print(f" Potential typos found: {typo_result['typos']}")
179
+
180
+ # Set empty and invalid cells to NaN
181
+ df.loc[empty_indices + invalid_indices, column_name] = np.nan
182
+ nonconforming_cells = len(empty_indices) + len(invalid_indices)
183
+
184
+ return df, nonconforming_cells
185
+
186
+ def remove_outliers(df, primary_key_column):
187
+ print("Removing rows with outliers from numeric/integer/float columns...")
188
+ rows_to_remove = set()
189
+ for column in df.select_dtypes(include=[np.number]).columns:
190
+ if column != primary_key_column:
191
+ q1 = df[column].quantile(0.25)
192
+ q3 = df[column].quantile(0.75)
193
+ iqr = q3 - q1
194
+ lower_bound = q1 - 1.5 * iqr
195
+ upper_bound = q3 + 1.5 * iqr
196
+ outlier_rows = df[(df[column] < lower_bound) | (df[column] > upper_bound)].index
197
+ rows_to_remove.update(outlier_rows)
198
+
199
+ initial_rows = len(df)
200
+ df = df.drop(index=list(rows_to_remove))
201
+ removed_rows = initial_rows - len(df)
202
+ print(f"Removed {removed_rows} rows containing outliers.")
203
+ return df, removed_rows
204
+
205
+ def calculate_nonconforming_cells(df):
206
+ nonconforming_cells = {}
207
+ for column in df.columns:
208
+ # Count NaN values
209
+ nan_count = df[column].isna().sum()
210
+
211
+ # For numeric columns, count infinite values
212
+ if np.issubdtype(df[column].dtype, np.number):
213
+ inf_count = np.isinf(df[column]).sum()
214
+ else:
215
+ inf_count = 0
216
+
217
+ # For object columns, count empty strings
218
+ if df[column].dtype == 'object':
219
+ empty_string_count = (df[column] == '').sum()
220
+ else:
221
+ empty_string_count = 0
222
+
223
+ nonconforming_cells[column] = nan_count + inf_count + empty_string_count
224
+
225
+ return nonconforming_cells
226
+
227
+
228
+ def clean_data(df):
229
+ start_time = time.time()
230
+ process_times = {}
231
+ removed_rows = 0
232
+ removed_columns = 0
233
+
234
+ print("Starting data validation and cleaning...")
235
+ print_dataframe_info(df, "Initial - ")
236
+
237
+ # Calculate nonconforming cells before cleaning
238
+ nonconforming_cells_before = calculate_nonconforming_cells(df)
239
+
240
+ steps = ['Normalize headers', 'Remove empty columns', 'Remove empty rows', 'Remove low count strings', 'Clean columns', 'Remove outliers']
241
+ total_steps = len(steps) + len(df.columns) # Add column count for individual column cleaning
242
+
243
+ # Step 1: Normalize column headers
244
+ step_start_time = time.time()
245
+ df = check_and_normalize_column_headers(df)
246
+ process_times['Normalize headers'] = time.time() - step_start_time
247
+ yield 1 / total_steps, "Normalized headers"
248
+
249
+ # Step 2: Remove empty columns (less than 60% valid data)
250
+ step_start_time = time.time()
251
+ df = remove_empty_columns(df)
252
+ process_times['Remove empty columns'] = time.time() - step_start_time
253
+ yield 2 / total_steps, "Removed empty columns"
254
+
255
+ # Step 3: Remove empty rows (less than 60% valid data)
256
+ step_start_time = time.time()
257
+ df = remove_empty_rows(df)
258
+ process_times['Remove empty rows'] = time.time() - step_start_time
259
+ yield 3 / total_steps, "Removed empty rows"
260
+
261
+ # Step 4: Remove low count categories
262
+ step_start_time = time.time()
263
+ df = remove_low_count_categories(df)
264
+ process_times['Remove low count strings'] = time.time() - step_start_time
265
+ yield 4 / total_steps, "Removed low count strings"
266
+
267
+ # Step 5: Clean columns (in batches)
268
+ column_cleaning_times = {}
269
+ for i, column in enumerate(df.columns):
270
+ column_start_time = time.time()
271
+ df, nonconforming = clean_column(df, column)
272
+ column_cleaning_times[f"Clean column: {column}"] = time.time() - column_start_time
273
+ yield (5 + i) / total_steps, f"Cleaning column: {column}"
274
+ process_times.update(column_cleaning_times)
275
+
276
+ # Step 6: Remove outliers from numeric columns
277
+ step_start_time = time.time()
278
+ df, outlier_rows_removed = remove_outliers(df)
279
+ removed_rows += outlier_rows_removed
280
+ process_times['Remove outliers'] = time.time() - step_start_time
281
+ yield 1.0, (df, nonconforming_cells_before, process_times, removed_columns, removed_rows)
282
+
283
+ print("Cleaning process completed.")
284
+ print_dataframe_info(df, "Final - ")
report.py CHANGED
@@ -8,10 +8,12 @@ from datetime import datetime
8
  REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
9
  os.makedirs(REPORT_DIR, exist_ok=True)
10
 
 
11
  def save_plot(fig, filename):
12
  fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
13
  plt.close(fig)
14
 
 
15
  def plot_heatmap(df, title):
16
  plt.figure(figsize=(12, 8))
17
  sns.heatmap(df.isnull(), cbar=False, cmap='Reds')
@@ -20,110 +22,103 @@ def plot_heatmap(df, title):
20
  save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')
21
 
22
 
23
- def plot_valid_data_percentage(original_df, cleaned_df):
24
- original_valid = (original_df.notna().sum() / len(original_df)) * 100
25
- cleaned_valid = (cleaned_df.notna().sum() / len(cleaned_df)) * 100
26
-
27
- # Combine the data and fill missing values with 0
28
  combined_data = pd.concat([original_valid, cleaned_valid], axis=1, keys=['Original', 'Cleaned']).fillna(0)
29
-
30
  plt.figure(figsize=(15, 8))
31
-
32
  x = range(len(combined_data))
33
  width = 0.35
34
-
35
  plt.bar(x, combined_data['Original'], width, label='Before Cleaning', alpha=0.8)
36
  plt.bar([i + width for i in x], combined_data['Cleaned'], width, label='After Cleaning', alpha=0.8)
37
-
38
  plt.xlabel('Columns')
39
  plt.ylabel('Percentage of Valid Data')
40
  plt.title('Percentage of Valid Data Before and After Cleaning')
41
- plt.xticks([i + width/2 for i in x], combined_data.index, rotation=90)
42
  plt.legend()
43
-
44
- # Add percentage labels on the bars with smaller font size
45
  for i, v in enumerate(combined_data['Original']):
46
  plt.text(i, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
47
  for i, v in enumerate(combined_data['Cleaned']):
48
  plt.text(i + width, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
49
-
50
  plt.tight_layout()
51
  plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
52
  plt.close()
53
 
 
54
  def plot_column_schemas(df):
55
  schemas = df.dtypes.astype(str).value_counts()
56
  fig, ax = plt.subplots(figsize=(10, 6))
57
-
58
- # Generate a color palette with as many colors as there are bars
59
  colors = plt.cm.rainbow(np.linspace(0, 1, len(schemas)))
60
-
61
- # Plot the bars
62
  bars = ax.bar(schemas.index, schemas.values, color=colors)
63
-
64
  ax.set_title('Column Data Types')
65
  ax.set_xlabel('Data Type')
66
  ax.set_ylabel('Count')
67
-
68
- # Add value labels on top of each bar
69
  for bar in bars:
70
  height = bar.get_height()
71
- ax.text(bar.get_x() + bar.get_width()/2., height,
72
  f'{height}',
73
  ha='center', va='bottom')
74
-
75
  save_plot(fig, 'column_schemas.png')
76
 
 
77
  def plot_nonconforming_cells(nonconforming_cells):
78
- # Ensure that nonconforming_cells is a dictionary
79
  if isinstance(nonconforming_cells, dict):
80
- # Proceed with plotting if it's a dictionary
81
  fig, ax = plt.subplots(figsize=(12, 6))
82
-
83
- # Generate a color palette with as many colors as there are bars
84
  colors = plt.cm.rainbow(np.linspace(0, 1, len(nonconforming_cells)))
85
-
86
- # Plot the bars
87
  bars = ax.bar(list(nonconforming_cells.keys()), list(nonconforming_cells.values()), color=colors)
88
-
89
  ax.set_title('Nonconforming Cells by Column')
90
  ax.set_xlabel('Columns')
91
  ax.set_ylabel('Number of Nonconforming Cells')
92
  plt.xticks(rotation=90)
93
-
94
- # Add value labels on top of each bar
95
  for bar in bars:
96
  height = bar.get_height()
97
- ax.text(bar.get_x() + bar.get_width()/2., height,
98
  f'{height:,}',
99
  ha='center', va='bottom')
100
-
101
  save_plot(fig, 'nonconforming_cells.png')
102
  else:
103
  print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")
104
 
105
 
106
-
107
- def plot_column_distributions(original_df, cleaned_df):
108
- numeric_columns = original_df.select_dtypes(include=[np.number]).columns
109
  num_columns = len(numeric_columns)
110
 
111
  if num_columns == 0:
112
  print("No numeric columns found for distribution plots.")
113
  return
114
 
115
- # Create subplots for distributions
116
  fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(18, 5 * ((num_columns + 2) // 3)))
117
  axes = axes.flatten() if num_columns > 1 else [axes]
118
 
119
  for i, column in enumerate(numeric_columns):
120
  if column in cleaned_df.columns:
121
- sns.histplot(original_df[column].dropna(), ax=axes[i], kde=True, color='blue', label='Before Cleaning', alpha=0.5)
122
- sns.histplot(cleaned_df[column].dropna(), ax=axes[i], kde=True, color='orange', label='After Cleaning', alpha=0.5)
 
 
123
  axes[i].set_title(f'{column} - Distribution Before & After Cleaning')
124
  axes[i].legend()
125
 
126
- # Remove any unused subplots
127
  for j in range(i + 1, len(axes)):
128
  fig.delaxes(axes[j])
129
 
@@ -131,16 +126,14 @@ def plot_column_distributions(original_df, cleaned_df):
131
  save_plot(fig, 'distributions_before_after_cleaning.png')
132
 
133
 
134
- def plot_boxplot_with_outliers(df):
135
- print("Plotting boxplots with outliers...")
136
- numeric_columns = df.select_dtypes(include=[np.number]).columns
137
  num_columns = len(numeric_columns)
138
 
139
  if num_columns == 0:
140
  print("No numeric columns found for boxplot.")
141
  return
142
 
143
- # Create subplots based on the number of numeric columns
144
  fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(15, 5 * ((num_columns + 2) // 3)))
145
  axes = axes.flatten() if num_columns > 1 else [axes]
146
 
@@ -148,7 +141,6 @@ def plot_boxplot_with_outliers(df):
148
  sns.boxplot(x=df[column], ax=axes[i])
149
  axes[i].set_title(f'Boxplot of {column} with Outliers')
150
 
151
- # Remove any unused subplots
152
  for j in range(i + 1, len(axes)):
153
  fig.delaxes(axes[j])
154
 
@@ -156,52 +148,42 @@ def plot_boxplot_with_outliers(df):
156
  save_plot(fig, 'boxplots_with_outliers.png')
157
 
158
 
159
- def plot_correlation_heatmap(df):
160
- # Select only numeric, float, and integer columns
161
  numeric_df = df.select_dtypes(include=[np.number])
 
162
 
163
- # Compute the correlation matrix
164
  correlation_matrix = numeric_df.corr()
165
 
166
- # Plot the heatmap
167
  fig, ax = plt.subplots(figsize=(15, 10))
168
  sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', ax=ax, cbar_kws={'label': 'Correlation'})
169
  ax.set_title('Correlation Heatmap')
170
  save_plot(fig, 'correlation_heatmap.png')
171
 
172
 
173
-
174
  def plot_process_times(process_times):
175
- # Convert seconds to minutes
176
  process_times_minutes = {k: v / 60 for k, v in process_times.items()}
177
 
178
- # Separate main processes and column cleaning processes
179
  main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
180
  column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}
181
 
182
- # Create the plot
183
  fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
184
 
185
- # Plot main processes
186
  bars1 = ax1.bar(main_processes.keys(), main_processes.values())
187
  ax1.set_title('Main Process Times')
188
  ax1.set_ylabel('Time (minutes)')
189
  ax1.tick_params(axis='x', rotation=45)
190
 
191
- # Plot column cleaning processes
192
  bars2 = ax2.bar(column_processes.keys(), column_processes.values())
193
  ax2.set_title('Column Cleaning Times')
194
  ax2.set_ylabel('Time (minutes)')
195
  ax2.tick_params(axis='x', rotation=90)
196
 
197
- # Add value labels on top of each bar
198
  for ax, bars in zip([ax1, ax2], [bars1, bars2]):
199
  for bar in bars:
200
  height = bar.get_height()
201
  ax.text(bar.get_x() + bar.get_width() / 2., height,
202
- f'{height:.2f}', ha='center', va='bottom')
203
 
204
- # Add total time to the plot
205
  total_time = sum(process_times_minutes.values())
206
  fig.suptitle(f'Process Times (Total: {total_time:.2f} minutes)', fontsize=16)
207
 
@@ -209,14 +191,15 @@ def plot_process_times(process_times):
209
  save_plot(fig, 'process_times.png')
210
 
211
 
212
- def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows):
 
213
  os.makedirs(REPORT_DIR, exist_ok=True)
214
 
215
  sns.set_style("whitegrid")
216
  plt.rcParams['figure.dpi'] = 400
217
 
218
  print("Plotting valid data percentages...")
219
- plot_valid_data_percentage(original_df, cleaned_df)
220
 
221
  print("Plotting column schemas...")
222
  plot_column_schemas(cleaned_df)
@@ -225,7 +208,7 @@ def create_full_report(original_df, cleaned_df, nonconforming_cells_before, proc
225
  plot_nonconforming_cells(nonconforming_cells_before)
226
 
227
  print("Plotting column distributions...")
228
- plot_column_distributions(original_df, cleaned_df)
229
 
230
  print("Plotting process times...")
231
  plot_process_times(process_times)
@@ -234,6 +217,9 @@ def create_full_report(original_df, cleaned_df, nonconforming_cells_before, proc
234
  plot_heatmap(original_df, "Missing Values Before Cleaning")
235
 
236
  print("Plotting correlation heatmap...")
237
- plot_correlation_heatmap(cleaned_df)
 
 
 
238
 
239
  print(f"All visualization reports saved in directory: {REPORT_DIR}")
 
8
  REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
9
  os.makedirs(REPORT_DIR, exist_ok=True)
10
 
11
+
12
  def save_plot(fig, filename):
13
  fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
14
  plt.close(fig)
15
 
16
+
17
  def plot_heatmap(df, title):
18
  plt.figure(figsize=(12, 8))
19
  sns.heatmap(df.isnull(), cbar=False, cmap='Reds')
 
22
  save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')
23
 
24
 
25
+ def plot_valid_data_percentage(original_df, cleaned_df, primary_key_column):
26
+ columns_to_plot = [col for col in original_df.columns if col != primary_key_column]
27
+ original_valid = (original_df[columns_to_plot].notna().sum() / len(original_df)) * 100
28
+ cleaned_valid = (cleaned_df[columns_to_plot].notna().sum() / len(cleaned_df)) * 100
29
+
30
  combined_data = pd.concat([original_valid, cleaned_valid], axis=1, keys=['Original', 'Cleaned']).fillna(0)
31
+
32
  plt.figure(figsize=(15, 8))
33
+
34
  x = range(len(combined_data))
35
  width = 0.35
36
+
37
  plt.bar(x, combined_data['Original'], width, label='Before Cleaning', alpha=0.8)
38
  plt.bar([i + width for i in x], combined_data['Cleaned'], width, label='After Cleaning', alpha=0.8)
39
+
40
  plt.xlabel('Columns')
41
  plt.ylabel('Percentage of Valid Data')
42
  plt.title('Percentage of Valid Data Before and After Cleaning')
43
+ plt.xticks([i + width / 2 for i in x], combined_data.index, rotation=90)
44
  plt.legend()
45
+
 
46
  for i, v in enumerate(combined_data['Original']):
47
  plt.text(i, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
48
  for i, v in enumerate(combined_data['Cleaned']):
49
  plt.text(i + width, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
50
+
51
  plt.tight_layout()
52
  plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
53
  plt.close()
54
 
55
+
56
  def plot_column_schemas(df):
57
  schemas = df.dtypes.astype(str).value_counts()
58
  fig, ax = plt.subplots(figsize=(10, 6))
59
+
 
60
  colors = plt.cm.rainbow(np.linspace(0, 1, len(schemas)))
61
+
 
62
  bars = ax.bar(schemas.index, schemas.values, color=colors)
63
+
64
  ax.set_title('Column Data Types')
65
  ax.set_xlabel('Data Type')
66
  ax.set_ylabel('Count')
67
+
 
68
  for bar in bars:
69
  height = bar.get_height()
70
+ ax.text(bar.get_x() + bar.get_width() / 2., height,
71
  f'{height}',
72
  ha='center', va='bottom')
73
+
74
  save_plot(fig, 'column_schemas.png')
75
 
76
+
77
  def plot_nonconforming_cells(nonconforming_cells):
 
78
  if isinstance(nonconforming_cells, dict):
 
79
  fig, ax = plt.subplots(figsize=(12, 6))
80
+
 
81
  colors = plt.cm.rainbow(np.linspace(0, 1, len(nonconforming_cells)))
82
+
 
83
  bars = ax.bar(list(nonconforming_cells.keys()), list(nonconforming_cells.values()), color=colors)
84
+
85
  ax.set_title('Nonconforming Cells by Column')
86
  ax.set_xlabel('Columns')
87
  ax.set_ylabel('Number of Nonconforming Cells')
88
  plt.xticks(rotation=90)
89
+
 
90
  for bar in bars:
91
  height = bar.get_height()
92
+ ax.text(bar.get_x() + bar.get_width() / 2., height,
93
  f'{height:,}',
94
  ha='center', va='bottom')
95
+
96
  save_plot(fig, 'nonconforming_cells.png')
97
  else:
98
  print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")
99
 
100
 
101
+ def plot_column_distributions(original_df, cleaned_df, primary_key_column):
102
+ numeric_columns = [col for col in original_df.select_dtypes(include=[np.number]).columns if
103
+ col != primary_key_column]
104
  num_columns = len(numeric_columns)
105
 
106
  if num_columns == 0:
107
  print("No numeric columns found for distribution plots.")
108
  return
109
 
 
110
  fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(18, 5 * ((num_columns + 2) // 3)))
111
  axes = axes.flatten() if num_columns > 1 else [axes]
112
 
113
  for i, column in enumerate(numeric_columns):
114
  if column in cleaned_df.columns:
115
+ sns.histplot(original_df[column].dropna(), ax=axes[i], kde=True, color='blue', label='Before Cleaning',
116
+ alpha=0.5)
117
+ sns.histplot(cleaned_df[column].dropna(), ax=axes[i], kde=True, color='orange', label='After Cleaning',
118
+ alpha=0.5)
119
  axes[i].set_title(f'{column} - Distribution Before & After Cleaning')
120
  axes[i].legend()
121
 
 
122
  for j in range(i + 1, len(axes)):
123
  fig.delaxes(axes[j])
124
 
 
126
  save_plot(fig, 'distributions_before_after_cleaning.png')
127
 
128
 
129
+ def plot_boxplot_with_outliers(df, primary_key_column):
130
+ numeric_columns = [col for col in df.select_dtypes(include=[np.number]).columns if col != primary_key_column]
 
131
  num_columns = len(numeric_columns)
132
 
133
  if num_columns == 0:
134
  print("No numeric columns found for boxplot.")
135
  return
136
 
 
137
  fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(15, 5 * ((num_columns + 2) // 3)))
138
  axes = axes.flatten() if num_columns > 1 else [axes]
139
 
 
141
  sns.boxplot(x=df[column], ax=axes[i])
142
  axes[i].set_title(f'Boxplot of {column} with Outliers')
143
 
 
144
  for j in range(i + 1, len(axes)):
145
  fig.delaxes(axes[j])
146
 
 
148
  save_plot(fig, 'boxplots_with_outliers.png')
149
 
150
 
151
+ def plot_correlation_heatmap(df, primary_key_column):
 
152
  numeric_df = df.select_dtypes(include=[np.number])
153
+ numeric_df = numeric_df.drop(columns=[primary_key_column], errors='ignore')
154
 
 
155
  correlation_matrix = numeric_df.corr()
156
 
 
157
  fig, ax = plt.subplots(figsize=(15, 10))
158
  sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', ax=ax, cbar_kws={'label': 'Correlation'})
159
  ax.set_title('Correlation Heatmap')
160
  save_plot(fig, 'correlation_heatmap.png')
161
 
162
 
 
163
  def plot_process_times(process_times):
 
164
  process_times_minutes = {k: v / 60 for k, v in process_times.items()}
165
 
 
166
  main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
167
  column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}
168
 
 
169
  fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
170
 
 
171
  bars1 = ax1.bar(main_processes.keys(), main_processes.values())
172
  ax1.set_title('Main Process Times')
173
  ax1.set_ylabel('Time (minutes)')
174
  ax1.tick_params(axis='x', rotation=45)
175
 
 
176
  bars2 = ax2.bar(column_processes.keys(), column_processes.values())
177
  ax2.set_title('Column Cleaning Times')
178
  ax2.set_ylabel('Time (minutes)')
179
  ax2.tick_params(axis='x', rotation=90)
180
 
 
181
  for ax, bars in zip([ax1, ax2], [bars1, bars2]):
182
  for bar in bars:
183
  height = bar.get_height()
184
  ax.text(bar.get_x() + bar.get_width() / 2., height,
185
+ f'{height:.4f}', ha='center', va='bottom')
186
 
 
187
  total_time = sum(process_times_minutes.values())
188
  fig.suptitle(f'Process Times (Total: {total_time:.2f} minutes)', fontsize=16)
189
 
 
191
  save_plot(fig, 'process_times.png')
192
 
193
 
194
+ def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns,
195
+ removed_rows, primary_key_column):
196
  os.makedirs(REPORT_DIR, exist_ok=True)
197
 
198
  sns.set_style("whitegrid")
199
  plt.rcParams['figure.dpi'] = 400
200
 
201
  print("Plotting valid data percentages...")
202
+ plot_valid_data_percentage(original_df, cleaned_df, primary_key_column)
203
 
204
  print("Plotting column schemas...")
205
  plot_column_schemas(cleaned_df)
 
208
  plot_nonconforming_cells(nonconforming_cells_before)
209
 
210
  print("Plotting column distributions...")
211
+ plot_column_distributions(original_df, cleaned_df, primary_key_column)
212
 
213
  print("Plotting process times...")
214
  plot_process_times(process_times)
 
217
  plot_heatmap(original_df, "Missing Values Before Cleaning")
218
 
219
  print("Plotting correlation heatmap...")
220
+ plot_correlation_heatmap(cleaned_df, primary_key_column)
221
+
222
+ print("Plotting boxplots with outliers...")
223
+ plot_boxplot_with_outliers(cleaned_df, primary_key_column)
224
 
225
  print(f"All visualization reports saved in directory: {REPORT_DIR}")