reab5555 commited on
Commit
1853d90
1 Parent(s): 654bb84

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +93 -0
  2. clean.py +295 -0
  3. llm_prompts.py +123 -0
  4. manage_schema.py +51 -0
  5. report.py +208 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from clean import clean_data
4
+ from report import create_full_report, REPORT_DIR
5
+ import os
6
+ import tempfile
7
+
8
+ def clean_and_visualize(file, progress=gr.Progress()):
9
+ # Load the data
10
+ df = pd.read_csv(file.name)
11
+
12
+ # Clean the data
13
+ cleaned_df = None
14
+ nonconforming_cells_before = None
15
+ process_times = None
16
+ removed_columns = None
17
+ removed_rows = None
18
+
19
+ for progress_value, status_text in clean_data(df):
20
+ if isinstance(status_text, tuple):
21
+ cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows = status_text
22
+ progress(progress_value, desc="Cleaning completed")
23
+ else:
24
+ progress(progress_value, desc=status_text)
25
+
26
+ # Generate full visualization report
27
+ create_full_report(
28
+ df,
29
+ cleaned_df,
30
+ nonconforming_cells_before,
31
+ process_times,
32
+ removed_columns,
33
+ removed_rows
34
+ )
35
+
36
+ # Save cleaned DataFrame to a temporary CSV file
37
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp_file:
38
+ cleaned_df.to_csv(tmp_file.name, index=False)
39
+ cleaned_csv_path = tmp_file.name
40
+
41
+ # Collect all generated images
42
+ image_files = [os.path.join(REPORT_DIR, f) for f in os.listdir(REPORT_DIR) if f.endswith('.png')]
43
+
44
+ return cleaned_csv_path, image_files
45
+
46
+ def launch_app():
47
+ with gr.Blocks() as app:
48
+ gr.Markdown("# Data Cleaning and Visualization App")
49
+
50
+ with gr.Row():
51
+ file_input = gr.File(label="Upload CSV File")
52
+
53
+ with gr.Row():
54
+ clean_button = gr.Button("Start Cleaning")
55
+
56
+ with gr.Row():
57
+ progress_bar = gr.Progress()
58
+
59
+ with gr.Row():
60
+ download_button = gr.Button("Download Cleaned CSV", visible=False)
61
+ cleaned_file_output = gr.File(label="Cleaned CSV", visible=False)
62
+
63
+ with gr.Row():
64
+ output_gallery = gr.Gallery(label="Visualization Results", show_label=True, elem_id="gallery", columns=[2],
65
+ rows=[2], object_fit="contain", height="auto")
66
+
67
+ def process_and_show_download(file):
68
+ cleaned_csv_path, image_files = clean_and_visualize(file, progress=progress_bar)
69
+ return (
70
+ gr.Button.update(visible=True),
71
+ gr.File.update(value=cleaned_csv_path, visible=True),
72
+ image_files
73
+ )
74
+
75
+ clean_button.click(
76
+ fn=process_and_show_download,
77
+ inputs=file_input,
78
+ outputs=[download_button, cleaned_file_output, output_gallery]
79
+ )
80
+
81
+ def trigger_download():
82
+ return gr.File.update(visible=True)
83
+
84
+ download_button.click(
85
+ fn=trigger_download,
86
+ inputs=[],
87
+ outputs=[cleaned_file_output]
88
+ )
89
+
90
+ app.launch()
91
+
92
+ if __name__ == "__main__":
93
+ launch_app()
clean.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import json
4
+ import time
5
+ from tqdm import tqdm
6
+ from llm_config import generate_llm_response
7
+ from llm_prompts import (
8
+ CHECK_HEADERS_PROMPT,
9
+ NORMALIZE_HEADERS_PROMPT,
10
+ CHECK_COLUMN_CONTENT_PROMPT,
11
+ CHECK_TYPOS_PROMPT,
12
+ TRANSFORM_STRING_PROMPT,
13
+ CHECK_LOW_COUNT_VALUES_PROMPT
14
+ )
15
+
16
+ BATCH_SIZE = 50
17
+ EMPTY_THRESHOLD = 0.5
18
+
19
+
20
+ def print_dataframe_info(df, step=""):
21
+ num_columns = df.shape[1]
22
+ num_rows = df.shape[0]
23
+ num_cells = num_columns * num_rows
24
+ print(f"{step}Dataframe info:")
25
+ print(f" Number of columns: {num_columns}")
26
+ print(f" Number of rows: {num_rows}")
27
+ print(f" Total number of cells: {num_cells}")
28
+
29
+
30
+ def check_and_normalize_column_headers(df):
31
+ print("Checking and normalizing column headers...")
32
+
33
+ check_prompt = CHECK_HEADERS_PROMPT.format(columns=df.columns.tolist())
34
+ check_response = generate_llm_response(check_prompt)
35
+ try:
36
+ invalid_columns = json.loads(check_response)
37
+ if invalid_columns:
38
+ print(f"Columns with invalid names (indices): {invalid_columns}")
39
+ for idx in invalid_columns:
40
+ new_name = f"column_{idx}"
41
+ print(f"Renaming column at index {idx} to '{new_name}'")
42
+ df.rename(columns={df.columns[idx]: new_name}, inplace=True)
43
+ else:
44
+ print("All column headers are valid or no invalid headers detected.")
45
+ except json.JSONDecodeError:
46
+ print("Error parsing LLM response for column headers check.")
47
+
48
+ normalize_prompt = NORMALIZE_HEADERS_PROMPT.format(columns=df.columns.tolist())
49
+ normalize_response = generate_llm_response(normalize_prompt)
50
+ try:
51
+ normalized_names = json.loads(normalize_response)
52
+ if normalized_names:
53
+ df.rename(columns=normalized_names, inplace=True)
54
+ print("Column names have been normalized.")
55
+ else:
56
+ print("No column names were normalized. Proceeding with current names.")
57
+ except json.JSONDecodeError:
58
+ print("Error parsing LLM response for column name normalization.")
59
+
60
+ # Fallback normalization
61
+ df.columns = [col.lower().replace(' ', '_') for col in df.columns]
62
+ print("Applied fallback normalization to ensure valid column names.")
63
+
64
+ return df
65
+
66
+
67
+ def process_column_batch(column_data, column_name):
68
+ sample = column_data.sample(n=min(BATCH_SIZE, len(column_data)), random_state=42).tolist()
69
+ prompt = CHECK_COLUMN_CONTENT_PROMPT.format(column_name=column_name, sample_values=str(sample))
70
+ response = generate_llm_response(prompt)
71
+ try:
72
+ result = json.loads(response)
73
+ if not all(key in result for key in ['data_type', 'empty_indices', 'invalid_indices']):
74
+ raise ValueError("Missing required keys in LLM response")
75
+ return result
76
+ except (json.JSONDecodeError, ValueError) as e:
77
+ print(f"Error parsing LLM response for column {column_name}: {str(e)}")
78
+ print(f"LLM Response: {response}")
79
+ return {'data_type': 'string', 'empty_indices': [], 'invalid_indices': []}
80
+
81
+
82
+ def check_typos(column_data, column_name):
83
+ sample = column_data.sample(n=min(BATCH_SIZE, len(column_data)), random_state=42).tolist()
84
+ prompt = CHECK_TYPOS_PROMPT.format(column_name=column_name, sample_values=str(sample))
85
+ response = generate_llm_response(prompt)
86
+ try:
87
+ return json.loads(response)
88
+ except json.JSONDecodeError:
89
+ print(f"Error parsing LLM response for typo check in column {column_name}")
90
+ return {"typos": {}}
91
+
92
+
93
+ def transform_string_column(column_data, column_name):
94
+ unique_values = column_data.unique().tolist()
95
+ prompt = TRANSFORM_STRING_PROMPT.format(column_name=column_name, unique_values=unique_values)
96
+ response = generate_llm_response(prompt)
97
+ try:
98
+ result = json.loads(response)
99
+ return result
100
+ except json.JSONDecodeError:
101
+ print(f"Error parsing LLM response for string transformation in column {column_name}")
102
+ return {}
103
+
104
+
105
+ def check_low_count_values(column_data, column_name):
106
+ value_counts = column_data.value_counts().to_dict()
107
+ prompt = CHECK_LOW_COUNT_VALUES_PROMPT.format(column_name=column_name, value_counts=value_counts)
108
+ response = generate_llm_response(prompt)
109
+ try:
110
+ result = json.loads(response)
111
+ return result
112
+ except json.JSONDecodeError:
113
+ print(f"Error parsing LLM response for low count values in column {column_name}")
114
+ return []
115
+
116
+
117
+ def remove_empty_columns(df, threshold=EMPTY_THRESHOLD):
118
+ print(f"Removing columns with less than {threshold * 100}% valid data...")
119
+ valid_threshold = int(df.shape[0] * threshold)
120
+ df = df.dropna(axis=1, thresh=valid_threshold)
121
+ return df
122
+
123
+
124
+ def remove_empty_rows(df, threshold=EMPTY_THRESHOLD):
125
+ print(f"Removing rows with less than {threshold * 100}% valid data...")
126
+ valid_threshold = int(df.shape[1] * threshold)
127
+ df = df.dropna(axis=0, thresh=valid_threshold)
128
+ return df
129
+
130
+
131
+ def remove_low_count_categories(df):
132
+ print("Removing strings with count below 2...")
133
+ for col in df.select_dtypes(include=['object']).columns:
134
+ value_counts = df[col].value_counts()
135
+ to_remove = value_counts[value_counts < 2].index
136
+ df[col] = df[col].replace(to_remove, np.nan)
137
+ return df
138
+
139
+
140
+ def clean_column(df, column_name):
141
+ print(f"Cleaning column: {column_name}")
142
+ column_data = df[column_name]
143
+ total_rows = len(column_data)
144
+ empty_indices = []
145
+ invalid_indices = []
146
+ data_type = "string"
147
+ nonconforming_cells = 0
148
+
149
+ for i in range(0, total_rows, BATCH_SIZE):
150
+ batch = column_data.iloc[i:i + BATCH_SIZE]
151
+ result = process_column_batch(batch, column_name)
152
+
153
+ valid_empty_indices = [idx for idx in result["empty_indices"] if idx + i < total_rows]
154
+ valid_invalid_indices = [idx for idx in result["invalid_indices"] if idx + i < total_rows]
155
+
156
+ empty_indices.extend([idx + i for idx in valid_empty_indices])
157
+ invalid_indices.extend([idx + i for idx in valid_invalid_indices])
158
+
159
+ if i == 0: # Use the data type from the first batch
160
+ data_type = result["data_type"]
161
+
162
+ print(f" Data type determined: {data_type}")
163
+ print(f" Empty cells: {len(empty_indices)}")
164
+ print(f" Invalid cells: {len(invalid_indices)}")
165
+
166
+ # Convert column to determined data type
167
+ if data_type == "float":
168
+ df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce')
169
+ elif data_type == "integer":
170
+ df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce').astype('Int64')
171
+ elif data_type == "date":
172
+ df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
173
+ elif data_type == "string" or data_type == "object":
174
+ # Transform string values
175
+ transform_result = transform_string_column(column_data, column_name)
176
+ df[column_name] = df[column_name].map(transform_result).fillna(df[column_name])
177
+
178
+ # Handle "nan" strings
179
+ df[column_name] = df[column_name].replace({"nan": np.nan, "NaN": np.nan, "NAN": np.nan})
180
+
181
+ # Check for low count values
182
+ low_count_values = check_low_count_values(df[column_name], column_name)
183
+ df.loc[df[column_name].isin(low_count_values), column_name] = np.nan
184
+
185
+ # Check for typos
186
+ typo_result = check_typos(df[column_name], column_name)
187
+ if typo_result["typos"]:
188
+ print(f" Potential typos found: {typo_result['typos']}")
189
+
190
+ # Set empty and invalid cells to NaN
191
+ df.loc[empty_indices + invalid_indices, column_name] = np.nan
192
+ nonconforming_cells = len(empty_indices) + len(invalid_indices)
193
+
194
+ return df, nonconforming_cells
195
+
196
+
197
+ def remove_outliers(df):
198
+ print("Removing rows with outliers from numeric/integer/float columns...")
199
+ rows_to_remove = set()
200
+ for column in df.select_dtypes(include=[np.number]).columns:
201
+ q1 = df[column].quantile(0.25)
202
+ q3 = df[column].quantile(0.75)
203
+ iqr = q3 - q1
204
+ lower_bound = q1 - 1.5 * iqr
205
+ upper_bound = q3 + 1.5 * iqr
206
+ outlier_rows = df[(df[column] < lower_bound) | (df[column] > upper_bound)].index
207
+ rows_to_remove.update(outlier_rows)
208
+
209
+ initial_rows = len(df)
210
+ df = df.drop(index=list(rows_to_remove))
211
+ removed_rows = initial_rows - len(df)
212
+ print(f"Removed {removed_rows} rows containing outliers.")
213
+ return df, removed_rows
214
+
215
+
216
+ def calculate_nonconforming_cells(df):
217
+ nonconforming_cells = {}
218
+ for column in df.columns:
219
+ # Count NaN values
220
+ nan_count = df[column].isna().sum()
221
+
222
+ # For numeric columns, count infinite values
223
+ if np.issubdtype(df[column].dtype, np.number):
224
+ inf_count = np.isinf(df[column]).sum()
225
+ else:
226
+ inf_count = 0
227
+
228
+ # For object columns, count empty strings
229
+ if df[column].dtype == 'object':
230
+ empty_string_count = (df[column] == '').sum()
231
+ else:
232
+ empty_string_count = 0
233
+
234
+ nonconforming_cells[column] = nan_count + inf_count + empty_string_count
235
+
236
+ return nonconforming_cells
237
+
238
+
239
+ def clean_data(df):
240
+ start_time = time.time()
241
+ process_times = {}
242
+ removed_rows = 0
243
+ removed_columns = 0
244
+
245
+ print("Starting data validation and cleaning...")
246
+ print_dataframe_info(df, "Initial - ")
247
+
248
+ # Calculate nonconforming cells before cleaning
249
+ nonconforming_cells_before = calculate_nonconforming_cells(df)
250
+
251
+ steps = ['Normalize headers', 'Remove empty columns', 'Remove empty rows', 'Remove low count strings', 'Clean columns', 'Remove outliers']
252
+ total_steps = len(steps) + len(df.columns) # Add column count for individual column cleaning
253
+
254
+ # Step 1: Normalize column headers
255
+ step_start_time = time.time()
256
+ df = check_and_normalize_column_headers(df)
257
+ process_times['Normalize headers'] = time.time() - step_start_time
258
+ yield 1 / total_steps, "Normalized headers"
259
+
260
+ # Step 2: Remove empty columns (less than 60% valid data)
261
+ step_start_time = time.time()
262
+ df = remove_empty_columns(df)
263
+ process_times['Remove empty columns'] = time.time() - step_start_time
264
+ yield 2 / total_steps, "Removed empty columns"
265
+
266
+ # Step 3: Remove empty rows (less than 60% valid data)
267
+ step_start_time = time.time()
268
+ df = remove_empty_rows(df)
269
+ process_times['Remove empty rows'] = time.time() - step_start_time
270
+ yield 3 / total_steps, "Removed empty rows"
271
+
272
+ # Step 4: Remove low count categories
273
+ step_start_time = time.time()
274
+ df = remove_low_count_categories(df)
275
+ process_times['Remove low count strings'] = time.time() - step_start_time
276
+ yield 4 / total_steps, "Removed low count strings"
277
+
278
+ # Step 5: Clean columns (in batches)
279
+ column_cleaning_times = {}
280
+ for i, column in enumerate(df.columns):
281
+ column_start_time = time.time()
282
+ df, nonconforming = clean_column(df, column)
283
+ column_cleaning_times[f"Clean column: {column}"] = time.time() - column_start_time
284
+ yield (5 + i) / total_steps, f"Cleaning column: {column}"
285
+ process_times.update(column_cleaning_times)
286
+
287
+ # Step 6: Remove outliers from numeric columns
288
+ step_start_time = time.time()
289
+ df, outlier_rows_removed = remove_outliers(df)
290
+ removed_rows += outlier_rows_removed
291
+ process_times['Remove outliers'] = time.time() - step_start_time
292
+ yield 1.0, (df, nonconforming_cells_before, process_times, removed_columns, removed_rows)
293
+
294
+ print("Cleaning process completed.")
295
+ print_dataframe_info(df, "Final - ")
llm_prompts.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CHECK_HEADERS_PROMPT = """
2
+ Analyze the following DataFrame columns and identify any columns without names or with invalid names.
3
+ Return only a JSON list of column indices (0-based) that need attention, without any explanation.
4
+ Columns: {columns}
5
+ """
6
+
7
+ NORMALIZE_HEADERS_PROMPT = """
8
+ Analyze the following DataFrame column names and normalize them according to these rules:
9
+ 1. Convert to lowercase
10
+ 2. Replace empty strings or spaces with underscores
11
+ 3. Remove any invalid characters (keep only letters, numbers, and underscores)
12
+
13
+ Return only a JSON object where keys are the original column names and values are the normalized names, without any explanation.
14
+ Column names: {columns}
15
+ """
16
+
17
+ CHECK_COLUMN_CONTENT_PROMPT = """
18
+ Analyze the following sample of values from the column '{column_name}' and determine:
19
+ 1. The most appropriate data type (float, integer, string, or date)
20
+ 2. Indices of empty or blank values
21
+ 3. Indices of values that don't conform to the determined data type
22
+
23
+ Sample values:
24
+ {sample_values}
25
+
26
+ Return only a JSON object with the following structure, without any explanation:
27
+ {{
28
+ "data_type": "detected_type",
29
+ "empty_indices": [list of indices of empty or blank values],
30
+ "invalid_indices": [list of indices of values that don't conform to the detected type]
31
+ }}
32
+ """
33
+
34
+ CHECK_TYPOS_PROMPT = """
35
+ Analyze the following sample of values from the column '{column_name}' and identify any potential typos or misspellings.
36
+ For each identified typo, suggest a correction.
37
+
38
+ Sample values:
39
+ {sample_values}
40
+
41
+ Return only a JSON object with the following structure, without any explanation:
42
+ {{
43
+ "typos": {{
44
+ "original_value1": "corrected_value1",
45
+ "original_value2": "corrected_value2",
46
+ ...
47
+ }}
48
+ }}
49
+
50
+ If no typos are found, return an empty object for "typos".
51
+ """
52
+
53
+ ENCODE_STRING_PROMPT = """
54
+ Analyze the following unique values from the column '{column_name}' and create an encoding scheme.
55
+ Assign a unique integer to each unique string value, starting from 0.
56
+
57
+ Unique values:
58
+ {unique_values}
59
+
60
+ Return only a JSON object with the following structure, without any explanation:
61
+ {{
62
+ "string_value1": 0,
63
+ "string_value2": 1,
64
+ "string_value3": 2,
65
+ ...
66
+ }}
67
+
68
+ Ensure that each unique string value is assigned a unique integer.
69
+ """
70
+
71
+ DETERMINE_DTYPE_PROMPT = """
72
+ Analyze the following sample values from a column and determine the most appropriate data type.
73
+ Possible types are: float, integer, string, or date.
74
+ If more than 80% of the values conform to a specific type, choose that type.
75
+ Otherwise, default to string.
76
+
77
+ Sample values:
78
+ {sample_values}
79
+
80
+ Return only a JSON object with the following structure, without any explanation:
81
+ {{
82
+ "column_type": "detected_type",
83
+ "invalid_indices": [list of indices that do not conform to the detected type]
84
+ }}
85
+ """
86
+
87
+ TRANSFORM_STRING_PROMPT = """
88
+ Transform the following unique string values from the column '{column_name}' to lowercase.
89
+ If a value is a variation of "nan" (case-insensitive), map it to "nan".
90
+
91
+ Unique values:
92
+ {unique_values}
93
+
94
+ Return only a JSON object with the following structure, without any explanation:
95
+ {{
96
+ "original_value1": "transformed_value1",
97
+ "original_value2": "transformed_value2",
98
+ ...
99
+ }}
100
+ """
101
+
102
+ CHECK_LOW_COUNT_VALUES_PROMPT = """
103
+ Analyze the following value counts from the column '{column_name}' and identify values with a count lower than 2.
104
+
105
+ Value counts:
106
+ {value_counts}
107
+
108
+ Return only a JSON list of values that have a count lower than 2, without any explanation.
109
+ """
110
+
111
+
112
+ CHECK_SCHEMA_CONFORMITY_PROMPT = """
113
+ Analyze the following sample of values from the column '{column_name}' and check if they conform to the determined data type '{data_type}'.
114
+
115
+ Sample values:
116
+ {sample_values}
117
+
118
+ Return only a JSON object with the following structure, without any explanation:
119
+ {{
120
+ "conforming_indices": [list of indices of values that conform to the data type],
121
+ "nonconforming_indices": [list of indices of values that do not conform to the data type]
122
+ }}
123
+ """
manage_schema.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import json
4
+ from llm_config import generate_llm_response
5
+ from llm_prompts import DETERMINE_DTYPE_PROMPT
6
+
7
+ SAMPLE_SIZE = 200
8
+
9
+
10
+ def determine_column_type(df, column):
11
+ sample = df[column].sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).tolist()
12
+ prompt = DETERMINE_DTYPE_PROMPT.format(sample_values=str(sample))
13
+ response = generate_llm_response(prompt)
14
+
15
+ try:
16
+ result = json.loads(response)
17
+ return result['column_type'], result['invalid_indices']
18
+ except (json.JSONDecodeError, KeyError):
19
+ print(f"Error parsing LLM response for column {column}")
20
+ return 'string', []
21
+
22
+
23
+ def enforce_column_type(df, column, column_type, invalid_indices):
24
+ if column_type == 'float':
25
+ df[column] = pd.to_numeric(df[column], errors='coerce')
26
+ elif column_type == 'integer':
27
+ df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64')
28
+ elif column_type == 'date':
29
+ df[column] = pd.to_datetime(df[column], errors='coerce')
30
+
31
+ # Set invalid values to NaN
32
+ df.loc[invalid_indices, column] = np.nan
33
+
34
+ return df
35
+
36
+
37
+ def process_dataframe(df):
38
+ print("Determining and enforcing column data types...")
39
+
40
+ for column in df.columns:
41
+ print(f"\nProcessing column: {column}")
42
+ column_type, invalid_indices = determine_column_type(df, column)
43
+ print(f" Detected type: {column_type}")
44
+ print(f" Number of invalid values: {len(invalid_indices)}")
45
+
46
+ df = enforce_column_type(df, column, column_type, invalid_indices)
47
+
48
+ valid_percentage = (df[column].count() / len(df)) * 100
49
+ print(f" Percentage of valid values after type enforcement: {valid_percentage:.2f}%")
50
+
51
+ return df
report.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ from datetime import datetime
7
+
8
+ REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
9
+ os.makedirs(REPORT_DIR, exist_ok=True)
10
+
11
+ def save_plot(fig, filename):
12
+ fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
13
+ plt.close(fig)
14
+
15
+ def plot_heatmap(df, title):
16
+ plt.figure(figsize=(12, 8))
17
+ sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
18
+ plt.title(title)
19
+ plt.tight_layout()
20
+ save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')
21
+
22
+
23
+ def plot_valid_data_percentage(original_df, cleaned_df):
24
+ original_valid = (original_df.notna().sum() / len(original_df)) * 100
25
+ cleaned_valid = (cleaned_df.notna().sum() / len(cleaned_df)) * 100
26
+
27
+ # Create a DataFrame with both original and cleaned percentages
28
+ combined_df = pd.DataFrame({
29
+ 'Original': original_valid,
30
+ 'Cleaned': cleaned_valid
31
+ }).fillna(0) # Fill NaN with 0 for columns that were removed
32
+
33
+ plt.figure(figsize=(15, 8))
34
+ combined_df.plot(kind='bar', width=0.8, alpha=0.8)
35
+
36
+ plt.xlabel('Columns')
37
+ plt.ylabel('Percentage of Valid Data')
38
+ plt.title('Percentage of Valid Data Before and After Cleaning')
39
+ plt.xticks(rotation=90)
40
+ plt.legend(['Before Cleaning', 'After Cleaning'])
41
+
42
+ # Add percentage labels on the bars
43
+ for i, (index, row) in enumerate(combined_df.iterrows()):
44
+ plt.text(i, row['Original'], f'{row["Original"]:.1f}%', ha='center', va='bottom')
45
+ if row['Cleaned'] > 0: # Only add label if column exists in cleaned data
46
+ plt.text(i, row['Cleaned'], f'{row["Cleaned"]:.1f}%', ha='center', va='bottom')
47
+
48
+ plt.tight_layout()
49
+ plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
50
+ plt.close()
51
+
52
+ def plot_column_schemas(df):
53
+ schemas = df.dtypes.astype(str).value_counts()
54
+ fig, ax = plt.subplots(figsize=(10, 6))
55
+ sns.barplot(x=schemas.index, y=schemas.values, ax=ax)
56
+ ax.set_title('Column Data Types')
57
+ ax.set_xlabel('Data Type')
58
+ ax.set_ylabel('Count')
59
+ save_plot(fig, 'column_schemas.png')
60
+
61
+ def plot_nonconforming_cells(nonconforming_cells):
62
+ # Ensure that nonconforming_cells is a dictionary
63
+ if isinstance(nonconforming_cells, dict):
64
+ # Proceed with plotting if it's a dictionary
65
+ fig, ax = plt.subplots(figsize=(12, 6))
66
+ sns.barplot(x=list(nonconforming_cells.keys()), y=list(nonconforming_cells.values()), ax=ax)
67
+ ax.set_title('Nonconforming Cells by Column')
68
+ ax.set_xlabel('Columns')
69
+ ax.set_ylabel('Number of Nonconforming Cells')
70
+ plt.xticks(rotation=90)
71
+ save_plot(fig, 'nonconforming_cells.png')
72
+ else:
73
+ print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")
74
+
75
+
76
+ def plot_column_distributions(original_df, cleaned_df):
77
+ numeric_columns = original_df.select_dtypes(include=[np.number]).columns
78
+ num_columns = len(numeric_columns)
79
+
80
+ if num_columns == 0:
81
+ print("No numeric columns found for distribution plots.")
82
+ return
83
+
84
+ # Create subplots for distributions
85
+ fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(18, 5 * ((num_columns + 2) // 3)))
86
+ axes = axes.flatten() if num_columns > 1 else [axes]
87
+
88
+ for i, column in enumerate(numeric_columns):
89
+ if column in cleaned_df.columns:
90
+ sns.histplot(original_df[column].dropna(), ax=axes[i], kde=True, color='blue', label='Before Cleaning', alpha=0.5)
91
+ sns.histplot(cleaned_df[column].dropna(), ax=axes[i], kde=True, color='orange', label='After Cleaning', alpha=0.5)
92
+ axes[i].set_title(f'{column} - Distribution Before & After Cleaning')
93
+ axes[i].legend()
94
+
95
+ # Remove any unused subplots
96
+ for j in range(i + 1, len(axes)):
97
+ fig.delaxes(axes[j])
98
+
99
+ plt.tight_layout()
100
+ save_plot(fig, 'distributions_before_after_cleaning.png')
101
+
102
+
103
+ def plot_boxplot_with_outliers(df):
104
+ print("Plotting boxplots with outliers...")
105
+ numeric_columns = df.select_dtypes(include=[np.number]).columns
106
+ num_columns = len(numeric_columns)
107
+
108
+ if num_columns == 0:
109
+ print("No numeric columns found for boxplot.")
110
+ return
111
+
112
+ # Create subplots based on the number of numeric columns
113
+ fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(15, 5 * ((num_columns + 2) // 3)))
114
+ axes = axes.flatten() if num_columns > 1 else [axes]
115
+
116
+ for i, column in enumerate(numeric_columns):
117
+ sns.boxplot(x=df[column], ax=axes[i])
118
+ axes[i].set_title(f'Boxplot of {column} with Outliers')
119
+
120
+ # Remove any unused subplots
121
+ for j in range(i + 1, len(axes)):
122
+ fig.delaxes(axes[j])
123
+
124
+ plt.tight_layout()
125
+ save_plot(fig, 'boxplots_with_outliers.png')
126
+
127
+
128
+ def plot_correlation_heatmap(df):
129
+ # Select only numeric, float, and integer columns
130
+ numeric_df = df.select_dtypes(include=[np.number])
131
+
132
+ # Compute the correlation matrix
133
+ correlation_matrix = numeric_df.corr()
134
+
135
+ # Plot the heatmap
136
+ fig, ax = plt.subplots(figsize=(15, 10))
137
+ sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', ax=ax, cbar_kws={'label': 'Correlation'})
138
+ ax.set_title('Correlation Heatmap')
139
+ save_plot(fig, 'correlation_heatmap.png')
140
+
141
+
142
+
143
+ def plot_process_times(process_times):
144
+ # Convert seconds to minutes
145
+ process_times_minutes = {k: v / 60 for k, v in process_times.items()}
146
+
147
+ # Separate main processes and column cleaning processes
148
+ main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
149
+ column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}
150
+
151
+ # Create the plot
152
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
153
+
154
+ # Plot main processes
155
+ bars1 = ax1.bar(main_processes.keys(), main_processes.values())
156
+ ax1.set_title('Main Process Times')
157
+ ax1.set_ylabel('Time (minutes)')
158
+ ax1.tick_params(axis='x', rotation=45)
159
+
160
+ # Plot column cleaning processes
161
+ bars2 = ax2.bar(column_processes.keys(), column_processes.values())
162
+ ax2.set_title('Column Cleaning Times')
163
+ ax2.set_ylabel('Time (minutes)')
164
+ ax2.tick_params(axis='x', rotation=90)
165
+
166
+ # Add value labels on top of each bar
167
+ for ax, bars in zip([ax1, ax2], [bars1, bars2]):
168
+ for bar in bars:
169
+ height = bar.get_height()
170
+ ax.text(bar.get_x() + bar.get_width() / 2., height,
171
+ f'{height:.2f}', ha='center', va='bottom')
172
+
173
+ # Add total time to the plot
174
+ total_time = sum(process_times_minutes.values())
175
+ fig.suptitle(f'Process Times (Total: {total_time:.2f} minutes)', fontsize=16)
176
+
177
+ plt.tight_layout()
178
+ save_plot(fig, 'process_times.png')
179
+
180
+
181
+ def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows):
182
+ os.makedirs(REPORT_DIR, exist_ok=True)
183
+
184
+ sns.set_style("whitegrid")
185
+ plt.rcParams['figure.dpi'] = 400
186
+
187
+ print("Plotting valid data percentages...")
188
+ plot_valid_data_percentage(original_df, cleaned_df)
189
+
190
+ print("Plotting column schemas...")
191
+ plot_column_schemas(cleaned_df)
192
+
193
+ print("Plotting nonconforming cells before cleaning...")
194
+ plot_nonconforming_cells(nonconforming_cells_before)
195
+
196
+ print("Plotting column distributions...")
197
+ plot_column_distributions(original_df, cleaned_df)
198
+
199
+ print("Plotting process times...")
200
+ plot_process_times(process_times)
201
+
202
+ print("Plotting heatmaps...")
203
+ plot_heatmap(original_df, "Missing Values Before Cleaning")
204
+
205
+ print("Plotting correlation heatmap...")
206
+ plot_correlation_heatmap(cleaned_df)
207
+
208
+ print(f"All visualization reports saved in directory: {REPORT_DIR}")