Spaces:
Sleeping
Sleeping
Upload clean.py
Browse files
clean.py
CHANGED
@@ -225,7 +225,7 @@ def calculate_nonconforming_cells(df):
|
|
225 |
return nonconforming_cells
|
226 |
|
227 |
|
228 |
-
def clean_data(df):
|
229 |
start_time = time.time()
|
230 |
process_times = {}
|
231 |
removed_rows = 0
|
@@ -267,18 +267,19 @@ def clean_data(df):
|
|
267 |
# Step 5: Clean columns (in batches)
|
268 |
column_cleaning_times = {}
|
269 |
for i, column in enumerate(df.columns):
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
|
|
274 |
process_times.update(column_cleaning_times)
|
275 |
|
276 |
# Step 6: Remove outliers from numeric columns
|
277 |
step_start_time = time.time()
|
278 |
-
df, outlier_rows_removed = remove_outliers(df)
|
279 |
removed_rows += outlier_rows_removed
|
280 |
process_times['Remove outliers'] = time.time() - step_start_time
|
281 |
yield 1.0, (df, nonconforming_cells_before, process_times, removed_columns, removed_rows)
|
282 |
|
283 |
print("Cleaning process completed.")
|
284 |
-
print_dataframe_info(df, "Final - ")
|
|
|
225 |
return nonconforming_cells
|
226 |
|
227 |
|
228 |
+
def clean_data(df, primary_key_column):
|
229 |
start_time = time.time()
|
230 |
process_times = {}
|
231 |
removed_rows = 0
|
|
|
267 |
# Step 5: Clean columns (in batches)
|
268 |
column_cleaning_times = {}
|
269 |
for i, column in enumerate(df.columns):
|
270 |
+
if column != primary_key_column:
|
271 |
+
column_start_time = time.time()
|
272 |
+
df, nonconforming = clean_column(df, column)
|
273 |
+
column_cleaning_times[f"Clean column: {column}"] = time.time() - column_start_time
|
274 |
+
yield (5 + i) / total_steps, f"Cleaning column: {column}"
|
275 |
process_times.update(column_cleaning_times)
|
276 |
|
277 |
# Step 6: Remove outliers from numeric columns
|
278 |
step_start_time = time.time()
|
279 |
+
df, outlier_rows_removed = remove_outliers(df, primary_key_column)
|
280 |
removed_rows += outlier_rows_removed
|
281 |
process_times['Remove outliers'] = time.time() - step_start_time
|
282 |
yield 1.0, (df, nonconforming_cells_before, process_times, removed_columns, removed_rows)
|
283 |
|
284 |
print("Cleaning process completed.")
|
285 |
+
print_dataframe_info(df, "Final - ")
|