reab5555 commited on
Commit
9ac48a5
·
verified ·
1 Parent(s): 231c3ea

Upload clean.py

Browse files
Files changed (1) hide show
  1. clean.py +8 -7
clean.py CHANGED
@@ -225,7 +225,7 @@ def calculate_nonconforming_cells(df):
225
  return nonconforming_cells
226
 
227
 
228
- def clean_data(df):
229
  start_time = time.time()
230
  process_times = {}
231
  removed_rows = 0
@@ -267,18 +267,19 @@ def clean_data(df):
267
  # Step 5: Clean columns (in batches)
268
  column_cleaning_times = {}
269
  for i, column in enumerate(df.columns):
270
- column_start_time = time.time()
271
- df, nonconforming = clean_column(df, column)
272
- column_cleaning_times[f"Clean column: {column}"] = time.time() - column_start_time
273
- yield (5 + i) / total_steps, f"Cleaning column: {column}"
 
274
  process_times.update(column_cleaning_times)
275
 
276
  # Step 6: Remove outliers from numeric columns
277
  step_start_time = time.time()
278
- df, outlier_rows_removed = remove_outliers(df)
279
  removed_rows += outlier_rows_removed
280
  process_times['Remove outliers'] = time.time() - step_start_time
281
  yield 1.0, (df, nonconforming_cells_before, process_times, removed_columns, removed_rows)
282
 
283
  print("Cleaning process completed.")
284
- print_dataframe_info(df, "Final - ")
 
225
  return nonconforming_cells
226
 
227
 
228
+ def clean_data(df, primary_key_column):
229
  start_time = time.time()
230
  process_times = {}
231
  removed_rows = 0
 
267
  # Step 5: Clean columns (in batches)
268
  column_cleaning_times = {}
269
  for i, column in enumerate(df.columns):
270
+ if column != primary_key_column:
271
+ column_start_time = time.time()
272
+ df, nonconforming = clean_column(df, column)
273
+ column_cleaning_times[f"Clean column: {column}"] = time.time() - column_start_time
274
+ yield (5 + i) / total_steps, f"Cleaning column: {column}"
275
  process_times.update(column_cleaning_times)
276
 
277
  # Step 6: Remove outliers from numeric columns
278
  step_start_time = time.time()
279
+ df, outlier_rows_removed = remove_outliers(df, primary_key_column)
280
  removed_rows += outlier_rows_removed
281
  process_times['Remove outliers'] = time.time() - step_start_time
282
  yield 1.0, (df, nonconforming_cells_before, process_times, removed_columns, removed_rows)
283
 
284
  print("Cleaning process completed.")
285
+ print_dataframe_info(df, "Final - ")