reab5555 commited on
Commit
e9e721e
1 Parent(s): 9ac48a5

Upload clean.py

Browse files
Files changed (1) hide show
  1. clean.py +7 -5
clean.py CHANGED
@@ -155,11 +155,11 @@ def clean_column(df, column_name):
155
 
156
  # Convert column to determined data type
157
  if data_type == "float":
158
- df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce')
159
  elif data_type == "integer":
160
- df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce').astype('Int64')
161
  elif data_type == "date":
162
- df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
163
  elif data_type == "string" or data_type == "object":
164
  # Transform string values
165
  transform_result = transform_string_column(column_data, column_name)
@@ -178,8 +178,10 @@ def clean_column(df, column_name):
178
  print(f" Potential typos found: {typo_result['typos']}")
179
 
180
  # Set empty and invalid cells to NaN
181
- df.loc[empty_indices + invalid_indices, column_name] = np.nan
182
- nonconforming_cells = len(empty_indices) + len(invalid_indices)
 
 
183
 
184
  return df, nonconforming_cells
185
 
 
155
 
156
  # Convert column to determined data type
157
  if data_type == "float":
158
+ df[column_name] = pd.to_numeric(df[column_name], errors='coerce')
159
  elif data_type == "integer":
160
+ df[column_name] = pd.to_numeric(df[column_name], errors='coerce').astype('Int64')
161
  elif data_type == "date":
162
+ df[column_name] = pd.to_datetime(df[column_name], errors='coerce', dayfirst=True)
163
  elif data_type == "string" or data_type == "object":
164
  # Transform string values
165
  transform_result = transform_string_column(column_data, column_name)
 
178
  print(f" Potential typos found: {typo_result['typos']}")
179
 
180
  # Set empty and invalid cells to NaN
181
+ indices_to_set_nan = set(empty_indices + invalid_indices)
182
+ existing_indices = df.index.intersection(indices_to_set_nan)
183
+ df.loc[existing_indices, column_name] = np.nan
184
+ nonconforming_cells = len(existing_indices)
185
 
186
  return df, nonconforming_cells
187