reab5555 commited on
Commit
c1015ec
1 Parent(s): e9e721e

Upload report.py

Browse files
Files changed (1) hide show
  1. report.py +10 -1
report.py CHANGED
@@ -23,7 +23,9 @@ def plot_heatmap(df, title):
23
 
24
 
25
  def plot_valid_data_percentage(original_df, cleaned_df, primary_key_column):
26
- columns_to_plot = [col for col in original_df.columns if col != primary_key_column]
 
 
27
  original_valid = (original_df[columns_to_plot].notna().sum() / len(original_df)) * 100
28
  cleaned_valid = (cleaned_df[columns_to_plot].notna().sum() / len(cleaned_df)) * 100
29
 
@@ -52,6 +54,13 @@ def plot_valid_data_percentage(original_df, cleaned_df, primary_key_column):
52
  plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
53
  plt.close()
54
 
 
 
 
 
 
 
 
55
 
56
  def plot_column_schemas(df):
57
  schemas = df.dtypes.astype(str).value_counts()
 
23
 
24
 
25
  def plot_valid_data_percentage(original_df, cleaned_df, primary_key_column):
26
+ # Get columns present in both DataFrames, excluding the primary key
27
+ columns_to_plot = [col for col in original_df.columns if col in cleaned_df.columns and col != primary_key_column]
28
+
29
  original_valid = (original_df[columns_to_plot].notna().sum() / len(original_df)) * 100
30
  cleaned_valid = (cleaned_df[columns_to_plot].notna().sum() / len(cleaned_df)) * 100
31
 
 
54
  plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
55
  plt.close()
56
 
57
+ # Print information about removed columns
58
+ removed_columns = set(original_df.columns) - set(cleaned_df.columns)
59
+ if removed_columns:
60
+ print(f"The following columns were removed during the cleaning process: {', '.join(removed_columns)}")
61
+ else:
62
+ print("No columns were removed during the cleaning process.")
63
+
64
 
65
  def plot_column_schemas(df):
66
  schemas = df.dtypes.astype(str).value_counts()