Spaces:
Running
Running
Upload report.py
Browse files
report.py
CHANGED
@@ -23,7 +23,9 @@ def plot_heatmap(df, title):
|
|
23 |
|
24 |
|
25 |
def plot_valid_data_percentage(original_df, cleaned_df, primary_key_column):
|
26 |
-
|
|
|
|
|
27 |
original_valid = (original_df[columns_to_plot].notna().sum() / len(original_df)) * 100
|
28 |
cleaned_valid = (cleaned_df[columns_to_plot].notna().sum() / len(cleaned_df)) * 100
|
29 |
|
@@ -52,6 +54,13 @@ def plot_valid_data_percentage(original_df, cleaned_df, primary_key_column):
|
|
52 |
plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
|
53 |
plt.close()
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
def plot_column_schemas(df):
|
57 |
schemas = df.dtypes.astype(str).value_counts()
|
|
|
23 |
|
24 |
|
25 |
def plot_valid_data_percentage(original_df, cleaned_df, primary_key_column):
|
26 |
+
# Get columns present in both DataFrames, excluding the primary key
|
27 |
+
columns_to_plot = [col for col in original_df.columns if col in cleaned_df.columns and col != primary_key_column]
|
28 |
+
|
29 |
original_valid = (original_df[columns_to_plot].notna().sum() / len(original_df)) * 100
|
30 |
cleaned_valid = (cleaned_df[columns_to_plot].notna().sum() / len(cleaned_df)) * 100
|
31 |
|
|
|
54 |
plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
|
55 |
plt.close()
|
56 |
|
57 |
+
# Print information about removed columns
|
58 |
+
removed_columns = set(original_df.columns) - set(cleaned_df.columns)
|
59 |
+
if removed_columns:
|
60 |
+
print(f"The following columns were removed during the cleaning process: {', '.join(removed_columns)}")
|
61 |
+
else:
|
62 |
+
print("No columns were removed during the cleaning process.")
|
63 |
+
|
64 |
|
65 |
def plot_column_schemas(df):
|
66 |
schemas = df.dtypes.astype(str).value_counts()
|