Spaces:

reab5555
/

AI-Data-Cleaner

Sleeping

App Files Files Community

reab5555 commited on Sep 14, 2024

Commit

ec38d9f

verified ·

1 Parent(s): 6d52bd3

Upload 3 files

Browse files

Files changed (3) hide show

app.py +48 -27
clean.py +284 -280
report.py +48 -62

app.py CHANGED Viewed

@@ -5,24 +5,28 @@ from report import create_full_report, REPORT_DIR
 import os
 import tempfile
-def clean_and_visualize(file, progress=gr.Progress()):
     # Load the data
     df = pd.read_csv(file.name)
     # Clean the data
     cleaned_df = None
     nonconforming_cells_before = None
     process_times = None
     removed_columns = None
     removed_rows = None
-    for progress_value, status_text in clean_data(df):
         if isinstance(status_text, tuple):
             cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows = status_text
             progress(progress_value, desc="Cleaning completed")
         else:
             progress(progress_value, desc=status_text)
     # Generate full visualization report
     create_full_report(
         df,
@@ -30,61 +34,78 @@ def clean_and_visualize(file, progress=gr.Progress()):
         nonconforming_cells_before,
         process_times,
         removed_columns,
-        removed_rows
     )
     # Save cleaned DataFrame to a temporary CSV file
     with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp_file:
         cleaned_df.to_csv(tmp_file.name, index=False)
         cleaned_csv_path = tmp_file.name
     # Collect all generated images
     image_files = [os.path.join(REPORT_DIR, f) for f in os.listdir(REPORT_DIR) if f.endswith('.png')]
     return cleaned_csv_path, image_files
 def launch_app():
     with gr.Blocks() as app:
         gr.Markdown("# AI Data Cleaner")
         with gr.Row():
             file_input = gr.File(label="Upload CSV File", file_count="single", file_types=[".csv"])
         with gr.Row():
             clean_button = gr.Button("Start Cleaning")
         with gr.Row():
             progress_bar = gr.Progress()
         with gr.Row():
             cleaned_file_output = gr.File(label="Cleaned CSV", visible=True)
         with gr.Row():
             output_gallery = gr.Gallery(
-                label="Visualization Results",
-                show_label=True,
-                elem_id="gallery",
                 columns=[3],
-                rows=[3],
-                object_fit="contain",
                 height="auto",
-                visible=False  # Initially set to invisible
             )
-        def process_and_show_results(file):
-            cleaned_csv_path, image_files = clean_and_visualize(file, progress=progress_bar)
             return (
                 cleaned_csv_path,
-                gr.Gallery(visible=True, value=image_files)  # Make gallery visible and update its content
             )
         clean_button.click(
             fn=process_and_show_results,
-            inputs=file_input,
             outputs=[cleaned_file_output, output_gallery]
         )
     app.launch()
 if __name__ == "__main__":
     launch_app()

 import os
 import tempfile
+def clean_and_visualize(file, primary_key_column, progress=gr.Progress()):
     # Load the data
     df = pd.read_csv(file.name)
+    # Remove duplicates from the primary key column
+    df = df.drop_duplicates(subset=[primary_key_column], keep='first')
     # Clean the data
     cleaned_df = None
     nonconforming_cells_before = None
     process_times = None
     removed_columns = None
     removed_rows = None
+    for progress_value, status_text in clean_data(df, primary_key_column):
         if isinstance(status_text, tuple):
             cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows = status_text
             progress(progress_value, desc="Cleaning completed")
         else:
             progress(progress_value, desc=status_text)
     # Generate full visualization report
     create_full_report(
         df,
         nonconforming_cells_before,
         process_times,
         removed_columns,
+        removed_rows,
+        primary_key_column
     )
     # Save cleaned DataFrame to a temporary CSV file
     with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp_file:
         cleaned_df.to_csv(tmp_file.name, index=False)
         cleaned_csv_path = tmp_file.name
     # Collect all generated images
     image_files = [os.path.join(REPORT_DIR, f) for f in os.listdir(REPORT_DIR) if f.endswith('.png')]
     return cleaned_csv_path, image_files
 def launch_app():
     with gr.Blocks() as app:
         gr.Markdown("# AI Data Cleaner")
         with gr.Row():
             file_input = gr.File(label="Upload CSV File", file_count="single", file_types=[".csv"])
+        with gr.Row():
+            primary_key_dropdown = gr.Dropdown(label="Select Primary Key Column", choices=[], interactive=True)
         with gr.Row():
             clean_button = gr.Button("Start Cleaning")
         with gr.Row():
             progress_bar = gr.Progress()
         with gr.Row():
             cleaned_file_output = gr.File(label="Cleaned CSV", visible=True)
         with gr.Row():
             output_gallery = gr.Gallery(
+                label="Visualization Results",
+                show_label=True,
+                elem_id="gallery",
                 columns=[3],
+                rows=[3],
+                object_fit="contain",
                 height="auto",
+                visible=False
             )
+        def update_primary_key_options(file):
+            if file is not None:
+                df = pd.read_csv(file.name)
+                return gr.Dropdown(choices=df.columns.tolist())
+        def process_and_show_results(file, primary_key_column):
+            cleaned_csv_path, image_files = clean_and_visualize(file, primary_key_column, progress=progress_bar)
             return (
                 cleaned_csv_path,
+                gr.Gallery(visible=True, value=image_files)
             )
+        file_input.change(
+            fn=update_primary_key_options,
+            inputs=file_input,
+            outputs=primary_key_dropdown
+        )
         clean_button.click(
             fn=process_and_show_results,
+            inputs=[file_input, primary_key_dropdown],
             outputs=[cleaned_file_output, output_gallery]
         )
     app.launch()
 if __name__ == "__main__":
     launch_app()

clean.py CHANGED Viewed

@@ -1,280 +1,284 @@
-import pandas as pd
-import numpy as np
-import json
-import time
-from tqdm import tqdm
-from llm_config import generate_llm_response
-from llm_prompts import (
-    CHECK_HEADERS_PROMPT,
-    NORMALIZE_HEADERS_PROMPT,
-    CHECK_COLUMN_CONTENT_PROMPT,
-    CHECK_TYPOS_PROMPT,
-    TRANSFORM_STRING_PROMPT,
-    CHECK_LOW_COUNT_VALUES_PROMPT
-)
-BATCH_SIZE = 50
-EMPTY_THRESHOLD = 0.5
-def print_dataframe_info(df, step=""):
-    num_columns = df.shape[1]
-    num_rows = df.shape[0]
-    num_cells = num_columns * num_rows
-    print(f"{step}Dataframe info:")
-    print(f"  Number of columns: {num_columns}")
-    print(f"  Number of rows: {num_rows}")
-    print(f"  Total number of cells: {num_cells}")
-def check_and_normalize_column_headers(df):
-    print("Checking and normalizing column headers...")
-    check_prompt = CHECK_HEADERS_PROMPT.format(columns=df.columns.tolist())
-    check_response = generate_llm_response(check_prompt)
-    try:
-        invalid_columns = json.loads(check_response)
-        if invalid_columns:
-            print(f"Columns with invalid names (indices): {invalid_columns}")
-            for idx in invalid_columns:
-                new_name = f"column_{idx}"
-                print(f"Renaming column at index {idx} to '{new_name}'")
-                df.rename(columns={df.columns[idx]: new_name}, inplace=True)
-        else:
-            print("All column headers are valid or no invalid headers detected.")
-    except json.JSONDecodeError:
-        print("Error parsing LLM response for column headers check.")
-    normalize_prompt = NORMALIZE_HEADERS_PROMPT.format(columns=df.columns.tolist())
-    normalize_response = generate_llm_response(normalize_prompt)
-    try:
-        normalized_names = json.loads(normalize_response)
-        if normalized_names:
-            df.rename(columns=normalized_names, inplace=True)
-            print("Column names have been normalized.")
-        else:
-            print("No column names were normalized. Proceeding with current names.")
-    except json.JSONDecodeError:
-        print("Error parsing LLM response for column name normalization.")
-    # Fallback normalization
-    df.columns = [col.lower().replace(' ', '_') for col in df.columns]
-    print("Applied fallback normalization to ensure valid column names.")
-    return df
-def process_column_batch(column_data, column_name):
-    sample = column_data.sample(n=min(BATCH_SIZE, len(column_data)), random_state=42).tolist()
-    prompt = CHECK_COLUMN_CONTENT_PROMPT.format(column_name=column_name, sample_values=str(sample))
-    response = generate_llm_response(prompt)
-    try:
-        result = json.loads(response)
-        if not all(key in result for key in ['data_type', 'empty_indices', 'invalid_indices']):
-            raise ValueError("Missing required keys in LLM response")
-        return result
-    except (json.JSONDecodeError, ValueError) as e:
-        print(f"Error parsing LLM response for column {column_name}: {str(e)}")
-        print(f"LLM Response: {response}")
-        return {'data_type': 'string', 'empty_indices': [], 'invalid_indices': []}
-def check_typos(column_data, column_name):
-    sample = column_data.sample(n=min(BATCH_SIZE, len(column_data)), random_state=42).tolist()
-    prompt = CHECK_TYPOS_PROMPT.format(column_name=column_name, sample_values=str(sample))
-    response = generate_llm_response(prompt)
-    try:
-        return json.loads(response)
-    except json.JSONDecodeError:
-        print(f"Error parsing LLM response for typo check in column {column_name}")
-        return {"typos": {}}
-def transform_string_column(column_data, column_name):
-    unique_values = column_data.unique().tolist()
-    prompt = TRANSFORM_STRING_PROMPT.format(column_name=column_name, unique_values=unique_values)
-    response = generate_llm_response(prompt)
-    try:
-        result = json.loads(response)
-        return result
-    except json.JSONDecodeError:
-        print(f"Error parsing LLM response for string transformation in column {column_name}")
-        return {}
-def check_low_count_values(column_data, column_name):
-    value_counts = column_data.value_counts().to_dict()
-    prompt = CHECK_LOW_COUNT_VALUES_PROMPT.format(column_name=column_name, value_counts=value_counts)
-    response = generate_llm_response(prompt)
-    try:
-        result = json.loads(response)
-        return result
-    except json.JSONDecodeError:
-        print(f"Error parsing LLM response for low count values in column {column_name}")
-        return []
-def remove_empty_columns(df, threshold=EMPTY_THRESHOLD):
-    print(f"Removing columns with less than {threshold * 100}% valid data...")
-    valid_threshold = int(df.shape[0] * threshold)
-    df = df.dropna(axis=1, thresh=valid_threshold)
-    return df
-def remove_empty_rows(df, threshold=EMPTY_THRESHOLD):
-    print(f"Removing rows with less than {threshold * 100}% valid data...")
-    valid_threshold = int(df.shape[1] * threshold)
-    df = df.dropna(axis=0, thresh=valid_threshold)
-    return df
-def clean_column(df, column_name):
-    print(f"Cleaning column: {column_name}")
-    column_data = df[column_name]
-    total_rows = len(column_data)
-    empty_indices = []
-    invalid_indices = []
-    data_type = "string"
-    nonconforming_cells = 0
-    for i in range(0, total_rows, BATCH_SIZE):
-        batch = column_data.iloc[i:i + BATCH_SIZE]
-        result = process_column_batch(batch, column_name)
-        valid_empty_indices = [idx for idx in result["empty_indices"] if idx + i < total_rows]
-        valid_invalid_indices = [idx for idx in result["invalid_indices"] if idx + i < total_rows]
-        empty_indices.extend([idx + i for idx in valid_empty_indices])
-        invalid_indices.extend([idx + i for idx in valid_invalid_indices])
-        if i == 0:  # Use the data type from the first batch
-            data_type = result["data_type"]
-    print(f"  Data type determined: {data_type}")
-    print(f"  Empty cells: {len(empty_indices)}")
-    print(f"  Invalid cells: {len(invalid_indices)}")
-    # Convert column to determined data type
-    if data_type == "float":
-        df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce')
-    elif data_type == "integer":
-        df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce').astype('Int64')
-    elif data_type == "date":
-        df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
-    elif data_type == "string" or data_type == "object":
-        # Transform string values
-        transform_result = transform_string_column(column_data, column_name)
-        df[column_name] = df[column_name].map(transform_result).fillna(df[column_name])
-        # Handle "nan" strings
-        df[column_name] = df[column_name].replace({"nan": np.nan, "NaN": np.nan, "NAN": np.nan})
-        # Check for low count values
-        low_count_values = check_low_count_values(df[column_name], column_name)
-        df.loc[df[column_name].isin(low_count_values), column_name] = np.nan
-        # Check for typos
-        typo_result = check_typos(df[column_name], column_name)
-        if typo_result["typos"]:
-            print(f"  Potential typos found: {typo_result['typos']}")
-    # Set empty and invalid cells to NaN
-    df.loc[empty_indices + invalid_indices, column_name] = np.nan
-    nonconforming_cells = len(empty_indices) + len(invalid_indices)
-    return df, nonconforming_cells
-def remove_outliers(df):
-    print("Removing rows with outliers from numeric/integer/float columns...")
-    rows_to_remove = set()
-    for column in df.select_dtypes(include=[np.number]).columns:
-        q1 = df[column].quantile(0.25)
-        q3 = df[column].quantile(0.75)
-        iqr = q3 - q1
-        lower_bound = q1 - 1.5 * iqr
-        upper_bound = q3 + 1.5 * iqr
-        outlier_rows = df[(df[column] < lower_bound) | (df[column] > upper_bound)].index
-        rows_to_remove.update(outlier_rows)
-    initial_rows = len(df)
-    df = df.drop(index=list(rows_to_remove))
-    removed_rows = initial_rows - len(df)
-    print(f"Removed {removed_rows} rows containing outliers.")
-    return df, removed_rows
-def calculate_nonconforming_cells(df):
-    nonconforming_cells = {}
-    for column in df.columns:
-        # Count NaN values
-        nan_count = df[column].isna().sum()
-        # For numeric columns, count infinite values
-        if np.issubdtype(df[column].dtype, np.number):
-            inf_count = np.isinf(df[column]).sum()
-        else:
-            inf_count = 0
-        # For object columns, count empty strings
-        if df[column].dtype == 'object':
-            empty_string_count = (df[column] == '').sum()
-        else:
-            empty_string_count = 0
-        nonconforming_cells[column] = nan_count + inf_count + empty_string_count
-    return nonconforming_cells
-def clean_data(df):
-    start_time = time.time()
-    process_times = {}
-    removed_rows = 0
-    removed_columns = 0
-    print("Starting data validation and cleaning...")
-    print_dataframe_info(df, "Initial - ")
-    # Calculate nonconforming cells before cleaning
-    nonconforming_cells_before = calculate_nonconforming_cells(df)
-    steps = ['Normalize headers', 'Remove empty columns', 'Remove empty rows', 'Remove low count strings', 'Clean columns', 'Remove outliers']
-    total_steps = len(steps) + len(df.columns)  # Add column count for individual column cleaning
-    # Step 1: Normalize column headers
-    step_start_time = time.time()
-    df = check_and_normalize_column_headers(df)
-    process_times['Normalize headers'] = time.time() - step_start_time
-    yield 1 / total_steps, "Normalized headers"
-    # Step 2: Remove empty columns (less than 60% valid data)
-    step_start_time = time.time()
-    df = remove_empty_columns(df)
-    process_times['Remove empty columns'] = time.time() - step_start_time
-    yield 2 / total_steps, "Removed empty columns"
-    # Step 3: Remove empty rows (less than 60% valid data)
-    step_start_time = time.time()
-    df = remove_empty_rows(df)
-    process_times['Remove empty rows'] = time.time() - step_start_time
-    yield 3 / total_steps, "Removed empty rows"
-    # Step 4: Clean columns (in batches)
-    column_cleaning_times = {}
-    for i, column in enumerate(df.columns):
-        column_start_time = time.time()
-        df, nonconforming = clean_column(df, column)
-        column_cleaning_times[f"Clean column: {column}"] = time.time() - column_start_time
-        yield (5 + i) / total_steps, f"Cleaning column: {column}"
-    process_times.update(column_cleaning_times)
-    # Step 5: Remove outliers from numeric columns
-    step_start_time = time.time()
-    df, outlier_rows_removed = remove_outliers(df)
-    removed_rows += outlier_rows_removed
-    process_times['Remove outliers'] = time.time() - step_start_time
-    yield 1.0, (df, nonconforming_cells_before, process_times, removed_columns, removed_rows)
-    print("Cleaning process completed.")
-    print_dataframe_info(df, "Final - ")

+import pandas as pd
+import numpy as np
+import json
+import time
+from tqdm import tqdm
+from llm_config import generate_llm_response
+from llm_prompts import (
+    CHECK_HEADERS_PROMPT,
+    NORMALIZE_HEADERS_PROMPT,
+    CHECK_COLUMN_CONTENT_PROMPT,
+    CHECK_TYPOS_PROMPT,
+    TRANSFORM_STRING_PROMPT,
+    CHECK_LOW_COUNT_VALUES_PROMPT
+)
+BATCH_SIZE = 50
+EMPTY_THRESHOLD = 0.5
+def print_dataframe_info(df, step=""):
+    num_columns = df.shape[1]
+    num_rows = df.shape[0]
+    num_cells = num_columns * num_rows
+    print(f"{step}Dataframe info:")
+    print(f"  Number of columns: {num_columns}")
+    print(f"  Number of rows: {num_rows}")
+    print(f"  Total number of cells: {num_cells}")
+def check_and_normalize_column_headers(df):
+    print("Checking and normalizing column headers...")
+    check_prompt = CHECK_HEADERS_PROMPT.format(columns=df.columns.tolist())
+    check_response = generate_llm_response(check_prompt)
+    try:
+        invalid_columns = json.loads(check_response)
+        if invalid_columns:
+            print(f"Columns with invalid names (indices): {invalid_columns}")
+            for idx in invalid_columns:
+                new_name = f"column_{idx}"
+                print(f"Renaming column at index {idx} to '{new_name}'")
+                df.rename(columns={df.columns[idx]: new_name}, inplace=True)
+        else:
+            print("All column headers are valid or no invalid headers detected.")
+    except json.JSONDecodeError:
+        print("Error parsing LLM response for column headers check.")
+    normalize_prompt = NORMALIZE_HEADERS_PROMPT.format(columns=df.columns.tolist())
+    normalize_response = generate_llm_response(normalize_prompt)
+    try:
+        normalized_names = json.loads(normalize_response)
+        if normalized_names:
+            df.rename(columns=normalized_names, inplace=True)
+            print("Column names have been normalized.")
+        else:
+            print("No column names were normalized. Proceeding with current names.")
+    except json.JSONDecodeError:
+        print("Error parsing LLM response for column name normalization.")
+    # Fallback normalization
+    df.columns = [col.lower().replace(' ', '_') for col in df.columns]
+    print("Applied fallback normalization to ensure valid column names.")
+    return df
+def process_column_batch(column_data, column_name):
+    sample = column_data.sample(n=min(BATCH_SIZE, len(column_data)), random_state=42).tolist()
+    prompt = CHECK_COLUMN_CONTENT_PROMPT.format(column_name=column_name, sample_values=str(sample))
+    response = generate_llm_response(prompt)
+    try:
+        result = json.loads(response)
+        if not all(key in result for key in ['data_type', 'empty_indices', 'invalid_indices']):
+            raise ValueError("Missing required keys in LLM response")
+        return result
+    except (json.JSONDecodeError, ValueError) as e:
+        print(f"Error parsing LLM response for column {column_name}: {str(e)}")
+        print(f"LLM Response: {response}")
+        return {'data_type': 'string', 'empty_indices': [], 'invalid_indices': []}
+def check_typos(column_data, column_name):
+    sample = column_data.sample(n=min(BATCH_SIZE, len(column_data)), random_state=42).tolist()
+    prompt = CHECK_TYPOS_PROMPT.format(column_name=column_name, sample_values=str(sample))
+    response = generate_llm_response(prompt)
+    try:
+        return json.loads(response)
+    except json.JSONDecodeError:
+        print(f"Error parsing LLM response for typo check in column {column_name}")
+        return {"typos": {}}
+def transform_string_column(column_data, column_name):
+    unique_values = column_data.unique().tolist()
+    prompt = TRANSFORM_STRING_PROMPT.format(column_name=column_name, unique_values=unique_values)
+    response = generate_llm_response(prompt)
+    try:
+        result = json.loads(response)
+        return result
+    except json.JSONDecodeError:
+        print(f"Error parsing LLM response for string transformation in column {column_name}")
+        return {}
+def check_low_count_values(column_data, column_name):
+    value_counts = column_data.value_counts().to_dict()
+    prompt = CHECK_LOW_COUNT_VALUES_PROMPT.format(column_name=column_name, value_counts=value_counts)
+    response = generate_llm_response(prompt)
+    try:
+        result = json.loads(response)
+        return result
+    except json.JSONDecodeError:
+        print(f"Error parsing LLM response for low count values in column {column_name}")
+        return []
+def remove_empty_columns(df, threshold=EMPTY_THRESHOLD):
+    print(f"Removing columns with less than {threshold * 100}% valid data...")
+    valid_threshold = int(df.shape[0] * threshold)
+    df = df.dropna(axis=1, thresh=valid_threshold)
+    return df
+def remove_empty_rows(df, threshold=EMPTY_THRESHOLD):
+    print(f"Removing rows with less than {threshold * 100}% valid data...")
+    valid_threshold = int(df.shape[1] * threshold)
+    df = df.dropna(axis=0, thresh=valid_threshold)
+    return df
+def remove_low_count_categories(df):
+    print("Removing strings with count below 2...")
+    for col in df.select_dtypes(include=['object']).columns:
+        value_counts = df[col].value_counts()
+        to_remove = value_counts[value_counts < 2].index
+        df[col] = df[col].replace(to_remove, np.nan)
+    return df
+def clean_column(df, column_name):
+    print(f"Cleaning column: {column_name}")
+    column_data = df[column_name]
+    total_rows = len(column_data)
+    empty_indices = []
+    invalid_indices = []
+    data_type = "string"
+    nonconforming_cells = 0
+    for i in range(0, total_rows, BATCH_SIZE):
+        batch = column_data.iloc[i:i + BATCH_SIZE]
+        result = process_column_batch(batch, column_name)
+        valid_empty_indices = [idx for idx in result["empty_indices"] if idx + i < total_rows]
+        valid_invalid_indices = [idx for idx in result["invalid_indices"] if idx + i < total_rows]
+        empty_indices.extend([idx + i for idx in valid_empty_indices])
+        invalid_indices.extend([idx + i for idx in valid_invalid_indices])
+        if i == 0:  # Use the data type from the first batch
+            data_type = result["data_type"]
+    print(f"  Data type determined: {data_type}")
+    print(f"  Empty cells: {len(empty_indices)}")
+    print(f"  Invalid cells: {len(invalid_indices)}")
+    # Convert column to determined data type
+    if data_type == "float":
+        df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce')
+    elif data_type == "integer":
+        df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce').astype('Int64')
+    elif data_type == "date":
+        df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
+    elif data_type == "string" or data_type == "object":
+        # Transform string values
+        transform_result = transform_string_column(column_data, column_name)
+        df[column_name] = df[column_name].map(transform_result).fillna(df[column_name])
+        # Handle "nan" strings
+        df[column_name] = df[column_name].replace({"nan": np.nan, "NaN": np.nan, "NAN": np.nan})
+        # Check for low count values
+        low_count_values = check_low_count_values(df[column_name], column_name)
+        df.loc[df[column_name].isin(low_count_values), column_name] = np.nan
+        # Check for typos
+        typo_result = check_typos(df[column_name], column_name)
+        if typo_result["typos"]:
+            print(f"  Potential typos found: {typo_result['typos']}")
+    # Set empty and invalid cells to NaN
+    df.loc[empty_indices + invalid_indices, column_name] = np.nan
+    nonconforming_cells = len(empty_indices) + len(invalid_indices)
+    return df, nonconforming_cells
+def remove_outliers(df, primary_key_column):
+    print("Removing rows with outliers from numeric/integer/float columns...")
+    rows_to_remove = set()
+    for column in df.select_dtypes(include=[np.number]).columns:
+        if column != primary_key_column:
+            q1 = df[column].quantile(0.25)
+            q3 = df[column].quantile(0.75)
+            iqr = q3 - q1
+            lower_bound = q1 - 1.5 * iqr
+            upper_bound = q3 + 1.5 * iqr
+            outlier_rows = df[(df[column] < lower_bound) | (df[column] > upper_bound)].index
+            rows_to_remove.update(outlier_rows)
+    initial_rows = len(df)
+    df = df.drop(index=list(rows_to_remove))
+    removed_rows = initial_rows - len(df)
+    print(f"Removed {removed_rows} rows containing outliers.")
+    return df, removed_rows
+def calculate_nonconforming_cells(df):
+    nonconforming_cells = {}
+    for column in df.columns:
+        # Count NaN values
+        nan_count = df[column].isna().sum()
+        # For numeric columns, count infinite values
+        if np.issubdtype(df[column].dtype, np.number):
+            inf_count = np.isinf(df[column]).sum()
+        else:
+            inf_count = 0
+        # For object columns, count empty strings
+        if df[column].dtype == 'object':
+            empty_string_count = (df[column] == '').sum()
+        else:
+            empty_string_count = 0
+        nonconforming_cells[column] = nan_count + inf_count + empty_string_count
+    return nonconforming_cells
+def clean_data(df):
+    start_time = time.time()
+    process_times = {}
+    removed_rows = 0
+    removed_columns = 0
+    print("Starting data validation and cleaning...")
+    print_dataframe_info(df, "Initial - ")
+    # Calculate nonconforming cells before cleaning
+    nonconforming_cells_before = calculate_nonconforming_cells(df)
+    steps = ['Normalize headers', 'Remove empty columns', 'Remove empty rows', 'Remove low count strings', 'Clean columns', 'Remove outliers']
+    total_steps = len(steps) + len(df.columns)  # Add column count for individual column cleaning
+    # Step 1: Normalize column headers
+    step_start_time = time.time()
+    df = check_and_normalize_column_headers(df)
+    process_times['Normalize headers'] = time.time() - step_start_time
+    yield 1 / total_steps, "Normalized headers"
+    # Step 2: Remove empty columns (less than 60% valid data)
+    step_start_time = time.time()
+    df = remove_empty_columns(df)
+    process_times['Remove empty columns'] = time.time() - step_start_time
+    yield 2 / total_steps, "Removed empty columns"
+    # Step 3: Remove empty rows (less than 60% valid data)
+    step_start_time = time.time()
+    df = remove_empty_rows(df)
+    process_times['Remove empty rows'] = time.time() - step_start_time
+    yield 3 / total_steps, "Removed empty rows"
+    # Step 4: Remove low count categories
+    step_start_time = time.time()
+    df = remove_low_count_categories(df)
+    process_times['Remove low count strings'] = time.time() - step_start_time
+    yield 4 / total_steps, "Removed low count strings"
+    # Step 5: Clean columns (in batches)
+    column_cleaning_times = {}
+    for i, column in enumerate(df.columns):
+        column_start_time = time.time()
+        df, nonconforming = clean_column(df, column)
+        column_cleaning_times[f"Clean column: {column}"] = time.time() - column_start_time
+        yield (5 + i) / total_steps, f"Cleaning column: {column}"
+    process_times.update(column_cleaning_times)
+    # Step 6: Remove outliers from numeric columns
+    step_start_time = time.time()
+    df, outlier_rows_removed = remove_outliers(df)
+    removed_rows += outlier_rows_removed
+    process_times['Remove outliers'] = time.time() - step_start_time
+    yield 1.0, (df, nonconforming_cells_before, process_times, removed_columns, removed_rows)
+    print("Cleaning process completed.")
+    print_dataframe_info(df, "Final - ")

report.py CHANGED Viewed

@@ -8,10 +8,12 @@ from datetime import datetime
 REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
 os.makedirs(REPORT_DIR, exist_ok=True)
 def save_plot(fig, filename):
     fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
     plt.close(fig)
 def plot_heatmap(df, title):
     plt.figure(figsize=(12, 8))
     sns.heatmap(df.isnull(), cbar=False, cmap='Reds')
@@ -20,110 +22,103 @@ def plot_heatmap(df, title):
     save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')
-def plot_valid_data_percentage(original_df, cleaned_df):
-    original_valid = (original_df.notna().sum() / len(original_df)) * 100
-    cleaned_valid = (cleaned_df.notna().sum() / len(cleaned_df)) * 100
-    # Combine the data and fill missing values with 0
     combined_data = pd.concat([original_valid, cleaned_valid], axis=1, keys=['Original', 'Cleaned']).fillna(0)
     plt.figure(figsize=(15, 8))
     x = range(len(combined_data))
     width = 0.35
     plt.bar(x, combined_data['Original'], width, label='Before Cleaning', alpha=0.8)
     plt.bar([i + width for i in x], combined_data['Cleaned'], width, label='After Cleaning', alpha=0.8)
     plt.xlabel('Columns')
     plt.ylabel('Percentage of Valid Data')
     plt.title('Percentage of Valid Data Before and After Cleaning')
-    plt.xticks([i + width/2 for i in x], combined_data.index, rotation=90)
     plt.legend()
-    # Add percentage labels on the bars with smaller font size
     for i, v in enumerate(combined_data['Original']):
         plt.text(i, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
     for i, v in enumerate(combined_data['Cleaned']):
         plt.text(i + width, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
     plt.tight_layout()
     plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
     plt.close()
 def plot_column_schemas(df):
     schemas = df.dtypes.astype(str).value_counts()
     fig, ax = plt.subplots(figsize=(10, 6))
-    # Generate a color palette with as many colors as there are bars
     colors = plt.cm.rainbow(np.linspace(0, 1, len(schemas)))
-    # Plot the bars
     bars = ax.bar(schemas.index, schemas.values, color=colors)
     ax.set_title('Column Data Types')
     ax.set_xlabel('Data Type')
     ax.set_ylabel('Count')
-    # Add value labels on top of each bar
     for bar in bars:
         height = bar.get_height()
-        ax.text(bar.get_x() + bar.get_width()/2., height,
                 f'{height}',
                 ha='center', va='bottom')
     save_plot(fig, 'column_schemas.png')
 def plot_nonconforming_cells(nonconforming_cells):
-    # Ensure that nonconforming_cells is a dictionary
     if isinstance(nonconforming_cells, dict):
-        # Proceed with plotting if it's a dictionary
         fig, ax = plt.subplots(figsize=(12, 6))
-        # Generate a color palette with as many colors as there are bars
         colors = plt.cm.rainbow(np.linspace(0, 1, len(nonconforming_cells)))
-        # Plot the bars
         bars = ax.bar(list(nonconforming_cells.keys()), list(nonconforming_cells.values()), color=colors)
         ax.set_title('Nonconforming Cells by Column')
         ax.set_xlabel('Columns')
         ax.set_ylabel('Number of Nonconforming Cells')
         plt.xticks(rotation=90)
-        # Add value labels on top of each bar
         for bar in bars:
             height = bar.get_height()
-            ax.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:,}',
                     ha='center', va='bottom')
         save_plot(fig, 'nonconforming_cells.png')
     else:
         print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")
-def plot_column_distributions(original_df, cleaned_df):
-    numeric_columns = original_df.select_dtypes(include=[np.number]).columns
     num_columns = len(numeric_columns)
     if num_columns == 0:
         print("No numeric columns found for distribution plots.")
         return
-    # Create subplots for distributions
     fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(18, 5 * ((num_columns + 2) // 3)))
     axes = axes.flatten() if num_columns > 1 else [axes]
     for i, column in enumerate(numeric_columns):
         if column in cleaned_df.columns:
-            sns.histplot(original_df[column].dropna(), ax=axes[i], kde=True, color='blue', label='Before Cleaning', alpha=0.5)
-            sns.histplot(cleaned_df[column].dropna(), ax=axes[i], kde=True, color='orange', label='After Cleaning', alpha=0.5)
             axes[i].set_title(f'{column} - Distribution Before & After Cleaning')
             axes[i].legend()
-    # Remove any unused subplots
     for j in range(i + 1, len(axes)):
         fig.delaxes(axes[j])
@@ -131,16 +126,14 @@ def plot_column_distributions(original_df, cleaned_df):
     save_plot(fig, 'distributions_before_after_cleaning.png')
-def plot_boxplot_with_outliers(df):
-    print("Plotting boxplots with outliers...")
-    numeric_columns = df.select_dtypes(include=[np.number]).columns
     num_columns = len(numeric_columns)
     if num_columns == 0:
         print("No numeric columns found for boxplot.")
         return
-    # Create subplots based on the number of numeric columns
     fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(15, 5 * ((num_columns + 2) // 3)))
     axes = axes.flatten() if num_columns > 1 else [axes]
@@ -148,7 +141,6 @@ def plot_boxplot_with_outliers(df):
         sns.boxplot(x=df[column], ax=axes[i])
         axes[i].set_title(f'Boxplot of {column} with Outliers')
-    # Remove any unused subplots
     for j in range(i + 1, len(axes)):
         fig.delaxes(axes[j])
@@ -156,52 +148,42 @@ def plot_boxplot_with_outliers(df):
     save_plot(fig, 'boxplots_with_outliers.png')
-def plot_correlation_heatmap(df):
-    # Select only numeric, float, and integer columns
     numeric_df = df.select_dtypes(include=[np.number])
-    # Compute the correlation matrix
     correlation_matrix = numeric_df.corr()
-    # Plot the heatmap
     fig, ax = plt.subplots(figsize=(15, 10))
     sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', ax=ax, cbar_kws={'label': 'Correlation'})
     ax.set_title('Correlation Heatmap')
     save_plot(fig, 'correlation_heatmap.png')
 def plot_process_times(process_times):
-    # Convert seconds to minutes
     process_times_minutes = {k: v / 60 for k, v in process_times.items()}
-    # Separate main processes and column cleaning processes
     main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
     column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}
-    # Create the plot
     fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
-    # Plot main processes
     bars1 = ax1.bar(main_processes.keys(), main_processes.values())
     ax1.set_title('Main Process Times')
     ax1.set_ylabel('Time (minutes)')
     ax1.tick_params(axis='x', rotation=45)
-    # Plot column cleaning processes
     bars2 = ax2.bar(column_processes.keys(), column_processes.values())
     ax2.set_title('Column Cleaning Times')
     ax2.set_ylabel('Time (minutes)')
     ax2.tick_params(axis='x', rotation=90)
-    # Add value labels on top of each bar
     for ax, bars in zip([ax1, ax2], [bars1, bars2]):
         for bar in bars:
             height = bar.get_height()
             ax.text(bar.get_x() + bar.get_width() / 2., height,
-                    f'{height:.2f}', ha='center', va='bottom')
-    # Add total time to the plot
     total_time = sum(process_times_minutes.values())
     fig.suptitle(f'Process Times (Total: {total_time:.2f} minutes)', fontsize=16)
@@ -209,14 +191,15 @@ def plot_process_times(process_times):
     save_plot(fig, 'process_times.png')
-def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows):
     os.makedirs(REPORT_DIR, exist_ok=True)
     sns.set_style("whitegrid")
     plt.rcParams['figure.dpi'] = 400
     print("Plotting valid data percentages...")
-    plot_valid_data_percentage(original_df, cleaned_df)
     print("Plotting column schemas...")
     plot_column_schemas(cleaned_df)
@@ -225,7 +208,7 @@ def create_full_report(original_df, cleaned_df, nonconforming_cells_before, proc
     plot_nonconforming_cells(nonconforming_cells_before)
     print("Plotting column distributions...")
-    plot_column_distributions(original_df, cleaned_df)
     print("Plotting process times...")
     plot_process_times(process_times)
@@ -234,6 +217,9 @@ def create_full_report(original_df, cleaned_df, nonconforming_cells_before, proc
     plot_heatmap(original_df, "Missing Values Before Cleaning")
     print("Plotting correlation heatmap...")
-    plot_correlation_heatmap(cleaned_df)
     print(f"All visualization reports saved in directory: {REPORT_DIR}")

 REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
 os.makedirs(REPORT_DIR, exist_ok=True)
 def save_plot(fig, filename):
     fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
     plt.close(fig)
 def plot_heatmap(df, title):
     plt.figure(figsize=(12, 8))
     sns.heatmap(df.isnull(), cbar=False, cmap='Reds')
     save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')
+def plot_valid_data_percentage(original_df, cleaned_df, primary_key_column):
+    columns_to_plot = [col for col in original_df.columns if col != primary_key_column]
+    original_valid = (original_df[columns_to_plot].notna().sum() / len(original_df)) * 100
+    cleaned_valid = (cleaned_df[columns_to_plot].notna().sum() / len(cleaned_df)) * 100
     combined_data = pd.concat([original_valid, cleaned_valid], axis=1, keys=['Original', 'Cleaned']).fillna(0)
     plt.figure(figsize=(15, 8))
     x = range(len(combined_data))
     width = 0.35
     plt.bar(x, combined_data['Original'], width, label='Before Cleaning', alpha=0.8)
     plt.bar([i + width for i in x], combined_data['Cleaned'], width, label='After Cleaning', alpha=0.8)
     plt.xlabel('Columns')
     plt.ylabel('Percentage of Valid Data')
     plt.title('Percentage of Valid Data Before and After Cleaning')
+    plt.xticks([i + width / 2 for i in x], combined_data.index, rotation=90)
     plt.legend()
     for i, v in enumerate(combined_data['Original']):
         plt.text(i, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
     for i, v in enumerate(combined_data['Cleaned']):
         plt.text(i + width, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
     plt.tight_layout()
     plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
     plt.close()
 def plot_column_schemas(df):
     schemas = df.dtypes.astype(str).value_counts()
     fig, ax = plt.subplots(figsize=(10, 6))
     colors = plt.cm.rainbow(np.linspace(0, 1, len(schemas)))
     bars = ax.bar(schemas.index, schemas.values, color=colors)
     ax.set_title('Column Data Types')
     ax.set_xlabel('Data Type')
     ax.set_ylabel('Count')
     for bar in bars:
         height = bar.get_height()
+        ax.text(bar.get_x() + bar.get_width() / 2., height,
                 f'{height}',
                 ha='center', va='bottom')
     save_plot(fig, 'column_schemas.png')
 def plot_nonconforming_cells(nonconforming_cells):
     if isinstance(nonconforming_cells, dict):
         fig, ax = plt.subplots(figsize=(12, 6))
         colors = plt.cm.rainbow(np.linspace(0, 1, len(nonconforming_cells)))
         bars = ax.bar(list(nonconforming_cells.keys()), list(nonconforming_cells.values()), color=colors)
         ax.set_title('Nonconforming Cells by Column')
         ax.set_xlabel('Columns')
         ax.set_ylabel('Number of Nonconforming Cells')
         plt.xticks(rotation=90)
         for bar in bars:
             height = bar.get_height()
+            ax.text(bar.get_x() + bar.get_width() / 2., height,
                     f'{height:,}',
                     ha='center', va='bottom')
         save_plot(fig, 'nonconforming_cells.png')
     else:
         print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")
+def plot_column_distributions(original_df, cleaned_df, primary_key_column):
+    numeric_columns = [col for col in original_df.select_dtypes(include=[np.number]).columns if
+                       col != primary_key_column]
     num_columns = len(numeric_columns)
     if num_columns == 0:
         print("No numeric columns found for distribution plots.")
         return
     fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(18, 5 * ((num_columns + 2) // 3)))
     axes = axes.flatten() if num_columns > 1 else [axes]
     for i, column in enumerate(numeric_columns):
         if column in cleaned_df.columns:
+            sns.histplot(original_df[column].dropna(), ax=axes[i], kde=True, color='blue', label='Before Cleaning',
+                         alpha=0.5)
+            sns.histplot(cleaned_df[column].dropna(), ax=axes[i], kde=True, color='orange', label='After Cleaning',
+                         alpha=0.5)
             axes[i].set_title(f'{column} - Distribution Before & After Cleaning')
             axes[i].legend()
     for j in range(i + 1, len(axes)):
         fig.delaxes(axes[j])
     save_plot(fig, 'distributions_before_after_cleaning.png')
+def plot_boxplot_with_outliers(df, primary_key_column):
+    numeric_columns = [col for col in df.select_dtypes(include=[np.number]).columns if col != primary_key_column]
     num_columns = len(numeric_columns)
     if num_columns == 0:
         print("No numeric columns found for boxplot.")
         return
     fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(15, 5 * ((num_columns + 2) // 3)))
     axes = axes.flatten() if num_columns > 1 else [axes]
         sns.boxplot(x=df[column], ax=axes[i])
         axes[i].set_title(f'Boxplot of {column} with Outliers')
     for j in range(i + 1, len(axes)):
         fig.delaxes(axes[j])
     save_plot(fig, 'boxplots_with_outliers.png')
+def plot_correlation_heatmap(df, primary_key_column):
     numeric_df = df.select_dtypes(include=[np.number])
+    numeric_df = numeric_df.drop(columns=[primary_key_column], errors='ignore')
     correlation_matrix = numeric_df.corr()
     fig, ax = plt.subplots(figsize=(15, 10))
     sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', ax=ax, cbar_kws={'label': 'Correlation'})
     ax.set_title('Correlation Heatmap')
     save_plot(fig, 'correlation_heatmap.png')
 def plot_process_times(process_times):
     process_times_minutes = {k: v / 60 for k, v in process_times.items()}
     main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
     column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}
     fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
     bars1 = ax1.bar(main_processes.keys(), main_processes.values())
     ax1.set_title('Main Process Times')
     ax1.set_ylabel('Time (minutes)')
     ax1.tick_params(axis='x', rotation=45)
     bars2 = ax2.bar(column_processes.keys(), column_processes.values())
     ax2.set_title('Column Cleaning Times')
     ax2.set_ylabel('Time (minutes)')
     ax2.tick_params(axis='x', rotation=90)
     for ax, bars in zip([ax1, ax2], [bars1, bars2]):
         for bar in bars:
             height = bar.get_height()
             ax.text(bar.get_x() + bar.get_width() / 2., height,
+                    f'{height:.4f}', ha='center', va='bottom')
     total_time = sum(process_times_minutes.values())
     fig.suptitle(f'Process Times (Total: {total_time:.2f} minutes)', fontsize=16)
     save_plot(fig, 'process_times.png')
+def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns,
+                       removed_rows, primary_key_column):
     os.makedirs(REPORT_DIR, exist_ok=True)
     sns.set_style("whitegrid")
     plt.rcParams['figure.dpi'] = 400
     print("Plotting valid data percentages...")
+    plot_valid_data_percentage(original_df, cleaned_df, primary_key_column)
     print("Plotting column schemas...")
     plot_column_schemas(cleaned_df)
     plot_nonconforming_cells(nonconforming_cells_before)
     print("Plotting column distributions...")
+    plot_column_distributions(original_df, cleaned_df, primary_key_column)
     print("Plotting process times...")
     plot_process_times(process_times)
     plot_heatmap(original_df, "Missing Values Before Cleaning")
     print("Plotting correlation heatmap...")
+    plot_correlation_heatmap(cleaned_df, primary_key_column)
+    print("Plotting boxplots with outliers...")
+    plot_boxplot_with_outliers(cleaned_df, primary_key_column)
     print(f"All visualization reports saved in directory: {REPORT_DIR}")