File size: 3,435 Bytes
4395fda
 
4f6d176
4395fda
 
 
 
 
 
 
4f6d176
4395fda
 
 
4f6d176
4395fda
 
 
 
4f6d176
4395fda
 
 
 
 
 
 
 
 
 
 
 
 
4f6d176
4395fda
 
 
4f6d176
4395fda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f6d176
4395fda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import gradio as gr
import pandas as pd
import os
from datetime import datetime
from clean import clean_data, get_numeric_columns
from report import create_full_report, REPORT_DIR

def clean_and_visualize(file, primary_key_column, progress=gr.Progress()):
    # Read the CSV file
    progress(0.05, desc="Reading CSV file")
    df = pd.read_csv(file.name)

    # Clean the data
    progress(0.1, desc="Starting data cleaning")
    cleaned_df, nonconforming_cells_before, process_times = clean_data(df, primary_key_column, progress)
    progress(0.8, desc="Data cleaning completed")

    # Calculate removed columns and rows
    removed_columns = len(df.columns) - len(cleaned_df.columns)
    removed_rows = len(df) - len(cleaned_df)

    # Generate full visualization report
    progress(0.9, desc="Generating report")
    create_full_report(
        df,
        cleaned_df,
        nonconforming_cells_before,
        process_times,
        removed_columns,
        removed_rows,
        primary_key_column
    )

    # Save cleaned data as CSV
    progress(0.95, desc="Saving cleaned data")
    current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    cleaned_csv_path = os.path.join(f"cleaned_data_{current_time}.csv")
    cleaned_df.to_csv(cleaned_csv_path, index=False)

    # Collect all generated images
    image_files = [os.path.join(REPORT_DIR, f) for f in os.listdir(REPORT_DIR) if f.endswith('.png')]

    progress(1.0, desc="Process completed")
    return cleaned_csv_path, image_files

def launch_app():
    with gr.Blocks() as app:
        gr.Markdown("# Data Cleaner")

        with gr.Row():
            file_input = gr.File(label="Upload CSV File", file_count="single", file_types=[".csv"])

        with gr.Row():
            primary_key_dropdown = gr.Dropdown(label="Select Primary Key Column", choices=[], interactive=True)

        with gr.Row():
            clean_button = gr.Button("Start Cleaning")

        with gr.Row():
            progress_bar = gr.Progress()

        with gr.Row():
            cleaned_file_output = gr.File(label="Cleaned CSV", visible=True)

        with gr.Row():
            output_gallery = gr.Gallery(
                label="Visualization Results",
                show_label=True,
                elem_id="gallery",
                columns=[3],
                rows=[3],
                object_fit="contain",
                height="auto",
                visible=False
            )

        def update_primary_key_options(file):
            if file is None:
                return gr.Dropdown(choices=[])

            df = pd.read_csv(file.name)
            numeric_columns = get_numeric_columns(df)

            return gr.Dropdown(choices=numeric_columns)

        def process_and_show_results(file, primary_key_column):
            cleaned_csv_path, image_files = clean_and_visualize(file, primary_key_column, progress=progress_bar)
            return (
                cleaned_csv_path,
                gr.Gallery(visible=True, value=image_files)
            )

        file_input.change(
            fn=update_primary_key_options,
            inputs=file_input,
            outputs=primary_key_dropdown
        )

        clean_button.click(
            fn=process_and_show_results,
            inputs=[file_input, primary_key_dropdown],
            outputs=[cleaned_file_output, output_gallery]
        )

    app.launch()

if __name__ == "__main__":
    launch_app()