import pandas as pd
import ydata_profiling
import gradio as gr
from pydantic_settings import BaseSettings
from tempfile import NamedTemporaryFile
import sweetviz as sv
#from datatile.summary.df import DataFrameSummary
from autoviz.AutoViz_Class import AutoViz_Class
from traceml.summary.df import DataFrameSummary
def variable_table(df):
"""
Analyzes a DataFrame and categorizes variables with colorful HTML formatting.
Args:
df (pandas.DataFrame): The DataFrame to analyze.
Returns:
str: HTML code representing the analysis results with colorful highlights.
"""
# Analyze variable types
numerical_vars=[]
text_vars=[]
categorical_vars=[]
# Identify categorical, numerical, and text columns
# Iterate over columns
for col in df.columns:
print(col )
unique_values = df[col].nunique()
total_values = df[col].count()
dtype_kind = df[col].dtype.kind
print( 'unique value-',unique_values,' total_values ',total_values,dtype_kind)
# Check criteria for numerical, text, and categorical variables
if (dtype_kind == 'f' or dtype_kind == 'i') and unique_values >= total_values / 10:
numerical_vars.append(col)
print('Numerical')
elif (dtype_kind == 'O' or dtype_kind == 'b') and unique_values > total_values / 10:
text_vars.append(col)
print('Text')
else:
categorical_vars.append(col)
print('categorical')
print(categorical_vars,numerical_vars,text_vars)
# Build HTML table with styles
table_style = 'border: 1px solid #ddd; border-collapse: collapse; text-align: left; font-size: 14px;'
header_style = 'background-color: #f2f2f2; padding: 5px 10px;'
data_style = 'padding: 5px 10px; border-bottom: 1px solid #ddd;'
category_color = '#90ee90' # Light green for categorical
numerical_color = '#add8e6' # Light blue for numerical
text_color = '#ffd9b3' # Light yellow for text
html = f"
"
html += f"Variable Type | Columns |
"
# Add rows for each variable type with coloring
if categorical_vars:
html += f"Categorical | {', '.join(categorical_vars)} |
"
if numerical_vars:
html += f"Numerical | {', '.join(numerical_vars)} |
"
if text_vars:
html += f"Text | {', '.join(text_vars)} |
"
# Handle cases where no variables are found
if not (categorical_vars or numerical_vars or text_vars):
html += "No variables found! |
"
html += "
"
return html
def generate_report(file, type):
df = pd.read_csv(file) if file.name.endswith(".csv") else pd.read_excel(file)
pandas_html_report =ydata_profiling.ProfileReport(df).to_html()
temp_file1 = NamedTemporaryFile(delete=False, suffix=".html")
temp_file1.write(pandas_html_report.encode('utf-8'))
temp_file1.close()
print('Pandas completed')
# dataprep_report = create_report(df)
# temp_file2 = NamedTemporaryFile(delete=False, suffix=".html")
# temp_file2.write(dataprep_report.encode('utf-8'))
# temp_file2.close()
def convert_mixed_to_str(df):
mixed_cols = df.select_dtypes(include=['object']).columns # Detect object dtype (usually mixed)
for col in mixed_cols:
df[col] = df[col].astype(str) # Convert mixed columns to strings
return df
df = convert_mixed_to_str(df.copy())
df.columns = df.columns.str.upper()
sweetviz_report = sv.analyze(df)
#sweetviz_report=sweetviz_report.show_html()
#print(type(sweetviz_report))
report=sweetviz_report.show_html( filepath='SWEETVIZ_REPORT.html',
open_browser=False,
layout='widescreen',
scale=None)
HTMLFileToBeOpened = open('SWEETVIZ_REPORT.html', "r")
# Reading the file and storing in a variable
contents = HTMLFileToBeOpened.read()
print('Pandas completed')
temp_file3 = NamedTemporaryFile(delete=False, suffix=".html")
temp_file3.write(contents.encode('utf-8'))
temp_file3.close()
#AUTOVIZ
AV = AutoViz_Class()
dfviz = AV.AutoViz(
"",
sep=",",
depVar="",
dfte=df,
header=0,
verbose=2,
lowess=False,
chart_format="html",
max_rows_analyzed=500,
max_cols_analyzed=20,
save_plot_dir="plots"
)
dfs = DataFrameSummary(df)
sd=dfs.summary()
sd.index.name = 'PARAMETERS'
sd1= sd.reset_index(drop=False)
def highlight_cols(x):
df = x.copy()
df.loc[:, :] = 'color: purple'
df[['PARAMETERS']] = 'color: red'
return df
# Applying the style function
sd1 = sd1.style.apply(highlight_cols, axis = None)
sd1.columns = sd1.columns.str.upper()
return temp_file1.name ,temp_file3.name ,variable_table(df),dfviz,sd1
with gr.Blocks() as cluster:
with gr.Column():
with gr.Row():
file=gr.File(file_types=['.csv', '.xlsx'], label="Upload a CSV or Excel file")
btn=gr.Button(value="Download Report")
with gr.Row():
gr.HTML(value="""PANDAS REPORT
""")
out1=gr.File(label="Download CSV")
gr.HTML(value="""DATAPREP REPORT
""")
out2=gr.File(label="Download CSV")
gr.HTML(value="""SWEETVIZ REPORT
""")
out3=gr.File(label="Download CSV")
with gr.Column():
gr.Markdown("***Uploaded File***")
var=gr.HTML()
with gr.Column():
gr.Markdown("#Uploaded File")
dataframe1=gr.Dataframe()
with gr.Column():
gr.Markdown("#Columns Analysis")
dataframe2=gr.Dataframe()
btn.click(generate_report,inputs=[file],outputs=[out1,out3,var,dataframe1,dataframe2])
cluster.launch()