DataVerse

Running

App Files Files Community

evijit HF Staff commited on Jul 11

Commit

813c7cf

verified ·

1 Parent(s): 9bf5a46

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -72

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# --- app.py (Dataverse Explorer - Corrected with drill-down) ---
 import gradio as gr
 import pandas as pd
 import plotly.express as px
@@ -29,7 +27,6 @@ def load_datasets_data():
         print(err_msg)
         return pd.DataFrame(), False, err_msg
-# --- CORRECTED: This function now preserves individual datasets for top orgs ---
 def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None):
     """
     Filter data and prepare it for a multi-level treemap.
@@ -58,36 +55,24 @@ def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None):
         filtered_df[count_by] = 0.0
     filtered_df[count_by] = pd.to_numeric(filtered_df[count_by], errors='coerce').fillna(0.0)
-    # 1. Get total for every organization to determine the top K
     all_org_totals = filtered_df.groupby("organization")[count_by].sum()
     top_org_names = all_org_totals.nlargest(top_k, keep='first').index.tolist()
-    # 2. Get the full data for the individual datasets belonging to the top organizations
     top_orgs_df = filtered_df[filtered_df['organization'].isin(top_org_names)].copy()
-    # 3. Calculate the total for the "Other" category
     other_total = all_org_totals[~all_org_totals.index.isin(top_org_names)].sum()
-    # 4. Create the final DataFrame for the plot
     final_df_for_plot = top_orgs_df
-    # 5. Add the "Other" row as a single entry if its value is greater than zero
     if other_total > 0:
-        other_row = pd.DataFrame([{
-            'organization': 'Other',
-            'id': 'Other',  # The 'id' for the "Other" category must be defined for the path
-            count_by: other_total
-        }])
         final_df_for_plot = pd.concat([final_df_for_plot, other_row], ignore_index=True)
-    # 6. Apply the skip filter to the organization/category level
     if skip_cats and len(skip_cats) > 0:
         final_df_for_plot = final_df_for_plot[~final_df_for_plot['organization'].isin(skip_cats)]
     final_df_for_plot["root"] = "datasets"
     return final_df_for_plot
-# --- CORRECTED: The path is now restored to allow drill-down ---
 def create_treemap(treemap_data, count_by, title=None):
     """Generate the Plotly treemap figure from the prepared data."""
     if treemap_data.empty or treemap_data[count_by].sum() <= 0:
@@ -95,8 +80,6 @@ def create_treemap(treemap_data, count_by, title=None):
         fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
         return fig
-    # The path is restored to `["root", "organization", "id"]` to enable drill-down.
-    # The "Other" row with id='Other' will correctly be displayed as a single block.
     fig = px.treemap(treemap_data, path=["root", "organization", "id"], values=count_by,
                      title=title, color_discrete_sequence=px.colors.qualitative.Plotly)
     fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
@@ -106,7 +89,7 @@ def create_treemap(treemap_data, count_by, title=None):
     )
     return fig
-# --- Gradio UI Blocks (no changes needed here) ---
 with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
     datasets_data_state = gr.State(pd.DataFrame())
     loading_complete_state = gr.State(False)
@@ -116,34 +99,11 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
     with gr.Row():
         with gr.Column(scale=1):
-            count_by_dropdown = gr.Dropdown(
-                label="Metric",
-                choices=[("Downloads (last 30 days)", "downloads"), ("Downloads (All Time)", "downloadsAllTime"), ("Likes", "likes")],
-                value="downloads"
-            )
-            tag_filter_dropdown = gr.Dropdown(
-                label="Filter by Tag",
-                choices=TAG_FILTER_CHOICES,
-                value="None"
-            )
-            top_k_dropdown = gr.Dropdown(
-                label="Number of Top Organizations",
-                choices=TOP_K_CHOICES,
-                value=25
-            )
-            skip_cats_textbox = gr.Textbox(
-                label="Organizations to Skip from the plot",
-                value="Other"
-            )
-            generate_plot_button = gr.Button(
-                value="Generate Plot",
-                variant="primary",
-                interactive=False
-            )
         with gr.Column(scale=3):
             plot_output = gr.Plot()
@@ -153,40 +113,51 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
     def _update_button_interactivity(is_loaded_flag):
         return gr.update(interactive=is_loaded_flag)
-    def ui_load_data_controller(progress=gr.Progress()):
         progress(0, desc=f"Loading dataset '{HF_DATASET_ID}'...")
         try:
             current_df, load_success_flag, status_msg_from_load = load_datasets_data()
             if load_success_flag:
-                progress(0.9, desc="Processing data...")
                 date_display = "Pre-processed (date unavailable)"
                 if 'data_download_timestamp' in current_df.columns and pd.notna(current_df['data_download_timestamp'].iloc[0]):
                     ts = pd.to_datetime(current_df['data_download_timestamp'].iloc[0], utc=True)
                     date_display = ts.strftime('%B %d, %Y, %H:%M:%S %Z')
-                data_info_text = (
-                    f"### Data Information\n- Source: `{HF_DATASET_ID}`\n"
-                    f"- Status: {status_msg_from_load}\n"
-                    f"- Total datasets loaded: {len(current_df):,}\n"
-                    f"- Data as of: {date_display}\n"
-                )
-                status_msg_ui = "Data loaded. Ready to generate plot."
             else:
                 data_info_text = f"### Data Load Failed\n- {status_msg_from_load}"
-                status_msg_ui = status_msg_from_load
         except Exception as e:
-            status_msg_ui = f"An unexpected error occurred: {str(e)}"
-            data_info_text = f"### Critical Error\n- {status_msg_ui}"
             load_success_flag = False
-            print(f"Critical error in ui_load_data_controller: {e}")
-        return current_df, load_success_flag, data_info_text, status_msg_ui
-    # --- CORRECTED: Updated stats to reflect the new plot structure ---
     def ui_generate_plot_controller(metric_choice, tag_choice, k_orgs,
                                    skip_cats_input, df_current_datasets, progress=gr.Progress()):
         if df_current_datasets is None or df_current_datasets.empty:
-            return create_treemap(pd.DataFrame(), metric_choice), "Dataset data is not loaded."
         progress(0.1, desc="Aggregating data...")
         cats_to_skip = [cat.strip() for cat in skip_cats_input.split(',') if cat.strip()]
@@ -202,21 +173,20 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
             plot_stats_md = "No data matches the selected filters. Please try different options."
         else:
             total_value_in_plot = treemap_df[metric_choice].sum()
-            # Count datasets, excluding our placeholder "Other" id
             total_datasets_in_plot = treemap_df[treemap_df['id'] != 'Other']['id'].nunique()
-            plot_stats_md = (
-                f"## Plot Statistics\n- **Organizations/Categories Shown**: {treemap_df['organization'].nunique():,}\n"
-                f"- **Individual Datasets Shown**: {total_datasets_in_plot:,}\n"
-                f"- **Total {metric_choice} in plot**: {int(total_value_in_plot):,}"
-            )
         return plotly_fig, plot_stats_md
-    # --- Event Wiring (no changes needed) ---
     demo.load(
-        fn=ui_load_data_controller,
         inputs=[],
-        outputs=[datasets_data_state, loading_complete_state, data_info_md, status_message_md]
     )
     loading_complete_state.change(

 import gradio as gr
 import pandas as pd
 import plotly.express as px
         print(err_msg)
         return pd.DataFrame(), False, err_msg
 def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None):
     """
     Filter data and prepare it for a multi-level treemap.
         filtered_df[count_by] = 0.0
     filtered_df[count_by] = pd.to_numeric(filtered_df[count_by], errors='coerce').fillna(0.0)
     all_org_totals = filtered_df.groupby("organization")[count_by].sum()
     top_org_names = all_org_totals.nlargest(top_k, keep='first').index.tolist()
     top_orgs_df = filtered_df[filtered_df['organization'].isin(top_org_names)].copy()
     other_total = all_org_totals[~all_org_totals.index.isin(top_org_names)].sum()
     final_df_for_plot = top_orgs_df
     if other_total > 0:
+        other_row = pd.DataFrame([{'organization': 'Other', 'id': 'Other', count_by: other_total}])
         final_df_for_plot = pd.concat([final_df_for_plot, other_row], ignore_index=True)
     if skip_cats and len(skip_cats) > 0:
         final_df_for_plot = final_df_for_plot[~final_df_for_plot['organization'].isin(skip_cats)]
     final_df_for_plot["root"] = "datasets"
     return final_df_for_plot
 def create_treemap(treemap_data, count_by, title=None):
     """Generate the Plotly treemap figure from the prepared data."""
     if treemap_data.empty or treemap_data[count_by].sum() <= 0:
         fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
         return fig
     fig = px.treemap(treemap_data, path=["root", "organization", "id"], values=count_by,
                      title=title, color_discrete_sequence=px.colors.qualitative.Plotly)
     fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     )
     return fig
+# --- Gradio UI Blocks ---
 with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
     datasets_data_state = gr.State(pd.DataFrame())
     loading_complete_state = gr.State(False)
     with gr.Row():
         with gr.Column(scale=1):
+            count_by_dropdown = gr.Dropdown(label="Metric", choices=[("Downloads (last 30 days)", "downloads"), ("Downloads (All Time)", "downloadsAllTime"), ("Likes", "likes")], value="downloads")
+            tag_filter_dropdown = gr.Dropdown(label="Filter by Tag", choices=TAG_FILTER_CHOICES, value="None")
+            top_k_dropdown = gr.Dropdown(label="Number of Top Organizations", choices=TOP_K_CHOICES, value=25)
+            skip_cats_textbox = gr.Textbox(label="Organizations to Skip from the plot", value="Other")
+            generate_plot_button = gr.Button(value="Generate Plot", variant="primary", interactive=False)
         with gr.Column(scale=3):
             plot_output = gr.Plot()
     def _update_button_interactivity(is_loaded_flag):
         return gr.update(interactive=is_loaded_flag)
+    ## CHANGE: New combined function to load data and generate the initial plot on startup.
+    def load_and_generate_initial_plot(progress=gr.Progress()):
         progress(0, desc=f"Loading dataset '{HF_DATASET_ID}'...")
+        # --- Part 1: Data Loading ---
         try:
             current_df, load_success_flag, status_msg_from_load = load_datasets_data()
             if load_success_flag:
+                progress(0.5, desc="Processing data...")
                 date_display = "Pre-processed (date unavailable)"
                 if 'data_download_timestamp' in current_df.columns and pd.notna(current_df['data_download_timestamp'].iloc[0]):
                     ts = pd.to_datetime(current_df['data_download_timestamp'].iloc[0], utc=True)
                     date_display = ts.strftime('%B %d, %Y, %H:%M:%S %Z')
+                data_info_text = (f"### Data Information\n- Source: `{HF_DATASET_ID}`\n"
+                                  f"- Status: {status_msg_from_load}\n"
+                                  f"- Total datasets loaded: {len(current_df):,}\n"
+                                  f"- Data as of: {date_display}\n")
             else:
                 data_info_text = f"### Data Load Failed\n- {status_msg_from_load}"
         except Exception as e:
+            status_msg_from_load = f"An unexpected error occurred: {str(e)}"
+            data_info_text = f"### Critical Error\n- {status_msg_from_load}"
             load_success_flag = False
+            current_df = pd.DataFrame() # Ensure df is empty on failure
+            print(f"Critical error in load_and_generate_initial_plot: {e}")
+        # --- Part 2: Generate Initial Plot ---
+        progress(0.6, desc="Generating initial plot...")
+        # Get default values directly from the UI component definitions
+        default_metric = "downloads"
+        default_tag = "None"
+        default_k = 25
+        default_skip_cats = "Other"
+        # Reuse the existing controller function for plotting
+        initial_plot, initial_status = ui_generate_plot_controller(
+            default_metric, default_tag, default_k, default_skip_cats, current_df, progress
+        )
+        return current_df, load_success_flag, data_info_text, initial_status, initial_plot
     def ui_generate_plot_controller(metric_choice, tag_choice, k_orgs,
                                    skip_cats_input, df_current_datasets, progress=gr.Progress()):
         if df_current_datasets is None or df_current_datasets.empty:
+            return create_treemap(pd.DataFrame(), metric_choice), "Dataset data is not loaded. Cannot generate plot."
         progress(0.1, desc="Aggregating data...")
         cats_to_skip = [cat.strip() for cat in skip_cats_input.split(',') if cat.strip()]
             plot_stats_md = "No data matches the selected filters. Please try different options."
         else:
             total_value_in_plot = treemap_df[metric_choice].sum()
             total_datasets_in_plot = treemap_df[treemap_df['id'] != 'Other']['id'].nunique()
+            plot_stats_md = (f"## Plot Statistics\n- **Organizations/Categories Shown**: {treemap_df['organization'].nunique():,}\n"
+                             f"- **Individual Datasets Shown**: {total_datasets_in_plot:,}\n"
+                             f"- **Total {metric_choice} in plot**: {int(total_value_in_plot):,}")
         return plotly_fig, plot_stats_md
+    # --- Event Wiring ---
+    ## CHANGE: Updated demo.load to call the new function and to add plot_output to the outputs list.
     demo.load(
+        fn=load_and_generate_initial_plot,
         inputs=[],
+        outputs=[datasets_data_state, loading_complete_state, data_info_md, status_message_md, plot_output]
     )
     loading_complete_state.change(