Spaces:

datasets-topics
/

topics-generator

Runtime error

App Files Files Community

asoria HF Staff commited on Oct 24, 2024

Commit

29466a4

1 Parent(s): 80ad604

Remove unused nested column

Browse files

Files changed (1) hide show

app.py +27 -63

app.py CHANGED Viewed

@@ -145,14 +145,24 @@ def fit_model(docs, embeddings, n_neighbors, n_components):
 @spaces.GPU(duration=60 * 5)
-def generate_topics(dataset, config, split, column, nested_column, plot_type):
     logging.info(
-        f"Generating topics for {dataset=} {config=} {split=} {column=} {nested_column=} {plot_type=}"
     )
     parquet_urls = get_parquet_urls(dataset, config, split)
     split_rows = get_split_rows(dataset, config, split)
-    logging.info(f"Split rows: {split_rows}")
     limit = min(split_rows, MAX_ROWS)
     n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
@@ -178,6 +188,11 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         if full_processing
         else f"⚙️ Processing partial dataset 0 of ({limit} rows)"
     )
     yield (
         gr.Accordion(open=False),
         gr.DataFrame(value=[], interactive=False, visible=True),
@@ -185,6 +200,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         gr.Label({message: rows_processed / limit}, visible=True),
         "",
     )
     while offset < limit:
         docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
         if not docs:
@@ -199,6 +215,9 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         if base_model is None:
             base_model = new_model
         else:
             updated_model = BERTopic.merge_models([base_model, new_model])
             nr_new_topics = len(set(updated_model.topics_)) - len(
@@ -216,11 +235,6 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         topics_info = base_model.get_topic_info()
         all_topics = base_model.topics_
-        sub_title = (
-            f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
-            if full_processing
-            else f"Data map for a sample of the dataset (first {limit} rows) using the column '{column}'"
-        )
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
@@ -271,7 +285,8 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
     logging.info("Finished processing all data")
-    plot_png = f"{dataset.replace('/', '-')}-{plot_type.lower()}.png"
     if plot_type == "DataMapPlot":
         topic_plot.savefig(plot_png, format="png", dpi=300)
     else:
@@ -287,7 +302,6 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
             for topic in all_topics
         ]
     )
-    dataset_clear_name = dataset.replace("/", "-")
     interactive_plot = datamapplot.create_interactive_plot(
         reduced_embeddings_array,
         topic_names_array,
@@ -308,7 +322,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
     with open(html_file_path, "w", encoding="utf-8") as html_file:
         html_file.write(html_content)
-    repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset.replace('/', '-')}"
     space_id = create_space_with_content(
         api=api,
@@ -364,9 +378,6 @@ with gr.Blocks() as demo:
         with gr.Row():
             text_column_dropdown = gr.Dropdown(label="Text column name")
-            nested_text_column_dropdown = gr.Dropdown(
-                label="Nested text column name", visible=False
-            )
             plot_type_radio = gr.Radio(
                 ["DataMapPlot", "Plotly"],
                 value="DataMapPlot",
@@ -388,7 +399,6 @@ with gr.Blocks() as demo:
             subset_dropdown,
             split_dropdown,
             text_column_dropdown,
-            nested_text_column_dropdown,
             plot_type_radio,
         ],
         outputs=[
@@ -408,7 +418,6 @@ with gr.Blocks() as demo:
                 subset_dropdown: gr.Dropdown(visible=False),
                 split_dropdown: gr.Dropdown(visible=False),
                 text_column_dropdown: gr.Dropdown(label="Text column name"),
-                nested_text_column_dropdown: gr.Dropdown(visible=False),
             }
         try:
             info_resp = get_info(dataset)
@@ -417,7 +426,6 @@ with gr.Blocks() as demo:
                 subset_dropdown: gr.Dropdown(visible=False),
                 split_dropdown: gr.Dropdown(visible=False),
                 text_column_dropdown: gr.Dropdown(label="Text column name"),
-                nested_text_column_dropdown: gr.Dropdown(visible=False),
             }
         subsets: list[str] = list(info_resp)
         subset = default_subset if default_subset in subsets else subsets[0]
@@ -433,20 +441,6 @@ with gr.Blocks() as demo:
             for feature_name, feature in features.items()
             if _is_string_feature(feature)
         ]
-        nested_features = [
-            feature_name
-            for feature_name, feature in features.items()
-            if isinstance(feature, dict)
-            and isinstance(next(iter(feature.values())), dict)
-        ]
-        nested_text_features = [
-            feature_name
-            for feature_name in nested_features
-            if any(
-                _is_string_feature(nested_feature)
-                for nested_feature in features[feature_name].values()
-            )
-        ]
         if not text_feature:
             return {
                 subset_dropdown: gr.Dropdown(
@@ -456,34 +450,9 @@ with gr.Blocks() as demo:
                     value=split, choices=splits, visible=len(splits) > 1
                 ),
                 text_column_dropdown: gr.Dropdown(
-                    choices=text_features + nested_text_features,
                     label="Text column name",
                 ),
-                nested_text_column_dropdown: gr.Dropdown(visible=False),
-            }
-        if text_feature in nested_text_features:
-            nested_keys = [
-                feature_name
-                for feature_name, feature in features[text_feature].items()
-                if _is_string_feature(feature)
-            ]
-            return {
-                subset_dropdown: gr.Dropdown(
-                    value=subset, choices=subsets, visible=len(subsets) > 1
-                ),
-                split_dropdown: gr.Dropdown(
-                    value=split, choices=splits, visible=len(splits) > 1
-                ),
-                text_column_dropdown: gr.Dropdown(
-                    choices=text_features + nested_text_features,
-                    label="Text column name",
-                ),
-                nested_text_column_dropdown: gr.Dropdown(
-                    value=nested_keys[0],
-                    choices=nested_keys,
-                    label="Nested text column name",
-                    visible=True,
-                ),
             }
         return {
             subset_dropdown: gr.Dropdown(
@@ -493,9 +462,8 @@ with gr.Blocks() as demo:
                 value=split, choices=splits, visible=len(splits) > 1
             ),
             text_column_dropdown: gr.Dropdown(
-                choices=text_features + nested_text_features, label="Text column name"
             ),
-            nested_text_column_dropdown: gr.Dropdown(visible=False),
         }
     @dataset_name.change(
@@ -504,7 +472,6 @@ with gr.Blocks() as demo:
             subset_dropdown,
             split_dropdown,
             text_column_dropdown,
-            nested_text_column_dropdown,
         ],
     )
     def show_input_from_subset_dropdown(dataset: str) -> dict:
@@ -518,7 +485,6 @@ with gr.Blocks() as demo:
             subset_dropdown,
             split_dropdown,
             text_column_dropdown,
-            nested_text_column_dropdown,
         ],
     )
     def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
@@ -532,7 +498,6 @@ with gr.Blocks() as demo:
             subset_dropdown,
             split_dropdown,
             text_column_dropdown,
-            nested_text_column_dropdown,
         ],
     )
     def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
@@ -546,7 +511,6 @@ with gr.Blocks() as demo:
             subset_dropdown,
             split_dropdown,
             text_column_dropdown,
-            nested_text_column_dropdown,
         ],
     )
     def show_input_from_text_column_dropdown(

 @spaces.GPU(duration=60 * 5)
+def generate_topics(dataset, config, split, column, plot_type):
     logging.info(
+        f"Generating topics for {dataset=} {config=} {split=} {column=} {plot_type=}"
     )
     parquet_urls = get_parquet_urls(dataset, config, split)
     split_rows = get_split_rows(dataset, config, split)
+    if split_rows is None or split_rows == 0:
+        return (
+            gr.Accordion(open=True),
+            gr.DataFrame(value=[], interactive=False, visible=True),
+            gr.Plot(value=None, visible=True),
+            gr.Label(
+                {"❌ Error: No data found for the selected dataset": 0.0}, visible=True
+            ),
+            "",
+        )
+    logging.info(f"Split number of rows: {split_rows}")
     limit = min(split_rows, MAX_ROWS)
     n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
         if full_processing
         else f"⚙️ Processing partial dataset 0 of ({limit} rows)"
     )
+    sub_title = (
+        f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
+        if full_processing
+        else f"Data map for a sample of the dataset (first {limit} rows) using the column '{column}'"
+    )
     yield (
         gr.Accordion(open=False),
         gr.DataFrame(value=[], interactive=False, visible=True),
         gr.Label({message: rows_processed / limit}, visible=True),
         "",
     )
     while offset < limit:
         docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
         if not docs:
         if base_model is None:
             base_model = new_model
+            logging.info(
+                f"The following topics are newly found: {base_model.topic_labels_}"
+            )
         else:
             updated_model = BERTopic.merge_models([base_model, new_model])
             nr_new_topics = len(set(updated_model.topics_)) - len(
         topics_info = base_model.get_topic_info()
         all_topics = base_model.topics_
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
     logging.info("Finished processing all data")
+    dataset_clear_name = dataset.replace("/", "-")
+    plot_png = f"{dataset_clear_name}-{plot_type.lower()}.png"
     if plot_type == "DataMapPlot":
         topic_plot.savefig(plot_png, format="png", dpi=300)
     else:
             for topic in all_topics
         ]
     )
     interactive_plot = datamapplot.create_interactive_plot(
         reduced_embeddings_array,
         topic_names_array,
     with open(html_file_path, "w", encoding="utf-8") as html_file:
         html_file.write(html_content)
+    repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_clear_name}"
     space_id = create_space_with_content(
         api=api,
         with gr.Row():
             text_column_dropdown = gr.Dropdown(label="Text column name")
             plot_type_radio = gr.Radio(
                 ["DataMapPlot", "Plotly"],
                 value="DataMapPlot",
             subset_dropdown,
             split_dropdown,
             text_column_dropdown,
             plot_type_radio,
         ],
         outputs=[
                 subset_dropdown: gr.Dropdown(visible=False),
                 split_dropdown: gr.Dropdown(visible=False),
                 text_column_dropdown: gr.Dropdown(label="Text column name"),
             }
         try:
             info_resp = get_info(dataset)
                 subset_dropdown: gr.Dropdown(visible=False),
                 split_dropdown: gr.Dropdown(visible=False),
                 text_column_dropdown: gr.Dropdown(label="Text column name"),
             }
         subsets: list[str] = list(info_resp)
         subset = default_subset if default_subset in subsets else subsets[0]
             for feature_name, feature in features.items()
             if _is_string_feature(feature)
         ]
         if not text_feature:
             return {
                 subset_dropdown: gr.Dropdown(
                     value=split, choices=splits, visible=len(splits) > 1
                 ),
                 text_column_dropdown: gr.Dropdown(
+                    choices=text_features,
                     label="Text column name",
                 ),
             }
         return {
             subset_dropdown: gr.Dropdown(
                 value=split, choices=splits, visible=len(splits) > 1
             ),
             text_column_dropdown: gr.Dropdown(
+                choices=text_features, label="Text column name"
             ),
         }
     @dataset_name.change(
             subset_dropdown,
             split_dropdown,
             text_column_dropdown,
         ],
     )
     def show_input_from_subset_dropdown(dataset: str) -> dict:
             subset_dropdown,
             split_dropdown,
             text_column_dropdown,
         ],
     )
     def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
             subset_dropdown,
             split_dropdown,
             text_column_dropdown,
         ],
     )
     def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
             subset_dropdown,
             split_dropdown,
             text_column_dropdown,
         ],
     )
     def show_input_from_text_column_dropdown(