Spaces:

datasets-topics
/

topics-generator

Runtime error

App Files Files Community

asoria commited on Oct 21, 2024

Commit

1f396c3

1 Parent(s): 937966f

Adding subtitle

Browse files

Files changed (1) hide show

app.py +17 -7

app.py CHANGED Viewed

@@ -46,7 +46,7 @@ assert (
     EXPORTS_REPOSITORY is not None
 ), "You need to set EXPORTS_REPOSITORY in your environment variables"
-MAX_ROWS = int(os.getenv("MAX_ROWS", "10_000"))
 CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "2_000"))
 DATASET_VIEWE_API_URL = "https://datasets-server.huggingface.co/"
 DATASETS_TOPICS_ORGANIZATION = os.getenv(
@@ -311,11 +311,18 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         all_topics, _ = base_model.transform(all_docs)
         all_topics = np.array(all_topics)
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
                 reduced_embeddings=reduced_embeddings_array,
                 title=dataset,
                 width=800,
                 height=700,
                 arrowprops={
@@ -324,12 +331,11 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
                     "linewidth": 0,
                     "fc": "#33333377",
                 },
-                dynamic_label_size=USE_ARROW_STYLE,
-                # label_wrap_width=12,
-                # label_over_points=True,
-                # dynamic_label_size=True,
-                # max_font_size=36,
-                # min_font_size=4,
             )
             if plot_type == "DataMapPlot"
             else base_model.visualize_documents(
@@ -386,6 +392,10 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         topic_names_array,
         hover_text=all_docs,
         title=dataset,
         enable_search=True,
         # TODO: Export data to .arrow and also serve it
         inline_data=True,

     EXPORTS_REPOSITORY is not None
 ), "You need to set EXPORTS_REPOSITORY in your environment variables"
+MAX_ROWS = int(os.getenv("MAX_ROWS", "8_000"))
 CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "2_000"))
 DATASET_VIEWE_API_URL = "https://datasets-server.huggingface.co/"
 DATASETS_TOPICS_ORGANIZATION = os.getenv(
         all_topics, _ = base_model.transform(all_docs)
         all_topics = np.array(all_topics)
+        sub_title = (
+            f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
+            if full_processing
+            else f"Data map for a sample of the dataset (first {limit} rows) using the column '{column}'"
+        )
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
                 reduced_embeddings=reduced_embeddings_array,
                 title=dataset,
+                sub_title=sub_title,
                 width=800,
                 height=700,
                 arrowprops={
                     "linewidth": 0,
                     "fc": "#33333377",
                 },
+                dynamic_label_size=True,
+                label_wrap_width=12,
+                label_over_points=True,
+                max_font_size=36,
+                min_font_size=4,
             )
             if plot_type == "DataMapPlot"
             else base_model.visualize_documents(
         topic_names_array,
         hover_text=all_docs,
         title=dataset,
+        sub_title=sub_title.replace(
+            "dataset",
+            f"<a href='https://huggingface.co/datasets/{dataset}/viewer/{config}/{split}' target='_blank'>dataset</a>",
+        ),
         enable_search=True,
         # TODO: Export data to .arrow and also serve it
         inline_data=True,