Spaces:

datasets-topics
/

topics-generator

Sleeping

App Files Files Community

asoria HF staff commited on Oct 15, 2024

Commit

119b257

1 Parent(s): dfa9cba

Export PNG from plot

Browse files

Files changed (2) hide show

app.py +62 -49
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from bertopic import BERTopic
 from bertopic.representation import KeyBERTInspired
 from cuml.manifold import UMAP
 from cuml.cluster import HDBSCAN
 from sklearn.feature_extraction.text import CountVectorizer
 from sentence_transformers import SentenceTransformer
@@ -25,12 +25,10 @@ import gradio as gr
 """
 TODOs:
-- Improve DataMapPlot plot arguments
-- Add export button for final plot
-- Export and serve an interactive HTML plot?
 - Try with more rows
 - Add TextGenerationLayer
 - Make it run on Zero GPU
 """
@@ -38,6 +36,12 @@ load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
 assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
@@ -145,6 +149,27 @@ def fit_model(docs, embeddings, n_neighbors, n_components):
     logging.info("Global model updated")
 def generate_topics(dataset, config, split, column, nested_column, plot_type):
     logging.info(
         f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
@@ -159,7 +184,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
     reduce_umap_model = UMAP(
         n_neighbors=n_neighbors,
-        n_components=2,  # For visualization, keeping it at 2 (2D)
         min_dist=0.0,
         metric="cosine",
         random_state=42,
@@ -183,6 +208,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         gr.DataFrame(value=[], interactive=False, visible=True),
         gr.Plot(value=None, visible=True),
         gr.Label({message: rows_processed / limit}, visible=True),
     )
     while offset < limit:
         docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
@@ -216,59 +242,32 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         topics_info = base_model.get_topic_info()
         all_topics, _ = base_model.transform(all_docs)
         all_topics = np.array(all_topics)
-        # topic_plot, _ = datamapplot.create_plot(
-        #     data_map_coords=reduced_embeddings_array,
-        #     labels=all_topics.astype(str),
-        #     use_medoids=True,
-        #     figsize=(12, 12),
-        #     dpi=100,
-        #     title="PubMed - Literature review",
-        #     sub_title="A data map of papers representing artificial intelligence and machine learning in ophthalmology",
-        #     title_keywords={"fontsize": 36, "fontfamily": "Roboto Black"},
-        #     sub_title_keywords={
-        #         "fontsize": 18,
-        #     },
-        #     highlight_label_keywords={
-        #         "fontsize": 12,
-        #         "fontweight": "bold",
-        #         "bbox": {"boxstyle": "round"},
-        #     },
-        #     label_font_size=8,
-        #     label_wrap_width=16,
-        #     label_linespacing=1.25,
-        #     label_direction_bias=1.3,
-        #     label_margin_factor=2.0,
-        #     label_base_radius=15.0,
-        #     point_size=4,
-        #     marker_type="o",
-        #     arrowprops={
-        #         "arrowstyle": "wedge,tail_width=0.5",
-        #         "connectionstyle": "arc3,rad=0.05",
-        #         "linewidth": 0,
-        #         "fc": "#33333377",
-        #     },
-        #     add_glow=True,
-        #     glow_keywords={
-        #         "kernel_bandwidth": 0.75,  # controls how wide the glow spreads.
-        #         "kernel": "cosine",  # controls the kernel type. Default is "gaussian". See https://scikit-learn.org/stable/modules/density.html#kernel-density.
-        #         "n_levels": 32,  # controls how many "levels" there are in the contour plot.
-        #         "max_alpha": 0.9,  # controls the translucency of the glow.
-        #     },
-        #     darkmode=False,
-        # )
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
                 reduced_embeddings=reduced_embeddings_array,
-                title=f"<b>{dataset}</b>",
             )
             if plot_type == "DataMapPlot"
             else base_model.visualize_documents(
                 docs=all_docs,
                 reduced_embeddings=reduced_embeddings_array,
                 custom_labels=True,
-                title=f"<b>{dataset}</b>",
             )
         )
@@ -286,12 +285,23 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
             topics_info,
             topic_plot,
             gr.Label({message: progress}, visible=True),
         )
         offset += CHUNK_SIZE
     logging.info("Finished processing all data")
     yield (
         gr.Accordion(open=False),
         topics_info,
@@ -299,6 +309,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         gr.Label(
             {f"✅ Done: {rows_processed} rows have been processed": 1.0}, visible=True
         ),
     )
     cuda.empty_cache()
@@ -339,7 +350,7 @@ with gr.Blocks() as demo:
             )
             plot_type_radio = gr.Radio(
                 ["DataMapPlot", "Plotly"],
-                value="Plotly",
                 label="Choose the plot type",
                 interactive=True,
             )
@@ -347,6 +358,7 @@ with gr.Blocks() as demo:
     gr.Markdown("## Data map")
     full_topics_generation_label = gr.Label(visible=False, show_label=False)
     topics_plot = gr.Plot()
     with gr.Accordion("Topics Info", open=False):
         topics_df = gr.DataFrame(interactive=False, visible=True)
@@ -365,6 +377,7 @@ with gr.Blocks() as demo:
             topics_df,
             topics_plot,
             full_topics_generation_label,
         ],
     )

 from bertopic.representation import KeyBERTInspired
 from cuml.manifold import UMAP
 from cuml.cluster import HDBSCAN
+from huggingface_hub import HfApi
 from sklearn.feature_extraction.text import CountVectorizer
 from sentence_transformers import SentenceTransformer
 """
 TODOs:
 - Try with more rows
 - Add TextGenerationLayer
+- Try with more rows
+- Export and serve an interactive HTML plot?
 - Make it run on Zero GPU
 """
 HF_TOKEN = os.getenv("HF_TOKEN")
 assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
+EXPORTS_REPOSITORY = os.getenv("EXPORTS_REPOSITORY")
+assert (
+    EXPORTS_REPOSITORY is not None
+), "You need to set EXPORTS_REPOSITORY in your environment variables"
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
     logging.info("Global model updated")
+def _push_to_hub(
+    dataset_id,
+    file_path,
+):
+    logging.info(f"Pushing file to hub: {dataset_id} on file {file_path}")
+    file_name = file_path.split("/")[-1]
+    api = HfApi(token=HF_TOKEN)
+    try:
+        logging.info(f"About to push {file_path} - {dataset_id}")
+        api.upload_file(
+            path_or_fileobj=file_path,
+            path_in_repo=file_name,
+            repo_id=EXPORTS_REPOSITORY,
+            repo_type="dataset",
+        )
+    except Exception as e:
+        logging.info("Failed to push file", e)
+        raise
 def generate_topics(dataset, config, split, column, nested_column, plot_type):
     logging.info(
         f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
     reduce_umap_model = UMAP(
         n_neighbors=n_neighbors,
+        n_components=2,  # For visualization, keeping it for 2D
         min_dist=0.0,
         metric="cosine",
         random_state=42,
         gr.DataFrame(value=[], interactive=False, visible=True),
         gr.Plot(value=None, visible=True),
         gr.Label({message: rows_processed / limit}, visible=True),
+        "",
     )
     while offset < limit:
         docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
         topics_info = base_model.get_topic_info()
         all_topics, _ = base_model.transform(all_docs)
         all_topics = np.array(all_topics)
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
                 reduced_embeddings=reduced_embeddings_array,
+                title=dataset,
+                width=800,
+                height=700,
+                # arrowprops={
+                #     "arrowstyle": "wedge,tail_width=0.5",
+                #     "connectionstyle": "arc3,rad=0.05",
+                #     "linewidth": 0,
+                #     "fc": "#33333377",
+                # },
+                label_wrap_width=12,
+                label_over_points=True,
+                dynamic_label_size=True,
+                max_font_size=36,
+                min_font_size=4,
             )
             if plot_type == "DataMapPlot"
             else base_model.visualize_documents(
                 docs=all_docs,
                 reduced_embeddings=reduced_embeddings_array,
                 custom_labels=True,
+                title=dataset,
             )
         )
             topics_info,
             topic_plot,
             gr.Label({message: progress}, visible=True),
+            "",
         )
         offset += CHUNK_SIZE
     logging.info("Finished processing all data")
+    plot_png = f"{dataset.replace('/', '-')}-{plot_type.lower()}.png"
+    if plot_type == "DataMapPlot":
+        topic_plot.savefig(plot_png, format="png", dpi=300)
+    else:
+        topic_plot.write_image(plot_png)
+    _push_to_hub(dataset, plot_png)
+    plot_png_link = (
+        f"https://huggingface.co/datasets/{EXPORTS_REPOSITORY}/blob/main/{plot_png}"
+    )
     yield (
         gr.Accordion(open=False),
         topics_info,
         gr.Label(
             {f"✅ Done: {rows_processed} rows have been processed": 1.0}, visible=True
         ),
+        f"[![Download as PNG](https://img.shields.io/badge/Download_as-PNG-red)]({plot_png_link})",
     )
     cuda.empty_cache()
             )
             plot_type_radio = gr.Radio(
                 ["DataMapPlot", "Plotly"],
+                value="DataMapPlot",
                 label="Choose the plot type",
                 interactive=True,
             )
     gr.Markdown("## Data map")
     full_topics_generation_label = gr.Label(visible=False, show_label=False)
+    open_png_label = gr.Markdown()
     topics_plot = gr.Plot()
     with gr.Accordion("Topics Info", open=False):
         topics_df = gr.DataFrame(interactive=False, visible=True)
             topics_df,
             topics_plot,
             full_topics_generation_label,
+            open_png_label,
         ],
     )

requirements.txt CHANGED Viewed

@@ -12,3 +12,4 @@ pandas
 torch
 numpy
 python-dotenv

 torch
 numpy
 python-dotenv
+kaleido