Spaces:

datasets-topics
/

topics-generator

Sleeping

App Files Files Community

asoria HF staff commited on Oct 18, 2024

Commit

64583bd

1 Parent(s): 2aedb2c

Open in spaces

Browse files

Files changed (1) hide show

app.py +84 -19

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
-# These imports at the end because of torch/datamapplot issue in Zero GPU
-# import spaces
 import gradio as gr
 import logging
@@ -16,13 +15,10 @@ from bertopic import BERTopic
 from bertopic.representation import KeyBERTInspired
 from bertopic.representation import TextGeneration
-# Temporary disabling because of ZeroGPU does not support cuml
 from cuml.manifold import UMAP
 from cuml.cluster import HDBSCAN
-# from umap import UMAP
-# from hdbscan import HDBSCAN
-from huggingface_hub import HfApi
 from sklearn.feature_extraction.text import CountVectorizer
 from sentence_transformers import SentenceTransformer
 from prompts import REPRESENTATION_PROMPT
@@ -59,6 +55,7 @@ logging.basicConfig(
 MAX_ROWS = 50_000
 CHUNK_SIZE = 10_000
 session = requests.Session()
 sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
@@ -186,7 +183,6 @@ def _push_to_hub(
     logging.info(f"Pushing file to hub: {dataset_id} on file {file_path}")
     file_name = file_path.split("/")[-1]
-    api = HfApi(token=HF_TOKEN)
     try:
         logging.info(f"About to push {file_path} - {dataset_id}")
         api.upload_file(
@@ -200,6 +196,44 @@ def _push_to_hub(
         raise
 def generate_topics(dataset, config, split, column, nested_column, plot_type):
     logging.info(
         f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
@@ -239,6 +273,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         gr.Plot(value=None, visible=True),
         gr.Label({message: rows_processed / limit}, visible=True),
         "",
     )
     while offset < limit:
         docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
@@ -278,6 +313,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
                 docs=all_docs,
                 reduced_embeddings=reduced_embeddings_array,
                 title=dataset,
                 width=800,
                 height=700,
                 arrowprops={
@@ -286,6 +322,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
                     "linewidth": 0,
                     "fc": "#33333377",
                 },
                 dynamic_label_size=False,
                 # label_wrap_width=12,
                 # label_over_points=True,
@@ -299,6 +336,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
                 reduced_embeddings=reduced_embeddings_array,
                 custom_labels=True,
                 title=dataset,
             )
         )
@@ -317,6 +355,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
             topic_plot,
             gr.Label({message: progress}, visible=True),
             "",
         )
         offset += CHUNK_SIZE
@@ -330,20 +369,42 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         topic_plot.write_image(plot_png)
     _push_to_hub(dataset, plot_png)
     plot_png_link = (
         f"https://huggingface.co/datasets/{EXPORTS_REPOSITORY}/blob/main/{plot_png}"
     )
-    # interactive_plot = datamapplot.create_interactive_plot(
-    #     reduced_embeddings_array,
-    #     *cord19_label_layers,
-    #     font_family="Cinzel",
-    #     enable_search=True,
-    #     inline_data=False,
-    #     offline_data_prefix="cord-large-1",
-    #     initial_zoom_fraction=0.4,
-    # )
-    # all_topics, _ = base_model.transform(all_topics)
-    # logging.info(f"TAll opics: {all_topics[:5]}")
     yield (
         gr.Accordion(open=False),
         topics_info,
@@ -352,6 +413,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
             {f"✅ Done: {rows_processed} rows have been processed": 1.0}, visible=True
         ),
         f"[![Download as PNG](https://img.shields.io/badge/Download_as-PNG-red)]({plot_png_link})",
     )
     cuda.empty_cache()
@@ -400,7 +462,9 @@ with gr.Blocks() as demo:
     gr.Markdown("## Data map")
     full_topics_generation_label = gr.Label(visible=False, show_label=False)
-    open_png_label = gr.Markdown()
     topics_plot = gr.Plot()
     with gr.Accordion("Topics Info", open=False):
         topics_df = gr.DataFrame(interactive=False, visible=True)
@@ -420,6 +484,7 @@ with gr.Blocks() as demo:
             topics_plot,
             full_topics_generation_label,
             open_png_label,
         ],
     )

+import spaces
 import gradio as gr
 import logging
 from bertopic.representation import KeyBERTInspired
 from bertopic.representation import TextGeneration
 from cuml.manifold import UMAP
 from cuml.cluster import HDBSCAN
+from huggingface_hub import HfApi, SpaceCard
 from sklearn.feature_extraction.text import CountVectorizer
 from sentence_transformers import SentenceTransformer
 from prompts import REPRESENTATION_PROMPT
 MAX_ROWS = 50_000
 CHUNK_SIZE = 10_000
+api = HfApi(token=HF_TOKEN)
 session = requests.Session()
 sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
     logging.info(f"Pushing file to hub: {dataset_id} on file {file_path}")
     file_name = file_path.split("/")[-1]
     try:
         logging.info(f"About to push {file_path} - {dataset_id}")
         api.upload_file(
         raise
+def create_space_with_content(dataset_id, html_file_path):
+    # TODO: Parameterize organization name
+    repo_id = f"datasets-topics/{dataset_id.replace('/', '-')}"
+    logging.info(f"Creating space with content: {repo_id} on file {html_file_path}")
+    api.create_repo(
+        repo_id=repo_id,
+        repo_type="space",
+        private=False,
+        exist_ok=True,
+        token=HF_TOKEN,
+        space_sdk="static",
+    )
+    SPACE_REPO_CARD_CONTENT = """
+---
+title: {dataset_id} topic modeling
+sdk: static
+pinned: false
+datasets:
+- {dataset_id}
+---
+"""
+    SpaceCard(
+        content=SPACE_REPO_CARD_CONTENT.format(dataset_id=dataset_id)
+    ).push_to_hub(repo_id=repo_id, repo_type="space", token=HF_TOKEN)
+    api.upload_file(
+        path_or_fileobj=html_file_path,
+        path_in_repo="index.html",
+        repo_type="space",
+        repo_id=repo_id,
+        token=HF_TOKEN,
+    )
+    logging.info(f"Space created done")
+    return repo_id
 def generate_topics(dataset, config, split, column, nested_column, plot_type):
     logging.info(
         f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
         gr.Plot(value=None, visible=True),
         gr.Label({message: rows_processed / limit}, visible=True),
         "",
+        "",
     )
     while offset < limit:
         docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
                 docs=all_docs,
                 reduced_embeddings=reduced_embeddings_array,
                 title=dataset,
+                font_family="Montserrat Thin",
                 width=800,
                 height=700,
                 arrowprops={
                     "linewidth": 0,
                     "fc": "#33333377",
                 },
+                # TODO: Make it configurable in UI
                 dynamic_label_size=False,
                 # label_wrap_width=12,
                 # label_over_points=True,
                 reduced_embeddings=reduced_embeddings_array,
                 custom_labels=True,
                 title=dataset,
+                font_family="Montserrat Thin",
             )
         )
             topic_plot,
             gr.Label({message: progress}, visible=True),
             "",
+            "",
         )
         offset += CHUNK_SIZE
         topic_plot.write_image(plot_png)
     _push_to_hub(dataset, plot_png)
+    all_topics, _ = base_model.transform(all_docs)
+    topic_info = base_model.get_topic_info()
+    topic_names = {row["Topic"]: row["Name"] for index, row in topic_info.iterrows()}
+    topic_names_array = np.array(
+        [
+            topic_names.get(topic, "No Topic").split("_")[1].strip("-")
+            for topic in all_topics
+        ]
+    )
+    dataset_clear_name = dataset.replace("/", "-")
+    interactive_plot = datamapplot.create_interactive_plot(
+        reduced_embeddings_array,
+        topic_names_array,
+        hover_text=all_docs,
+        title=dataset,
+        enable_search=True,
+        font_family="Montserrat Thin",
+        # TODO: Export data to .arrow and also serve it
+        inline_data=True,
+        # offline_data_prefix=dataset_clear_name,
+        initial_zoom_fraction=0.9,
+    )
+    html_content = str(interactive_plot)
+    html_file_path = f"{dataset_clear_name}.html"
+    with open(html_file_path, "w", encoding="utf-8") as html_file:
+        html_file.write(html_content)
+    space_id = create_space_with_content(dataset, html_file_path)
     plot_png_link = (
         f"https://huggingface.co/datasets/{EXPORTS_REPOSITORY}/blob/main/{plot_png}"
     )
+    space_link = f"https://huggingface.co/spaces/{space_id}"
     yield (
         gr.Accordion(open=False),
         topics_info,
             {f"✅ Done: {rows_processed} rows have been processed": 1.0}, visible=True
         ),
         f"[![Download as PNG](https://img.shields.io/badge/Download_as-PNG-red)]({plot_png_link})",
+        f"[![Go to interactive plot](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue)]({space_link})",
     )
     cuda.empty_cache()
     gr.Markdown("## Data map")
     full_topics_generation_label = gr.Label(visible=False, show_label=False)
+    with gr.Row():
+        open_png_label = gr.Markdown()
+        open_space_label = gr.Markdown()
     topics_plot = gr.Plot()
     with gr.Accordion("Topics Info", open=False):
         topics_df = gr.DataFrame(interactive=False, visible=True)
             topics_plot,
             full_topics_generation_label,
             open_png_label,
+            open_space_label,
         ],
     )