Spaces:

datasets-topics
/

topics-generator

Runtime error

App Files Files Community

asoria commited on Oct 29, 2024

Commit

2b40426

verified ·

1 Parent(s): 24bed82

Try to fix plot

Browse files

Files changed (1) hide show

app.py +208 -216

app.py CHANGED Viewed

@@ -37,6 +37,7 @@ DATASETS_TOPICS_ORGANIZATION = os.getenv(
     "DATASETS_TOPICS_ORGANIZATION", "datasets-topics"
 )
 USE_CUML = int(os.getenv("USE_CUML", "1"))
 # Use cuml lib only if configured
 if USE_CUML:
@@ -52,17 +53,19 @@ logging.basicConfig(
 )
 api = HfApi(token=HF_TOKEN)
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
-vectorizer_model = CountVectorizer(stop_words="english")
 representation_model = KeyBERTInspired()
 inference_client = InferenceClient(model_id)
 def calculate_embeddings(docs):
-    return embedding_model.encode(docs, show_progress_bar=True, batch_size=32)
 def calculate_n_neighbors_and_components(n_rows):
@@ -92,7 +95,7 @@ def fit_model(docs, embeddings, n_neighbors, n_components):
     new_model = BERTopic(
         language="english",
         # Sub-models
-        embedding_model=embedding_model,  # Step 1 - Extract embeddings
         umap_model=umap_model,  # Step 2 - UMAP model
         hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
         vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
@@ -166,146 +169,44 @@ def generate_topics(dataset, config, split, column, plot_type):
         "",
     )
-    try:
-        while offset < limit:
-            logging.info(f"----> Getting records from {offset=} with {CHUNK_SIZE=}")
-            docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
-            if not docs:
-                break
-            logging.info(f"Got {len(docs)} docs  ✓")
-            embeddings = calculate_embeddings(docs)
-            new_model = fit_model(docs, embeddings, n_neighbors, n_components)
-            if base_model is None:
-                base_model = new_model
-                logging.info(
-                    f"The following topics are newly found: {base_model.topic_labels_}"
-                )
-            else:
-                updated_model = BERTopic.merge_models([base_model, new_model])
-                nr_new_topics = len(set(updated_model.topics_)) - len(
-                    set(base_model.topics_)
-                )
-                new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:]
-                logging.info(f"The following topics are newly found: {new_topics}")
-                base_model = updated_model
-            logging.info("Reducing embeddings to 2D")
-            reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
-            reduced_embeddings_list.append(reduced_embeddings)
-            all_docs.extend(docs)
-            reduced_embeddings_array = np.vstack(reduced_embeddings_list)
-            logging.info("Reducing embeddings to 2D ✓")
-            topics_info = base_model.get_topic_info()
-            all_topics = base_model.topics_
-            logging.info(f"Preparing topics {plot_type} plot")
-            topic_plot = (
-                base_model.visualize_document_datamap(
-                    docs=all_docs,
-                    topics=all_topics,
-                    reduced_embeddings=reduced_embeddings_array,
-                    title="",
-                    sub_title=sub_title,
-                    width=800,
-                    height=700,
-                    arrowprops={
-                        "arrowstyle": "wedge,tail_width=0.5",
-                        "connectionstyle": "arc3,rad=0.05",
-                        "linewidth": 0,
-                        "fc": "#33333377",
-                    },
-                    dynamic_label_size=True,
-                    # label_wrap_width=12,
-                    label_over_points=True,
-                    max_font_size=36,
-                    min_font_size=4,
-                )
-                if plot_type == "DataMapPlot"
-                else base_model.visualize_documents(
-                    docs=all_docs,
-                    topics=all_topics,
-                    reduced_embeddings=reduced_embeddings_array,
-                    custom_labels=True,
-                    title="",
-                )
-            )
-            logging.info("Plot done ✓")
-            rows_processed += len(docs)
-            progress = min(rows_processed / limit, 1.0)
-            logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
-            message = (
-                f"Processing topics for full dataset: {rows_processed} of {limit}"
-                if full_processing
-                else f"Processing topics for partial dataset: {rows_processed} of {limit} rows"
             )
-            yield (
-                gr.Accordion(open=False),
-                topics_info,
-                topic_plot,
-                gr.Label({"⏳ " + message: progress}, visible=True),
-                "",
             )
-            offset += CHUNK_SIZE
-            del docs, embeddings, new_model, reduced_embeddings
-        logging.info("Finished processing topic modeling data")
-        yield (
-            gr.Accordion(open=False),
-            topics_info,
-            topic_plot,
-            gr.Label(
-                {
-                    "✅ " + message: 1.0,
-                    f"⏳ Generating topic names with {model_id}": 0.0,
-                },
-                visible=True,
-            ),
-            "",
-        )
-        all_topics = base_model.topics_
-        topics_info = base_model.get_topic_info()
-        new_topics_by_text_generation = {}
-        for _, row in topics_info.iterrows():
-            logging.info(
-                f"Processing topic: {row['Topic']} - Representation: {row['Representation']}"
-            )
-            prompt = f"{LLAMA_3_8B_PROMPT.replace('[KEYWORDS]', ','.join(row['Representation']))}"
-            prompt_messages = [
-                {
-                    "role": "system",
-                    "content": "You are a helpful, respectful and honest assistant for labeling topics.",
-                },
-                {"role": "user", "content": prompt},
-            ]
-            output = inference_client.chat_completion(
-                messages=prompt_messages,
-                stream=False,
-                max_tokens=500,
-                top_p=0.8,
-                seed=42,
-            )
-            inference_response = output.choices[0].message.content
-            logging.info("Inference response:")
-            logging.info(inference_response)
-            new_topics_by_text_generation[row["Topic"]] = inference_response.replace(
-                "Topic=", ""
-            ).strip()
-        base_model.set_topic_labels(new_topics_by_text_generation)
         topics_info = base_model.get_topic_info()
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
                 topics=all_topics,
-                custom_labels=True,
                 reduced_embeddings=reduced_embeddings_array,
                 title="",
                 sub_title=sub_title,
@@ -326,100 +227,191 @@ def generate_topics(dataset, config, split, column, plot_type):
             if plot_type == "DataMapPlot"
             else base_model.visualize_documents(
                 docs=all_docs,
                 reduced_embeddings=reduced_embeddings_array,
-                custom_labels=True,
                 title="",
             )
         )
-        dataset_clear_name = dataset.replace("/", "-")
-        plot_png = f"{dataset_clear_name}-{plot_type.lower()}.png"
-        if plot_type == "DataMapPlot":
-            topic_plot.savefig(plot_png, format="png", dpi=300)
-        else:
-            topic_plot.write_image(plot_png)
-        custom_labels = base_model.custom_labels_
-        topic_names_array = [custom_labels[doc_topic + 1] for doc_topic in all_topics]
         yield (
             gr.Accordion(open=False),
             topics_info,
             topic_plot,
-            gr.Label(
-                {
-                    "✅ " + message: 1.0,
-                    f"✅ Generating topic names with {model_id}": 1.0,
-                    "⏳ Creating Interactive Space": 0.0,
-                },
-                visible=True,
-            ),
             "",
         )
-        interactive_plot = datamapplot.create_interactive_plot(
-            reduced_embeddings_array,
-            topic_names_array,
-            hover_text=all_docs,
-            title=dataset,
-            sub_title=sub_title.replace(
-                "dataset",
-                f"<a href='https://huggingface.co/datasets/{dataset}/viewer/{config}/{split}' target='_blank'>dataset</a>",
-            ),
-            enable_search=True,
-            # TODO: Export data to .arrow and also serve it
-            inline_data=True,
-            # offline_data_prefix=dataset_clear_name,
-            initial_zoom_fraction=0.8,
-        )
-        html_content = str(interactive_plot)
-        html_file_path = f"{dataset_clear_name}.html"
-        with open(html_file_path, "w", encoding="utf-8") as html_file:
-            html_file.write(html_content)
-        repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_clear_name}"
-        space_id = create_space_with_content(
-            api=api,
-            repo_id=repo_id,
-            dataset_id=dataset,
-            html_file_path=html_file_path,
-            plot_file_path=plot_png,
-            space_card=SPACE_REPO_CARD_CONTENT,
-            token=HF_TOKEN,
-        )
-        space_link = f"https://huggingface.co/spaces/{space_id}"
-        yield (
-            gr.Accordion(open=False),
-            topics_info,
-            topic_plot,
-            gr.Label(
-                {
-                    "✅ " + message: 1.0,
-                    f"✅ Generating topic names with {model_id}": 1.0,
-                    "✅ Creating Interactive Space": 1.0,
-                },
-                visible=True,
-            ),
-            f"[![Go to interactive plot](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue)]({space_link})",
         )
-        del reduce_umap_model, all_docs, reduced_embeddings_list
-        del (
-            base_model,
-            all_topics,
-            topics_info,
-            topic_names_array,
-            interactive_plot,
         )
-        cuda.empty_cache()
-    except Exception as error:
-        return (
-            gr.Accordion(open=True),
-            gr.DataFrame(value=[], interactive=False, visible=True),
-            gr.Plot(value=None, visible=True),
-            gr.Label({f"❌ Error: {error}": 0.0}, visible=True),
-            "",
         )
 with gr.Blocks() as demo:
@@ -468,11 +460,11 @@ with gr.Blocks() as demo:
         generate_button = gr.Button("Generate Topics", variant="primary")
     gr.Markdown("## Data map")
-    progress_label = gr.Label(visible=False, show_label=False)
     open_space_label = gr.Markdown()
     topics_plot = gr.Plot()
-    # with gr.Accordion("Topics Info", open=False):
-    topics_df = gr.DataFrame(interactive=False, visible=True)
     gr.HTML(
         f"<p style='text-align: center; color:orange;'>⚠ This space processes datasets in batches of <b>{CHUNK_SIZE}</b>, with a maximum of <b>{MAX_ROWS}</b> rows. If you need further assistance, please open a new issue in the Community tab.</p>"
     )
@@ -494,7 +486,7 @@ with gr.Blocks() as demo:
             data_details_accordion,
             topics_df,
             topics_plot,
-            progress_label,
             open_space_label,
         ],
     )

     "DATASETS_TOPICS_ORGANIZATION", "datasets-topics"
 )
 USE_CUML = int(os.getenv("USE_CUML", "1"))
+USE_LLM_TEXT_GENERATION = int(os.getenv("USE_LLM_TEXT_GENERATION", "1"))
 # Use cuml lib only if configured
 if USE_CUML:
 )
 api = HfApi(token=HF_TOKEN)
+sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
+# Representation model
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 representation_model = KeyBERTInspired()
+vectorizer_model = CountVectorizer(stop_words="english")
 inference_client = InferenceClient(model_id)
 def calculate_embeddings(docs):
+    return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
 def calculate_n_neighbors_and_components(n_rows):
     new_model = BERTopic(
         language="english",
         # Sub-models
+        embedding_model=sentence_model,  # Step 1 - Extract embeddings
         umap_model=umap_model,  # Step 2 - UMAP model
         hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
         vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
         "",
     )
+    while offset < limit:
+        logging.info(f"----> Getting records from {offset=} with {CHUNK_SIZE=}")
+        docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
+        if not docs:
+            break
+        logging.info(f"Got {len(docs)} docs  ✓")
+        embeddings = calculate_embeddings(docs)
+        new_model = fit_model(docs, embeddings, n_neighbors, n_components)
+        if base_model is None:
+            base_model = new_model
+            logging.info(
+                f"The following topics are newly found: {base_model.topic_labels_}"
             )
+        else:
+            updated_model = BERTopic.merge_models([base_model, new_model])
+            nr_new_topics = len(set(updated_model.topics_)) - len(
+                set(base_model.topics_)
             )
+            new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:]
+            logging.info(f"The following topics are newly found: {new_topics}")
+            base_model = updated_model
+        logging.info("Reducing embeddings to 2D")
+        reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
+        reduced_embeddings_list.append(reduced_embeddings)
+        logging.info("Reducing embeddings to 2D ✓")
+        all_docs.extend(docs)
+        reduced_embeddings_array = np.vstack(reduced_embeddings_list)
         topics_info = base_model.get_topic_info()
+        all_topics = base_model.topics_
+        logging.info(f"Preparing topics {plot_type} plot")
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
                 topics=all_topics,
                 reduced_embeddings=reduced_embeddings_array,
                 title="",
                 sub_title=sub_title,
             if plot_type == "DataMapPlot"
             else base_model.visualize_documents(
                 docs=all_docs,
+                topics=all_topics,
                 reduced_embeddings=reduced_embeddings_array,
                 title="",
             )
         )
+        logging.info("Plot done ✓")
+        rows_processed += len(docs)
+        progress = min(rows_processed / limit, 1.0)
+        logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
+        message = (
+            f"Processing topics for full dataset: {rows_processed} of {limit}"
+            if full_processing
+            else f"Processing topics for partial dataset: {rows_processed} of {limit} rows"
+        )
         yield (
             gr.Accordion(open=False),
             topics_info,
             topic_plot,
+            gr.Label({"⏳ " + message: progress}, visible=True),
             "",
         )
+        offset += CHUNK_SIZE
+        del docs, embeddings, new_model, reduced_embeddings
+    logging.info("Finished processing all data")
+    yield (
+        gr.Accordion(open=False),
+        topics_info,
+        topic_plot,
+        gr.Label(
+            {
+                "✅ " + message: 1.0,
+                f"⏳ Generating topic names with {model_id}": 0.0,
+            },
+            visible=True,
+        ),
+        "",
+    )
+    all_topics = base_model.topics_
+    topics_info = base_model.get_topic_info()
+    new_topics_by_text_generation = {}
+    for _, row in topics_info.iterrows():
+        logging.info(
+            f"Processing topic: {row['Topic']} - Representation: {row['Representation']}"
         )
+        prompt = f"{LLAMA_3_8B_PROMPT.replace('[KEYWORDS]', ','.join(row['Representation']))}"
+        prompt_messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful, respectful and honest assistant for labeling topics.",
+            },
+            {"role": "user", "content": prompt},
+        ]
+        output = inference_client.chat_completion(
+            messages=prompt_messages,
+            stream=False,
+            max_tokens=500,
+            top_p=0.8,
+            seed=42,
         )
+        inference_response = output.choices[0].message.content
+        logging.info("Inference response:")
+        logging.info(inference_response)
+        new_topics_by_text_generation[row["Topic"]] = inference_response.replace(
+            "Topic=", ""
+        ).strip()
+    base_model.set_topic_labels(new_topics_by_text_generation)
+    topics_info = base_model.get_topic_info()
+    topic_plot = (
+        base_model.visualize_document_datamap(
+            docs=all_docs,
+            topics=all_topics,
+            custom_labels=True,
+            reduced_embeddings=reduced_embeddings_array,
+            title="",
+            sub_title=sub_title,
+            width=800,
+            height=700,
+            arrowprops={
+                "arrowstyle": "wedge,tail_width=0.5",
+                "connectionstyle": "arc3,rad=0.05",
+                "linewidth": 0,
+                "fc": "#33333377",
+            },
+            dynamic_label_size=True,
+            # label_wrap_width=12,
+            label_over_points=True,
+            max_font_size=36,
+            min_font_size=4,
         )
+        if plot_type == "DataMapPlot"
+        else base_model.visualize_documents(
+            docs=all_docs,
+            reduced_embeddings=reduced_embeddings_array,
+            custom_labels=True,
+            title="",
+        )
+    )
+    dataset_clear_name = dataset.replace("/", "-")
+    plot_png = f"{dataset_clear_name}-{plot_type.lower()}.png"
+    if plot_type == "DataMapPlot":
+        topic_plot.savefig(plot_png, format="png", dpi=300)
+    else:
+        topic_plot.write_image(plot_png)
+    custom_labels = base_model.custom_labels_
+    topic_names_array = [custom_labels[doc_topic + 1] for doc_topic in all_topics]
+    yield (
+        gr.Accordion(open=False),
+        topics_info,
+        topic_plot,
+        gr.Label(
+            {
+                "✅ " + message: 1.0,
+                f"✅ Generating topic names with {model_id}": 1.0,
+                "⏳ Creating Interactive Space": 0.0,
+            },
+            visible=True,
+        ),
+        "",
+    )
+    interactive_plot = datamapplot.create_interactive_plot(
+        reduced_embeddings_array,
+        topic_names_array,
+        hover_text=all_docs,
+        title=dataset,
+        sub_title=sub_title.replace(
+            "dataset",
+            f"<a href='https://huggingface.co/datasets/{dataset}/viewer/{config}/{split}' target='_blank'>dataset</a>",
+        ),
+        enable_search=True,
+        # TODO: Export data to .arrow and also serve it
+        inline_data=True,
+        # offline_data_prefix=dataset_clear_name,
+        initial_zoom_fraction=0.8,
+    )
+    html_content = str(interactive_plot)
+    html_file_path = f"{dataset_clear_name}.html"
+    with open(html_file_path, "w", encoding="utf-8") as html_file:
+        html_file.write(html_content)
+    repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_clear_name}"
+    space_id = create_space_with_content(
+        api=api,
+        repo_id=repo_id,
+        dataset_id=dataset,
+        html_file_path=html_file_path,
+        plot_file_path=plot_png,
+        space_card=SPACE_REPO_CARD_CONTENT,
+        token=HF_TOKEN,
+    )
+    space_link = f"https://huggingface.co/spaces/{space_id}"
+    yield (
+        gr.Accordion(open=False),
+        topics_info,
+        topic_plot,
+        gr.Label(
+            {
+                "✅ " + message: 1.0,
+                f"✅ Generating topic names with {model_id}": 1.0,
+                "✅ Creating Interactive Space": 1.0,
+            },
+            visible=True,
+        ),
+        f"[![Go to interactive plot](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue)]({space_link})",
+    )
+    del reduce_umap_model, all_docs, reduced_embeddings_list
+    del (
+        base_model,
+        all_topics,
+        topics_info,
+        topic_plot,
+        topic_names_array,
+        interactive_plot,
+    )
+    cuda.empty_cache()
 with gr.Blocks() as demo:
         generate_button = gr.Button("Generate Topics", variant="primary")
     gr.Markdown("## Data map")
+    full_topics_generation_label = gr.Label(visible=False, show_label=False)
     open_space_label = gr.Markdown()
     topics_plot = gr.Plot()
+    with gr.Accordion("Topics Info", open=False):
+        topics_df = gr.DataFrame(interactive=False, visible=True)
     gr.HTML(
         f"<p style='text-align: center; color:orange;'>⚠ This space processes datasets in batches of <b>{CHUNK_SIZE}</b>, with a maximum of <b>{MAX_ROWS}</b> rows. If you need further assistance, please open a new issue in the Community tab.</p>"
     )
             data_details_accordion,
             topics_df,
             topics_plot,
+            full_topics_generation_label,
             open_space_label,
         ],
     )