Spaces:

datasets-topics
/

topics-generator

Sleeping

App Files Files Community

asoria HF staff commited on Oct 18, 2024

Commit

95d1f22

1 Parent(s): cefd61d

Parameterize behavior

Browse files

Files changed (3) hide show

app.py +54 -56
requirements.txt +2 -2
prompts.py → templates.py +11 -0

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ from bertopic.representation import TextGeneration
 from huggingface_hub import HfApi, SpaceCard
 from sklearn.feature_extraction.text import CountVectorizer
 from sentence_transformers import SentenceTransformer
-from prompts import REPRESENTATION_PROMPT
 from torch import cuda, bfloat16
 from transformers import (
     BitsAndBytesConfig,
@@ -27,11 +27,6 @@ from transformers import (
     AutoModelForCausalLM,
     pipeline,
 )
-# from cuml.manifold import UMAP
-# from cuml.cluster import HDBSCAN
-from umap import UMAP
-from hdbscan import HDBSCAN
 """
 TODOs:
@@ -51,52 +46,68 @@ assert (
     EXPORTS_REPOSITORY is not None
 ), "You need to set EXPORTS_REPOSITORY in your environment variables"
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
-MAX_ROWS = 50_000
-CHUNK_SIZE = 10_000
 api = HfApi(token=HF_TOKEN)
 session = requests.Session()
 sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
 # Representation model
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_compute_dtype=bfloat16,
-)
-model_id = "meta-llama/Llama-2-7b-chat-hf"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    trust_remote_code=True,
-    quantization_config=bnb_config,
-    device_map="auto",
-)
-model.eval()
-generator = pipeline(
-    model=model,
-    tokenizer=tokenizer,
-    task="text-generation",
-    temperature=0.1,
-    max_new_tokens=500,
-    repetition_penalty=1.1,
-)
-representation_model = TextGeneration(generator, prompt=REPRESENTATION_PROMPT)
-# End of representation model
 vectorizer_model = CountVectorizer(stop_words="english")
 def get_split_rows(dataset, config, split):
     config_size = session.get(
-        f"https://datasets-server.huggingface.co/size?dataset={dataset}&config={config}",
         timeout=20,
     ).json()
     if "error" in config_size:
@@ -112,7 +123,7 @@ def get_split_rows(dataset, config, split):
 def get_parquet_urls(dataset, config, split):
     parquet_files = session.get(
-        f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}&split={split}",
         timeout=20,
     ).json()
     if "error" in parquet_files:
@@ -125,7 +136,6 @@ def get_parquet_urls(dataset, config, split):
 def get_docs_from_parquet(parquet_urls, column, offset, limit):
     SQL_QUERY = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};"
     df = duckdb.sql(SQL_QUERY).to_df()
-    logging.debug(f"Dataframe: {df.head(5)}")
     return df[column].tolist()
@@ -200,8 +210,7 @@ def _push_to_hub(
 def create_space_with_content(dataset_id, html_file_path):
-    # TODO: Parameterize organization name
-    repo_id = f"datasets-topics/{dataset_id.replace('/', '-')}"
     logging.info(f"Creating space with content: {repo_id} on file {html_file_path}")
     api.create_repo(
         repo_id=repo_id,
@@ -211,16 +220,6 @@ def create_space_with_content(dataset_id, html_file_path):
         token=HF_TOKEN,
         space_sdk="static",
     )
-    SPACE_REPO_CARD_CONTENT = """
----
-title: {dataset_id} topic modeling
-sdk: static
-pinned: false
-datasets:
-- {dataset_id}
----
-"""
     SpaceCard(
         content=SPACE_REPO_CARD_CONTENT.format(dataset_id=dataset_id)
@@ -233,14 +232,14 @@ datasets:
         repo_id=repo_id,
         token=HF_TOKEN,
     )
-    logging.info(f"Space created done")
     return repo_id
 @spaces.GPU(duration=120)
 def generate_topics(dataset, config, split, column, nested_column, plot_type):
     logging.info(
-        f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
     )
     parquet_urls = get_parquet_urls(dataset, config, split)
@@ -326,8 +325,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
                     "linewidth": 0,
                     "fc": "#33333377",
                 },
-                # TODO: Make it configurable in UI
-                dynamic_label_size=False,
                 # label_wrap_width=12,
                 # label_over_points=True,
                 # dynamic_label_size=True,
@@ -395,7 +393,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         # TODO: Export data to .arrow and also serve it
         inline_data=True,
         # offline_data_prefix=dataset_clear_name,
-        initial_zoom_fraction=0.9,
     )
     html_content = str(interactive_plot)
     html_file_path = f"{dataset_clear_name}.html"
@@ -503,7 +501,7 @@ with gr.Blocks() as demo:
                 nested_text_column_dropdown: gr.Dropdown(visible=False),
             }
         info_resp = session.get(
-            f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=20
         ).json()
         if "error" in info_resp:
             return {

 from huggingface_hub import HfApi, SpaceCard
 from sklearn.feature_extraction.text import CountVectorizer
 from sentence_transformers import SentenceTransformer
+from templates import REPRESENTATION_PROMPT, SPACE_REPO_CARD_CONTENT
 from torch import cuda, bfloat16
 from transformers import (
     BitsAndBytesConfig,
     AutoModelForCausalLM,
     pipeline,
 )
 """
 TODOs:
     EXPORTS_REPOSITORY is not None
 ), "You need to set EXPORTS_REPOSITORY in your environment variables"
+MAX_ROWS = int(os.getenv("MAX_ROWS", "10_000"))
+CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "2_000"))
+DATASET_VIEWE_API_URL = "https://datasets-server.huggingface.co/"
+DATASETS_TOPICS_ORGANIZATION = os.getenv(
+    "DATASETS_TOPICS_ORGANIZATION", "datasets-topics"
+)
+USE_ARROW_STYLE = int(os.getenv("USE_ARROW_STYLE", "0"))
+USE_CUML = int(os.getenv("USE_CUML", "0"))
+if USE_CUML:
+    from cuml.manifold import UMAP
+    from cuml.cluster import HDBSCAN
+else:
+    from umap import UMAP
+    from hdbscan import HDBSCAN
+USE_LLM_TEXT_GENERATION = int(os.getenv("USE_LLM_TEXT_GENERATION", "1"))
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 api = HfApi(token=HF_TOKEN)
 session = requests.Session()
 sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
 # Representation model
+if USE_LLM_TEXT_GENERATION:
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_compute_dtype=bfloat16,
+    )
+    model_id = "meta-llama/Llama-2-7b-chat-hf"
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        trust_remote_code=True,
+        quantization_config=bnb_config,
+        device_map="auto",
+    )
+    model.eval()
+    generator = pipeline(
+        model=model,
+        tokenizer=tokenizer,
+        task="text-generation",
+        temperature=0.1,
+        max_new_tokens=500,
+        repetition_penalty=1.1,
+    )
+    representation_model = TextGeneration(generator, prompt=REPRESENTATION_PROMPT)
+else:
+    representation_model = KeyBERTInspired()
 vectorizer_model = CountVectorizer(stop_words="english")
 def get_split_rows(dataset, config, split):
     config_size = session.get(
+        f"{DATASET_VIEWE_API_URL}/size?dataset={dataset}&config={config}",
         timeout=20,
     ).json()
     if "error" in config_size:
 def get_parquet_urls(dataset, config, split):
     parquet_files = session.get(
+        f"{DATASET_VIEWE_API_URL}/parquet?dataset={dataset}&config={config}&split={split}",
         timeout=20,
     ).json()
     if "error" in parquet_files:
 def get_docs_from_parquet(parquet_urls, column, offset, limit):
     SQL_QUERY = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};"
     df = duckdb.sql(SQL_QUERY).to_df()
     return df[column].tolist()
 def create_space_with_content(dataset_id, html_file_path):
+    repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_id.replace('/', '-')}"
     logging.info(f"Creating space with content: {repo_id} on file {html_file_path}")
     api.create_repo(
         repo_id=repo_id,
         token=HF_TOKEN,
         space_sdk="static",
     )
     SpaceCard(
         content=SPACE_REPO_CARD_CONTENT.format(dataset_id=dataset_id)
         repo_id=repo_id,
         token=HF_TOKEN,
     )
+    logging.info(f"Space creation done")
     return repo_id
 @spaces.GPU(duration=120)
 def generate_topics(dataset, config, split, column, nested_column, plot_type):
     logging.info(
+        f"Generating topics for {dataset=} {config=} {split=} {column=} {nested_column=} {plot_type=}"
     )
     parquet_urls = get_parquet_urls(dataset, config, split)
                     "linewidth": 0,
                     "fc": "#33333377",
                 },
+                dynamic_label_size=USE_ARROW_STYLE,
                 # label_wrap_width=12,
                 # label_over_points=True,
                 # dynamic_label_size=True,
         # TODO: Export data to .arrow and also serve it
         inline_data=True,
         # offline_data_prefix=dataset_clear_name,
+        initial_zoom_fraction=0.8,
     )
     html_content = str(interactive_plot)
     html_file_path = f"{dataset_clear_name}.html"
                 nested_text_column_dropdown: gr.Dropdown(visible=False),
             }
         info_resp = session.get(
+            f"{DATASET_VIEWE_API_URL}/info?dataset={dataset}", timeout=20
         ).json()
         if "error" in info_resp:
             return {

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-# --extra-index-url https://pypi.nvidia.com
-# cuml-cu11
 spaces
 gradio
 torch

+--extra-index-url https://pypi.nvidia.com
+cuml-cu11
 spaces
 gradio
 torch

prompts.py → templates.py RENAMED Viewed

@@ -29,3 +29,14 @@ Based on the information about the topic above, please create a short label of t
 """
 REPRESENTATION_PROMPT = SYSTEM_PROMPT + EXAMPLE_PROMPT + MAIN_PROMPT

 """
 REPRESENTATION_PROMPT = SYSTEM_PROMPT + EXAMPLE_PROMPT + MAIN_PROMPT
+SPACE_REPO_CARD_CONTENT = """
+---
+title: {dataset_id} topic modeling
+sdk: static
+pinned: false
+datasets:
+- {dataset_id}
+---
+"""