asoria HF staff commited on
Commit
a893b55
1 Parent(s): 38d83a2

Move hub code to another file

Browse files
Files changed (3) hide show
  1. app.py +16 -62
  2. src/hub.py +51 -0
  3. templates.py → src/templates.py +1 -0
app.py CHANGED
@@ -15,11 +15,9 @@ from bertopic import BERTopic
15
  from bertopic.representation import KeyBERTInspired
16
  from bertopic.representation import TextGeneration
17
 
18
-
19
- from huggingface_hub import HfApi, SpaceCard
20
  from sklearn.feature_extraction.text import CountVectorizer
21
  from sentence_transformers import SentenceTransformer
22
- from templates import REPRESENTATION_PROMPT, SPACE_REPO_CARD_CONTENT
23
  from torch import cuda, bfloat16
24
  from transformers import (
25
  BitsAndBytesConfig,
@@ -28,13 +26,8 @@ from transformers import (
28
  pipeline,
29
  )
30
 
31
- """
32
- TODOs:
33
- - Improve representation layer (Try with llamacpp or TextGeneration)
34
- - Make it run on Zero GPU
35
- - Try with more rows (Current: 50_000/10_000 -> Minimal Targett: 1_000_000/20_000)
36
- - Export interactive plots and serve their HTML content (It doesn't work with gr.HTML)
37
- """
38
 
39
  load_dotenv()
40
  HF_TOKEN = os.getenv("HF_TOKEN")
@@ -189,54 +182,7 @@ def fit_model(docs, embeddings, n_neighbors, n_components):
189
  return new_model
190
 
191
 
192
- def _push_to_hub(
193
- dataset_id,
194
- file_path,
195
- ):
196
- logging.info(f"Pushing file to hub: {dataset_id} on file {file_path}")
197
-
198
- file_name = file_path.split("/")[-1]
199
- try:
200
- logging.info(f"About to push {file_path} - {dataset_id}")
201
- api.upload_file(
202
- path_or_fileobj=file_path,
203
- path_in_repo=file_name,
204
- repo_id=EXPORTS_REPOSITORY,
205
- repo_type="dataset",
206
- )
207
- except Exception as e:
208
- logging.info("Failed to push file", e)
209
- raise
210
-
211
-
212
- def create_space_with_content(dataset_id, html_file_path):
213
- repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_id.replace('/', '-')}"
214
- logging.info(f"Creating space with content: {repo_id} on file {html_file_path}")
215
- api.create_repo(
216
- repo_id=repo_id,
217
- repo_type="space",
218
- private=False,
219
- exist_ok=True,
220
- token=HF_TOKEN,
221
- space_sdk="static",
222
- )
223
-
224
- SpaceCard(
225
- content=SPACE_REPO_CARD_CONTENT.format(dataset_id=dataset_id)
226
- ).push_to_hub(repo_id=repo_id, repo_type="space", token=HF_TOKEN)
227
-
228
- api.upload_file(
229
- path_or_fileobj=html_file_path,
230
- path_in_repo="index.html",
231
- repo_type="space",
232
- repo_id=repo_id,
233
- token=HF_TOKEN,
234
- )
235
- logging.info(f"Space creation done")
236
- return repo_id
237
-
238
-
239
- @spaces.GPU(duration=60*5)
240
  def generate_topics(dataset, config, split, column, nested_column, plot_type):
241
  logging.info(
242
  f"Generating topics for {dataset=} {config=} {split=} {column=} {nested_column=} {plot_type=}"
@@ -374,8 +320,6 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
374
  else:
375
  topic_plot.write_image(plot_png)
376
 
377
- _push_to_hub(dataset, plot_png)
378
-
379
  all_topics, _ = base_model.transform(all_docs)
380
  topic_info = base_model.get_topic_info()
381
 
@@ -407,10 +351,20 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
407
  with open(html_file_path, "w", encoding="utf-8") as html_file:
408
  html_file.write(html_content)
409
 
410
- space_id = create_space_with_content(dataset, html_file_path)
 
 
 
 
 
 
 
 
 
 
411
 
412
  plot_png_link = (
413
- f"https://huggingface.co/datasets/{EXPORTS_REPOSITORY}/blob/main/{plot_png}"
414
  )
415
 
416
  space_link = f"https://huggingface.co/spaces/{space_id}"
 
15
  from bertopic.representation import KeyBERTInspired
16
  from bertopic.representation import TextGeneration
17
 
18
+ from huggingface_hub import HfApi
 
19
  from sklearn.feature_extraction.text import CountVectorizer
20
  from sentence_transformers import SentenceTransformer
 
21
  from torch import cuda, bfloat16
22
  from transformers import (
23
  BitsAndBytesConfig,
 
26
  pipeline,
27
  )
28
 
29
+ from src.hub import create_space_with_content
30
+ from src.templates import REPRESENTATION_PROMPT, SPACE_REPO_CARD_CONTENT
 
 
 
 
 
31
 
32
  load_dotenv()
33
  HF_TOKEN = os.getenv("HF_TOKEN")
 
182
  return new_model
183
 
184
 
185
+ @spaces.GPU(duration=60 * 5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  def generate_topics(dataset, config, split, column, nested_column, plot_type):
187
  logging.info(
188
  f"Generating topics for {dataset=} {config=} {split=} {column=} {nested_column=} {plot_type=}"
 
320
  else:
321
  topic_plot.write_image(plot_png)
322
 
 
 
323
  all_topics, _ = base_model.transform(all_docs)
324
  topic_info = base_model.get_topic_info()
325
 
 
351
  with open(html_file_path, "w", encoding="utf-8") as html_file:
352
  html_file.write(html_content)
353
 
354
+ repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset.replace('/', '-')}"
355
+
356
+ space_id = create_space_with_content(
357
+ api=api,
358
+ repo_id=repo_id,
359
+ dataset_id=dataset,
360
+ html_file_path=html_file_path,
361
+ plot_file_path=plot_png,
362
+ space_card=SPACE_REPO_CARD_CONTENT,
363
+ token=HF_TOKEN,
364
+ )
365
 
366
  plot_png_link = (
367
+ f"https://huggingface.co/spaces/{space_id}/blob/main/static_plot.png"
368
  )
369
 
370
  space_link = f"https://huggingface.co/spaces/{space_id}"
src/hub.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from huggingface_hub import HfApi, SpaceCard
3
+
4
+
5
+ def create_space_with_content(
6
+ api: HfApi,
7
+ repo_id: str,
8
+ dataset_id: str,
9
+ html_file_path: str,
10
+ plot_file_path: str,
11
+ space_card: str,
12
+ token: str,
13
+ ):
14
+ logging.info(f"Creating space with content: {repo_id} on file {html_file_path}")
15
+ api.create_repo(
16
+ repo_id=repo_id,
17
+ repo_type="space",
18
+ private=False,
19
+ exist_ok=True,
20
+ token=token,
21
+ space_sdk="static",
22
+ )
23
+
24
+ SpaceCard(content=space_card.format(dataset_id=dataset_id)).push_to_hub(
25
+ repo_id=repo_id, repo_type="space", token=token
26
+ )
27
+
28
+ api.upload_file(
29
+ path_or_fileobj=html_file_path,
30
+ path_in_repo="index.html",
31
+ repo_type="space",
32
+ repo_id=repo_id,
33
+ token=token,
34
+ )
35
+
36
+ logging.info(f"Pushing file to hub: {dataset_id} on file {plot_file_path}")
37
+
38
+ try:
39
+ logging.info(f"About to push {plot_file_path} - {dataset_id}")
40
+ api.upload_file(
41
+ path_or_fileobj=plot_file_path,
42
+ path_in_repo="static_plot.png",
43
+ repo_id=repo_id,
44
+ repo_type="space",
45
+ )
46
+ except Exception as e:
47
+ logging.info("Failed to push file", e)
48
+ raise
49
+
50
+ logging.info(f"Space creation done")
51
+ return repo_id
templates.py → src/templates.py RENAMED
@@ -39,4 +39,5 @@ datasets:
39
  - {dataset_id}
40
  ---
41
 
 
42
  """
 
39
  - {dataset_id}
40
  ---
41
 
42
+ ![Static plot](static_plot.png)
43
  """