Spaces:
Sleeping
Sleeping
Move hub code to another file
Browse files- app.py +16 -62
- src/hub.py +51 -0
- templates.py → src/templates.py +1 -0
app.py
CHANGED
@@ -15,11 +15,9 @@ from bertopic import BERTopic
|
|
15 |
from bertopic.representation import KeyBERTInspired
|
16 |
from bertopic.representation import TextGeneration
|
17 |
|
18 |
-
|
19 |
-
from huggingface_hub import HfApi, SpaceCard
|
20 |
from sklearn.feature_extraction.text import CountVectorizer
|
21 |
from sentence_transformers import SentenceTransformer
|
22 |
-
from templates import REPRESENTATION_PROMPT, SPACE_REPO_CARD_CONTENT
|
23 |
from torch import cuda, bfloat16
|
24 |
from transformers import (
|
25 |
BitsAndBytesConfig,
|
@@ -28,13 +26,8 @@ from transformers import (
|
|
28 |
pipeline,
|
29 |
)
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
- Improve representation layer (Try with llamacpp or TextGeneration)
|
34 |
-
- Make it run on Zero GPU
|
35 |
-
- Try with more rows (Current: 50_000/10_000 -> Minimal Targett: 1_000_000/20_000)
|
36 |
-
- Export interactive plots and serve their HTML content (It doesn't work with gr.HTML)
|
37 |
-
"""
|
38 |
|
39 |
load_dotenv()
|
40 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
@@ -189,54 +182,7 @@ def fit_model(docs, embeddings, n_neighbors, n_components):
|
|
189 |
return new_model
|
190 |
|
191 |
|
192 |
-
|
193 |
-
dataset_id,
|
194 |
-
file_path,
|
195 |
-
):
|
196 |
-
logging.info(f"Pushing file to hub: {dataset_id} on file {file_path}")
|
197 |
-
|
198 |
-
file_name = file_path.split("/")[-1]
|
199 |
-
try:
|
200 |
-
logging.info(f"About to push {file_path} - {dataset_id}")
|
201 |
-
api.upload_file(
|
202 |
-
path_or_fileobj=file_path,
|
203 |
-
path_in_repo=file_name,
|
204 |
-
repo_id=EXPORTS_REPOSITORY,
|
205 |
-
repo_type="dataset",
|
206 |
-
)
|
207 |
-
except Exception as e:
|
208 |
-
logging.info("Failed to push file", e)
|
209 |
-
raise
|
210 |
-
|
211 |
-
|
212 |
-
def create_space_with_content(dataset_id, html_file_path):
|
213 |
-
repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_id.replace('/', '-')}"
|
214 |
-
logging.info(f"Creating space with content: {repo_id} on file {html_file_path}")
|
215 |
-
api.create_repo(
|
216 |
-
repo_id=repo_id,
|
217 |
-
repo_type="space",
|
218 |
-
private=False,
|
219 |
-
exist_ok=True,
|
220 |
-
token=HF_TOKEN,
|
221 |
-
space_sdk="static",
|
222 |
-
)
|
223 |
-
|
224 |
-
SpaceCard(
|
225 |
-
content=SPACE_REPO_CARD_CONTENT.format(dataset_id=dataset_id)
|
226 |
-
).push_to_hub(repo_id=repo_id, repo_type="space", token=HF_TOKEN)
|
227 |
-
|
228 |
-
api.upload_file(
|
229 |
-
path_or_fileobj=html_file_path,
|
230 |
-
path_in_repo="index.html",
|
231 |
-
repo_type="space",
|
232 |
-
repo_id=repo_id,
|
233 |
-
token=HF_TOKEN,
|
234 |
-
)
|
235 |
-
logging.info(f"Space creation done")
|
236 |
-
return repo_id
|
237 |
-
|
238 |
-
|
239 |
-
@spaces.GPU(duration=60*5)
|
240 |
def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
241 |
logging.info(
|
242 |
f"Generating topics for {dataset=} {config=} {split=} {column=} {nested_column=} {plot_type=}"
|
@@ -374,8 +320,6 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
374 |
else:
|
375 |
topic_plot.write_image(plot_png)
|
376 |
|
377 |
-
_push_to_hub(dataset, plot_png)
|
378 |
-
|
379 |
all_topics, _ = base_model.transform(all_docs)
|
380 |
topic_info = base_model.get_topic_info()
|
381 |
|
@@ -407,10 +351,20 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
407 |
with open(html_file_path, "w", encoding="utf-8") as html_file:
|
408 |
html_file.write(html_content)
|
409 |
|
410 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
411 |
|
412 |
plot_png_link = (
|
413 |
-
f"https://huggingface.co/
|
414 |
)
|
415 |
|
416 |
space_link = f"https://huggingface.co/spaces/{space_id}"
|
|
|
15 |
from bertopic.representation import KeyBERTInspired
|
16 |
from bertopic.representation import TextGeneration
|
17 |
|
18 |
+
from huggingface_hub import HfApi
|
|
|
19 |
from sklearn.feature_extraction.text import CountVectorizer
|
20 |
from sentence_transformers import SentenceTransformer
|
|
|
21 |
from torch import cuda, bfloat16
|
22 |
from transformers import (
|
23 |
BitsAndBytesConfig,
|
|
|
26 |
pipeline,
|
27 |
)
|
28 |
|
29 |
+
from src.hub import create_space_with_content
|
30 |
+
from src.templates import REPRESENTATION_PROMPT, SPACE_REPO_CARD_CONTENT
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
load_dotenv()
|
33 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
|
|
182 |
return new_model
|
183 |
|
184 |
|
185 |
+
@spaces.GPU(duration=60 * 5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
187 |
logging.info(
|
188 |
f"Generating topics for {dataset=} {config=} {split=} {column=} {nested_column=} {plot_type=}"
|
|
|
320 |
else:
|
321 |
topic_plot.write_image(plot_png)
|
322 |
|
|
|
|
|
323 |
all_topics, _ = base_model.transform(all_docs)
|
324 |
topic_info = base_model.get_topic_info()
|
325 |
|
|
|
351 |
with open(html_file_path, "w", encoding="utf-8") as html_file:
|
352 |
html_file.write(html_content)
|
353 |
|
354 |
+
repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset.replace('/', '-')}"
|
355 |
+
|
356 |
+
space_id = create_space_with_content(
|
357 |
+
api=api,
|
358 |
+
repo_id=repo_id,
|
359 |
+
dataset_id=dataset,
|
360 |
+
html_file_path=html_file_path,
|
361 |
+
plot_file_path=plot_png,
|
362 |
+
space_card=SPACE_REPO_CARD_CONTENT,
|
363 |
+
token=HF_TOKEN,
|
364 |
+
)
|
365 |
|
366 |
plot_png_link = (
|
367 |
+
f"https://huggingface.co/spaces/{space_id}/blob/main/static_plot.png"
|
368 |
)
|
369 |
|
370 |
space_link = f"https://huggingface.co/spaces/{space_id}"
|
src/hub.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from huggingface_hub import HfApi, SpaceCard
|
3 |
+
|
4 |
+
|
5 |
+
def create_space_with_content(
|
6 |
+
api: HfApi,
|
7 |
+
repo_id: str,
|
8 |
+
dataset_id: str,
|
9 |
+
html_file_path: str,
|
10 |
+
plot_file_path: str,
|
11 |
+
space_card: str,
|
12 |
+
token: str,
|
13 |
+
):
|
14 |
+
logging.info(f"Creating space with content: {repo_id} on file {html_file_path}")
|
15 |
+
api.create_repo(
|
16 |
+
repo_id=repo_id,
|
17 |
+
repo_type="space",
|
18 |
+
private=False,
|
19 |
+
exist_ok=True,
|
20 |
+
token=token,
|
21 |
+
space_sdk="static",
|
22 |
+
)
|
23 |
+
|
24 |
+
SpaceCard(content=space_card.format(dataset_id=dataset_id)).push_to_hub(
|
25 |
+
repo_id=repo_id, repo_type="space", token=token
|
26 |
+
)
|
27 |
+
|
28 |
+
api.upload_file(
|
29 |
+
path_or_fileobj=html_file_path,
|
30 |
+
path_in_repo="index.html",
|
31 |
+
repo_type="space",
|
32 |
+
repo_id=repo_id,
|
33 |
+
token=token,
|
34 |
+
)
|
35 |
+
|
36 |
+
logging.info(f"Pushing file to hub: {dataset_id} on file {plot_file_path}")
|
37 |
+
|
38 |
+
try:
|
39 |
+
logging.info(f"About to push {plot_file_path} - {dataset_id}")
|
40 |
+
api.upload_file(
|
41 |
+
path_or_fileobj=plot_file_path,
|
42 |
+
path_in_repo="static_plot.png",
|
43 |
+
repo_id=repo_id,
|
44 |
+
repo_type="space",
|
45 |
+
)
|
46 |
+
except Exception as e:
|
47 |
+
logging.info("Failed to push file", e)
|
48 |
+
raise
|
49 |
+
|
50 |
+
logging.info(f"Space creation done")
|
51 |
+
return repo_id
|
templates.py → src/templates.py
RENAMED
@@ -39,4 +39,5 @@ datasets:
|
|
39 |
- {dataset_id}
|
40 |
---
|
41 |
|
|
|
42 |
"""
|
|
|
39 |
- {dataset_id}
|
40 |
---
|
41 |
|
42 |
+
![Static plot](static_plot.png)
|
43 |
"""
|