Spaces:
Sleeping
Sleeping
Remove unused nested column
Browse files
app.py
CHANGED
@@ -145,14 +145,24 @@ def fit_model(docs, embeddings, n_neighbors, n_components):
|
|
145 |
|
146 |
|
147 |
@spaces.GPU(duration=60 * 5)
|
148 |
-
def generate_topics(dataset, config, split, column,
|
149 |
logging.info(
|
150 |
-
f"Generating topics for {dataset=} {config=} {split=} {column=} {
|
151 |
)
|
152 |
|
153 |
parquet_urls = get_parquet_urls(dataset, config, split)
|
154 |
split_rows = get_split_rows(dataset, config, split)
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
limit = min(split_rows, MAX_ROWS)
|
158 |
n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
|
@@ -178,6 +188,11 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
178 |
if full_processing
|
179 |
else f"⚙️ Processing partial dataset 0 of ({limit} rows)"
|
180 |
)
|
|
|
|
|
|
|
|
|
|
|
181 |
yield (
|
182 |
gr.Accordion(open=False),
|
183 |
gr.DataFrame(value=[], interactive=False, visible=True),
|
@@ -185,6 +200,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
185 |
gr.Label({message: rows_processed / limit}, visible=True),
|
186 |
"",
|
187 |
)
|
|
|
188 |
while offset < limit:
|
189 |
docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
|
190 |
if not docs:
|
@@ -199,6 +215,9 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
199 |
|
200 |
if base_model is None:
|
201 |
base_model = new_model
|
|
|
|
|
|
|
202 |
else:
|
203 |
updated_model = BERTopic.merge_models([base_model, new_model])
|
204 |
nr_new_topics = len(set(updated_model.topics_)) - len(
|
@@ -216,11 +235,6 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
216 |
|
217 |
topics_info = base_model.get_topic_info()
|
218 |
all_topics = base_model.topics_
|
219 |
-
sub_title = (
|
220 |
-
f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
|
221 |
-
if full_processing
|
222 |
-
else f"Data map for a sample of the dataset (first {limit} rows) using the column '{column}'"
|
223 |
-
)
|
224 |
topic_plot = (
|
225 |
base_model.visualize_document_datamap(
|
226 |
docs=all_docs,
|
@@ -271,7 +285,8 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
271 |
|
272 |
logging.info("Finished processing all data")
|
273 |
|
274 |
-
|
|
|
275 |
if plot_type == "DataMapPlot":
|
276 |
topic_plot.savefig(plot_png, format="png", dpi=300)
|
277 |
else:
|
@@ -287,7 +302,6 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
287 |
for topic in all_topics
|
288 |
]
|
289 |
)
|
290 |
-
dataset_clear_name = dataset.replace("/", "-")
|
291 |
interactive_plot = datamapplot.create_interactive_plot(
|
292 |
reduced_embeddings_array,
|
293 |
topic_names_array,
|
@@ -308,7 +322,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
308 |
with open(html_file_path, "w", encoding="utf-8") as html_file:
|
309 |
html_file.write(html_content)
|
310 |
|
311 |
-
repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{
|
312 |
|
313 |
space_id = create_space_with_content(
|
314 |
api=api,
|
@@ -364,9 +378,6 @@ with gr.Blocks() as demo:
|
|
364 |
|
365 |
with gr.Row():
|
366 |
text_column_dropdown = gr.Dropdown(label="Text column name")
|
367 |
-
nested_text_column_dropdown = gr.Dropdown(
|
368 |
-
label="Nested text column name", visible=False
|
369 |
-
)
|
370 |
plot_type_radio = gr.Radio(
|
371 |
["DataMapPlot", "Plotly"],
|
372 |
value="DataMapPlot",
|
@@ -388,7 +399,6 @@ with gr.Blocks() as demo:
|
|
388 |
subset_dropdown,
|
389 |
split_dropdown,
|
390 |
text_column_dropdown,
|
391 |
-
nested_text_column_dropdown,
|
392 |
plot_type_radio,
|
393 |
],
|
394 |
outputs=[
|
@@ -408,7 +418,6 @@ with gr.Blocks() as demo:
|
|
408 |
subset_dropdown: gr.Dropdown(visible=False),
|
409 |
split_dropdown: gr.Dropdown(visible=False),
|
410 |
text_column_dropdown: gr.Dropdown(label="Text column name"),
|
411 |
-
nested_text_column_dropdown: gr.Dropdown(visible=False),
|
412 |
}
|
413 |
try:
|
414 |
info_resp = get_info(dataset)
|
@@ -417,7 +426,6 @@ with gr.Blocks() as demo:
|
|
417 |
subset_dropdown: gr.Dropdown(visible=False),
|
418 |
split_dropdown: gr.Dropdown(visible=False),
|
419 |
text_column_dropdown: gr.Dropdown(label="Text column name"),
|
420 |
-
nested_text_column_dropdown: gr.Dropdown(visible=False),
|
421 |
}
|
422 |
subsets: list[str] = list(info_resp)
|
423 |
subset = default_subset if default_subset in subsets else subsets[0]
|
@@ -433,20 +441,6 @@ with gr.Blocks() as demo:
|
|
433 |
for feature_name, feature in features.items()
|
434 |
if _is_string_feature(feature)
|
435 |
]
|
436 |
-
nested_features = [
|
437 |
-
feature_name
|
438 |
-
for feature_name, feature in features.items()
|
439 |
-
if isinstance(feature, dict)
|
440 |
-
and isinstance(next(iter(feature.values())), dict)
|
441 |
-
]
|
442 |
-
nested_text_features = [
|
443 |
-
feature_name
|
444 |
-
for feature_name in nested_features
|
445 |
-
if any(
|
446 |
-
_is_string_feature(nested_feature)
|
447 |
-
for nested_feature in features[feature_name].values()
|
448 |
-
)
|
449 |
-
]
|
450 |
if not text_feature:
|
451 |
return {
|
452 |
subset_dropdown: gr.Dropdown(
|
@@ -456,34 +450,9 @@ with gr.Blocks() as demo:
|
|
456 |
value=split, choices=splits, visible=len(splits) > 1
|
457 |
),
|
458 |
text_column_dropdown: gr.Dropdown(
|
459 |
-
choices=text_features
|
460 |
label="Text column name",
|
461 |
),
|
462 |
-
nested_text_column_dropdown: gr.Dropdown(visible=False),
|
463 |
-
}
|
464 |
-
if text_feature in nested_text_features:
|
465 |
-
nested_keys = [
|
466 |
-
feature_name
|
467 |
-
for feature_name, feature in features[text_feature].items()
|
468 |
-
if _is_string_feature(feature)
|
469 |
-
]
|
470 |
-
return {
|
471 |
-
subset_dropdown: gr.Dropdown(
|
472 |
-
value=subset, choices=subsets, visible=len(subsets) > 1
|
473 |
-
),
|
474 |
-
split_dropdown: gr.Dropdown(
|
475 |
-
value=split, choices=splits, visible=len(splits) > 1
|
476 |
-
),
|
477 |
-
text_column_dropdown: gr.Dropdown(
|
478 |
-
choices=text_features + nested_text_features,
|
479 |
-
label="Text column name",
|
480 |
-
),
|
481 |
-
nested_text_column_dropdown: gr.Dropdown(
|
482 |
-
value=nested_keys[0],
|
483 |
-
choices=nested_keys,
|
484 |
-
label="Nested text column name",
|
485 |
-
visible=True,
|
486 |
-
),
|
487 |
}
|
488 |
return {
|
489 |
subset_dropdown: gr.Dropdown(
|
@@ -493,9 +462,8 @@ with gr.Blocks() as demo:
|
|
493 |
value=split, choices=splits, visible=len(splits) > 1
|
494 |
),
|
495 |
text_column_dropdown: gr.Dropdown(
|
496 |
-
choices=text_features
|
497 |
),
|
498 |
-
nested_text_column_dropdown: gr.Dropdown(visible=False),
|
499 |
}
|
500 |
|
501 |
@dataset_name.change(
|
@@ -504,7 +472,6 @@ with gr.Blocks() as demo:
|
|
504 |
subset_dropdown,
|
505 |
split_dropdown,
|
506 |
text_column_dropdown,
|
507 |
-
nested_text_column_dropdown,
|
508 |
],
|
509 |
)
|
510 |
def show_input_from_subset_dropdown(dataset: str) -> dict:
|
@@ -518,7 +485,6 @@ with gr.Blocks() as demo:
|
|
518 |
subset_dropdown,
|
519 |
split_dropdown,
|
520 |
text_column_dropdown,
|
521 |
-
nested_text_column_dropdown,
|
522 |
],
|
523 |
)
|
524 |
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
|
@@ -532,7 +498,6 @@ with gr.Blocks() as demo:
|
|
532 |
subset_dropdown,
|
533 |
split_dropdown,
|
534 |
text_column_dropdown,
|
535 |
-
nested_text_column_dropdown,
|
536 |
],
|
537 |
)
|
538 |
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
|
@@ -546,7 +511,6 @@ with gr.Blocks() as demo:
|
|
546 |
subset_dropdown,
|
547 |
split_dropdown,
|
548 |
text_column_dropdown,
|
549 |
-
nested_text_column_dropdown,
|
550 |
],
|
551 |
)
|
552 |
def show_input_from_text_column_dropdown(
|
|
|
145 |
|
146 |
|
147 |
@spaces.GPU(duration=60 * 5)
|
148 |
+
def generate_topics(dataset, config, split, column, plot_type):
|
149 |
logging.info(
|
150 |
+
f"Generating topics for {dataset=} {config=} {split=} {column=} {plot_type=}"
|
151 |
)
|
152 |
|
153 |
parquet_urls = get_parquet_urls(dataset, config, split)
|
154 |
split_rows = get_split_rows(dataset, config, split)
|
155 |
+
if split_rows is None or split_rows == 0:
|
156 |
+
return (
|
157 |
+
gr.Accordion(open=True),
|
158 |
+
gr.DataFrame(value=[], interactive=False, visible=True),
|
159 |
+
gr.Plot(value=None, visible=True),
|
160 |
+
gr.Label(
|
161 |
+
{"❌ Error: No data found for the selected dataset": 0.0}, visible=True
|
162 |
+
),
|
163 |
+
"",
|
164 |
+
)
|
165 |
+
logging.info(f"Split number of rows: {split_rows}")
|
166 |
|
167 |
limit = min(split_rows, MAX_ROWS)
|
168 |
n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
|
|
|
188 |
if full_processing
|
189 |
else f"⚙️ Processing partial dataset 0 of ({limit} rows)"
|
190 |
)
|
191 |
+
sub_title = (
|
192 |
+
f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
|
193 |
+
if full_processing
|
194 |
+
else f"Data map for a sample of the dataset (first {limit} rows) using the column '{column}'"
|
195 |
+
)
|
196 |
yield (
|
197 |
gr.Accordion(open=False),
|
198 |
gr.DataFrame(value=[], interactive=False, visible=True),
|
|
|
200 |
gr.Label({message: rows_processed / limit}, visible=True),
|
201 |
"",
|
202 |
)
|
203 |
+
|
204 |
while offset < limit:
|
205 |
docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
|
206 |
if not docs:
|
|
|
215 |
|
216 |
if base_model is None:
|
217 |
base_model = new_model
|
218 |
+
logging.info(
|
219 |
+
f"The following topics are newly found: {base_model.topic_labels_}"
|
220 |
+
)
|
221 |
else:
|
222 |
updated_model = BERTopic.merge_models([base_model, new_model])
|
223 |
nr_new_topics = len(set(updated_model.topics_)) - len(
|
|
|
235 |
|
236 |
topics_info = base_model.get_topic_info()
|
237 |
all_topics = base_model.topics_
|
|
|
|
|
|
|
|
|
|
|
238 |
topic_plot = (
|
239 |
base_model.visualize_document_datamap(
|
240 |
docs=all_docs,
|
|
|
285 |
|
286 |
logging.info("Finished processing all data")
|
287 |
|
288 |
+
dataset_clear_name = dataset.replace("/", "-")
|
289 |
+
plot_png = f"{dataset_clear_name}-{plot_type.lower()}.png"
|
290 |
if plot_type == "DataMapPlot":
|
291 |
topic_plot.savefig(plot_png, format="png", dpi=300)
|
292 |
else:
|
|
|
302 |
for topic in all_topics
|
303 |
]
|
304 |
)
|
|
|
305 |
interactive_plot = datamapplot.create_interactive_plot(
|
306 |
reduced_embeddings_array,
|
307 |
topic_names_array,
|
|
|
322 |
with open(html_file_path, "w", encoding="utf-8") as html_file:
|
323 |
html_file.write(html_content)
|
324 |
|
325 |
+
repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_clear_name}"
|
326 |
|
327 |
space_id = create_space_with_content(
|
328 |
api=api,
|
|
|
378 |
|
379 |
with gr.Row():
|
380 |
text_column_dropdown = gr.Dropdown(label="Text column name")
|
|
|
|
|
|
|
381 |
plot_type_radio = gr.Radio(
|
382 |
["DataMapPlot", "Plotly"],
|
383 |
value="DataMapPlot",
|
|
|
399 |
subset_dropdown,
|
400 |
split_dropdown,
|
401 |
text_column_dropdown,
|
|
|
402 |
plot_type_radio,
|
403 |
],
|
404 |
outputs=[
|
|
|
418 |
subset_dropdown: gr.Dropdown(visible=False),
|
419 |
split_dropdown: gr.Dropdown(visible=False),
|
420 |
text_column_dropdown: gr.Dropdown(label="Text column name"),
|
|
|
421 |
}
|
422 |
try:
|
423 |
info_resp = get_info(dataset)
|
|
|
426 |
subset_dropdown: gr.Dropdown(visible=False),
|
427 |
split_dropdown: gr.Dropdown(visible=False),
|
428 |
text_column_dropdown: gr.Dropdown(label="Text column name"),
|
|
|
429 |
}
|
430 |
subsets: list[str] = list(info_resp)
|
431 |
subset = default_subset if default_subset in subsets else subsets[0]
|
|
|
441 |
for feature_name, feature in features.items()
|
442 |
if _is_string_feature(feature)
|
443 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
444 |
if not text_feature:
|
445 |
return {
|
446 |
subset_dropdown: gr.Dropdown(
|
|
|
450 |
value=split, choices=splits, visible=len(splits) > 1
|
451 |
),
|
452 |
text_column_dropdown: gr.Dropdown(
|
453 |
+
choices=text_features,
|
454 |
label="Text column name",
|
455 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
456 |
}
|
457 |
return {
|
458 |
subset_dropdown: gr.Dropdown(
|
|
|
462 |
value=split, choices=splits, visible=len(splits) > 1
|
463 |
),
|
464 |
text_column_dropdown: gr.Dropdown(
|
465 |
+
choices=text_features, label="Text column name"
|
466 |
),
|
|
|
467 |
}
|
468 |
|
469 |
@dataset_name.change(
|
|
|
472 |
subset_dropdown,
|
473 |
split_dropdown,
|
474 |
text_column_dropdown,
|
|
|
475 |
],
|
476 |
)
|
477 |
def show_input_from_subset_dropdown(dataset: str) -> dict:
|
|
|
485 |
subset_dropdown,
|
486 |
split_dropdown,
|
487 |
text_column_dropdown,
|
|
|
488 |
],
|
489 |
)
|
490 |
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
|
|
|
498 |
subset_dropdown,
|
499 |
split_dropdown,
|
500 |
text_column_dropdown,
|
|
|
501 |
],
|
502 |
)
|
503 |
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
|
|
|
511 |
subset_dropdown,
|
512 |
split_dropdown,
|
513 |
text_column_dropdown,
|
|
|
514 |
],
|
515 |
)
|
516 |
def show_input_from_text_column_dropdown(
|