asoria HF staff commited on
Commit
29466a4
1 Parent(s): 80ad604

Remove unused nested column

Browse files
Files changed (1) hide show
  1. app.py +27 -63
app.py CHANGED
@@ -145,14 +145,24 @@ def fit_model(docs, embeddings, n_neighbors, n_components):
145
 
146
 
147
  @spaces.GPU(duration=60 * 5)
148
- def generate_topics(dataset, config, split, column, nested_column, plot_type):
149
  logging.info(
150
- f"Generating topics for {dataset=} {config=} {split=} {column=} {nested_column=} {plot_type=}"
151
  )
152
 
153
  parquet_urls = get_parquet_urls(dataset, config, split)
154
  split_rows = get_split_rows(dataset, config, split)
155
- logging.info(f"Split rows: {split_rows}")
 
 
 
 
 
 
 
 
 
 
156
 
157
  limit = min(split_rows, MAX_ROWS)
158
  n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
@@ -178,6 +188,11 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
178
  if full_processing
179
  else f"⚙️ Processing partial dataset 0 of ({limit} rows)"
180
  )
 
 
 
 
 
181
  yield (
182
  gr.Accordion(open=False),
183
  gr.DataFrame(value=[], interactive=False, visible=True),
@@ -185,6 +200,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
185
  gr.Label({message: rows_processed / limit}, visible=True),
186
  "",
187
  )
 
188
  while offset < limit:
189
  docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
190
  if not docs:
@@ -199,6 +215,9 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
199
 
200
  if base_model is None:
201
  base_model = new_model
 
 
 
202
  else:
203
  updated_model = BERTopic.merge_models([base_model, new_model])
204
  nr_new_topics = len(set(updated_model.topics_)) - len(
@@ -216,11 +235,6 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
216
 
217
  topics_info = base_model.get_topic_info()
218
  all_topics = base_model.topics_
219
- sub_title = (
220
- f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
221
- if full_processing
222
- else f"Data map for a sample of the dataset (first {limit} rows) using the column '{column}'"
223
- )
224
  topic_plot = (
225
  base_model.visualize_document_datamap(
226
  docs=all_docs,
@@ -271,7 +285,8 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
271
 
272
  logging.info("Finished processing all data")
273
 
274
- plot_png = f"{dataset.replace('/', '-')}-{plot_type.lower()}.png"
 
275
  if plot_type == "DataMapPlot":
276
  topic_plot.savefig(plot_png, format="png", dpi=300)
277
  else:
@@ -287,7 +302,6 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
287
  for topic in all_topics
288
  ]
289
  )
290
- dataset_clear_name = dataset.replace("/", "-")
291
  interactive_plot = datamapplot.create_interactive_plot(
292
  reduced_embeddings_array,
293
  topic_names_array,
@@ -308,7 +322,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
308
  with open(html_file_path, "w", encoding="utf-8") as html_file:
309
  html_file.write(html_content)
310
 
311
- repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset.replace('/', '-')}"
312
 
313
  space_id = create_space_with_content(
314
  api=api,
@@ -364,9 +378,6 @@ with gr.Blocks() as demo:
364
 
365
  with gr.Row():
366
  text_column_dropdown = gr.Dropdown(label="Text column name")
367
- nested_text_column_dropdown = gr.Dropdown(
368
- label="Nested text column name", visible=False
369
- )
370
  plot_type_radio = gr.Radio(
371
  ["DataMapPlot", "Plotly"],
372
  value="DataMapPlot",
@@ -388,7 +399,6 @@ with gr.Blocks() as demo:
388
  subset_dropdown,
389
  split_dropdown,
390
  text_column_dropdown,
391
- nested_text_column_dropdown,
392
  plot_type_radio,
393
  ],
394
  outputs=[
@@ -408,7 +418,6 @@ with gr.Blocks() as demo:
408
  subset_dropdown: gr.Dropdown(visible=False),
409
  split_dropdown: gr.Dropdown(visible=False),
410
  text_column_dropdown: gr.Dropdown(label="Text column name"),
411
- nested_text_column_dropdown: gr.Dropdown(visible=False),
412
  }
413
  try:
414
  info_resp = get_info(dataset)
@@ -417,7 +426,6 @@ with gr.Blocks() as demo:
417
  subset_dropdown: gr.Dropdown(visible=False),
418
  split_dropdown: gr.Dropdown(visible=False),
419
  text_column_dropdown: gr.Dropdown(label="Text column name"),
420
- nested_text_column_dropdown: gr.Dropdown(visible=False),
421
  }
422
  subsets: list[str] = list(info_resp)
423
  subset = default_subset if default_subset in subsets else subsets[0]
@@ -433,20 +441,6 @@ with gr.Blocks() as demo:
433
  for feature_name, feature in features.items()
434
  if _is_string_feature(feature)
435
  ]
436
- nested_features = [
437
- feature_name
438
- for feature_name, feature in features.items()
439
- if isinstance(feature, dict)
440
- and isinstance(next(iter(feature.values())), dict)
441
- ]
442
- nested_text_features = [
443
- feature_name
444
- for feature_name in nested_features
445
- if any(
446
- _is_string_feature(nested_feature)
447
- for nested_feature in features[feature_name].values()
448
- )
449
- ]
450
  if not text_feature:
451
  return {
452
  subset_dropdown: gr.Dropdown(
@@ -456,34 +450,9 @@ with gr.Blocks() as demo:
456
  value=split, choices=splits, visible=len(splits) > 1
457
  ),
458
  text_column_dropdown: gr.Dropdown(
459
- choices=text_features + nested_text_features,
460
  label="Text column name",
461
  ),
462
- nested_text_column_dropdown: gr.Dropdown(visible=False),
463
- }
464
- if text_feature in nested_text_features:
465
- nested_keys = [
466
- feature_name
467
- for feature_name, feature in features[text_feature].items()
468
- if _is_string_feature(feature)
469
- ]
470
- return {
471
- subset_dropdown: gr.Dropdown(
472
- value=subset, choices=subsets, visible=len(subsets) > 1
473
- ),
474
- split_dropdown: gr.Dropdown(
475
- value=split, choices=splits, visible=len(splits) > 1
476
- ),
477
- text_column_dropdown: gr.Dropdown(
478
- choices=text_features + nested_text_features,
479
- label="Text column name",
480
- ),
481
- nested_text_column_dropdown: gr.Dropdown(
482
- value=nested_keys[0],
483
- choices=nested_keys,
484
- label="Nested text column name",
485
- visible=True,
486
- ),
487
  }
488
  return {
489
  subset_dropdown: gr.Dropdown(
@@ -493,9 +462,8 @@ with gr.Blocks() as demo:
493
  value=split, choices=splits, visible=len(splits) > 1
494
  ),
495
  text_column_dropdown: gr.Dropdown(
496
- choices=text_features + nested_text_features, label="Text column name"
497
  ),
498
- nested_text_column_dropdown: gr.Dropdown(visible=False),
499
  }
500
 
501
  @dataset_name.change(
@@ -504,7 +472,6 @@ with gr.Blocks() as demo:
504
  subset_dropdown,
505
  split_dropdown,
506
  text_column_dropdown,
507
- nested_text_column_dropdown,
508
  ],
509
  )
510
  def show_input_from_subset_dropdown(dataset: str) -> dict:
@@ -518,7 +485,6 @@ with gr.Blocks() as demo:
518
  subset_dropdown,
519
  split_dropdown,
520
  text_column_dropdown,
521
- nested_text_column_dropdown,
522
  ],
523
  )
524
  def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
@@ -532,7 +498,6 @@ with gr.Blocks() as demo:
532
  subset_dropdown,
533
  split_dropdown,
534
  text_column_dropdown,
535
- nested_text_column_dropdown,
536
  ],
537
  )
538
  def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
@@ -546,7 +511,6 @@ with gr.Blocks() as demo:
546
  subset_dropdown,
547
  split_dropdown,
548
  text_column_dropdown,
549
- nested_text_column_dropdown,
550
  ],
551
  )
552
  def show_input_from_text_column_dropdown(
 
145
 
146
 
147
  @spaces.GPU(duration=60 * 5)
148
+ def generate_topics(dataset, config, split, column, plot_type):
149
  logging.info(
150
+ f"Generating topics for {dataset=} {config=} {split=} {column=} {plot_type=}"
151
  )
152
 
153
  parquet_urls = get_parquet_urls(dataset, config, split)
154
  split_rows = get_split_rows(dataset, config, split)
155
+ if split_rows is None or split_rows == 0:
156
+ return (
157
+ gr.Accordion(open=True),
158
+ gr.DataFrame(value=[], interactive=False, visible=True),
159
+ gr.Plot(value=None, visible=True),
160
+ gr.Label(
161
+ {"❌ Error: No data found for the selected dataset": 0.0}, visible=True
162
+ ),
163
+ "",
164
+ )
165
+ logging.info(f"Split number of rows: {split_rows}")
166
 
167
  limit = min(split_rows, MAX_ROWS)
168
  n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
 
188
  if full_processing
189
  else f"⚙️ Processing partial dataset 0 of ({limit} rows)"
190
  )
191
+ sub_title = (
192
+ f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
193
+ if full_processing
194
+ else f"Data map for a sample of the dataset (first {limit} rows) using the column '{column}'"
195
+ )
196
  yield (
197
  gr.Accordion(open=False),
198
  gr.DataFrame(value=[], interactive=False, visible=True),
 
200
  gr.Label({message: rows_processed / limit}, visible=True),
201
  "",
202
  )
203
+
204
  while offset < limit:
205
  docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
206
  if not docs:
 
215
 
216
  if base_model is None:
217
  base_model = new_model
218
+ logging.info(
219
+ f"The following topics are newly found: {base_model.topic_labels_}"
220
+ )
221
  else:
222
  updated_model = BERTopic.merge_models([base_model, new_model])
223
  nr_new_topics = len(set(updated_model.topics_)) - len(
 
235
 
236
  topics_info = base_model.get_topic_info()
237
  all_topics = base_model.topics_
 
 
 
 
 
238
  topic_plot = (
239
  base_model.visualize_document_datamap(
240
  docs=all_docs,
 
285
 
286
  logging.info("Finished processing all data")
287
 
288
+ dataset_clear_name = dataset.replace("/", "-")
289
+ plot_png = f"{dataset_clear_name}-{plot_type.lower()}.png"
290
  if plot_type == "DataMapPlot":
291
  topic_plot.savefig(plot_png, format="png", dpi=300)
292
  else:
 
302
  for topic in all_topics
303
  ]
304
  )
 
305
  interactive_plot = datamapplot.create_interactive_plot(
306
  reduced_embeddings_array,
307
  topic_names_array,
 
322
  with open(html_file_path, "w", encoding="utf-8") as html_file:
323
  html_file.write(html_content)
324
 
325
+ repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_clear_name}"
326
 
327
  space_id = create_space_with_content(
328
  api=api,
 
378
 
379
  with gr.Row():
380
  text_column_dropdown = gr.Dropdown(label="Text column name")
 
 
 
381
  plot_type_radio = gr.Radio(
382
  ["DataMapPlot", "Plotly"],
383
  value="DataMapPlot",
 
399
  subset_dropdown,
400
  split_dropdown,
401
  text_column_dropdown,
 
402
  plot_type_radio,
403
  ],
404
  outputs=[
 
418
  subset_dropdown: gr.Dropdown(visible=False),
419
  split_dropdown: gr.Dropdown(visible=False),
420
  text_column_dropdown: gr.Dropdown(label="Text column name"),
 
421
  }
422
  try:
423
  info_resp = get_info(dataset)
 
426
  subset_dropdown: gr.Dropdown(visible=False),
427
  split_dropdown: gr.Dropdown(visible=False),
428
  text_column_dropdown: gr.Dropdown(label="Text column name"),
 
429
  }
430
  subsets: list[str] = list(info_resp)
431
  subset = default_subset if default_subset in subsets else subsets[0]
 
441
  for feature_name, feature in features.items()
442
  if _is_string_feature(feature)
443
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  if not text_feature:
445
  return {
446
  subset_dropdown: gr.Dropdown(
 
450
  value=split, choices=splits, visible=len(splits) > 1
451
  ),
452
  text_column_dropdown: gr.Dropdown(
453
+ choices=text_features,
454
  label="Text column name",
455
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  }
457
  return {
458
  subset_dropdown: gr.Dropdown(
 
462
  value=split, choices=splits, visible=len(splits) > 1
463
  ),
464
  text_column_dropdown: gr.Dropdown(
465
+ choices=text_features, label="Text column name"
466
  ),
 
467
  }
468
 
469
  @dataset_name.change(
 
472
  subset_dropdown,
473
  split_dropdown,
474
  text_column_dropdown,
 
475
  ],
476
  )
477
  def show_input_from_subset_dropdown(dataset: str) -> dict:
 
485
  subset_dropdown,
486
  split_dropdown,
487
  text_column_dropdown,
 
488
  ],
489
  )
490
  def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
 
498
  subset_dropdown,
499
  split_dropdown,
500
  text_column_dropdown,
 
501
  ],
502
  )
503
  def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
 
511
  subset_dropdown,
512
  split_dropdown,
513
  text_column_dropdown,
 
514
  ],
515
  )
516
  def show_input_from_text_column_dropdown(