asoria HF staff commited on
Commit
7dcda45
1 Parent(s): af9239a

Removing TextGeneration layer temporally

Browse files
Files changed (1) hide show
  1. app.py +40 -94
app.py CHANGED
@@ -7,17 +7,8 @@ from gradio_huggingfacehub_search import HuggingfaceHubSearch
7
  from bertopic import BERTopic
8
  from bertopic.representation import (
9
  KeyBERTInspired,
10
- TextGeneration,
11
  )
12
  from umap import UMAP
13
- from torch import cuda, bfloat16
14
- from transformers import (
15
- BitsAndBytesConfig,
16
- AutoTokenizer,
17
- AutoModelForCausalLM,
18
- pipeline,
19
- )
20
- from prompts import REPRESENTATION_PROMPT
21
  from hdbscan import HDBSCAN
22
  from sklearn.feature_extraction.text import CountVectorizer
23
 
@@ -26,7 +17,7 @@ from sentence_transformers import SentenceTransformer
26
  from dotenv import load_dotenv
27
  import os
28
 
29
- import spaces
30
  import gradio as gr
31
 
32
 
@@ -38,8 +29,8 @@ logging.basicConfig(
38
  level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
39
  )
40
 
41
- MAX_ROWS = 1_000
42
- CHUNK_SIZE = 300
43
 
44
 
45
  session = requests.Session()
@@ -47,71 +38,7 @@ sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
47
  keybert = KeyBERTInspired()
48
  vectorizer_model = CountVectorizer(stop_words="english")
49
 
50
- model_id = "meta-llama/Llama-2-7b-chat-hf"
51
- device = f"cuda:{cuda.current_device()}" if cuda.is_available() else "cpu"
52
- logging.info(device)
53
-
54
- bnb_config = BitsAndBytesConfig(
55
- load_in_4bit=True, # 4-bit quantization
56
- bnb_4bit_quant_type="nf4", # Normalized float 4
57
- bnb_4bit_use_double_quant=True, # Second quantization after the first
58
- bnb_4bit_compute_dtype=bfloat16, # Computation type
59
- )
60
-
61
- tokenizer = AutoTokenizer.from_pretrained(model_id)
62
-
63
- model = AutoModelForCausalLM.from_pretrained(
64
- model_id,
65
- trust_remote_code=True,
66
- quantization_config=bnb_config,
67
- device_map="auto",
68
- offload_folder="offload", # Offloading part of the model to CPU to save GPU memory
69
- )
70
-
71
- # Enable gradient checkpointing for memory efficiency during backprop?
72
- model.gradient_checkpointing_enable()
73
-
74
- generator = pipeline(
75
- model=model,
76
- tokenizer=tokenizer,
77
- task="text-generation",
78
- temperature=0.1,
79
- max_new_tokens=200, # Reduced max_new_tokens to limit memory consumption
80
- repetition_penalty=1.1,
81
- )
82
-
83
- llama2 = TextGeneration(generator, prompt=REPRESENTATION_PROMPT)
84
- representation_model = {
85
- "KeyBERT": keybert,
86
- "Llama2": llama2,
87
- }
88
-
89
- # TODO: It should be proporcional to the number of rows
90
- # For small datasets (1-200 rows) it worked fine with 2 neighbors
91
- N_NEIGHBORS = 15
92
-
93
- umap_model = UMAP(
94
- n_neighbors=N_NEIGHBORS,
95
- n_components=5,
96
- min_dist=0.0,
97
- metric="cosine",
98
- random_state=42,
99
- )
100
-
101
- hdbscan_model = HDBSCAN(
102
- min_cluster_size=N_NEIGHBORS,
103
- metric="euclidean",
104
- cluster_selection_method="eom",
105
- prediction_data=True,
106
- )
107
-
108
- reduce_umap_model = UMAP(
109
- n_neighbors=N_NEIGHBORS,
110
- n_components=2,
111
- min_dist=0.0,
112
- metric="cosine",
113
- random_state=42,
114
- )
115
 
116
  global_topic_model = None
117
 
@@ -151,16 +78,30 @@ def get_docs_from_parquet(parquet_urls, column, offset, limit):
151
  return df[column].tolist()
152
 
153
 
154
- @spaces.GPU
155
- # TODO: Modify batch size to reduce memory consumption during embedding calculation, which value is better?
156
  def calculate_embeddings(docs):
157
  return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
158
 
159
 
160
- @spaces.GPU
161
- def fit_model(docs, embeddings):
162
  global global_topic_model
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  new_model = BERTopic(
165
  "english",
166
  # Sub-models
@@ -172,7 +113,7 @@ def fit_model(docs, embeddings):
172
  # Hyperparameters
173
  top_n_words=10,
174
  verbose=True,
175
- min_topic_size=15, # TODO: Should this value be coherent with N_NEIGHBORS?
176
  )
177
  logging.info("Fitting new model")
178
  new_model.fit(docs, embeddings)
@@ -183,6 +124,10 @@ def fit_model(docs, embeddings):
183
  logging.info("Global model updated")
184
 
185
 
 
 
 
 
186
  def generate_topics(dataset, config, split, column, nested_column):
187
  logging.info(
188
  f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
@@ -193,6 +138,16 @@ def generate_topics(dataset, config, split, column, nested_column):
193
  logging.info(f"Split rows: {split_rows}")
194
 
195
  limit = min(split_rows, MAX_ROWS)
 
 
 
 
 
 
 
 
 
 
196
  offset = 0
197
  rows_processed = 0
198
 
@@ -201,8 +156,8 @@ def generate_topics(dataset, config, split, column, nested_column):
201
  reduced_embeddings_list = []
202
  topics_info, topic_plot = None, None
203
  yield (
204
- gr.DataFrame(interactive=False, visible=True),
205
- gr.Plot(visible=True),
206
  gr.Label(
207
  {f"⚙️ Generating topics {dataset}": rows_processed / limit}, visible=True
208
  ),
@@ -217,7 +172,7 @@ def generate_topics(dataset, config, split, column, nested_column):
217
  )
218
 
219
  embeddings = calculate_embeddings(docs)
220
- fit_model(docs, embeddings)
221
 
222
  if base_model is None:
223
  base_model = global_topic_model
@@ -230,13 +185,6 @@ def generate_topics(dataset, config, split, column, nested_column):
230
  logging.info(f"The following topics are newly found: {new_topics}")
231
  base_model = updated_model
232
 
233
- repr_model_topics = {
234
- key: label[0][0].split("\n")[0]
235
- for key, label in base_model.get_topics(full=True)["Llama2"].items()
236
- }
237
-
238
- base_model.set_topic_labels(repr_model_topics)
239
-
240
  reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
241
  reduced_embeddings_list.append(reduced_embeddings)
242
 
@@ -249,8 +197,6 @@ def generate_topics(dataset, config, split, column, nested_column):
249
  custom_labels=True,
250
  )
251
 
252
- logging.info(f"Topics: {repr_model_topics}")
253
-
254
  rows_processed += len(docs)
255
  progress = min(rows_processed / limit, 1.0)
256
  logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
 
7
  from bertopic import BERTopic
8
  from bertopic.representation import (
9
  KeyBERTInspired,
 
10
  )
11
  from umap import UMAP
 
 
 
 
 
 
 
 
12
  from hdbscan import HDBSCAN
13
  from sklearn.feature_extraction.text import CountVectorizer
14
 
 
17
  from dotenv import load_dotenv
18
  import os
19
 
20
+ # import spaces
21
  import gradio as gr
22
 
23
 
 
29
  level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
30
  )
31
 
32
+ MAX_ROWS = 5_000
33
+ CHUNK_SIZE = 1_000
34
 
35
 
36
  session = requests.Session()
 
38
  keybert = KeyBERTInspired()
39
  vectorizer_model = CountVectorizer(stop_words="english")
40
 
41
+ representation_model = KeyBERTInspired()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  global_topic_model = None
44
 
 
78
  return df[column].tolist()
79
 
80
 
81
+ # @spaces.GPU
 
82
  def calculate_embeddings(docs):
83
  return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
84
 
85
 
86
+ # @spaces.GPU
87
+ def fit_model(docs, embeddings, n_neighbors):
88
  global global_topic_model
89
 
90
+ umap_model = UMAP(
91
+ n_neighbors=n_neighbors,
92
+ n_components=5,
93
+ min_dist=0.0,
94
+ metric="cosine",
95
+ random_state=42,
96
+ )
97
+
98
+ hdbscan_model = HDBSCAN(
99
+ min_cluster_size=n_neighbors,
100
+ metric="euclidean",
101
+ cluster_selection_method="eom",
102
+ prediction_data=True,
103
+ )
104
+
105
  new_model = BERTopic(
106
  "english",
107
  # Sub-models
 
113
  # Hyperparameters
114
  top_n_words=10,
115
  verbose=True,
116
+ min_topic_size=n_neighbors, # TODO: Should this value be coherent with N_NEIGHBORS?
117
  )
118
  logging.info("Fitting new model")
119
  new_model.fit(docs, embeddings)
 
124
  logging.info("Global model updated")
125
 
126
 
127
+ def calculate_n_neighbors(n_rows):
128
+ return max(n_rows // 20, 2)
129
+
130
+
131
  def generate_topics(dataset, config, split, column, nested_column):
132
  logging.info(
133
  f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
 
138
  logging.info(f"Split rows: {split_rows}")
139
 
140
  limit = min(split_rows, MAX_ROWS)
141
+ n_neighbors = calculate_n_neighbors(limit)
142
+
143
+ reduce_umap_model = UMAP(
144
+ n_neighbors=n_neighbors,
145
+ n_components=2,
146
+ min_dist=0.0,
147
+ metric="cosine",
148
+ random_state=42,
149
+ )
150
+
151
  offset = 0
152
  rows_processed = 0
153
 
 
156
  reduced_embeddings_list = []
157
  topics_info, topic_plot = None, None
158
  yield (
159
+ gr.DataFrame(value=[], interactive=False, visible=True),
160
+ gr.Plot(value=None, visible=True),
161
  gr.Label(
162
  {f"⚙️ Generating topics {dataset}": rows_processed / limit}, visible=True
163
  ),
 
172
  )
173
 
174
  embeddings = calculate_embeddings(docs)
175
+ fit_model(docs, embeddings, n_neighbors)
176
 
177
  if base_model is None:
178
  base_model = global_topic_model
 
185
  logging.info(f"The following topics are newly found: {new_topics}")
186
  base_model = updated_model
187
 
 
 
 
 
 
 
 
188
  reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
189
  reduced_embeddings_list.append(reduced_embeddings)
190
 
 
197
  custom_labels=True,
198
  )
199
 
 
 
200
  rows_processed += len(docs)
201
  progress = min(rows_processed / limit, 1.0)
202
  logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")