jharrison27 commited on
Commit
335999f
·
1 Parent(s): d7135ca

include new similarity metric

Browse files
Files changed (1) hide show
  1. app.py +24 -33
app.py CHANGED
@@ -1,11 +1,13 @@
1
  import streamlit as st
2
  from transformers import pipeline
 
 
3
  from sklearn.cluster import KMeans
4
  import numpy as np
5
 
6
  # Mock data
7
  mock_words = [
8
- "apple", "banana", "grape", "date", # Fruits
9
  "car", "truck", "bus", "bicycle", # Vehicles
10
  "red", "blue", "green", "yellow", # Colors
11
  "cat", "dog", "rabbit", "hamster" # Pets
@@ -35,42 +37,29 @@ def embed_words(words, model_name):
35
  embeddings = embedder(words)
36
  return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])
37
 
38
- def iterative_clustering(words, model_name):
39
  remaining_words = words[:]
40
  grouped_words = []
41
-
42
  while len(remaining_words) >= 4:
43
  embeddings = embed_words(remaining_words, model_name)
44
- kmeans = KMeans(n_clusters=min(4, len(remaining_words) // 4), random_state=0).fit(embeddings)
45
- clusters = {i: [] for i in range(kmeans.n_clusters)}
46
- for word, label in zip(remaining_words, kmeans.labels_):
47
- if len(clusters[label]) < 4:
48
- clusters[label].append(word)
49
-
50
- # Select the most cohesive cluster
51
- best_cluster, best_idx = select_most_cohesive_cluster(clusters, kmeans, embeddings)
52
-
53
- # Store the best cluster and remove those words
54
- grouped_words.append(best_cluster)
55
- remaining_words = [word for word in remaining_words if word not in best_cluster]
56
-
 
 
 
57
  return grouped_words
58
 
59
- def select_most_cohesive_cluster(clusters, kmeans_model, embeddings):
60
- min_distance = float('inf')
61
- best_cluster = None
62
- best_idx = -1
63
- for idx, cluster in clusters.items():
64
- if len(cluster) == 4:
65
- cluster_embeddings = embeddings[[i for i, label in enumerate(kmeans_model.labels_) if label == idx]]
66
- centroid = kmeans_model.cluster_centers_[idx]
67
- distance = np.mean(np.linalg.norm(cluster_embeddings - centroid, axis=1))
68
- if distance < min_distance:
69
- min_distance = distance
70
- best_cluster = cluster
71
- best_idx = idx
72
- return best_cluster, best_idx
73
-
74
  def display_clusters(clusters):
75
  for i, words in enumerate(clusters):
76
  st.markdown(f"### Group {i+1}")
@@ -79,14 +68,16 @@ def display_clusters(clusters):
79
  def main():
80
  st.title("NYT Connections Solver")
81
  st.write("This app demonstrates solving the NYT Connections game using word embeddings and clustering.")
82
- st.write("Select an embedding model from the dropdown menu and click 'Generate Clusters' to see the grouped words.")
83
 
84
  # Dropdown menu for selecting the embedding model
85
  model_name = st.selectbox("Select Embedding Model", list(models.keys()))
 
 
86
 
87
  if st.button("Generate Clusters"):
88
  with st.spinner("Generating clusters..."):
89
- clusters = iterative_clustering(mock_words, model_name)
90
  display_clusters(clusters)
91
 
92
  if __name__ == "__main__":
 
1
  import streamlit as st
2
  from transformers import pipeline
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
5
  from sklearn.cluster import KMeans
6
  import numpy as np
7
 
8
  # Mock data
9
  mock_words = [
10
+ "apple", "banana", "cherry", "date", # Fruits
11
  "car", "truck", "bus", "bicycle", # Vehicles
12
  "red", "blue", "green", "yellow", # Colors
13
  "cat", "dog", "rabbit", "hamster" # Pets
 
37
  embeddings = embedder(words)
38
  return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])
39
 
40
+ def iterative_clustering(words, model_name, method):
41
  remaining_words = words[:]
42
  grouped_words = []
 
43
  while len(remaining_words) >= 4:
44
  embeddings = embed_words(remaining_words, model_name)
45
+ if method == 'Cosine Similarity':
46
+ # Use cosine similarity and hierarchical clustering
47
+ sim_matrix = cosine_similarity(embeddings)
48
+ Z = linkage(sim_matrix, 'average', metric='cosine')
49
+ labels = fcluster(Z, t=4, criterion='maxclust')
50
+ elif method == 'K-means':
51
+ # Use K-means clustering
52
+ kmeans = KMeans(n_clusters=4, random_state=0).fit(embeddings)
53
+ labels = kmeans.labels_ + 1 # Ensure labels match those used in hierarchical clustering
54
+ # Find the first complete cluster of exactly four items
55
+ for i in range(1, 5):
56
+ cluster = [word for idx, word in enumerate(remaining_words) if labels[idx] == i]
57
+ if len(cluster) == 4:
58
+ grouped_words.append(cluster)
59
+ remaining_words = [word for word in remaining_words if word not in cluster]
60
+ break
61
  return grouped_words
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def display_clusters(clusters):
64
  for i, words in enumerate(clusters):
65
  st.markdown(f"### Group {i+1}")
 
68
  def main():
69
  st.title("NYT Connections Solver")
70
  st.write("This app demonstrates solving the NYT Connections game using word embeddings and clustering.")
71
+ st.write("Select an embedding model and a clustering method from the dropdown menus, then click 'Generate Clusters' to see the grouped words.")
72
 
73
  # Dropdown menu for selecting the embedding model
74
  model_name = st.selectbox("Select Embedding Model", list(models.keys()))
75
+ # Dropdown menu for selecting the clustering method
76
+ clustering_method = st.selectbox("Select Clustering Method", ['K-means', 'Cosine Similarity'])
77
 
78
  if st.button("Generate Clusters"):
79
  with st.spinner("Generating clusters..."):
80
+ clusters = iterative_clustering(mock_words, model_name, clustering_method)
81
  display_clusters(clusters)
82
 
83
  if __name__ == "__main__":