Spaces:

jharrison27
/

connections-solver

Runtime error

App Files Files Community

jharrison27 commited on May 20, 2024

Commit

335999f

1 Parent(s): d7135ca

include new similarity metric

Browse files

Files changed (1) hide show

app.py +24 -33

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import streamlit as st
 from transformers import pipeline
 from sklearn.cluster import KMeans
 import numpy as np
 # Mock data
 mock_words = [
-    "apple", "banana", "grape", "date",  # Fruits
     "car", "truck", "bus", "bicycle",  # Vehicles
     "red", "blue", "green", "yellow",  # Colors
     "cat", "dog", "rabbit", "hamster"  # Pets
@@ -35,42 +37,29 @@ def embed_words(words, model_name):
     embeddings = embedder(words)
     return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])
-def iterative_clustering(words, model_name):
     remaining_words = words[:]
     grouped_words = []
     while len(remaining_words) >= 4:
         embeddings = embed_words(remaining_words, model_name)
-        kmeans = KMeans(n_clusters=min(4, len(remaining_words) // 4), random_state=0).fit(embeddings)
-        clusters = {i: [] for i in range(kmeans.n_clusters)}
-        for word, label in zip(remaining_words, kmeans.labels_):
-            if len(clusters[label]) < 4:
-                clusters[label].append(word)
-        # Select the most cohesive cluster
-        best_cluster, best_idx = select_most_cohesive_cluster(clusters, kmeans, embeddings)
-        # Store the best cluster and remove those words
-        grouped_words.append(best_cluster)
-        remaining_words = [word for word in remaining_words if word not in best_cluster]
     return grouped_words
-def select_most_cohesive_cluster(clusters, kmeans_model, embeddings):
-    min_distance = float('inf')
-    best_cluster = None
-    best_idx = -1
-    for idx, cluster in clusters.items():
-        if len(cluster) == 4:
-            cluster_embeddings = embeddings[[i for i, label in enumerate(kmeans_model.labels_) if label == idx]]
-            centroid = kmeans_model.cluster_centers_[idx]
-            distance = np.mean(np.linalg.norm(cluster_embeddings - centroid, axis=1))
-            if distance < min_distance:
-                min_distance = distance
-                best_cluster = cluster
-                best_idx = idx
-    return best_cluster, best_idx
 def display_clusters(clusters):
     for i, words in enumerate(clusters):
         st.markdown(f"### Group {i+1}")
@@ -79,14 +68,16 @@ def display_clusters(clusters):
 def main():
     st.title("NYT Connections Solver")
     st.write("This app demonstrates solving the NYT Connections game using word embeddings and clustering.")
-    st.write("Select an embedding model from the dropdown menu and click 'Generate Clusters' to see the grouped words.")
     # Dropdown menu for selecting the embedding model
     model_name = st.selectbox("Select Embedding Model", list(models.keys()))
     if st.button("Generate Clusters"):
         with st.spinner("Generating clusters..."):
-            clusters = iterative_clustering(mock_words, model_name)
         display_clusters(clusters)
 if __name__ == "__main__":

 import streamlit as st
 from transformers import pipeline
+from sklearn.metrics.pairwise import cosine_similarity
+from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
 from sklearn.cluster import KMeans
 import numpy as np
 # Mock data
 mock_words = [
+    "apple", "banana", "cherry", "date",  # Fruits
     "car", "truck", "bus", "bicycle",  # Vehicles
     "red", "blue", "green", "yellow",  # Colors
     "cat", "dog", "rabbit", "hamster"  # Pets
     embeddings = embedder(words)
     return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])
+def iterative_clustering(words, model_name, method):
     remaining_words = words[:]
     grouped_words = []
     while len(remaining_words) >= 4:
         embeddings = embed_words(remaining_words, model_name)
+        if method == 'Cosine Similarity':
+            # Use cosine similarity and hierarchical clustering
+            sim_matrix = cosine_similarity(embeddings)
+            Z = linkage(sim_matrix, 'average', metric='cosine')
+            labels = fcluster(Z, t=4, criterion='maxclust')
+        elif method == 'K-means':
+            # Use K-means clustering
+            kmeans = KMeans(n_clusters=4, random_state=0).fit(embeddings)
+            labels = kmeans.labels_ + 1  # Ensure labels match those used in hierarchical clustering
+        # Find the first complete cluster of exactly four items
+        for i in range(1, 5):
+            cluster = [word for idx, word in enumerate(remaining_words) if labels[idx] == i]
+            if len(cluster) == 4:
+                grouped_words.append(cluster)
+                remaining_words = [word for word in remaining_words if word not in cluster]
+                break
     return grouped_words
 def display_clusters(clusters):
     for i, words in enumerate(clusters):
         st.markdown(f"### Group {i+1}")
 def main():
     st.title("NYT Connections Solver")
     st.write("This app demonstrates solving the NYT Connections game using word embeddings and clustering.")
+    st.write("Select an embedding model and a clustering method from the dropdown menus, then click 'Generate Clusters' to see the grouped words.")
     # Dropdown menu for selecting the embedding model
     model_name = st.selectbox("Select Embedding Model", list(models.keys()))
+    # Dropdown menu for selecting the clustering method
+    clustering_method = st.selectbox("Select Clustering Method", ['K-means', 'Cosine Similarity'])
     if st.button("Generate Clusters"):
         with st.spinner("Generating clusters..."):
+            clusters = iterative_clustering(mock_words, model_name, clustering_method)
         display_clusters(clusters)
 if __name__ == "__main__":