Spaces:
Runtime error
Runtime error
Commit
·
335999f
1
Parent(s):
d7135ca
include new similarity metric
Browse files
app.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
import streamlit as st
|
2 |
from transformers import pipeline
|
|
|
|
|
3 |
from sklearn.cluster import KMeans
|
4 |
import numpy as np
|
5 |
|
6 |
# Mock data
|
7 |
mock_words = [
|
8 |
-
"apple", "banana", "
|
9 |
"car", "truck", "bus", "bicycle", # Vehicles
|
10 |
"red", "blue", "green", "yellow", # Colors
|
11 |
"cat", "dog", "rabbit", "hamster" # Pets
|
@@ -35,42 +37,29 @@ def embed_words(words, model_name):
|
|
35 |
embeddings = embedder(words)
|
36 |
return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])
|
37 |
|
38 |
-
def iterative_clustering(words, model_name):
|
39 |
remaining_words = words[:]
|
40 |
grouped_words = []
|
41 |
-
|
42 |
while len(remaining_words) >= 4:
|
43 |
embeddings = embed_words(remaining_words, model_name)
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
#
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
57 |
return grouped_words
|
58 |
|
59 |
-
def select_most_cohesive_cluster(clusters, kmeans_model, embeddings):
|
60 |
-
min_distance = float('inf')
|
61 |
-
best_cluster = None
|
62 |
-
best_idx = -1
|
63 |
-
for idx, cluster in clusters.items():
|
64 |
-
if len(cluster) == 4:
|
65 |
-
cluster_embeddings = embeddings[[i for i, label in enumerate(kmeans_model.labels_) if label == idx]]
|
66 |
-
centroid = kmeans_model.cluster_centers_[idx]
|
67 |
-
distance = np.mean(np.linalg.norm(cluster_embeddings - centroid, axis=1))
|
68 |
-
if distance < min_distance:
|
69 |
-
min_distance = distance
|
70 |
-
best_cluster = cluster
|
71 |
-
best_idx = idx
|
72 |
-
return best_cluster, best_idx
|
73 |
-
|
74 |
def display_clusters(clusters):
|
75 |
for i, words in enumerate(clusters):
|
76 |
st.markdown(f"### Group {i+1}")
|
@@ -79,14 +68,16 @@ def display_clusters(clusters):
|
|
79 |
def main():
|
80 |
st.title("NYT Connections Solver")
|
81 |
st.write("This app demonstrates solving the NYT Connections game using word embeddings and clustering.")
|
82 |
-
st.write("Select an embedding model from the dropdown
|
83 |
|
84 |
# Dropdown menu for selecting the embedding model
|
85 |
model_name = st.selectbox("Select Embedding Model", list(models.keys()))
|
|
|
|
|
86 |
|
87 |
if st.button("Generate Clusters"):
|
88 |
with st.spinner("Generating clusters..."):
|
89 |
-
clusters = iterative_clustering(mock_words, model_name)
|
90 |
display_clusters(clusters)
|
91 |
|
92 |
if __name__ == "__main__":
|
|
|
1 |
import streamlit as st
|
2 |
from transformers import pipeline
|
3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
+
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
5 |
from sklearn.cluster import KMeans
|
6 |
import numpy as np
|
7 |
|
8 |
# Mock data
|
9 |
mock_words = [
|
10 |
+
"apple", "banana", "cherry", "date", # Fruits
|
11 |
"car", "truck", "bus", "bicycle", # Vehicles
|
12 |
"red", "blue", "green", "yellow", # Colors
|
13 |
"cat", "dog", "rabbit", "hamster" # Pets
|
|
|
37 |
embeddings = embedder(words)
|
38 |
return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])
|
39 |
|
40 |
+
def iterative_clustering(words, model_name, method):
|
41 |
remaining_words = words[:]
|
42 |
grouped_words = []
|
|
|
43 |
while len(remaining_words) >= 4:
|
44 |
embeddings = embed_words(remaining_words, model_name)
|
45 |
+
if method == 'Cosine Similarity':
|
46 |
+
# Use cosine similarity and hierarchical clustering
|
47 |
+
sim_matrix = cosine_similarity(embeddings)
|
48 |
+
Z = linkage(sim_matrix, 'average', metric='cosine')
|
49 |
+
labels = fcluster(Z, t=4, criterion='maxclust')
|
50 |
+
elif method == 'K-means':
|
51 |
+
# Use K-means clustering
|
52 |
+
kmeans = KMeans(n_clusters=4, random_state=0).fit(embeddings)
|
53 |
+
labels = kmeans.labels_ + 1 # Ensure labels match those used in hierarchical clustering
|
54 |
+
# Find the first complete cluster of exactly four items
|
55 |
+
for i in range(1, 5):
|
56 |
+
cluster = [word for idx, word in enumerate(remaining_words) if labels[idx] == i]
|
57 |
+
if len(cluster) == 4:
|
58 |
+
grouped_words.append(cluster)
|
59 |
+
remaining_words = [word for word in remaining_words if word not in cluster]
|
60 |
+
break
|
61 |
return grouped_words
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
def display_clusters(clusters):
|
64 |
for i, words in enumerate(clusters):
|
65 |
st.markdown(f"### Group {i+1}")
|
|
|
68 |
def main():
|
69 |
st.title("NYT Connections Solver")
|
70 |
st.write("This app demonstrates solving the NYT Connections game using word embeddings and clustering.")
|
71 |
+
st.write("Select an embedding model and a clustering method from the dropdown menus, then click 'Generate Clusters' to see the grouped words.")
|
72 |
|
73 |
# Dropdown menu for selecting the embedding model
|
74 |
model_name = st.selectbox("Select Embedding Model", list(models.keys()))
|
75 |
+
# Dropdown menu for selecting the clustering method
|
76 |
+
clustering_method = st.selectbox("Select Clustering Method", ['K-means', 'Cosine Similarity'])
|
77 |
|
78 |
if st.button("Generate Clusters"):
|
79 |
with st.spinner("Generating clusters..."):
|
80 |
+
clusters = iterative_clustering(mock_words, model_name, clustering_method)
|
81 |
display_clusters(clusters)
|
82 |
|
83 |
if __name__ == "__main__":
|