|
from sklearn.cluster import AgglomerativeClustering |
|
from sklearn.metrics import pairwise_distances |
|
import numpy as np |
|
|
|
def get_recommendations_from_clustering(embeddings_unlabeled, embeddings_positive, distance_threshold=0.2, top_n=20): |
|
|
|
cosine_distances_bert = pairwise_distances(embeddings_unlabeled, embeddings_positive, metric='cosine') |
|
cosine_distances_bert_square = pairwise_distances(embeddings_unlabeled, metric='cosine') |
|
|
|
|
|
clustering_cosine_bert = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, affinity='precomputed', linkage='average') |
|
clustering_cosine_bert.fit(cosine_distances_bert_square) |
|
unlabeled_clusters_cosine_bert = clustering_cosine_bert.labels_ |
|
|
|
|
|
def get_recommended_papers_indices(unlabeled_clusters, unlabeled_distances, top_n=20): |
|
recommended_indices = [] |
|
for cluster_id in np.unique(unlabeled_clusters): |
|
cluster_indices = np.where(unlabeled_clusters == cluster_id)[0] |
|
cluster_distances = unlabeled_distances[cluster_indices].min(axis=1) |
|
sorted_indices = cluster_indices[np.argsort(cluster_distances)] |
|
recommended_indices.extend(sorted_indices[:top_n]) |
|
return recommended_indices |
|
|
|
|
|
recommended_indices = get_recommended_papers_indices(unlabeled_clusters_cosine_bert, cosine_distances_bert, top_n) |
|
return recommended_indices |