Spaces:
Sleeping
Sleeping
| from typing import Dict, List | |
| import pandas as pd | |
| import numpy as np | |
| def load_data(): | |
| return pd.read_csv("gmm_point_tracking_with_centroids.csv").reset_index() | |
| def process_data(df, iteration, num_samples): | |
| # 随机采样论文 | |
| sampled_df = df.sample(n=num_samples, random_state=iteration).reset_index() | |
| # 计算每个论文属于各个 cluster 的概率 | |
| probabilities = [] | |
| paper_attributes = [] | |
| for idx, row in sampled_df.iterrows(): | |
| prob_str = row["probabilities"].strip("[]") | |
| prob_list = list(map(float, prob_str.split(", "))) | |
| probabilities.append(prob_list) | |
| paper_attributes.append( | |
| { | |
| "order": idx, | |
| "index": row['index'], | |
| "id": row["id"], | |
| "title": row["title"], | |
| "keywords": row["keywords"], | |
| "author": row["author"], | |
| } | |
| ) | |
| return sampled_df, probabilities, paper_attributes | |
| def build_hyperedges( | |
| probabilities, | |
| paper_attributes: List[Dict[str, str]], | |
| display_attribute_name: str, | |
| top_k: int = None, | |
| top_p: float = None, | |
| ) -> Dict[str, List[str]]: | |
| # 构建超图边 | |
| hyperedges: Dict[str, List[str]] = {} | |
| for idx, (prob, paper_attr) in enumerate(zip(probabilities, paper_attributes)): | |
| if display_attribute_name == "index" or display_attribute_name == "order": | |
| # display_attribute = f"Paper {idx}" | |
| display_attribute = f"Paper {paper_attr[display_attribute_name]}" | |
| else: | |
| display_attribute: str = paper_attr[display_attribute_name] | |
| if top_k is not None: | |
| selected_indices = np.argsort(prob)[-top_k:][::-1] | |
| else: | |
| # 累加起来,直到第一次大于等于 p | |
| selected_indices = [] | |
| cumulative_prob = 0.0 | |
| for i, p in enumerate(np.sort(prob)[::-1]): | |
| selected_indices.append(i) | |
| cumulative_prob += p | |
| if cumulative_prob > top_p+1e-4: | |
| break | |
| for cluster in selected_indices: | |
| cluster_name: str = f"Cluster {cluster}" | |
| if cluster_name not in hyperedges: | |
| hyperedges[cluster_name] = [] | |
| hyperedges[cluster_name].append(display_attribute) | |
| return hyperedges | |