import numpy as np from sklearn.metrics.pairwise import cosine_similarity import statistics import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix, \ accuracy_score, roc_auc_score, roc_curve, f1_score, recall_score, precision_score import matplotlib.pyplot as plt import copy from sklearn import preprocessing, tree from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.tree import DecisionTreeClassifier from scipy.spatial import distance from sklearn.naive_bayes import GaussianNB import itertools import os from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import random from sklearn.utils import shuffle from imblearn.under_sampling import NearMiss,TomekLinks from imblearn.over_sampling import SMOTE from collections import Counter from imblearn.combine import SMOTETomek, SMOTEENN from sklearn.model_selection import StratifiedKFold from imblearn.pipeline import make_pipeline from matplotlib import pyplot from scipy import interp from sklearn.metrics import roc_curve,auc #keras from keras.models import Sequential from keras.layers import Dense, SimpleRNN, LSTM # Read ComE node embs per timestep [id, emb] folder = os.listdir('ComE_per_timestep/embs') path = 'ComE_per_timestep/embs' ComE_id_embs = [] for file in folder: ComE_id_embs.append(np.genfromtxt(os.path.join(path, file), dtype=None).tolist()) # Read ComE labels per timestep folder = os.listdir('ComE_per_timestep/labels_pred') path = 'ComE_per_timestep/labels_pred' ComE_lbls = [] for file in folder: ComE_lbls.append(np.genfromtxt(os.path.join(path, file), dtype=None).tolist()) # Node ids per timestep node_ids = [] for step in ComE_id_embs: tmp = [id_emb[0] for id_emb in step] node_ids.append(tmp) # [Node_id, clr] per timestep id_clr = [] for i in range(len(node_ids)): tmp = {} for ind,node in enumerate(node_ids[i]): tmp[node] = ComE_lbls[i][ind] id_clr.append(tmp) # Clustered nodes per timestep clustered_nodes_init = [] for ind,i in enumerate(id_clr): clrids_uniq = set(i.values()) d = {} for clrid in clrids_uniq: d[clrid] = [k for k in i.keys() if i[k] == clrid] clustered_nodes_init.append(d) clustered_nodes = [] for s in clustered_nodes_init: per_step = [] for k,v in sorted(s.items()): per_step.append(v) clustered_nodes.append(per_step) # ------------------------------ READ FEATURES ------------------------------- # ComE FEATURES folder = os.listdir('ComE_features_per_timestep/') path = 'ComE_features_per_timestep/' id_ComE_feats_clr = [] id_ComE_feats_out = [] id_ComE_feats_gbl = [] id_ComE_feats_clrout = [] id_ComE_feats_clrgbl = [] id_ComE_feats_all = [] for file in folder: df_ComE = pd.read_csv(os.path.join(path,file), names=['node_id', \ 'distin_med_eucl', 'distin_med_cos', 'distin_med_l1',\ 'distout_med_eucl', 'distout_med_cos', 'distout_med_l1',\ 'distin_eucl_max', 'distin_eucl_min', 'distin_eucl_avg',\ 'distin_cos_max', 'distin_cos_min', 'distin_cos_avg',\ 'distin_l1_max', 'distin_l1_min', 'distin_l1_avg',\ 'distout_eucl_max', 'distout_eucl_min', 'distout_eucl_avg',\ 'distout_cos_max', 'distout_cos_min', 'distout_cos_avg',\ 'distout_l1_max', 'distout_l1_min', 'distout_l1_avg', \ 'dist_glob_max_eucl', 'dist_glob_min_eucl', 'dist_glob_avg_eucl', \ 'dist_glob_max_cos', 'dist_glob_min_cos', 'dist_glob_avg_cos', \ 'dist_glob_max_l1', 'dist_glob_min_l1', 'dist_glob_avg_l1'], skiprows=1) df_ComE_clr = df_ComE[['node_id', 'distin_med_eucl', \ 'distin_eucl_max', 'distin_eucl_min', 'distin_eucl_avg']] df_ComE_out = df_ComE[['node_id', 'distout_med_eucl', \ 'distout_eucl_max', 'distout_eucl_min', 'distout_eucl_avg']] df_ComE_gbl = df_ComE[['node_id', 'distout_med_eucl', \ 'dist_glob_max_eucl', 'dist_glob_min_eucl', 'dist_glob_avg_eucl']] df_ComE_clrout = df_ComE[['node_id', 'distin_med_eucl', 'distout_med_eucl', \ 'distin_eucl_max', 'distin_eucl_min', 'distin_eucl_avg', \ 'distout_eucl_max', 'distout_eucl_min', 'distout_eucl_avg']] df_ComE_clrgbl = df_ComE[['node_id', 'distin_med_eucl', \ 'distin_eucl_max', 'distin_eucl_min', 'distin_eucl_avg', \ 'dist_glob_max_eucl', 'dist_glob_min_eucl', 'dist_glob_avg_eucl']] df_ComE_all = df_ComE[['node_id', 'distin_med_eucl', 'distout_med_eucl', \ 'distin_eucl_max', 'distin_eucl_min', 'distin_eucl_avg', \ 'distout_eucl_max', 'distout_eucl_min', 'distout_eucl_avg', \ 'dist_glob_max_eucl', 'dist_glob_min_eucl', 'dist_glob_avg_eucl']] df_ComE_clr_lst = df_ComE_clr.values.tolist() df_ComE_out_lst = df_ComE_out.values.tolist() df_ComE_gbl_lst = df_ComE_gbl.values.tolist() df_ComE_clrout_lst = df_ComE_clrout.values.tolist() df_ComE_clrgbl_lst = df_ComE_clrgbl.values.tolist() df_ComE_all_lst = df_ComE_all.values.tolist() id_ComE_feats_clr.append(df_ComE_clr_lst) id_ComE_feats_out.append(df_ComE_out_lst) id_ComE_feats_gbl.append(df_ComE_gbl_lst) id_ComE_feats_clrout.append(df_ComE_clrout_lst) id_ComE_feats_clrgbl.append(df_ComE_clrgbl_lst) id_ComE_feats_all.append(df_ComE_all_lst) #sort by node id for i in id_ComE_feats_clr: i.sort() for i in id_ComE_feats_out: i.sort() for i in id_ComE_feats_gbl: i.sort() for i in id_ComE_feats_clrout: i.sort() for i in id_ComE_feats_clrgbl: i.sort() for i in id_ComE_feats_all: i.sort() # Classic FEATURES folder = os.listdir('classic_features_per_timestep/classic_features') path = 'classic_features_per_timestep/classic_features' id_classic_clr = [] id_classic_gbl = [] id_classic_all = [] id_classic_nodeg = [] for file in folder: df_classic = pd.read_csv(os.path.join(path,file), names=['node_id', \ 'degree', 'betweenness', 'closeness', 'eigenvector', \ 'degree_ntwk', 'betweenness_ntwk', 'closeness_ntwk', 'eigenvector_ntwk'], \ skiprows=1) df_classic_clr = df_classic[['node_id', \ 'degree', 'betweenness', 'closeness', 'eigenvector']] df_classic_gbl = df_classic[['node_id', \ 'degree_ntwk', 'betweenness_ntwk', 'closeness_ntwk', 'eigenvector_ntwk']] df_classic_nodeg = pd.read_csv(os.path.join(path,file), names=['node_id', \ 'betweenness', 'closeness', 'eigenvector', \ 'betweenness_ntwk', 'closeness_ntwk', 'eigenvector_ntwk'], \ skiprows=1) df_classic_all_lst = df_classic.values.tolist() df_classic_clr_lst = df_classic_clr.values.tolist() id_classic_gbl_lst = df_classic_gbl.values.tolist() id_classic_nodeg_lst = df_classic_nodeg.values.tolist() id_classic_all.append(df_classic_all_lst) id_classic_clr.append(df_classic_clr_lst) id_classic_gbl.append(id_classic_gbl_lst) id_classic_nodeg.append(id_classic_nodeg_lst) #sort by node id for i in id_classic_all: i.sort() for i in id_classic_clr: i.sort() for i in id_classic_gbl: i.sort() for i in id_classic_nodeg: i.sort() id_combo_ComE_clrout_classic_all = [] for ind,s in enumerate(id_ComE_feats_clrout): temp = [] for inx,row in enumerate(s): tmp = row[:] tmp.extend(id_classic_all[ind][inx][1:]) temp.append(tmp) id_combo_ComE_clrout_classic_all.append(temp) #-------------------------------- MATCHING ------------------------------------ # [clr_x_tn, clr_y_tn+1, common_nodes_tn_tn+1] #print(clustered_nodes[0]) matching = [] a = 0 while a