|
|
|
|
|
|
|
|
|
|
|
import logging |
|
import os |
|
import sys |
|
|
|
import numpy as np |
|
from sklearn.cluster import MiniBatchKMeans |
|
|
|
import joblib |
|
|
|
logging.basicConfig( |
|
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", |
|
datefmt="%Y-%m-%d %H:%M:%S", |
|
level=os.environ.get("LOGLEVEL", "INFO").upper(), |
|
stream=sys.stdout, |
|
) |
|
logger = logging.getLogger("learn_kmeans") |
|
|
|
|
|
def get_km_model( |
|
n_clusters, |
|
init, |
|
max_iter, |
|
batch_size, |
|
tol, |
|
max_no_improvement, |
|
n_init, |
|
reassignment_ratio, |
|
): |
|
return MiniBatchKMeans( |
|
n_clusters=n_clusters, |
|
init=init, |
|
max_iter=max_iter, |
|
batch_size=batch_size, |
|
verbose=1, |
|
compute_labels=False, |
|
tol=tol, |
|
max_no_improvement=max_no_improvement, |
|
init_size=None, |
|
n_init=n_init, |
|
reassignment_ratio=reassignment_ratio, |
|
) |
|
|
|
|
|
def load_feature_shard(feat_dir, split, nshard, rank, percent): |
|
feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy" |
|
leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len" |
|
with open(leng_path, "r") as f: |
|
lengs = [int(line.rstrip()) for line in f] |
|
offsets = [0] + np.cumsum(lengs[:-1]).tolist() |
|
|
|
if percent < 0: |
|
return np.load(feat_path, mmap_mode="r") |
|
else: |
|
nsample = int(np.ceil(len(lengs) * percent)) |
|
indices = np.random.choice(len(lengs), nsample, replace=False) |
|
feat = np.load(feat_path, mmap_mode="r") |
|
sampled_feat = np.concatenate( |
|
[feat[offsets[i]: offsets[i] + lengs[i]] for i in indices], axis=0 |
|
) |
|
logger.info( |
|
( |
|
f"sampled {nsample} utterances, {len(sampled_feat)} frames " |
|
f"from shard {rank}/{nshard}" |
|
) |
|
) |
|
return sampled_feat |
|
|
|
|
|
def load_feature(feat_dir, split, nshard, seed, percent): |
|
assert percent <= 1.0 |
|
feat = np.concatenate( |
|
[ |
|
load_feature_shard(feat_dir, split, nshard, r, percent) |
|
for r in range(nshard) |
|
], |
|
axis=0, |
|
) |
|
logging.info(f"loaded feature with dimension {feat.shape}") |
|
return feat |
|
|
|
|
|
def learn_kmeans( |
|
feat_dir, |
|
split, |
|
nshard, |
|
km_path, |
|
n_clusters, |
|
seed, |
|
percent, |
|
init, |
|
max_iter, |
|
batch_size, |
|
tol, |
|
n_init, |
|
reassignment_ratio, |
|
max_no_improvement, |
|
): |
|
np.random.seed(seed) |
|
feat = load_feature(feat_dir, split, nshard, seed, percent) |
|
km_model = get_km_model( |
|
n_clusters, |
|
init, |
|
max_iter, |
|
batch_size, |
|
tol, |
|
max_no_improvement, |
|
n_init, |
|
reassignment_ratio, |
|
) |
|
km_model.fit(feat) |
|
joblib.dump(km_model, km_path) |
|
|
|
inertia = -km_model.score(feat) / len(feat) |
|
logger.info("total intertia: %.5f", inertia) |
|
logger.info("finished successfully") |
|
|
|
|
|
if __name__ == "__main__": |
|
import argparse |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("feat_dir", type=str) |
|
parser.add_argument("split", type=str) |
|
parser.add_argument("nshard", type=int) |
|
parser.add_argument("km_path", type=str) |
|
parser.add_argument("n_clusters", type=int) |
|
parser.add_argument("--seed", default=0, type=int) |
|
parser.add_argument( |
|
"--percent", default=-1, type=float, help="sample a subset; -1 for all" |
|
) |
|
parser.add_argument("--init", default="k-means++") |
|
parser.add_argument("--max_iter", default=100, type=int) |
|
parser.add_argument("--batch_size", default=10000, type=int) |
|
parser.add_argument("--tol", default=0.0, type=float) |
|
parser.add_argument("--max_no_improvement", default=100, type=int) |
|
parser.add_argument("--n_init", default=20, type=int) |
|
parser.add_argument("--reassignment_ratio", default=0.0, type=float) |
|
args = parser.parse_args() |
|
logging.info(str(args)) |
|
|
|
learn_kmeans(**vars(args)) |
|
|