Spaces:
Running
on
Zero
Running
on
Zero
File size: 12,224 Bytes
6faeba1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import argparse
import os
import pickle
from copy import deepcopy
import pandas as pd
from tqdm import tqdm
from Preprocessing.multilinguality.SimilaritySolver import SimilaritySolver
from Utility.storage_config import MODELS_DIR
from Utility.utils import load_json_from_path
ISO_LOOKUP_PATH = "iso_lookup.json"
ISO_TO_FULLNAME_PATH = "iso_to_fullname.json"
LANG_PAIRS_MAP_PATH = "lang_1_to_lang_2_to_map_dist.json"
LANG_PAIRS_TREE_PATH = "lang_1_to_lang_2_to_tree_dist.json"
LANG_PAIRS_ASP_PATH = "asp_dict.pkl"
LANG_PAIRS_LEARNED_DIST_PATH = "lang_1_to_lang_2_to_learned_dist.json"
LANG_PAIRS_ORACLE_PATH = "lang_1_to_lang_2_to_oracle_dist.json"
SUPVERVISED_LANGUAGES_PATH = "supervised_languages.json"
DATASET_SAVE_DIR = "distance_datasets/"
class LangDistDatasetCreator():
def __init__(self, model_path, cache_root="."):
self.model_path = model_path
self.cache_root = cache_root
self.lang_pairs_map = None
self.largest_value_map_dist = None
self.lang_pairs_tree = None
self.lang_pairs_asp = None
self.lang_pairs_learned_dist = None
self.lang_pairs_oracle = None
self.supervised_langs = load_json_from_path(os.path.join(cache_root, SUPVERVISED_LANGUAGES_PATH))
self.iso_lookup = load_json_from_path(os.path.join(cache_root, ISO_LOOKUP_PATH))
self.iso_to_fullname = load_json_from_path(os.path.join(cache_root, ISO_TO_FULLNAME_PATH))
def load_required_distance_lookups(self, distance_type, excluded_distances=[]):
# init required distance lookups
print(f"Loading required distance lookups for distance_type '{distance_type}'.")
try:
if distance_type == "combined":
if "map" not in excluded_distances and not self.lang_pairs_map:
self.lang_pairs_map = load_json_from_path(os.path.join(self.cache_root, LANG_PAIRS_MAP_PATH))
self.largest_value_map_dist = 0.0
for _, values in self.lang_pairs_map.items():
for _, value in values.items():
self.largest_value_map_dist = max(self.largest_value_map_dist, value)
if "tree" not in excluded_distances and not self.lang_pairs_tree:
self.lang_pairs_tree = load_json_from_path(os.path.join(self.cache_root, LANG_PAIRS_TREE_PATH))
if "asp" not in excluded_distances and not self.lang_pairs_asp:
with open(os.path.join(self.cache_root, LANG_PAIRS_ASP_PATH), "rb") as f:
self.lang_pairs_asp = pickle.load(f)
elif distance_type == "map" and not self.lang_pairs_map:
self.lang_pairs_map = load_json_from_path(os.path.join(self.cache_root, LANG_PAIRS_MAP_PATH))
self.largest_value_map_dist = 0.0
for _, values in self.lang_pairs_map.items():
for _, value in values.items():
self.largest_value_map_dist = max(self.largest_value_map_dist, value)
elif distance_type == "tree" and not self.lang_pairs_tree:
self.lang_pairs_tree = load_json_from_path(os.path.join(self.cache_root, LANG_PAIRS_TREE_PATH))
elif distance_type == "asp" and not self.lang_pairs_asp:
with open(os.path.join(self.cache_root, LANG_PAIRS_ASP_PATH), "rb") as f:
self.lang_pairs_asp = pickle.load(f)
elif distance_type == "learned" and not self.lang_pairs_learned_dist:
self.lang_pairs_learned_dist = load_json_from_path(os.path.join(self.cache_root, LANG_PAIRS_LEARNED_DIST_PATH))
elif distance_type == "oracle" and not self.lang_pairs_oracle:
self.lang_pairs_oracle = load_json_from_path(os.path.join(self.cache_root, LANG_PAIRS_ORACLE_PATH))
except FileNotFoundError as e:
raise FileNotFoundError("Please create all lookup files via create_distance_lookups.py") from e
def create_dataset(self,
distance_type: str = "learned",
zero_shot: bool = False,
n_closest: int = 50,
excluded_languages: list = [],
excluded_distances: list = [],
find_furthest: bool = False,
individual_distances: bool = False,
write_to_csv=True):
"""Create dataset with a given feature's distance in a dict, and saves it to a CSV file."""
distance_types = ["learned", "map", "tree", "asp", "combined", "random", "oracle"]
if distance_type not in distance_types:
raise ValueError(f"Invalid distance type '{distance_type}'. Expected one of {distance_types}")
dataset_dict = dict()
self.load_required_distance_lookups(distance_type, excluded_distances)
sim_solver = SimilaritySolver(tree_dist=self.lang_pairs_tree,
map_dist=self.lang_pairs_map,
largest_value_map_dist=self.largest_value_map_dist,
asp_dict=self.lang_pairs_asp,
learned_dist=self.lang_pairs_learned_dist,
oracle_dist=self.lang_pairs_oracle,
iso_to_fullname=self.iso_to_fullname)
supervised_langs = sorted(self.supervised_langs)
remove_langs_suffix = ""
if len(excluded_languages) > 0:
remove_langs_suffix = "_no-illegal-langs"
for excl_lang in excluded_languages:
supervised_langs.remove(excl_lang)
individual_dist_suffix, excluded_feat_suffix = "", ""
if distance_type == "combined":
if individual_distances:
individual_dist_suffix = "_indiv-dists"
if len(excluded_distances) > 0:
excluded_feat_suffix = "_excl-" + "-".join(excluded_distances)
furthest_suffix = "_furthest" if find_furthest else ""
zero_shot_suffix = ""
if zero_shot:
iso_codes_to_ids = deepcopy(self.iso_lookup)[-1]
zero_shot_suffix = "_zeroshot"
# leave supervised-pretrained language embeddings untouched
for sup_lang in supervised_langs:
iso_codes_to_ids.pop(sup_lang, None)
lang_codes = list(iso_codes_to_ids)
else:
lang_codes = supervised_langs
failed_langs = []
if distance_type == "random":
random_seed = 0
sorted_by = "closest" if not find_furthest else "furthest"
for lang in tqdm(lang_codes, desc=f"Retrieving {sorted_by} distances"):
if distance_type == "combined":
feature_dict = sim_solver.find_closest_combined_distance(lang,
supervised_langs,
k=n_closest,
individual_distances=individual_distances,
excluded_features=excluded_distances,
find_furthest=find_furthest)
elif distance_type == "random":
random_seed += 1
dataset_dict[lang] = [lang] # target language as first column
feature_dict = sim_solver.find_closest(distance_type,
lang,
supervised_langs,
k=n_closest,
find_furthest=find_furthest,
random_seed=random_seed)
else:
feature_dict = sim_solver.find_closest(distance_type,
lang,
supervised_langs,
k=n_closest,
find_furthest=find_furthest)
# discard incomplete results
if len(feature_dict) < n_closest:
failed_langs.append(lang)
continue
dataset_dict[lang] = [lang] # target language as first column
# create entry for a single close lang (`feature_dict` must be sorted by distance)
for _, close_lang in enumerate(feature_dict):
if distance_type == "combined":
dist_combined = feature_dict[close_lang]["combined_distance"]
close_lang_feature_list = [close_lang, dist_combined]
if individual_distances:
indiv_dists = feature_dict[close_lang]["individual_distances"]
close_lang_feature_list.extend(indiv_dists)
else:
dist = feature_dict[close_lang]
close_lang_feature_list = [close_lang, dist]
# column order: compared close language, {feature}_dist (plus optionally indiv dists)
dataset_dict[lang].extend(close_lang_feature_list)
# prepare df columns
dataset_columns = ["target_lang"]
for i in range(n_closest):
dataset_columns.extend([f"closest_lang_{i}", f"{distance_type}_dist_{i}"])
if distance_type == "combined" and individual_distances:
if "map" not in excluded_distances:
dataset_columns.append(f"map_dist_{i}")
if "asp" not in excluded_distances:
dataset_columns.append(f"asp_dist_{i}")
if "tree" not in excluded_distances:
dataset_columns.append(f"tree_dist_{i}")
df = pd.DataFrame.from_dict(dataset_dict, orient="index")
df.columns = dataset_columns
if write_to_csv:
out_path = os.path.join(os.path.join(self.cache_root, DATASET_SAVE_DIR), f"dataset_{distance_type}_top{n_closest}{furthest_suffix}{zero_shot_suffix}{remove_langs_suffix}{excluded_feat_suffix}{individual_dist_suffix}" + ".csv")
os.makedirs(os.path.join(self.cache_root, DATASET_SAVE_DIR), exist_ok=True)
df.to_csv(out_path, sep="|", index=False)
print(f"Successfully retrieved distances for {len(lang_codes) - len(failed_langs)}/{len(lang_codes)} languages.")
if len(failed_langs) > 0:
print(f"Failed to retrieve distances for the following {len(failed_langs)} languages:\n{failed_langs}")
return df
if __name__ == "__main__":
default_model_path = os.path.join(MODELS_DIR, "ToucanTTS_Meta", "best.pt") # MODELS_DIR must be absolute path, the relative path will fail at this location
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", "-m", type=str, default=default_model_path, help="model path from which to obtain pretrained language embeddings")
args = parser.parse_args()
dc = LangDistDatasetCreator(args.model_path)
excluded_langs = []
# create datasets for evaluation of approx. lang emb methods on supervised languages
dataset = dc.create_dataset(distance_type="tree", n_closest=30, zero_shot=False)
dataset = dc.create_dataset(distance_type="map", n_closest=30, zero_shot=False, excluded_languages=excluded_langs)
dataset = dc.create_dataset(distance_type="map", n_closest=30, zero_shot=False, find_furthest=True)
dataset = dc.create_dataset(distance_type="asp", n_closest=30, zero_shot=False)
dataset = dc.create_dataset(distance_type="random", n_closest=30, zero_shot=False, excluded_languages=excluded_langs)
dataset = dc.create_dataset(distance_type="combined", n_closest=30, zero_shot=False, individual_distances=True)
dataset = dc.create_dataset(distance_type="learned", n_closest=30, zero_shot=False)
dataset = dc.create_dataset(distance_type="oracle", n_closest=30, zero_shot=False)
|