File size: 7,615 Bytes
8c1fbe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import pickle
import re

import gradio as gr
import matplotlib.pyplot as plt
import networkx as nx
from tqdm import tqdm

from Utility.utils import load_json_from_path


class Visualizer:

    def __init__(self, cache_root="."):
        tree_lookup_path = os.path.join(cache_root, "lang_1_to_lang_2_to_tree_dist.json")
        self.tree_dist = load_json_from_path(tree_lookup_path)

        map_lookup_path = os.path.join(cache_root, "lang_1_to_lang_2_to_map_dist.json")
        self.map_dist = load_json_from_path(map_lookup_path)
        largest_value_map_dist = 0.0
        for _, values in self.map_dist.items():
            for _, value in values.items():
                largest_value_map_dist = max(largest_value_map_dist, value)
        for key1 in self.map_dist:
            for key2 in self.map_dist[key1]:
                self.map_dist[key1][key2] = self.map_dist[key1][key2] / largest_value_map_dist

        asp_dict_path = os.path.join(cache_root, "asp_dict.pkl")
        with open(asp_dict_path, 'rb') as dictfile:
            asp_sim = pickle.load(dictfile)
        lang_list = list(asp_sim.keys())
        self.asp_dist = dict()
        seen_langs = set()
        for lang_1 in lang_list:
            if lang_1 not in seen_langs:
                seen_langs.add(lang_1)
                self.asp_dist[lang_1] = dict()
            for index, lang_2 in enumerate(lang_list):
                if lang_2 not in seen_langs:  # it's symmetric
                    self.asp_dist[lang_1][lang_2] = 1 - asp_sim[lang_1][index]

        self.iso_codes_to_names = load_json_from_path(os.path.join(cache_root, "iso_to_fullname.json"))
        for code in self.iso_codes_to_names:
            self.iso_codes_to_names[code] = re.sub("\(.*?\)", "", self.iso_codes_to_names[code])

    def visualize(self, distance_type, neighbor, num_neighbors):
        plt.clf()
        plt.figure(figsize=(12, 12))

        assert distance_type in ["Physical Distance between Language Centroids on the Globe",
                                 "Distance to the Lowest Common Ancestor in the Language Family Tree",
                                 "Angular Distance between the Frequencies of Phonemes"]
        if distance_type == "Distance to the Lowest Common Ancestor in the Language Family Tree":
            distance_measure = self.tree_dist
        elif distance_type == "Angular Distance between the Frequencies of Phonemes":
            distance_measure = self.asp_dist
        elif distance_type == "Physical Distance between Language Centroids on the Globe":
            distance_measure = self.map_dist

        distances = list()

        for lang_1 in distance_measure:
            if lang_1 not in self.iso_codes_to_names:
                continue
            for lang_2 in distance_measure[lang_1]:
                if lang_2 not in self.iso_codes_to_names:
                    continue
                distances.append((self.iso_codes_to_names[lang_1], self.iso_codes_to_names[lang_2], distance_measure[lang_1][lang_2]))

        G = nx.Graph()
        min_dist = min(d for _, _, d in distances)
        max_dist = max(d for _, _, d in distances)
        normalized_distances = [(entity1, entity2, (d - min_dist) / (max_dist - min_dist)) for entity1, entity2, d in distances]

        d_dist = list()
        for entity1, entity2, d in tqdm(normalized_distances):
            if neighbor == entity2 or neighbor == entity1:
                if entity1 != entity2:
                    d_dist.append(d)
        thresh = sorted(d_dist)[num_neighbors]
        neighbors = set()
        for entity1, entity2, d in tqdm(normalized_distances):
            if d < thresh and (neighbor == entity2 or neighbor == entity1) and (entity1 != entity2):
                neighbors.add(entity1)
                neighbors.add(entity2)
                spring_tension = (thresh - d) * 10  # for vis purposes
                G.add_edge(entity1, entity2, weight=spring_tension)
        neighbors.remove(neighbor)
        for entity1, entity2, d in tqdm(normalized_distances):
            if entity2 in neighbors and entity1 in neighbors:
                if entity1 != entity2:
                    spring_tension = thresh - d
                    G.add_edge(entity1, entity2, weight=spring_tension)

        pos = nx.spring_layout(G, weight="weight")  # Positions for all nodes
        edges = G.edges(data=True)
        nx.draw_networkx_nodes(G, pos, node_size=1, alpha=0.01)
        edges_connected_to_specific_node = [(u, v) for u, v in G.edges() if u == neighbor or v == neighbor]
        nx.draw_networkx_edges(G, pos, edgelist=edges_connected_to_specific_node, edge_color='orange', alpha=0.4, width=3)
        # edges_not_connected_to_specific_node = [(u, v) for u, v in G.edges() if u != neighbor and v != neighbor]
        # nx.draw_networkx_edges(G, pos, edgelist=edges_not_connected_to_specific_node, edge_color='gray', alpha=0.1, width=1)
        for u, v, d in edges:
            if u == neighbor or v == neighbor:
                nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): round((thresh - (d['weight'] / 10)) * 10, 2)}, font_color="red", alpha=0.4)  # reverse modifications
        nx.draw_networkx_labels(G, pos, font_size=14, font_family='sans-serif', font_color='green')
        nx.draw_networkx_labels(G, pos, labels={neighbor: neighbor}, font_size=14, font_family='sans-serif', font_color='red')
        plt.title(f'Graph of {distance_type}')
        plt.subplots_adjust(left=0, right=1, top=0.9, bottom=0)
        plt.tight_layout()
        return plt.gcf()


if __name__ == '__main__':
    vis = Visualizer(cache_root=".")
    text_selection = [f"{vis.iso_codes_to_names[iso_code]}" for iso_code in vis.iso_codes_to_names]
    iface = gr.Interface(fn=vis.visualize,
                         inputs=[gr.Dropdown(["Physical Distance between Language Centroids on the Globe",
                                              "Distance to the Lowest Common Ancestor in the Language Family Tree",
                                              "Angular Distance between the Frequencies of Phonemes"],
                                             type="value",
                                             value='Physical Distance between Language Centroids on the Globe',
                                             label="Select the Type of Distance"),
                                 gr.Dropdown(text_selection,
                                             type="value",
                                             value="German",
                                             label="Select the second Language (type on your keyboard to find it quickly)"),
                                 gr.Slider(minimum=0, maximum=100, step=1,
                                           value=12,
                                           label="How many Nearest Neighbors should be displayed?")
                                 ],
                         outputs=[gr.Plot(label="", show_label=False, format="png", container=True)],
                         description="<br><br> This demo allows you to find the nearest neighbors of a language from the ISO 639-3 list according to several distance measurement functions. "
                                     "For more information, check out our paper: https://arxiv.org/abs/2406.06403 and our text-to-speech tool, in which we make use of "
                                     "this technique: https://github.com/DigitalPhonetics/IMS-Toucan <br><br>",
                         fill_width=True,
                         allow_flagging="never")
    iface.launch()