File size: 10,039 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# Natural Language Toolkit: Clusterer Utilities
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Trevor Cohn <[email protected]>
# Contributor: J Richard Snape
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import copy
from abc import abstractmethod
from math import sqrt
from sys import stdout

try:
    import numpy
except ImportError:
    pass

from nltk.cluster.api import ClusterI


class VectorSpaceClusterer(ClusterI):
    """

    Abstract clusterer which takes tokens and maps them into a vector space.

    Optionally performs singular value decomposition to reduce the

    dimensionality.

    """

    def __init__(self, normalise=False, svd_dimensions=None):
        """

        :param normalise:       should vectors be normalised to length 1

        :type normalise:        boolean

        :param svd_dimensions:  number of dimensions to use in reducing vector

                                dimensionsionality with SVD

        :type svd_dimensions:   int

        """
        self._Tt = None
        self._should_normalise = normalise
        self._svd_dimensions = svd_dimensions

    def cluster(self, vectors, assign_clusters=False, trace=False):
        assert len(vectors) > 0

        # normalise the vectors
        if self._should_normalise:
            vectors = list(map(self._normalise, vectors))

        # use SVD to reduce the dimensionality
        if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
            [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors)))
            S = d[: self._svd_dimensions] * numpy.identity(
                self._svd_dimensions, numpy.float64
            )
            T = u[:, : self._svd_dimensions]
            Dt = vt[: self._svd_dimensions, :]
            vectors = numpy.transpose(numpy.dot(S, Dt))
            self._Tt = numpy.transpose(T)

        # call abstract method to cluster the vectors
        self.cluster_vectorspace(vectors, trace)

        # assign the vectors to clusters
        if assign_clusters:
            return [self.classify(vector) for vector in vectors]

    @abstractmethod
    def cluster_vectorspace(self, vectors, trace):
        """

        Finds the clusters using the given set of vectors.

        """

    def classify(self, vector):
        if self._should_normalise:
            vector = self._normalise(vector)
        if self._Tt is not None:
            vector = numpy.dot(self._Tt, vector)
        cluster = self.classify_vectorspace(vector)
        return self.cluster_name(cluster)

    @abstractmethod
    def classify_vectorspace(self, vector):
        """

        Returns the index of the appropriate cluster for the vector.

        """

    def likelihood(self, vector, label):
        if self._should_normalise:
            vector = self._normalise(vector)
        if self._Tt is not None:
            vector = numpy.dot(self._Tt, vector)
        return self.likelihood_vectorspace(vector, label)

    def likelihood_vectorspace(self, vector, cluster):
        """

        Returns the likelihood of the vector belonging to the cluster.

        """
        predicted = self.classify_vectorspace(vector)
        return 1.0 if cluster == predicted else 0.0

    def vector(self, vector):
        """

        Returns the vector after normalisation and dimensionality reduction

        """
        if self._should_normalise:
            vector = self._normalise(vector)
        if self._Tt is not None:
            vector = numpy.dot(self._Tt, vector)
        return vector

    def _normalise(self, vector):
        """

        Normalises the vector to unit length.

        """
        return vector / sqrt(numpy.dot(vector, vector))


def euclidean_distance(u, v):
    """

    Returns the euclidean distance between vectors u and v. This is equivalent

    to the length of the vector (u - v).

    """
    diff = u - v
    return sqrt(numpy.dot(diff, diff))


def cosine_distance(u, v):
    """

    Returns 1 minus the cosine of the angle between vectors v and u. This is

    equal to ``1 - (u.v / |u||v|)``.

    """
    return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))


class _DendrogramNode:
    """Tree node of a dendrogram."""

    def __init__(self, value, *children):
        self._value = value
        self._children = children

    def leaves(self, values=True):
        if self._children:
            leaves = []
            for child in self._children:
                leaves.extend(child.leaves(values))
            return leaves
        elif values:
            return [self._value]
        else:
            return [self]

    def groups(self, n):
        queue = [(self._value, self)]

        while len(queue) < n:
            priority, node = queue.pop()
            if not node._children:
                queue.push((priority, node))
                break
            for child in node._children:
                if child._children:
                    queue.append((child._value, child))
                else:
                    queue.append((0, child))
            # makes the earliest merges at the start, latest at the end
            queue.sort()

        groups = []
        for priority, node in queue:
            groups.append(node.leaves())
        return groups

    def __lt__(self, comparator):
        return cosine_distance(self._value, comparator._value) < 0


class Dendrogram:
    """

    Represents a dendrogram, a tree with a specified branching order.  This

    must be initialised with the leaf items, then iteratively call merge for

    each branch. This class constructs a tree representing the order of calls

    to the merge function.

    """

    def __init__(self, items=[]):
        """

        :param  items: the items at the leaves of the dendrogram

        :type   items: sequence of (any)

        """
        self._items = [_DendrogramNode(item) for item in items]
        self._original_items = copy.copy(self._items)
        self._merge = 1

    def merge(self, *indices):
        """

        Merges nodes at given indices in the dendrogram. The nodes will be

        combined which then replaces the first node specified. All other nodes

        involved in the merge will be removed.



        :param  indices: indices of the items to merge (at least two)

        :type   indices: seq of int

        """
        assert len(indices) >= 2
        node = _DendrogramNode(self._merge, *(self._items[i] for i in indices))
        self._merge += 1
        self._items[indices[0]] = node
        for i in indices[1:]:
            del self._items[i]

    def groups(self, n):
        """

        Finds the n-groups of items (leaves) reachable from a cut at depth n.

        :param  n: number of groups

        :type   n: int

        """
        if len(self._items) > 1:
            root = _DendrogramNode(self._merge, *self._items)
        else:
            root = self._items[0]
        return root.groups(n)

    def show(self, leaf_labels=[]):
        """

        Print the dendrogram in ASCII art to standard out.



        :param leaf_labels: an optional list of strings to use for labeling the

                            leaves

        :type leaf_labels: list

        """

        # ASCII rendering characters
        JOIN, HLINK, VLINK = "+", "-", "|"

        # find the root (or create one)
        if len(self._items) > 1:
            root = _DendrogramNode(self._merge, *self._items)
        else:
            root = self._items[0]
        leaves = self._original_items

        if leaf_labels:
            last_row = leaf_labels
        else:
            last_row = ["%s" % leaf._value for leaf in leaves]

        # find the bottom row and the best cell width
        width = max(map(len, last_row)) + 1
        lhalf = width // 2
        rhalf = int(width - lhalf - 1)

        # display functions
        def format(centre, left=" ", right=" "):
            return f"{lhalf * left}{centre}{right * rhalf}"

        def display(str):
            stdout.write(str)

        # for each merge, top down
        queue = [(root._value, root)]
        verticals = [format(" ") for leaf in leaves]
        while queue:
            priority, node = queue.pop()
            child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children))
            indices = list(map(leaves.index, child_left_leaf))
            if child_left_leaf:
                min_idx = min(indices)
                max_idx = max(indices)
            for i in range(len(leaves)):
                if leaves[i] in child_left_leaf:
                    if i == min_idx:
                        display(format(JOIN, " ", HLINK))
                    elif i == max_idx:
                        display(format(JOIN, HLINK, " "))
                    else:
                        display(format(JOIN, HLINK, HLINK))
                    verticals[i] = format(VLINK)
                elif min_idx <= i <= max_idx:
                    display(format(HLINK, HLINK, HLINK))
                else:
                    display(verticals[i])
            display("\n")
            for child in node._children:
                if child._children:
                    queue.append((child._value, child))
            queue.sort()

            for vertical in verticals:
                display(vertical)
            display("\n")

        # finally, display the last line
        display("".join(item.center(width) for item in last_row))
        display("\n")

    def __repr__(self):
        if len(self._items) > 1:
            root = _DendrogramNode(self._merge, *self._items)
        else:
            root = self._items[0]
        leaves = root.leaves(False)
        return "<Dendrogram with %d leaves>" % len(leaves)