File size: 6,654 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# Natural Language Toolkit: Lin's Thesaurus
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Dan Blanchard <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.txt

import re
from collections import defaultdict
from functools import reduce

from nltk.corpus.reader import CorpusReader


class LinThesaurusCorpusReader(CorpusReader):
    """Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin."""

    # Compiled regular expression for extracting the key from the first line of each
    # thesaurus entry
    _key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')

    @staticmethod
    def __defaultdict_factory():
        """Factory for creating defaultdict of defaultdict(dict)s"""
        return defaultdict(dict)

    def __init__(self, root, badscore=0.0):
        """

        Initialize the thesaurus.



        :param root: root directory containing thesaurus LISP files

        :type root: C{string}

        :param badscore: the score to give to words which do not appear in each other's sets of synonyms

        :type badscore: C{float}

        """

        super().__init__(root, r"sim[A-Z]\.lsp")
        self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
        self._badscore = badscore
        for path, encoding, fileid in self.abspaths(
            include_encoding=True, include_fileid=True
        ):
            with open(path) as lin_file:
                first = True
                for line in lin_file:
                    line = line.strip()
                    # Start of entry
                    if first:
                        key = LinThesaurusCorpusReader._key_re.sub(r"\1", line)
                        first = False
                    # End of entry
                    elif line == "))":
                        first = True
                    # Lines with pairs of ngrams and scores
                    else:
                        split_line = line.split("\t")
                        if len(split_line) == 2:
                            ngram, score = split_line
                            self._thesaurus[fileid][key][ngram.strip('"')] = float(
                                score
                            )

    def similarity(self, ngram1, ngram2, fileid=None):
        """

        Returns the similarity score for two ngrams.



        :param ngram1: first ngram to compare

        :type ngram1: C{string}

        :param ngram2: second ngram to compare

        :type ngram2: C{string}

        :param fileid: thesaurus fileid to search in. If None, search all fileids.

        :type fileid: C{string}

        :return: If fileid is specified, just the score for the two ngrams; otherwise,

                 list of tuples of fileids and scores.

        """
        # Entries don't contain themselves, so make sure similarity between item and itself is 1.0
        if ngram1 == ngram2:
            if fileid:
                return 1.0
            else:
                return [(fid, 1.0) for fid in self._fileids]
        else:
            if fileid:
                return (
                    self._thesaurus[fileid][ngram1][ngram2]
                    if ngram2 in self._thesaurus[fileid][ngram1]
                    else self._badscore
                )
            else:
                return [
                    (
                        fid,
                        (
                            self._thesaurus[fid][ngram1][ngram2]
                            if ngram2 in self._thesaurus[fid][ngram1]
                            else self._badscore
                        ),
                    )
                    for fid in self._fileids
                ]

    def scored_synonyms(self, ngram, fileid=None):
        """

        Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram



        :param ngram: ngram to lookup

        :type ngram: C{string}

        :param fileid: thesaurus fileid to search in. If None, search all fileids.

        :type fileid: C{string}

        :return: If fileid is specified, list of tuples of scores and synonyms; otherwise,

                 list of tuples of fileids and lists, where inner lists consist of tuples of

                 scores and synonyms.

        """
        if fileid:
            return self._thesaurus[fileid][ngram].items()
        else:
            return [
                (fileid, self._thesaurus[fileid][ngram].items())
                for fileid in self._fileids
            ]

    def synonyms(self, ngram, fileid=None):
        """

        Returns a list of synonyms for the current ngram.



        :param ngram: ngram to lookup

        :type ngram: C{string}

        :param fileid: thesaurus fileid to search in. If None, search all fileids.

        :type fileid: C{string}

        :return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and

                 lists, where inner lists contain synonyms.

        """
        if fileid:
            return self._thesaurus[fileid][ngram].keys()
        else:
            return [
                (fileid, self._thesaurus[fileid][ngram].keys())
                for fileid in self._fileids
            ]

    def __contains__(self, ngram):
        """

        Determines whether or not the given ngram is in the thesaurus.



        :param ngram: ngram to lookup

        :type ngram: C{string}

        :return: whether the given ngram is in the thesaurus.

        """
        return reduce(
            lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
            self._fileids,
            False,
        )


######################################################################
# Demo
######################################################################


def demo():
    from nltk.corpus import lin_thesaurus as thes

    word1 = "business"
    word2 = "enterprise"
    print("Getting synonyms for " + word1)
    print(thes.synonyms(word1))

    print("Getting scored synonyms for " + word1)
    print(thes.scored_synonyms(word1))

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
    print(thes.synonyms(word1, fileid="simN.lsp"))

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
    print(thes.synonyms(word1, fileid="simN.lsp"))

    print(f"Similarity score for {word1} and {word2}:")
    print(thes.similarity(word1, word2))


if __name__ == "__main__":
    demo()