Spaces:
Sleeping
Sleeping
File size: 6,654 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# Natural Language Toolkit: Lin's Thesaurus
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Dan Blanchard <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.txt
import re
from collections import defaultdict
from functools import reduce
from nltk.corpus.reader import CorpusReader
class LinThesaurusCorpusReader(CorpusReader):
"""Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin."""
# Compiled regular expression for extracting the key from the first line of each
# thesaurus entry
_key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
@staticmethod
def __defaultdict_factory():
"""Factory for creating defaultdict of defaultdict(dict)s"""
return defaultdict(dict)
def __init__(self, root, badscore=0.0):
"""
Initialize the thesaurus.
:param root: root directory containing thesaurus LISP files
:type root: C{string}
:param badscore: the score to give to words which do not appear in each other's sets of synonyms
:type badscore: C{float}
"""
super().__init__(root, r"sim[A-Z]\.lsp")
self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
self._badscore = badscore
for path, encoding, fileid in self.abspaths(
include_encoding=True, include_fileid=True
):
with open(path) as lin_file:
first = True
for line in lin_file:
line = line.strip()
# Start of entry
if first:
key = LinThesaurusCorpusReader._key_re.sub(r"\1", line)
first = False
# End of entry
elif line == "))":
first = True
# Lines with pairs of ngrams and scores
else:
split_line = line.split("\t")
if len(split_line) == 2:
ngram, score = split_line
self._thesaurus[fileid][key][ngram.strip('"')] = float(
score
)
def similarity(self, ngram1, ngram2, fileid=None):
"""
Returns the similarity score for two ngrams.
:param ngram1: first ngram to compare
:type ngram1: C{string}
:param ngram2: second ngram to compare
:type ngram2: C{string}
:param fileid: thesaurus fileid to search in. If None, search all fileids.
:type fileid: C{string}
:return: If fileid is specified, just the score for the two ngrams; otherwise,
list of tuples of fileids and scores.
"""
# Entries don't contain themselves, so make sure similarity between item and itself is 1.0
if ngram1 == ngram2:
if fileid:
return 1.0
else:
return [(fid, 1.0) for fid in self._fileids]
else:
if fileid:
return (
self._thesaurus[fileid][ngram1][ngram2]
if ngram2 in self._thesaurus[fileid][ngram1]
else self._badscore
)
else:
return [
(
fid,
(
self._thesaurus[fid][ngram1][ngram2]
if ngram2 in self._thesaurus[fid][ngram1]
else self._badscore
),
)
for fid in self._fileids
]
def scored_synonyms(self, ngram, fileid=None):
"""
Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
:param ngram: ngram to lookup
:type ngram: C{string}
:param fileid: thesaurus fileid to search in. If None, search all fileids.
:type fileid: C{string}
:return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
list of tuples of fileids and lists, where inner lists consist of tuples of
scores and synonyms.
"""
if fileid:
return self._thesaurus[fileid][ngram].items()
else:
return [
(fileid, self._thesaurus[fileid][ngram].items())
for fileid in self._fileids
]
def synonyms(self, ngram, fileid=None):
"""
Returns a list of synonyms for the current ngram.
:param ngram: ngram to lookup
:type ngram: C{string}
:param fileid: thesaurus fileid to search in. If None, search all fileids.
:type fileid: C{string}
:return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
lists, where inner lists contain synonyms.
"""
if fileid:
return self._thesaurus[fileid][ngram].keys()
else:
return [
(fileid, self._thesaurus[fileid][ngram].keys())
for fileid in self._fileids
]
def __contains__(self, ngram):
"""
Determines whether or not the given ngram is in the thesaurus.
:param ngram: ngram to lookup
:type ngram: C{string}
:return: whether the given ngram is in the thesaurus.
"""
return reduce(
lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
self._fileids,
False,
)
######################################################################
# Demo
######################################################################
def demo():
from nltk.corpus import lin_thesaurus as thes
word1 = "business"
word2 = "enterprise"
print("Getting synonyms for " + word1)
print(thes.synonyms(word1))
print("Getting scored synonyms for " + word1)
print(thes.scored_synonyms(word1))
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
print(thes.synonyms(word1, fileid="simN.lsp"))
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
print(thes.synonyms(word1, fileid="simN.lsp"))
print(f"Similarity score for {word1} and {word2}:")
print(thes.similarity(word1, word2))
if __name__ == "__main__":
demo()
|