File size: 1,789 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# Natural Language Toolkit: Word Sense Disambiguation Algorithms
#
# Authors: Liling Tan <[email protected]>,
#          Dmitrijs Milajevs <[email protected]>
#
# Copyright (C) 2001-2023 NLTK Project
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

from nltk.corpus import wordnet


def lesk(context_sentence, ambiguous_word, pos=None, synsets=None):
    """Return a synset for an ambiguous word in a context.



    :param iter context_sentence: The context sentence where the ambiguous word

         occurs, passed as an iterable of words.

    :param str ambiguous_word: The ambiguous word that requires WSD.

    :param str pos: A specified Part-of-Speech (POS).

    :param iter synsets: Possible synsets of the ambiguous word.

    :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.



    This function is an implementation of the original Lesk algorithm (1986) [1].



    Usage example::



        >>> lesk(['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'], 'bank', 'n')

        Synset('savings_bank.n.02')



    [1] Lesk, Michael. "Automatic sense disambiguation using machine

    readable dictionaries: how to tell a pine cone from an ice cream

    cone." Proceedings of the 5th Annual International Conference on

    Systems Documentation. ACM, 1986.

    https://dl.acm.org/citation.cfm?id=318728

    """

    context = set(context_sentence)
    if synsets is None:
        synsets = wordnet.synsets(ambiguous_word)

    if pos:
        synsets = [ss for ss in synsets if str(ss.pos()) == pos]

    if not synsets:
        return None

    _, sense = max(
        (len(context.intersection(ss.definition().split())), ss) for ss in synsets
    )

    return sense