File size: 3,417 Bytes
fe1339d
 
 
 
 
abc6394
974d749
abc6394
fe1339d
974d749
 
abc6394
 
 
974d749
 
abc6394
 
974d749
 
 
 
 
fe1339d
974d749
abc6394
 
974d749
fe1339d
974d749
 
abc6394
974d749
 
 
fe1339d
 
 
 
 
 
 
974d749
fe1339d
974d749
 
 
 
 
 
 
 
 
 
 
 
 
 
fe1339d
974d749
 
 
fe1339d
974d749
 
 
fe1339d
974d749
 
fe1339d
974d749
 
fe1339d
 
974d749
fe1339d
 
 
974d749
f562d6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import spacy
import pickle
from nltk.corpus import wordnet


def load_spacy_values(model = "en_core_web_md", filepath_docs_spacy = 'dict_spacy_object.pkl'):
    '''
    Loads a spaCy language model and a dictionary of spaCy Doc objects from a pickle file.

    Parameters
    ----------
    model : str
        The name or local path of the spaCy model to be loaded for processing text. 
        For example, "en_core_web_sm" or a custom model path.

    filepath_docs_spacy : str
        The path to the pickle file containing a dictionary where the keys are tokens 
        (strings) and the values are the corresponding serialized spaCy Doc objects.

    Returns
    -------
    nlp : spacy.language.Language
        The loaded spaCy language model.

    dict_docs_spacy : dict
        A dictionary where the keys are tokens (strings) and the values are spaCy Doc 
        objects reconstructed from the serialized bytes stored in the pickle file.
    '''
    
    # ---- Load the spaCy NLP model
    #
    nlp = spacy.load(model)
    
    # ---- Load pickle file and reconstruct the dictionary with tokens as keys and spaCy Doc objects as values
    #
    with open(filepath_docs_spacy, 'rb') as file:
        dict_docs_spacy_bytes = pickle.load(file)
    
    dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()}
    
    return nlp, dict_docs_spacy


def find_antonyms(word):
    '''
    Generate a set of all the antonyms of a given word

    Parameters
    ----------
    word : str
        The word that we want to find the antonyms

    Returns
    -------
    antonyms : set of str
        A set of all the antonym detected using nltk and WordNet
    '''
    
    antonyms = set()

    # ---- Load all the set of synonyms of the word recorded from wordnet
    #
    syn_set = wordnet.synsets(word)

    # ---- Loop over each set of synonyms
    #
    for syn in syn_set:
        # ---- Loop over each synonym
        #
        for lemma in syn.lemmas():
            # ---- Add antonyms of the synonyms to the antonyms set
            #
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())

    return antonyms


def find_synonyms(word, model, dict_embedding, list_2000_tokens):
   # 고유명사 보존 
   doc = model(word)
   if doc[0].pos_ == "PROPN":
       return word

   # 기본 동사 매핑
   basic_verbs = {
       "is": "IS",
       "am": "IS", 
       "are": "IS",
       "was": "IS",
       "were": "IS",
       "be": "IS",
       "have": "HAVE",
       "has": "HAVE",
       "had": "HAVE"
   }
   
   if word.lower() in basic_verbs:
       return basic_verbs[word.lower()]

   # 이미 목록에 있는 단어는 그대로 반환
   if word in list_2000_tokens:
       return word
       
   # 품사가 같은 유사어 찾기
   word_doc = model(word)
   word_pos = word_doc[0].pos_
   
   antonyms = find_antonyms(word)
   filtered_tokens = [token for token in list_2000_tokens 
                     if token not in antonyms
                     and model(token)[0].pos_ == word_pos]

   similarities = []
   word_embedding = model(word)
   
   for token in filtered_tokens:
       similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))

   most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
   
   return most_similar_token