Spaces:

ginigen
/

Sign-language

Building

File size: 3,417 Bytes

import spacy
import pickle
from nltk.corpus import wordnet


def load_spacy_values(model = "en_core_web_md", filepath_docs_spacy = 'dict_spacy_object.pkl'):
    '''
    Loads a spaCy language model and a dictionary of spaCy Doc objects from a pickle file.

    Parameters
    ----------
    model : str
        The name or local path of the spaCy model to be loaded for processing text. 
        For example, "en_core_web_sm" or a custom model path.

    filepath_docs_spacy : str
        The path to the pickle file containing a dictionary where the keys are tokens 
        (strings) and the values are the corresponding serialized spaCy Doc objects.

    Returns
    -------
    nlp : spacy.language.Language
        The loaded spaCy language model.

    dict_docs_spacy : dict
        A dictionary where the keys are tokens (strings) and the values are spaCy Doc 
        objects reconstructed from the serialized bytes stored in the pickle file.
    '''
    
    # ---- Load the spaCy NLP model
    #
    nlp = spacy.load(model)
    
    # ---- Load pickle file and reconstruct the dictionary with tokens as keys and spaCy Doc objects as values
    #
    with open(filepath_docs_spacy, 'rb') as file:
        dict_docs_spacy_bytes = pickle.load(file)
    
    dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()}
    
    return nlp, dict_docs_spacy


def find_antonyms(word):
    '''
    Generate a set of all the antonyms of a given word

    Parameters
    ----------
    word : str
        The word that we want to find the antonyms

    Returns
    -------
    antonyms : set of str
        A set of all the antonym detected using nltk and WordNet
    '''
    
    antonyms = set()

    # ---- Load all the set of synonyms of the word recorded from wordnet
    #
    syn_set = wordnet.synsets(word)

    # ---- Loop over each set of synonyms
    #
    for syn in syn_set:
        # ---- Loop over each synonym
        #
        for lemma in syn.lemmas():
            # ---- Add antonyms of the synonyms to the antonyms set
            #
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())

    return antonyms


def find_synonyms(word, model, dict_embedding, list_2000_tokens):
   # 고유명사 보존 
   doc = model(word)
   if doc[0].pos_ == "PROPN":
       return word

   # 기본 동사 매핑
   basic_verbs = {
       "is": "IS",
       "am": "IS", 
       "are": "IS",
       "was": "IS",
       "were": "IS",
       "be": "IS",
       "have": "HAVE",
       "has": "HAVE",
       "had": "HAVE"
   }
   
   if word.lower() in basic_verbs:
       return basic_verbs[word.lower()]

   # 이미 목록에 있는 단어는 그대로 반환
   if word in list_2000_tokens:
       return word
       
   # 품사가 같은 유사어 찾기
   word_doc = model(word)
   word_pos = word_doc[0].pos_
   
   antonyms = find_antonyms(word)
   filtered_tokens = [token for token in list_2000_tokens 
                     if token not in antonyms
                     and model(token)[0].pos_ == word_pos]

   similarities = []
   word_embedding = model(word)
   
   for token in filtered_tokens:
       similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))

   most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
   
   return most_similar_token