import spacy import pickle from nltk.corpus import wordnet def load_spacy_values(model = "en_core_web_md", filepath_docs_spacy = 'dict_spacy_object.pkl'): ''' Loads a spaCy language model and a dictionary of spaCy Doc objects from a pickle file. Parameters ---------- model : str The name or local path of the spaCy model to be loaded for processing text. For example, "en_core_web_sm" or a custom model path. filepath_docs_spacy : str The path to the pickle file containing a dictionary where the keys are tokens (strings) and the values are the corresponding serialized spaCy Doc objects. Returns ------- nlp : spacy.language.Language The loaded spaCy language model. dict_docs_spacy : dict A dictionary where the keys are tokens (strings) and the values are spaCy Doc objects reconstructed from the serialized bytes stored in the pickle file. ''' # ---- Load the spaCy NLP model # nlp = spacy.load(model) # ---- Load pickle file and reconstruct the dictionary with tokens as keys and spaCy Doc objects as values # with open(filepath_docs_spacy, 'rb') as file: dict_docs_spacy_bytes = pickle.load(file) dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()} return nlp, dict_docs_spacy def find_antonyms(word): ''' Generate a set of all the antonyms of a given word Parameters ---------- word : str The word that we want to find the antonyms Returns ------- antonyms : set of str A set of all the antonym detected using nltk and WordNet ''' antonyms = set() # ---- Load all the set of synonyms of the word recorded from wordnet # syn_set = wordnet.synsets(word) # ---- Loop over each set of synonyms # for syn in syn_set: # ---- Loop over each synonym # for lemma in syn.lemmas(): # ---- Add antonyms of the synonyms to the antonyms set # if lemma.antonyms(): antonyms.add(lemma.antonyms()[0].name()) return antonyms def find_synonyms(word, model, dict_embedding, list_2000_tokens): # 고유명사 보존 doc = model(word) if doc[0].pos_ == "PROPN": return word # 기본 동사 매핑 basic_verbs = { "is": "IS", "am": "IS", "are": "IS", "was": "IS", "were": "IS", "be": "IS", "have": "HAVE", "has": "HAVE", "had": "HAVE" } if word.lower() in basic_verbs: return basic_verbs[word.lower()] # 이미 목록에 있는 단어는 그대로 반환 if word in list_2000_tokens: return word # 품사가 같은 유사어 찾기 word_doc = model(word) word_pos = word_doc[0].pos_ antonyms = find_antonyms(word) filtered_tokens = [ token for token in list_2000_tokens if token not in antonyms and model(token)[0].pos_ == word_pos ] similarities = [] word_embedding = model(word) for token in filtered_tokens: similarities.append((token, dict_embedding.get(token).similarity(word_embedding))) # ====== 수정된 부분: similarities 리스트가 비었는지 확인 ====== if not similarities: # 유사 후보가 없다면 원본 단어를 그대로 반환 return word # ========================================================== most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0] return most_similar_token