ginipick commited on
Commit
95a5be9
·
verified ·
1 Parent(s): 784e32f

Update src/synonyms_preprocess.py

Browse files
Files changed (1) hide show
  1. src/synonyms_preprocess.py +45 -38
src/synonyms_preprocess.py CHANGED
@@ -77,46 +77,53 @@ def find_antonyms(word):
77
 
78
 
79
  def find_synonyms(word, model, dict_embedding, list_2000_tokens):
80
- # 고유명사 보존
81
- doc = model(word)
82
- if doc[0].pos_ == "PROPN":
83
- return word
84
-
85
- # 기본 동사 매핑
86
- basic_verbs = {
87
- "is": "IS",
88
- "am": "IS",
89
- "are": "IS",
90
- "was": "IS",
91
- "were": "IS",
92
- "be": "IS",
93
- "have": "HAVE",
94
- "has": "HAVE",
95
- "had": "HAVE"
96
- }
97
 
98
- if word.lower() in basic_verbs:
99
- return basic_verbs[word.lower()]
100
 
101
- # 이미 목록에 있는 단어는 그대로 반환
102
- if word in list_2000_tokens:
103
- return word
104
 
105
- # 품사가 같은 유사어 찾기
106
- word_doc = model(word)
107
- word_pos = word_doc[0].pos_
108
 
109
- antonyms = find_antonyms(word)
110
- filtered_tokens = [token for token in list_2000_tokens
111
- if token not in antonyms
112
- and model(token)[0].pos_ == word_pos]
113
-
114
- similarities = []
115
- word_embedding = model(word)
116
-
117
- for token in filtered_tokens:
118
- similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))
119
-
120
- most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
121
 
122
- return most_similar_token
 
 
 
 
 
 
 
 
 
 
 
77
 
78
 
79
  def find_synonyms(word, model, dict_embedding, list_2000_tokens):
80
+ # 고유명사 보존
81
+ doc = model(word)
82
+ if doc[0].pos_ == "PROPN":
83
+ return word
84
+
85
+ # 기본 동사 매핑
86
+ basic_verbs = {
87
+ "is": "IS",
88
+ "am": "IS",
89
+ "are": "IS",
90
+ "was": "IS",
91
+ "were": "IS",
92
+ "be": "IS",
93
+ "have": "HAVE",
94
+ "has": "HAVE",
95
+ "had": "HAVE"
96
+ }
97
 
98
+ if word.lower() in basic_verbs:
99
+ return basic_verbs[word.lower()]
100
 
101
+ # 이미 목록에 있는 단어는 그대로 반환
102
+ if word in list_2000_tokens:
103
+ return word
104
 
105
+ # 품사가 같은 유사어 찾기
106
+ word_doc = model(word)
107
+ word_pos = word_doc[0].pos_
108
 
109
+ antonyms = find_antonyms(word)
110
+ filtered_tokens = [
111
+ token for token in list_2000_tokens
112
+ if token not in antonyms
113
+ and model(token)[0].pos_ == word_pos
114
+ ]
115
+
116
+ similarities = []
117
+ word_embedding = model(word)
 
 
 
118
 
119
+ for token in filtered_tokens:
120
+ similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))
121
+
122
+ # ====== 수정된 부분: similarities 리스트가 비었는지 확인 ======
123
+ if not similarities:
124
+ # 유사 후보가 없다면 원본 단어를 그대로 반환
125
+ return word
126
+ # ==========================================================
127
+
128
+ most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
129
+ return most_similar_token