Spaces:
Running
Running
File size: 8,110 Bytes
5d0a311 b1aa3b5 5d0a311 b1aa3b5 5d0a311 b1aa3b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 |
import enum
import subprocess
import spacy
import pyinflect
from difflib import ndiff
from typing import List, Union, Tuple, Dict
# BES auxiliary “be” Let it **be**.
# HVS forms of “have” I**’ve** seen the Queen
# MD verb, modal auxiliary VerbType=mod This **could** work.
# VB verb, base form VerbForm=inf I want to **go**.
# VBD verb, past tense VerbForm=fin Tense=past This **was** a sentence.
# VBG verb, gerund or present participle VerbForm=part Tense=pres Aspect=prog I am **going**.
# VBN verb, past participle VerbForm=part Tense=past Aspect=perf The treasure was **lost**.
# VBP verb, non-3rd person singular present VerbForm=fin Tense=pres I **want** to go.
# VBZ verb, 3rd person singular present VerbForm=fin Tense=pres Number=sing Person=3 He **wants** to go.
class APVoice:
class Tense(enum.Enum):
simple_present = {
'aux':[None,'VBZ'],
'main':['VBZ','VBP', 'VB'],
'tobe':{'NN':'is{}','NNS':'are{}'}
}
simple_past = {
'aux':[None, 'VBD'],
'main':['VBD', 'VB'],
'tobe':{'NN':'was{}','NNS':'were{}'}
}
future_simple = {
'aux':['MD'],
'main':['VB'],
'tobe':{'NN':'will{} be','NNS':'will{} be'}
}
present_cont = {
'aux':['VBP','VBZ'],
'main':['VBG'],
'tobe':{'NN':'is{} being','NNS':'are{} being'}
}
past_cont = {
'aux':['VBD'],
'main':['VBG'],
'tobe':{'NN':'was{} being','NNS':'were{} being'}
}
present_perfect = {
'aux':['VBP','VBZ'],
'main':['VBN'],
'tobe':{'NN':'has{} been','NNS':'have{} been'}
}
def __init__(
self
) -> None:
self.parser = None
self.__init_parser(model="en_core_web_sm")
def __init_parser(
self,
model: str
) -> None:
self.parser = None
try:
self.parser = spacy.load(model)
except:
print(f"* Downloading {model} model...")
_ = subprocess.Popen(
f"python -m spacy download {model}",
stdout=subprocess.PIPE,
shell=True).communicate()
self.parser = spacy.load(model)
def verb2participle(
self,
verb: str
) -> str:
tk = self.parser(verb)[0]
return tk._.inflect('VBN')
def subjp2objp(
self,
pronoun: str
) -> str:
"""
Convert Subject pronouns to Object pronouns.
"""
mapping = {"i":"me","you":"you","we":"us","they":"them","he":"him","she":"her", "it":"it"}
return mapping.get(pronoun.lower(), None)
def get_gramatical_number(
self,
dobj_data: List[List[Tuple[str,str,str]]]
) -> Union[str, None]:
result = [tag for _,dep,tag in dobj_data if dep == 'dobj']
if len(result) == 0:
result = None
else:
result = result[0].replace('NNP', 'NN')
return result
def get_verbal_tense(
self,
verb_data: List[List[Tuple[str,str,str,int]]]
) -> Union[str, None]:
aux, neg, root = verb_data
root = root[0][2] if len(root) > 0 else None
aux = aux[0][2] if len(aux) > 0 else None
tense_name = None
for tense in self.Tense:
if aux in tense.value['aux'] and root in tense.value['main']:
tense_name = tense.name
break
return tense_name
def get_subj(
self,
sentence: str,
) -> Tuple[ List[Tuple[str,str,str]], str]:
out_data = []
for tk in self.parser(sentence):
if "subj" in tk.dep_:
out_data = [(t,t.dep_,t.tag_) for t in tk.subtree]
break
out_str = ' '.join([t.text for t,_,_ in out_data])
return out_data, out_str
def get_verb(
self,
sentence: str,
) -> Tuple[ List[List[Tuple[str,str,str,int]]], str]:
main_data = []
aux_data = []
neg_data = []
out_data = []
for tk in self.parser(sentence):
if "ROOT" in tk.dep_:
main_data = [ (tk,tk.dep_,tk.tag_,tk.i)]
aux_data = [(t,t.dep_,t.tag_,t.i) for t in tk.children if t.dep_ == "aux"]
neg_data = [(t,t.dep_,t.tag_,t.i) for t in tk.children if t.dep_ == "neg"]
out_data = [aux_data, neg_data, main_data]
break
out_str = sorted([tup for list_ in out_data for tup in list_], key=lambda x: x[3])
out_str = ' '.join([t.text for t,_,_,_ in out_str])
return out_data, out_str
def get_dobj(
self,
sentence: str,
) -> Tuple[ List[Tuple[str,str,str]], str]:
out_data = []
for tk in self.parser(sentence):
if "dobj" in tk.dep_:
out_data = [(t,t.dep_,t.tag_)for t in tk.subtree]
break
out_str = ' '.join([t.text for t,_,_ in out_data])
return out_data, out_str
def get_complement(
self,
subj: str,
verb: str,
dobj: str,
full_sentence: str,
) -> str:
concat_sentence = subj + ' ' + verb + ' ' + dobj
diff = ""
for tk in ndiff(concat_sentence.split(), full_sentence.split()):
mark, word = tk[0], tk[2:]
if mark == '+':
diff += word + " "
return diff.strip()
def active2passive(
self,
active_sentence: str,
debug: bool=False
) -> Dict[str, str]:
active_sentence = active_sentence.strip()
if active_sentence == "":
raise RuntimeError(
f"Error: The sentence does not be empty!"
)
subj_data, subj_str = self.get_subj(active_sentence)
if debug: print(subj_data)
if subj_str == "":
raise RuntimeError(
f"Error: The sentence's subject has not been found or the sentence does not be the correct format!"
)
verb_data, verb_str = self.get_verb(active_sentence)
if debug: print(verb_data)
if verb_str == "":
raise RuntimeError(
f"Error: The sentence's verb has not been found or the sentence does not be the correct format!"
)
dobj_data, dobj_str = self.get_dobj(active_sentence)
if debug: print(dobj_data)
if dobj_str == "":
raise RuntimeError(
f"Error: The sentence's direct object has not been found or the sentence does not be the correct format!"
)
complement = self.get_complement(subj_str, verb_str, dobj_str, active_sentence)
# Get pasive subject
p_subj = dobj_str
# Get tense + participle verb
verbal_tense = self.get_verbal_tense(verb_data)
if debug: print(verbal_tense)
if verbal_tense is None:
raise RuntimeError(
f"Error: The sentence does not be the correct format or the verbal tense has not been implemented yet!"
)
_, neg_data, main_data = verb_data
neg = " not" if len(neg_data) > 0 else ""
gramatical_number = self.get_gramatical_number(dobj_data)
if debug: print(gramatical_number)
p_tobe = self.Tense[verbal_tense].value['tobe'][gramatical_number].format(neg)
p_verb = self.verb2participle(main_data[0][0].text)
# Convert active_object to pasive_agent
p_agent = "by "
for tk,_,tag in subj_data:
word = tk.text
if tag == 'PRP':
word = self.subjp2objp(word)
p_agent += word + " "
return {
'subject': p_subj.capitalize(),
'tobe':p_tobe,
'participle': p_verb,
'agent': p_agent[0].lower() + p_agent[1:].strip(),
'complement':complement
} |