Spaces:

jiangjiechen
/

loren-fact-checking

Build error

App Files Files Community

loren-fact-checking / src /parsing_client /sentence_parser.py

jiangjiechen

init loren for spaces

7f7285f almost 3 years ago

raw

history blame

10.7 kB

	# -- coding:utf-8 --

	"""
	@Last modified date : 2020/12/23
	"""

	import re
	import nltk
	from nltk.stem import WordNetLemmatizer
	from allennlp.predictors.predictor import Predictor

	nltk.download('wordnet')
	nltk.download('stopwords')


	def deal_bracket(text, restore, leading_ent=None):
	if leading_ent:
	leading_ent = ' '.join(leading_ent.split('_'))
	text = f'Things about {leading_ent}: ' + text
	if restore:
	text = text.replace('-LRB-', '(').replace('-RRB-', ')')
	text = text.replace('LRB', '(').replace('RRB', ')')
	return text


	def refine_entity(entity):
	entity = re.sub(r'-LRB- .+ -RRB-$', '', entity)
	entity = re.sub(r'LRB .+ RRB$', '', entity)
	entity = re.sub(r'_', ' ', entity)
	entity = re.sub(r'\s+', ' ', entity)
	return entity.strip()


	def find_sub_seq(seq_a, seq_b, shift=0, uncased=False, lemmatizer=None):
	if uncased:
	seq_a = [token.lower() for token in seq_a]
	seq_b = [token.lower() for token in seq_b]
	if lemmatizer is not None:
	seq_a = [lemmatizer.lemmatize(token) for token in seq_a]
	seq_b = [lemmatizer.lemmatize(token) for token in seq_b]
	for i in range(shift, len(seq_a)):
	if seq_a[i:i+len(seq_b)] == seq_b:
	return i, i + len(seq_b)
	return -1, -1


	def is_sub_seq(seq_start, seq_end, all_seqs):
	for start, end, is_candidate in all_seqs:
	if start <= seq_start < seq_end <= end:
	return start, end, is_candidate
	return None


	# extract named entity with B-I-L-U-O schema
	def extract_named_entity(tags):
	all_NEs = []
	ne_type, ne_start = '', -1
	for i, t in enumerate(tags):
	if t == 'O':
	ne_type, ne_start = '', -1
	continue
	t1, t2 = t.split('-')
	if t1 == 'B':
	ne_type, ne_start = t2, i
	elif t1 == 'I' and t2 != ne_type:
	ne_type, ne_start = '', -1
	elif t1 == 'L' and t2 != ne_type:
	ne_type, ne_start = '', -1
	elif t1 == 'L' and t2 == ne_type:
	all_NEs.append((ne_start, i + 1, False))
	ne_type, ne_start = '', -1
	elif t1 == 'U':
	all_NEs.append((i, i + 1, False))
	ne_type, ne_start = '', -1

	return all_NEs


	def refine_results(tokens, spans, stopwords):
	all_spans = []
	for span_start, span_end, is_candidate in spans:
	# remove stopwords
	if not is_candidate:
	while span_start < span_end and tokens[span_start].lower() in stopwords:
	span_start += 1
	if span_start >= span_end:
	continue

	# add prefix
	if span_start > 0 and tokens[span_start - 1] in ['a', 'an', 'A', 'An', 'the', 'The']:
	span_start -= 1

	# convert token-level index into char-level index
	span = ' '.join(tokens[span_start:span_end])
	span_start = len(' '.join(tokens[:span_start])) + 1 * min(1, span_start) # 1 for blank
	span_end = span_start + len(span)

	all_spans.append((span, span_start, span_end))
	all_spans = sorted(all_spans, key=lambda x: (x[1], x[1] - x[2]))

	# remove overlap
	refined_spans = []
	for span, span_start, span_end in all_spans:
	flag = True
	for _, start, end in refined_spans:
	if start <= span_start < span_end <= end:
	flag = False
	break
	if flag:
	refined_spans.append((span, span_start, span_end))

	return refined_spans


	class SentenceParser:
	def __init__(self, device='cuda:0',
	ner_path="https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz",
	cp_path="https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz"):
	self.device = self.parse_device(device)
	self.ner = Predictor.from_path(ner_path, cuda_device=self.device)
	print('* ner loaded')
	self.cp = Predictor.from_path(cp_path, cuda_device=self.device)
	print('* constituency parser loaded')
	self.lemmatizer = WordNetLemmatizer()

	# some heuristic rules can be added here
	self.stopwords = set(nltk.corpus.stopwords.words('english'))
	self.stopwords.update({'-', '\'s', 'try', 'tries', 'tried', 'trying',
	'become', 'becomes', 'became', 'becoming',
	'make', 'makes', 'made', 'making', 'call', 'called', 'calling',
	'put', 'ever', 'something', 'someone', 'sometime'})
	self.special_tokens = ['only', 'most', 'before', 'after', 'behind']
	for token in self.special_tokens:
	if token in self.stopwords: self.stopwords.remove(token)
	if 'won' in self.stopwords: self.stopwords.remove('won')
	if 'own' in self.stopwords: self.stopwords.remove('own')

	def parse_device(self, device):
	if 'cpu' in device:
	return -1
	else:
	dev = re.findall('\d+', device)
	return 0 if len(dev) == 0 else int(dev[0])

	def identify_NPs(self, text, candidate_NPs=None):
	text = re.sub(r'\s+', ' ', text).strip()
	if len(text) == 0: return {'text': '', 'NPs': [], 'verbs': [], 'adjs': []}

	cp_outputs = self.cp.predict(text)
	ner_outputs = self.ner.predict(text)
	tokens = cp_outputs['tokens']
	pos_tags = cp_outputs['pos_tags']
	ner_tags = ner_outputs['tags']
	tree = cp_outputs['hierplane_tree']['root']

	# extract candidate noun phrases passed by user with token index
	all_NPs = []
	candidate_NPs = [refine_entity(np).split() for np in candidate_NPs] if candidate_NPs else []
	for np in sorted(candidate_NPs, key=len, reverse=True):
	np_start, np_end = find_sub_seq(tokens, np, 0, uncased=True, lemmatizer=self.lemmatizer)
	while np_start != -1 and np_end != -1:
	if not is_sub_seq(np_start, np_end, all_NPs):
	all_NPs.append((np_start, np_end, True))
	np_start, np_end = find_sub_seq(tokens, np, np_end, uncased=True, lemmatizer=self.lemmatizer)

	# extract noun phrases from tree
	def _get_bottom_NPs(children):
	if 'children' not in children:
	return None
	if {'NP', 'OP', 'XP', 'QP'} & set(children['attributes']):
	is_bottom = True
	for child in children['children']:
	if 'children' in child:
	is_bottom = False
	if is_bottom:
	bottom_NPs.append(children['word'].split())
	else:
	for child in children['children']:
	_get_bottom_NPs(child)
	else:
	for child in children['children']:
	_get_bottom_NPs(child)
	bottom_NPs = []
	_get_bottom_NPs(tree)

	# find token indices of noun phrases
	np_index = -1
	for np in bottom_NPs:
	np_start, np_end = find_sub_seq(tokens, np, np_index + 1)
	if not is_sub_seq(np_start, np_end, all_NPs):
	all_NPs.append((np_start, np_end, False))
	np_index = np_end

	# extract named entities with token index
	all_NEs = extract_named_entity(ner_tags)

	# extract verbs with token index
	all_verbs = []
	for i, pos in enumerate(pos_tags):
	if pos[0] == 'V':
	if not is_sub_seq(i, i + 1, all_NPs) and not is_sub_seq(i, i + 1, all_NEs):
	all_verbs.append((i, i + 1, False))

	# extract modifiers with token index
	all_modifiers = []
	for i, (token, pos) in enumerate(zip(tokens, pos_tags)):
	if pos in ['JJ', 'RB']: # adj. and adv.
	if not is_sub_seq(i, i + 1, all_NPs) and not is_sub_seq(i, i + 1, all_NEs):
	all_modifiers.append((i, i + 1, False))
	elif token in self.special_tokens:
	if not is_sub_seq(i, i + 1, all_NPs) and not is_sub_seq(i, i + 1, all_NEs):
	all_modifiers.append((i, i + 1, False))

	# split noun phrases with named entities
	all_spans = []
	for np_start, np_end, np_is_candidate in all_NPs:
	if np_is_candidate: # candidate noun phrases will be preserved
	all_spans.append((np_start, np_end, np_is_candidate))
	else:
	match = is_sub_seq(np_start, np_end, all_NEs)
	if match: # if a noun phrase is a sub span of a named entity, the named entity will be preserved
	all_spans.append(match)
	else: # else if a named entity is a sub span of a noun phrase, the noun phrase will be split
	index = np_start
	for ne_start, ne_end, ne_is_candidate in all_NEs:
	if np_start <= ne_start < ne_end <= np_end:
	all_modifiers.append((index, ne_start, False))
	all_spans.append((ne_start, ne_end, ne_is_candidate))
	index = ne_end
	all_spans.append((index, np_end, False))

	# named entities without overlapping
	for ne_start, ne_end, is_candidate in all_NEs:
	if not is_sub_seq(ne_start, ne_end, all_spans):
	all_spans.append((ne_start, ne_end, is_candidate))

	all_spans = refine_results(tokens, all_spans, self.stopwords)
	all_verbs = refine_results(tokens, all_verbs, self.stopwords)
	all_modifiers = refine_results(tokens, all_modifiers, self.stopwords)

	return {'text': tree['word'], 'NPs': all_spans, 'verbs': all_verbs, 'adjs': all_modifiers}


	if __name__ == '__main__':
	import json

	print('Initializing sentence parser.')
	client = SentenceParser(device='cpu')

	print('Parsing sentence.')
	sentence = "The Africa Cup of Nations is held in odd - numbered years due to conflict with the World Cup . "
	entities = ['Africa Cup of Nations', 'Africa_Cup_of_Nations', 'Africa Cup', 'Africa_Cup']
	results = client.identify_NPs(sentence, entities)
	print(json.dumps(results, ensure_ascii=False, indent=4))

	# import random
	# from tqdm import tqdm
	# from utils import read_json_lines, save_json
	#
	# print('Parsing file.')
	# results = []
	# data = list(read_json_lines('data/train.jsonl'))
	# random.shuffle(data)
	# for entry in tqdm(data[:100]):
	# results.append(client.identify_NPs(entry['claim']))
	# save_json(results, 'data/results.json')