Spaces:

lingbionlp
/

PhenoTagger-Demo

Build error

App Files Files Community

PhenoTagger-Demo / src /abbre_resolution.py

lingbionlp

Upload 23 files

ae5152f about 2 years ago

raw

history blame

15.4 kB

	# -- coding: utf-8 --
	"""
	Created on Tue Aug 11 16:52:40 2020

	@author: luol2
	"""

	import logging
	import regex
	import sys
	import io

	"""
	A Python 3 refactoring of Vincent Van Asch's Python 2 code at

	http://www.cnts.ua.ac.be/~vincent/scripts/abbreviations.py

	Based on

	A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
	A. Schwartz and M. Hearst
	Biocomputing, 2003, pp 451-462.

	"""

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
	log = logging.getLogger('Abbre')


	class Candidate(str):
	def __init__(self, value):
	super().__init__()
	self.start = 0
	self.stop = 0

	def set_position(self, start, stop):
	self.start = start
	self.stop = stop


	def yield_lines_from_file(file_path):
	with open(file_path, 'rb') as f:
	for line in f:
	try:
	line = line.decode('utf-8')
	except UnicodeDecodeError:
	line = line.decode('latin-1').encode('utf-8').decode('utf-8')
	line = line.strip()
	yield line
	f.close()


	def yield_lines_from_doc(doc_text):
	for line in doc_text.split("\n"):
	yield line.strip()


	def best_candidates(sentence):
	"""
	:param sentence: line read from input file
	:return: a Candidate iterator
	"""

	if '(' in sentence:
	# Check some things first
	if sentence.count('(') != sentence.count(')'):
	raise ValueError("Unbalanced parentheses: {}".format(sentence))

	if sentence.find('(') > sentence.find(')'):
	raise ValueError("First parentheses is right: {}".format(sentence))

	closeindex = -1
	while 1:
	# Look for open parenthesis
	openindex = sentence.find('(', closeindex + 1)

	if openindex == -1: break

	# Look for closing parentheses
	closeindex = openindex + 1
	open = 1
	skip = False
	while open:
	try:
	char = sentence[closeindex]
	except IndexError:
	# We found an opening bracket but no associated closing bracket
	# Skip the opening bracket
	skip = True
	break
	if char == '(':
	open += 1
	elif char in [')', ';', ':']:
	open -= 1
	closeindex += 1

	if skip:
	closeindex = openindex + 1
	continue

	# Output if conditions are met
	start = openindex + 1
	stop = closeindex - 1
	candidate = sentence[start:stop]

	# Take into account whitespace that should be removed
	start = start + len(candidate) - len(candidate.lstrip())
	stop = stop - len(candidate) + len(candidate.rstrip())
	candidate = sentence[start:stop]

	if conditions(candidate):
	new_candidate = Candidate(candidate)
	new_candidate.set_position(start, stop)
	yield new_candidate


	def conditions(candidate):
	"""
	Based on Schwartz&Hearst

	2 <= len(str) <= 10
	len(tokens) <= 2
	re.search('\p{L}', str)
	str[0].isalnum()

	and extra:
	if it matches (\p{L}\.?\s?){2,}
	it is a good candidate.

	:param candidate: candidate abbreviation
	:return: True if this is a good candidate
	"""
	viable = True
	if regex.match('(\p{L}\.?\s?){2,}', candidate.lstrip()):
	viable = True
	if len(candidate) < 2 or len(candidate) > 10:
	viable = False
	if len(candidate.split()) > 2:
	viable = False
	if not regex.search('\p{L}', candidate):
	viable = False
	if not candidate[0].isalnum():
	viable = False

	return viable


	def get_definition(candidate, sentence):
	"""
	Takes a candidate and a sentence and returns the definition candidate.

	The definintion candidate is the set of tokens (in front of the candidate)
	that starts with a token starting with the first character of the candidate

	:param candidate: candidate abbreviation
	:param sentence: current sentence (single line from input file)
	:return: candidate definition for this abbreviation
	"""
	# Take the tokens in front of the candidate
	tokens = regex.split(r'[\s\-]+', sentence[:candidate.start - 2].lower())
	#print(tokens)
	# the char that we are looking for
	key = candidate[0].lower()

	# Count the number of tokens that start with the same character as the candidate
	# print(tokens)
	firstchars = [t[0] for t in tokens]
	# print(firstchars)
	definition_freq = firstchars.count(key)
	candidate_freq = candidate.lower().count(key)

	# Look for the list of tokens in front of candidate that
	# have a sufficient number of tokens starting with key
	if candidate_freq <= definition_freq:
	# we should at least have a good number of starts
	count = 0
	start = 0
	startindex = len(firstchars) - 1

	while count < candidate_freq:
	if abs(start) > len(firstchars):
	raise ValueError("candiate {} not found".format(candidate))
	start -= 1
	# Look up key in the definition
	try:
	startindex = firstchars.index(key, len(firstchars) + start)
	except ValueError:
	pass

	# Count the number of keys in definition
	count = firstchars[startindex:].count(key)

	# We found enough keys in the definition so return the definition as a definition candidate
	start = len(' '.join(tokens[:startindex]))
	stop = candidate.start - 1
	candidate = sentence[start:stop]

	# Remove whitespace
	start = start + len(candidate) - len(candidate.lstrip())
	stop = stop - len(candidate) + len(candidate.rstrip())
	candidate = sentence[start:stop]

	new_candidate = Candidate(candidate)
	new_candidate.set_position(start, stop)
	#print('new_candidate:')
	#print(new_candidate,start,stop)
	return new_candidate

	else:
	raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')


	def select_definition(definition, abbrev):
	"""
	Takes a definition candidate and an abbreviation candidate
	and returns True if the chars in the abbreviation occur in the definition

	Based on
	A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst
	:param definition: candidate definition
	:param abbrev: candidate abbreviation
	:return:
	"""


	if len(definition) < len(abbrev):
	raise ValueError('Abbreviation is longer than definition')

	if abbrev in definition.split():
	raise ValueError('Abbreviation is full word of definition')

	sindex = -1
	lindex = -1

	while 1:
	try:
	longchar = definition[lindex].lower()
	except IndexError:
	raise

	shortchar = abbrev[sindex].lower()

	if not shortchar.isalnum():
	sindex -= 1

	if sindex == -1 * len(abbrev):
	if shortchar == longchar:
	if lindex == -1 * len(definition) or not definition[lindex - 1].isalnum():
	break
	else:
	lindex -= 1
	else:
	lindex -= 1
	if lindex == -1 * (len(definition) + 1):
	raise ValueError("definition {} was not found in {}".format(abbrev, definition))

	else:
	if shortchar == longchar:
	sindex -= 1
	lindex -= 1
	else:
	lindex -= 1
	# print('lindex:',lindex,len(definition),definition[lindex:len(definition)])
	new_candidate = Candidate(definition[lindex:len(definition)])
	new_candidate.set_position(definition.start+lindex+len(definition), definition.stop)
	definition = new_candidate

	tokens = len(definition.split())
	length = len(abbrev)

	if tokens > min([length + 5, length * 2]):
	raise ValueError("did not meet min(\|A\|+5, \|A\|*2) constraint")

	# Do not return definitions that contain unbalanced parentheses
	if definition.count('(') != definition.count(')'):
	raise ValueError("Unbalanced parentheses not allowed in a definition")
	# print('select:')
	# print(definition,definition.start, definition.stop)
	new_definition_dict={'definition':definition,'start':definition.start,'stop':definition.stop}
	return new_definition_dict


	def extract_abbreviation_definition_pairs(file_path=None, doc_text=None):
	abbrev_map = []
	omit = 0
	written = 0
	if file_path:
	sentence_iterator = enumerate(yield_lines_from_file(file_path))
	elif doc_text:
	sentence_iterator = enumerate(yield_lines_from_doc(doc_text))
	else:
	return abbrev_map

	for i, sentence in sentence_iterator:
	#print(sentence)
	try:
	for candidate in best_candidates(sentence):
	#print(candidate)
	try:
	#print('begin get definition')
	definition = get_definition(candidate, sentence)
	#print('get_definition:')
	#print(definition)

	except (ValueError, IndexError) as e:
	#log.debug("{} Omitting candidate {}. Reason: {}".format(i, candidate, e.args[0]))
	omit += 1
	else:
	try:
	definition_dict = select_definition(definition, candidate)
	except (ValueError, IndexError) as e:
	#log.debug("{} Omitting definition {} for candidate {}. Reason: {}".format(i, definition_dict, candidate, e.args[0]))
	omit += 1
	else:
	definition_dict['abbre']=candidate
	abbrev_map.append(definition_dict)
	written += 1
	except (ValueError, IndexError) as e:
	log.debug("{} Error processing sentence {}: {}".format(i, sentence, e.args[0]))
	log.debug("{} abbreviations detected and kept ({} omitted)".format(written, omit))
	return abbrev_map

	def postprocess_abbr(ner_result,ori_text):

	final_result={}
	if len(ner_result)==0:
	return []
	# abbr recognition
	abbr_result=extract_abbreviation_definition_pairs(doc_text=ori_text)

	# read ner results
	nor_loc_list={} #{entity_name_location:entity_information}

	for ele in ner_result:
	nor_loc_list[str(ele[0])+' '+str(ele[1])]=ele
	final_result['\t'.join(ele)]=[int(ele[0]),int(ele[1])]

	#abbr matching
	for abbr in abbr_result:
	abbr_index=str(abbr['start'])+' '+str(abbr['stop'])
	if abbr_index in nor_loc_list.keys():

	line=ori_text
	abbr_text=abbr['abbre']
	abbr_eid=0
	while line.find(abbr_text)>=0:
	abbr_sid=line.find(abbr_text)+abbr_eid
	abbr_eid=abbr_sid+len(abbr_text)
	# print(abbr_sid,abbr_eid)
	if abbr_sid>0 and abbr_eid<len(ori_text):
	if ori_text[abbr_sid-1].isalnum()==False and ori_text[abbr_eid].isalnum()==False:
	final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+nor_loc_list[abbr_index][2]+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
	elif abbr_sid==0 and abbr_eid<len(ori_text):
	if ori_text[abbr_eid].isalnum()==False:
	final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+nor_loc_list[abbr_index][2]+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
	elif abbr_sid>0 and abbr_eid==len(ori_text):
	if ori_text[abbr_sid-1].isalnum()==False :
	final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+nor_loc_list[abbr_index][2]+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
	line=ori_text[abbr_eid:]
	# print(final_result)
	sorted_final_result=sorted(final_result.items(), key=lambda kv:(kv[1]), reverse=False)
	final_result=[]
	for ele in sorted_final_result:
	final_result.append(ele[0].split('\t'))
	return final_result

	def ner_abbr(ner_result,abbr_result,ori_text):
	# read ner results
	nor_name_list={} #{entity_name:entity_information}
	nor_loc_list={} #{entity_name_location:entity_information}
	final_result={} #{entity_information:location} use to sort
	for ele in ner_result:
	temp_seg=ele.split('\t')
	nor_loc_list[temp_seg[0]+' '+temp_seg[1]]=temp_seg
	nor_name_list[temp_seg[2].lower()]=temp_seg
	final_result['\t'.join(temp_seg[0:4])]=[int(temp_seg[0]),int(temp_seg[1])]

	#abbr matching
	for abbr in abbr_result:
	abbr_index=str(abbr['start'])+' '+str(abbr['stop'])
	if abbr_index in nor_loc_list.keys():

	line=ori_text
	abbr_text=abbr['abbre']
	abbr_eid=0
	while line.find(abbr_text)>=0:
	abbr_sid=line.find(abbr_text)+abbr_eid
	abbr_eid=abbr_sid+len(abbr_text)
	# print(abbr_sid,abbr_eid)
	if abbr_sid>0 and abbr_eid<len(ori_text):
	if ori_text[abbr_sid-1].isalnum()==False and ori_text[abbr_eid].isalnum()==False:
	final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+abbr_text+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
	elif abbr_sid==0 and abbr_eid<len(ori_text):
	if ori_text[abbr_eid].isalnum()==False:
	final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+abbr_text+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
	elif abbr_sid>0 and abbr_eid==len(ori_text):
	if ori_text[abbr_sid-1].isalnum()==False :
	final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+abbr_text+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
	line=ori_text[abbr_eid:]
	# print(final_result)
	final_result=sorted(final_result.items(), key=lambda kv:(kv[1]), reverse=False)

	return final_result




	if __name__ == '__main__':
	path='//panfs/pan1/bionlp/lulab/luoling/HPO_project/diseaseTag/data/test/results/'
	fin=open(path+'NCBI_test_phecr_95.tsv','r',encoding='utf-8')
	context=fin.read().strip().split('\n\n')
	fin.close()
	fout=open(path+'NCBI_test_phecr_abbre_95.tsv','w',encoding='utf-8')
	for doc in context:
	lines=doc.split('\n')
	ori_text=lines[1]
	# print(ori_text)
	fout.write(lines[0]+'\n'+lines[1]+'\n')
	if len(lines)>2:
	abbr_result=extract_abbreviation_definition_pairs(doc_text=ori_text)
	print(abbr_result)
	abbr_out=ner_abbr(lines[2:],abbr_result,ori_text)
	else:
	abbr_out=[]
	# print('final:',abbr_out)
	for ele in abbr_out:
	fout.write(ele[0]+'\n')
	fout.write('\n')
	# sys.exit()
	fout.close()
	#last_out=combine_ml_dict_fn(abbr_out,infile)
	#print(last_out)