Spaces:

B-patents
/

patent-bert

Build error

patent-bert / app.py

danseith

update wording in description

4b598b3 over 2 years ago

14.9 kB

	import gradio as gr
	import numpy as np
	import torch
	import re
	from nltk.stem import PorterStemmer
	from collections import defaultdict
	from transformers import pipeline
	from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline
	from transformers import AutoModelForMaskedLM

	ex_str1 = "A crustless sandwich made from two slices of baked bread. The sandwich includes first and second matching " \
	"crustless bread pieces. The bread pieces have the same general outer shape defined by an outer periphery " \
	"with central portions surrounded by an outer peripheral area, the bread pieces being at least partially " \
	"crimped together at the outer peripheral area."
	ex_key1 = "sandwich bread crimped"

	ex_str2 = "The present disclosure provides a DNA-targeting RNA that comprises a targeting sequence and, together with" \
	" a modifying polypeptide, provides for site-specific modification of a target DNA and/or a polypeptide" \
	" associated with the target DNA. "
	ex_key2 = "DNA target modification"

	ex_str3 = "The graphite plane is composed of a two-dimensional hexagonal lattice of carbon atoms and the plate has a " \
	"length and a width parallel to the graphite plane and a thickness orthogonal to the graphite plane with at " \
	"least one of the length, width, and thickness values being 100 nanometers or smaller. "
	ex_key3 = "graphite lattice orthogonal "

	tab_two_examples = [[ex_str1, ex_key1],
	[ex_str2, ex_key2],
	[ex_str3, ex_key3]]
	#
	# tab_one_examples = [['A crustless _ made from two slices of baked bread.'],
	# ['The present disclosure provides a DNA-targeting RNA that comprises a targeting _.'],
	# ['The _ plane is composed of a two-dimensional hexagonal lattice of carbon atoms.']
	# ]

	ignore_str = ['a', 'an', 'the', 'is', 'and', 'or', '!', '(', ')', '-', '[', ']', '{', '}', ';', ':', "'", '"', '\\',
	',', '<', '>', '.', '/', '?', '@', '#', '$', '%', '^', '&', '*', '_', '~']


	def add_mask(text, lower_bound=0, index=None):
	split_text = text.split()
	if index is not None:
	split_text[index] = '[MASK]'
	return ' '.join(split_text), None
	# If the user supplies a mask, don't add more
	if '_' in split_text:
	u_pos = [i for i, s in enumerate(split_text) if '_' in s][0]
	split_text[u_pos] = '[MASK]'
	return ' '.join(split_text), '[MASK]'

	idx = np.random.randint(low=lower_bound, high=len(split_text), size=1).astype(int)[0]
	# Don't mask certain words
	num_iters = 0
	while split_text[idx].lower() in ignore_str:
	num_iters += 1
	idx = np.random.randint(len(split_text), size=1).astype(int)[0]
	if num_iters > 10:
	break

	masked_string = split_text[idx]
	split_text[idx] = '[MASK]'
	masked_output = ' '.join(split_text)
	return masked_output, masked_string


	class TempScalePipe(FillMaskPipeline):
	def _sanitize_parameters(self, top_k=None, targets=None, temp=None):
	postprocess_params = {}

	if targets is not None:
	target_ids = self.get_target_ids(targets, top_k)
	postprocess_params["target_ids"] = target_ids

	if top_k is not None:
	postprocess_params["top_k"] = top_k

	if temp is not None:
	postprocess_params["temp"] = temp
	return {}, {}, postprocess_params


	def __call__(self, inputs, args, *kwargs):
	"""
	Fill the masked token in the text(s) given as inputs.

	Args:
	args (`str` or `List[str]`):
	One or several texts (or one list of prompts) with masked tokens.
	targets (`str` or `List[str]`, optional):
	When passed, the model will limit the scores to the passed targets instead of looking up in the whole
	vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
	resulting token will be used (with a warning, and that might be slower).
	top_k (`int`, optional):
	When passed, overrides the number of predictions to return.

	Return:
	A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:

	- sequence (`str`) -- The corresponding input with the mask token prediction.
	- score (`float`) -- The corresponding probability.
	- token (`int`) -- The predicted token id (to replace the masked one).
	- token (`str`) -- The predicted token (to replace the masked one).
	"""
	outputs = super().__call__(inputs, **kwargs)
	if isinstance(inputs, list) and len(inputs) == 1:
	return outputs[0]
	return outputs

	def postprocess(self, model_outputs, top_k=10, target_ids=None, temp=1):
	# Cap top_k if there are targets
	if target_ids is not None and target_ids.shape[0] < top_k:
	top_k = target_ids.shape[0]
	input_ids = model_outputs["input_ids"][0]
	outputs = model_outputs["logits"]

	masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
	# Fill mask pipeline supports only one ${mask_token} per sample

	logits = outputs[0, masked_index, :] / temp
	probs = logits.softmax(dim=-1)
	sampling = False
	if sampling:
	predictions = torch.multinomial(probs, num_samples=3)
	values = probs[0, predictions]
	if target_ids is not None:
	probs = probs[..., target_ids]
	if not sampling:
	values, predictions = probs.topk(top_k)

	result = []
	single_mask = values.shape[0] == 1
	for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
	row = []
	for v, p in zip(_values, _predictions):
	# Copy is important since we're going to modify this array in place
	tokens = input_ids.numpy().copy()
	if target_ids is not None:
	p = target_ids[p].tolist()

	tokens[masked_index[i]] = p
	# Filter padding out:
	tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
	# Originally we skip special tokens to give readable output.
	# For multi masks though, the other [MASK] would be removed otherwise
	# making the output look odd, so we add them back
	sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
	proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
	row.append(proposition)
	result.append(row)
	if single_mask:
	return result[0]
	return result


	PIPELINE_REGISTRY.register_pipeline(
	"temp-scale",
	pipeline_class=TempScalePipe,
	pt_model=AutoModelForMaskedLM,
	)
	scrambler = pipeline("temp-scale", model="anferico/bert-for-patents")

	generator = pipeline('text-generation', model='gpt2')

	def sample_output(out, sampling):
	score_to_str = {out[k]: k for k in out.keys()}
	score_list = list(score_to_str.keys())
	if sampling == 'multi':
	idx = np.argmax(np.random.multinomial(1, score_list, 1))
	else:
	idx = np.random.randint(0, len(score_list))
	score = score_list[idx]
	return score_to_str[score]


	def unmask_single(text, temp=1):
	masked_text, _ = add_mask(text)
	res = scrambler(masked_text, temp=temp, top_k=10)
	out = {item["token_str"]: item["score"] for item in res}
	return out


	def unmask(text, temp, rounds, lower_bound=0):
	sampling = 'multi'
	for _ in range(rounds):
	masked_text, masked = add_mask(text, lower_bound)
	split_text = masked_text.split()
	res = scrambler(masked_text, temp=temp, top_k=15)
	mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0]
	out = {item["token_str"]: item["score"] for item in res}
	new_token = sample_output(out, sampling)
	unsuccessful_iters = 0
	while masked in new_token:
	if unsuccessful_iters > 5:
	break
	print('skipped', new_token)
	new_token = sample_output(out, sampling=sampling)
	unsuccessful_iters += 1
	if masked in new_token:
	split_text[mask_pos] = new_token
	else:
	split_text[mask_pos] = '' + new_token + ''
	text = ' '.join(split_text)

	text = list(text)
	text[0] = text[0].upper()
	return ''.join(text)


	def autocomplete(text, temp):
	output = generator(text, max_length=30, num_return_sequences=1)
	gpt_out = output[0]['generated_text']
	# diff = gpt_out.replace(text, '')
	patent_bert_out = unmask(gpt_out, temp=temp, rounds=5, lower_bound=len(text.split()))
	# Take the output from gpt-2 and randomly mask, if a mask is confident, swap it in. Iterate 5 times
	return patent_bert_out


	def extract_keywords(text, queries):
	q_dict = {}
	temp = 1 # set temperature to 1
	for query in queries.split():
	# Iterate through text and mask each token
	ps = PorterStemmer()
	top_scores = defaultdict(list)
	top_k_range = 30
	text_no_punc = re.sub(r'[^\w\s]', '', text)
	indices = [i for i, t in enumerate(text_no_punc.split()) if t.lower() == query.lower()]
	for i in indices:
	masked_text, masked = add_mask(text, index=i)
	res = scrambler(masked_text, temp=temp, top_k=top_k_range)
	out = {item["token_str"]: item["score"] for item in res}
	sorted_keys = sorted(out, key=out.get)
	# If the key does not appear, floor its rank for that round
	for rank, token_str in enumerate(sorted_keys):
	if token_str in ignore_str:
	continue
	stemmed = ps.stem(token_str)
	norm_rank = rank / top_k_range
	top_scores[stemmed].append(norm_rank)
	for key in top_scores.keys():
	if key not in out.keys():
	top_scores[key].append(0)
	# Calc mean
	for key in top_scores.keys():
	top_scores[key] = np.mean(top_scores[key])
	# Normalize
	for key in top_scores.keys():
	top_scores[key] = top_scores[key] / np.sum(list(top_scores.values()))
	# Get top_k
	top_n = sorted(list(top_scores.values()))[-3]
	for key in list(top_scores.keys()):
	if top_scores[key] < top_n:
	del top_scores[key]
	q_dict[query] = top_scores

	keywords = ''
	for i, q in enumerate(q_dict.keys()):
	keywords += '['
	for ii, k in enumerate(q_dict[q].keys()):
	keywords += k
	if ii < len(q_dict[q].keys()) - 1:
	keywords += ' OR '
	else:
	keywords += ']'
	if i < len(q_dict.keys()) - 1:
	keywords += ' AND '
	# keywords = set([k for q in q_dict.keys() for k in q_dict[q].keys()])
	# search_str = ' OR '.join(keywords)
	output = [q_dict[q] for q in q_dict]
	output.append(keywords)
	return output
	# fig, ax = plt.subplots(nrows=1, ncols=3)
	# for q in q_dict:
	# ax.bar(q_dict[q])
	# return fig

	label0 = gr.Label(label='keyword 1', num_top_classes=3)
	label01 = gr.Label(label='keyword 2', num_top_classes=3)
	label02 = gr.Label(label='keyword 3', num_top_classes=3)
	textbox02 = gr.Textbox(label="Input Keywords", lines=3)
	textbox01 = gr.Textbox(label="Input Keywords", placeholder="Type keywords here", lines=1)
	textbox0 = gr.Textbox(label="Input Sentences", placeholder="Type sentences here", lines=5)

	output_textbox0 = gr.Textbox(label='Search String of Keywords', placeholder="Output will appear here", lines=4)
	# temp_slider0 = gr.Slider(1.0, 3.0, value=1.0, label='Creativity')

	textbox1 = gr.Textbox(label="Input Sentence", lines=5)
	# output_textbox1 = gr.Textbox(placeholder="Output will appear here", lines=4)
	title1 = "Patent-BERT: Context-Dependent Synonym Generator"
	description1 = """<p>
	Try inserting a few sentences from a patent, and pick keywords for the model to analyze. The model will analyze the
	context of the keywords in the sentences and generate the top three most likely candidates for each word.
	This can be used for more creative patent drafting or patent searches using the generated search string. The base model is
	<a href= "https://github.com/google/patents-public-data/blob/master/models/BERT%20for%20Patents.md">Patent BERT</a> created and trained by Google.

	<strong>Note:</strong> Current pipeline only allows for <strong>three</strong> keyword submissions. Stemming (e.g., altering -> alter) is built into the output for
	broader search string. <br/>

	Beta features (currently work-in-progress) include: (<strong>A</strong>) adjustment options for (i) the number of keywords, (ii) the number of context-dependent synonyms,
	and (iii) a 'creativity' parameter of the model; (<strong>B</strong>) analysis of where these words appear in the patent (e.g.,
	claim, summary, etc.); and (<strong>C</strong>) a stemming option for input keywords.
	<br/>
	<p/>"""

	# textbox2 = gr.Textbox(label="Input Sentences", lines=5)
	# output_textbox2 = gr.Textbox(placeholder="Output will appear here", lines=4)
	# temp_slider2 = gr.Slider(1.0, 3.0, value=1.0, label='Creativity')
	# edit_slider2 = gr.Slider(1, 20, step=1, value=1.0, label='Number of edits')


	# title2 = "Patent-BERT Sentence Remix-er: Multiple Edits"
	# description2 = """<p>
	#
	# Try typing in a sentence for the model to remix. Adjust the 'creativity' scale bar to change the
	# the model's confidence in its likely substitutions and the 'number of edits' for the number of edits you want
	# the model to attempt to make. The words substituted in the output sentence will be enclosed in asterisks (e.g., word).
	# <br/> <p/> """

	demo0 = gr.Interface(
	fn=extract_keywords,
	inputs=[textbox0, textbox01],
	outputs=[label0, label01, label02, output_textbox0],
	examples=tab_two_examples,
	allow_flagging='never',
	title=title1,
	description=description1
	)

	# demo1 = gr.Interface(
	# fn=unmask_single,
	# inputs=[textbox1],
	# outputs='label',
	# examples=tab_one_examples,
	# allow_flagging='never',
	# title=title1,
	# description=description1
	# )

	# demo2 = gr.Interface(
	# fn=unmask,
	# inputs=[textbox2, temp_slider2, edit_slider2],
	# outputs=[output_textbox2],
	# examples=tab_two_examples,
	# allow_flagging='never',
	# title=title2,
	# description=description2
	# )

	gr.TabbedInterface(
	[demo0], ["Keyword generator"]
	).launch()