import spacy from spacy.util import minibatch, compounding from spacy.scorer import Scorer from src.model_utils import * import random from tqdm import tqdm def train_transformer(config: dict, train_data: list, components: list, iter: int, batch_size: int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data: list=None)-> spacy: """ Finetune a transformer model or resume training from a fine-tuned model. Parameters: config: dict, configuration parameters train_data: list, contain training data components: list, list of components to be trained iter: int, number of iterations to train batch_size: int, batch size to be used for training entities: list of entities to be trained on for NER eval_data: list, containing evaluation data Returns: nlp : spacy transformer losses: list of the losses at every iteration """ if config['dir'] is not None: nlp = spacy.load(config['dir']) optimizer = nlp.resume_training() else: nlp = spacy.blank("en") # empty English pipeline nlp.add_pipe("transformer", config=config['config']) for component in components: nlp.add_pipe(component) task=nlp.get_pipe(component) if ('ner' in components) and (entities is not None): for label in entities: task.add_label(label) nlp.initialize() # XXX don't forget this step! optimizer = nlp.create_optimizer() # convert data into training doc train_data_doc = make_training_doc(nlp, train_data) all_losses = [] for itn in tqdm(range(1,iter+1)): print("Starting iteration " + str(itn)) random.shuffle(train_data) losses = {} # compounding(4.0, 32.0, 1.001) batches = minibatch(train_data_doc, size=batch_size) for batch in batches: nlp.update(batch, sgd=optimizer,drop=0.2, losses=losses) scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data) print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \ format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f'])) all_losses.append([losses[component] for component in components]) return nlp, all_losses def train_spacy(model: spacy, train_data: list, components: list, iter: int, batch_size:int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data:list=None)-> spacy: """ Finetune a spacy model or resume training from a fine-tuned model. Parameters: model: str, name of spacy model train_data: list, contain training data components: list, list of components to be trained iter: int, number of iterations to train batch_size: int, batch size to be used for training entities: list of entities to be trained on for NER eval_data: list, containing evaluation data Returns: nlp : spacy model losses: list of the losses at every iteration """ # get model and optimizer if model is not None: nlp, optimizer = load_model(model) # load existing spaCy model/ blank models # convert data into training doc train_data_doc = make_training_doc(nlp, train_data) # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy for component in components: if component not in nlp.pipe_names: ner = nlp.create_pipe(component) nlp.add_pipe(component, last=True) else: ner = nlp.get_pipe(component) # add labels if component is NER if (component == 'ner') and (entities is not None): for ent in entities: ner.add_label(ent) print(f'Entities in the model are: {nlp.get_pipe("ner").labels}') # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in components] all_losses = [] with nlp.disable_pipes(*other_pipes): # only train NER for itn in tqdm(range(1,iter+1)): print("Starting iteration " + str(itn)) random.shuffle(train_data) losses = {} batches = minibatch(train_data_doc, size=batch_size) for batch in batches: nlp.update(list(batch), losses=losses, drop=0.1, sgd=optimizer) scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data) print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \ format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f'])) all_losses.append([losses[component] for component in components]) return nlp, all_losses def eval_spacy(model: spacy, data): """ Function to perform evaluation and scoring Parameters: model: either a spacy model or spacy transformer data: evaluation data so that scoring can be done Returns: score: dict with scores of the model """ scorer = Scorer() examples = [] try: # accept spacy format json data for input_, annot in data: doc = model.make_doc(input_) example = Example.from_dict(doc, annot) example.predicted = model(str(example.text)) examples.append(example) scores = scorer.score(examples) return scores except TypeError: # accept alternative format json data for row in data: input_, annot = row.values() doc = model.make_doc(input_) example = Example.from_dict(doc, {'entities':annot}) example.predicted = model(str(example.text)) examples.append(example) scores = scorer.score(examples) return scores except Exception as e: print(e)