PeteBleackley commited on
Commit
c8625dc
·
1 Parent(s): 56e5680

Modified training scripts to use PyTorch

Browse files
Files changed (1) hide show
  1. scripts.py +89 -75
scripts.py CHANGED
@@ -1,8 +1,6 @@
1
 
2
  import os
3
- import re
4
  import argparse
5
- import pickle
6
  import json
7
  import numpy
8
  import tokenizers
@@ -13,8 +11,7 @@ import qarac.corpora.Batcher
13
  import qarac.models.qarac_base_model
14
  import qarac.models.QaracTrainerModel
15
  import qarac.corpora.CombinedCorpus
16
- import keras
17
- import tensorflow
18
  import spacy
19
  import pandas
20
  import qarac.utils.CoreferenceResolver
@@ -23,9 +20,23 @@ import difflib
23
  import scipy.stats
24
  import scipy.spatial
25
  import seaborn
 
26
 
 
27
 
28
-
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
 
31
  def capitalise(token,i):
@@ -67,12 +78,12 @@ def train_base_model(task,filename):
67
  768,
68
  12,
69
  task=='decode')
70
- optimizer = keras.optimizers.Nadam(learning_rate=keras.optimizers.schedules.ExponentialDecay(1.0e-5, 100, 0.99))
71
- model.compile(optimizer=optimizer,loss='sparse_categorical_crossentropy',metrics='accuracy')
72
- model.fit(train_data,
73
- epochs=100,
74
- workers = 16,
75
- use_multiprocessing=True)
76
  test_data=qarac.corpora.Batcher.Batcher(test)
77
  print(model.evaluate(test_data))
78
  model.save(filename)
@@ -121,38 +132,45 @@ def train_models(path):
121
  trainer = qarac.models.QaracTrainerModel.QaracTrainerModel(encoder_base,
122
  decoder_base,
123
  tokenizer)
124
- losses={'encode_decode':keras.losses.SparseCategoricalCrossentropy(from_logits=True),
125
- 'question_answering':keras.losses.mean_squared_error,
126
- 'reasoning':keras.losses.SparseCategoricalCrossentropy(from_logits=True),
127
- 'consistency':keras.losses.mean_squared_error}
128
- optimizer = keras.optimizers.Nadam(learning_rate=keras.optimizers.schedules.ExponentialDecay(1.0e-5, 100, 0.99))
129
- trainer.compile(optimizer=optimizer,
130
- loss=losses)
131
  training_data = qarac.corpora.CombinedCorpus.CombinedCorpus(tokenizer,
132
  all_text='corpora/all_text.csv',
133
  question_answering='corpora/question_answering.csv',
134
  reasoning='corpora/reasoning_train.csv',
135
  consistency='corpora/consistency.csv')
136
- history = trainer.fit(training_data,
137
- epochs=10)
138
- with open('history.json','w') as jsonfile:
139
- json.dump(history.history,jsonfile)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  huggingface_hub.login(token=os.environ['HUGGINGFACE_TOKEN'])
141
  trainer.question_encoder.push_to_hub('{}/qarac-roberta-question-encoder'.format(path))
142
  trainer.answer_encoder.push_to_hub('{}/qarac-roberta-answer-encoder'.format(path))
143
  trainer.decoder.push_to_hub('{}/qarac-roberta-decoder'.format(path))
144
- with open('model_summaries.txt') as summaries:
145
- summaries.write('TRAINER MODEL\n')
146
- summaries.write(trainer.summary())
147
- summaries.write('QUESTION ENCODER\n')
148
- summaries.write(trainer.question_encoder.summary())
149
- summaries.write('ANSWER ENCODER\n')
150
- summaries.write(trainer.answer_encoder.summary())
151
- summaries.write('DECODER\n')
152
- summaries.write(trainer.decoder.summary())
153
- keras.utils.plot_model(trainer,'trainer_model.png')
154
- keras.utils.plot_model(trainer.answer_encoder,'encoder_model.png')
155
- keras.utils.plot_model(trainer.decoder,'decoder_model.png')
156
 
157
  def test_encode_decode(path):
158
  encoder = transformers.Transformer.from_pretrained('{}/qarac-roberta-answer-encoder'.format(path))
@@ -173,9 +191,8 @@ def test_encode_decode(path):
173
  maxlen = max((len(sentence) for sentence in batch))
174
  for sample in batch:
175
  sample.pad(maxlen,pad_id=pad_token)
176
- input_ids = tensorflow.constant([sample.ids for sample in batch])
177
- attention_mask = tensorflow.constant(numpy.notequal(input_ids.numpy(),
178
- pad_token).astype(int))
179
  vectors = encoder(input_ids,
180
  attention_mask)
181
  decoded = decoder.generate(vector=vectors)
@@ -187,9 +204,8 @@ def test_encode_decode(path):
187
  maxlen = max((len(sentence) for sentence in batch))
188
  for sample in batch:
189
  sample.pad(maxlen,pad_id=pad_token)
190
- input_ids = tensorflow.constant([sample.ids for sample in batch])
191
- attention_mask = tensorflow.constant(numpy.notequal(input_ids.numpy(),
192
- pad_token).astype(int))
193
  vectors = encoder(input_ids,
194
  attention_mask)
195
  decoded = decoder.generate(vector=vectors)
@@ -234,20 +250,20 @@ def test_question_answering(path):
234
  pad_token = tokenizer.token_to_id('<pad>')
235
  for question in questions:
236
  question.pad(maxlen,pad_id=pad_token)
237
- question_ids = tensorflow.constant([question.ids
238
- for question in questions])
239
- attention_mask = tensorflow.constant(numpy.not_equal(question_ids.numpy(),
240
- pad_token).astype(int))
241
  q_vectors = question_encoder(question_ids,
242
  attention_mask=attention_mask).numpy()
243
  answers = tokenize(data['Resolved_answer'])
244
  maxlen = max((len(answer) for answer in answers))
245
  for answer in answers:
246
  answer.pad(maxlen,pad_id=pad_token)
247
- answer_ids = tensorflow.constant([answer.ids
248
- for answer in answers])
249
- attention_mask = tensorflow.constant(numpy.not_equal(answer_ids.numpy(),
250
- pad_token).astype(int))
251
  answer_lookup = scipy.spatial.KDTree(answer_encoder(answer_ids,
252
  attention_mask=attention_mask).numpy())
253
  n_correct = 0
@@ -321,15 +337,15 @@ def test_reasoning(path):
321
  maxlen=max((len(sample for sample in p0_batch)))
322
  for sample in p0_batch:
323
  sample.pad(maxlen,pad_token)
324
- p0_in = tensorflow.constant([sample.ids for sample in p0.batch])
325
- p0_attn = tensorflow.constant(numpy.not_equal(p0_in.numpy(),
326
- pad_token).astype(int))
327
  maxlen=max((len(sample for sample in p1_batch)))
328
  for sample in p1_batch:
329
  sample.pad(maxlen,pad_token)
330
- p1_in = tensorflow.constant([sample.ids for sample in p1.batch])
331
- p1_attn = tensorflow.constant(numpy.not_equal(p0_in.numpy(),
332
- pad_token).astype(int))
333
  predictions = decoder.generate(vector=(encoder(p0_in,
334
  attention_mask=p0_attn)
335
  +encoder(p1_in,
@@ -345,15 +361,15 @@ def test_reasoning(path):
345
  maxlen=max((len(sample for sample in p0_batch)))
346
  for sample in p0_batch:
347
  sample.pad(maxlen,pad_token)
348
- p0_in = tensorflow.constant([sample.ids for sample in p0.batch])
349
- p0_attn = tensorflow.constant(numpy.not_equal(p0_in.numpy(),
350
- pad_token).astype(int))
351
  maxlen=max((len(sample for sample in p1_batch)))
352
  for sample in p1_batch:
353
  sample.pad(maxlen,pad_token)
354
- p1_in = tensorflow.constant([sample.ids for sample in p1.batch])
355
- p1_attn = tensorflow.constant(numpy.not_equal(p0_in.numpy(),
356
- pad_token).astype(int))
357
  predictions = decoder.generate(vector=(encoder(p0_in,
358
  attention_mask=p0_attn)
359
  +encoder(p1_in,
@@ -391,24 +407,22 @@ def test_consistency(path):
391
  maxlen = max((len(sentence for sentence in s0)))
392
  for sentence in s0:
393
  sentence.pad(maxlen,pad_id=pad_token)
394
- s0_in = tensorflow.constant([sentence.ids for sentence in s0])
395
- s0_attn = tensorflow.constant(numpy.not_equal(s0_in.numpy(),
396
- pad_token).astype(int))
397
  maxlen = max((len(sentence for sentence in s1)))
398
  for sentence in s1:
399
  sentence.pad(maxlen,pad_id=pad_token)
400
- s1_in = tensorflow.constant([sentence.ids for sentence in s1])
401
- s1_attn = tensorflow.constant(numpy.not_equal(s1_in.numpy(),
402
- pad_token).astype(int))
403
- s0_vec = tensorflow.l2_norm(encoder(s0_in,attention_mask=s0_attn),
404
- axis=1)
405
- s1_vec = tensorflow.l2_norm(encoder(s1_in,attention_mask=s1_attn),
406
- axis=1)
407
- @tensorflow.function
408
- def dotprod(vecs):
409
- (x,y)=vecs
410
- return tensorflow.tensordot(x,y,axes=1)
411
- consistency = tensorflow.vectorized_map(dotprod, (s0_vec,s1_vec)).numpy()
412
  results = pandas.DataFrame({'label':data['gold_label'],
413
  'score':consistency})
414
  third = 1.0/3.0
 
1
 
2
  import os
 
3
  import argparse
 
4
  import json
5
  import numpy
6
  import tokenizers
 
11
  import qarac.models.qarac_base_model
12
  import qarac.models.QaracTrainerModel
13
  import qarac.corpora.CombinedCorpus
14
+ import torch
 
15
  import spacy
16
  import pandas
17
  import qarac.utils.CoreferenceResolver
 
20
  import scipy.stats
21
  import scipy.spatial
22
  import seaborn
23
+ import tqdm
24
 
25
+ EPSILON = 1.0e-12
26
 
27
+ class CombinedLoss(torch.nn.Module):
28
+ def __init__(self):
29
+ super(CombinedLoss,self).__init__()
30
+ self.component_losses = (torch.nn.CrossEntropyLoss(),
31
+ torch.nn.MSELoss(),
32
+ torch.nn.CrossEntropyLoss(),
33
+ torch.nn.MSELoss())
34
+
35
+ def forward(self,y_pred,y_true):
36
+ return torch.sum((fn(pred,obs)
37
+ for (fn,pred,obs) in zip(self.component_losses,
38
+ y_pred,
39
+ y_true)))
40
 
41
 
42
  def capitalise(token,i):
 
78
  768,
79
  12,
80
  task=='decode')
81
+ #optimizer = keras.optimizers.Nadam(learning_rate=keras.optimizers.schedules.ExponentialDecay(1.0e-5, 100, 0.99))
82
+ #model.compile(optimizer=optimizer,loss='sparse_categorical_crossentropy',metrics='accuracy')
83
+ #model.fit(train_data,
84
+ # epochs=100,
85
+ # workers = 16,
86
+ # use_multiprocessing=True)
87
  test_data=qarac.corpora.Batcher.Batcher(test)
88
  print(model.evaluate(test_data))
89
  model.save(filename)
 
132
  trainer = qarac.models.QaracTrainerModel.QaracTrainerModel(encoder_base,
133
  decoder_base,
134
  tokenizer)
135
+ loss_fn = CombinedLoss()
136
+ optimizer = torch.optim.NAdam(trainer.parameters(),lr=5.0e-5)
137
+ scheduler = torch.optim.ExponentialDecay(optimizer,gamma=0.9)
 
 
 
 
138
  training_data = qarac.corpora.CombinedCorpus.CombinedCorpus(tokenizer,
139
  all_text='corpora/all_text.csv',
140
  question_answering='corpora/question_answering.csv',
141
  reasoning='corpora/reasoning_train.csv',
142
  consistency='corpora/consistency.csv')
143
+ n_batches = len(training_data)
144
+ history = []
145
+ for epoch in range(10):
146
+ print("Epoch",epoch)
147
+ epoch_history = []
148
+ for (batch,(X,Y)) in enumerate(tqdm.tqdm(training_data)):
149
+ prediction = trainer(X['all_text'],
150
+ X['offset_text'],
151
+ X['question'],
152
+ X['answer'],
153
+ X['proposition0'],
154
+ X['proposition1'],
155
+ X['conclusion_offset'],
156
+ X['statement0'],
157
+ X['statement1'])
158
+ loss = loss_fn(prediction,Y)
159
+ loss.backward()
160
+ optimizer.step()
161
+ optimizer.zero_grad()
162
+ if batch % 1024 == 0 or batch == n_batches-1:
163
+ epoch_history.append({'batch':batch,
164
+ 'loss':loss.item()})
165
+ scheduler.step()
166
+ history.append(epoch_history)
167
+ with open('training_history.json','w') as jsonfile:
168
+ json.dump(history,jsonfile)
169
  huggingface_hub.login(token=os.environ['HUGGINGFACE_TOKEN'])
170
  trainer.question_encoder.push_to_hub('{}/qarac-roberta-question-encoder'.format(path))
171
  trainer.answer_encoder.push_to_hub('{}/qarac-roberta-answer-encoder'.format(path))
172
  trainer.decoder.push_to_hub('{}/qarac-roberta-decoder'.format(path))
173
+
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  def test_encode_decode(path):
176
  encoder = transformers.Transformer.from_pretrained('{}/qarac-roberta-answer-encoder'.format(path))
 
191
  maxlen = max((len(sentence) for sentence in batch))
192
  for sample in batch:
193
  sample.pad(maxlen,pad_id=pad_token)
194
+ input_ids = torch.tensor([sample.ids for sample in batch])
195
+ attention_mask = torch.not_equal(input_ids,pad_token)
 
196
  vectors = encoder(input_ids,
197
  attention_mask)
198
  decoded = decoder.generate(vector=vectors)
 
204
  maxlen = max((len(sentence) for sentence in batch))
205
  for sample in batch:
206
  sample.pad(maxlen,pad_id=pad_token)
207
+ input_ids = torch.tensor([sample.ids for sample in batch])
208
+ attention_mask = torch.not_equal(input_ids, pad_token)
 
209
  vectors = encoder(input_ids,
210
  attention_mask)
211
  decoded = decoder.generate(vector=vectors)
 
250
  pad_token = tokenizer.token_to_id('<pad>')
251
  for question in questions:
252
  question.pad(maxlen,pad_id=pad_token)
253
+ question_ids = torch.tensor([question.ids
254
+ for question in questions])
255
+ attention_mask = torch.not_equal(question_ids,
256
+ pad_token)
257
  q_vectors = question_encoder(question_ids,
258
  attention_mask=attention_mask).numpy()
259
  answers = tokenize(data['Resolved_answer'])
260
  maxlen = max((len(answer) for answer in answers))
261
  for answer in answers:
262
  answer.pad(maxlen,pad_id=pad_token)
263
+ answer_ids = torch.tensor([answer.ids
264
+ for answer in answers])
265
+ attention_mask = torch.not_equal(answer_ids,
266
+ pad_token)
267
  answer_lookup = scipy.spatial.KDTree(answer_encoder(answer_ids,
268
  attention_mask=attention_mask).numpy())
269
  n_correct = 0
 
337
  maxlen=max((len(sample for sample in p0_batch)))
338
  for sample in p0_batch:
339
  sample.pad(maxlen,pad_token)
340
+ p0_in = torch.tensor([sample.ids for sample in p0.batch])
341
+ p0_attn = torch.not_equal(p0_in,
342
+ pad_token)
343
  maxlen=max((len(sample for sample in p1_batch)))
344
  for sample in p1_batch:
345
  sample.pad(maxlen,pad_token)
346
+ p1_in = torch.tensor([sample.ids for sample in p1.batch])
347
+ p1_attn = torch.not_equal(p0_in,
348
+ pad_token)
349
  predictions = decoder.generate(vector=(encoder(p0_in,
350
  attention_mask=p0_attn)
351
  +encoder(p1_in,
 
361
  maxlen=max((len(sample for sample in p0_batch)))
362
  for sample in p0_batch:
363
  sample.pad(maxlen,pad_token)
364
+ p0_in = torch.tensor([sample.ids for sample in p0.batch])
365
+ p0_attn = torch.not_equal(p0_in,
366
+ pad_token)
367
  maxlen=max((len(sample for sample in p1_batch)))
368
  for sample in p1_batch:
369
  sample.pad(maxlen,pad_token)
370
+ p1_in = torch.tensor([sample.ids for sample in p1.batch])
371
+ p1_attn = torch.not_equal(p0_in,
372
+ pad_token)
373
  predictions = decoder.generate(vector=(encoder(p0_in,
374
  attention_mask=p0_attn)
375
  +encoder(p1_in,
 
407
  maxlen = max((len(sentence for sentence in s0)))
408
  for sentence in s0:
409
  sentence.pad(maxlen,pad_id=pad_token)
410
+ s0_in = torch.tensor([sentence.ids for sentence in s0])
411
+ s0_attn = torch.not_equal(s0_in,
412
+ pad_token)
413
  maxlen = max((len(sentence for sentence in s1)))
414
  for sentence in s1:
415
  sentence.pad(maxlen,pad_id=pad_token)
416
+ s1_in = torch.tensor([sentence.ids for sentence in s1])
417
+ s1_attn = torch.not_equal(s1_in,
418
+ pad_token)
419
+ s0_vec = encoder(s0_in,attention_mask=s0_attn)
420
+ s0_norm = torch.max(torch.linalg.vector_norm(s0_vec,dim=1),EPSILON)
421
+ s0 = s0_vec/s0_norm
422
+ s1_vec = encoder(s1_in,attention_mask=s1_attn)
423
+ s1_norm = torch.max(torch.linalg.vector_norm(s1_vec,dim=1),EPSILON)
424
+ s1 = s1_vec/s1_norm
425
+ consistency = torch.einsum('ij,ij->i',s0,s1).numpy()
 
 
426
  results = pandas.DataFrame({'label':data['gold_label'],
427
  'score':consistency})
428
  third = 1.0/3.0