File size: 4,672 Bytes
7f7285f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding:utf-8  -*-

"""
@Author             : Bao
@Date               : 2020/10/13
@Desc               :
@Last modified by   : Bao
@Last modified date : 2020/11/11
"""

import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


DEFAULT_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


class Generator:
    """
    Examples:
        import json
        import torch

        input_data = [
            {'context': 'My name is Sarah.', 'answer': ('Sarah', 11, 16)},
            {'context': 'My name is Sarah and I live in London.', 'answer': ('London', 31, 37)},
            {'context': 'Sarah lived in London. Jone lived in Canada.', 'answer': ('Canada', 37, 43)},
            {'context': 'Sarah lived in London. Jone lived in Canada.', 'answer': ('lived', 28, 33)},
        ]
        generator = Generator(
            'valhalla/t5-base-qg-hl',
            'your_cache_dir',
            'cuda' if torch.cuda.is_available() else 'cpu',
        )

        results = generator(input_data, beam_size=5)
        print(json.dumps(results, ensure_ascii=False, indent=4))
    """

    def __init__(self, model_name_or_path, cache_dir=None, device=DEFAULT_DEVICE, verbose=True):
        self.seed = 1111
        self.device = device
        self.verbose = verbose
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name_or_path,
            cache_dir=cache_dir if cache_dir else None,
        )
        self.model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name_or_path,
            cache_dir=cache_dir if cache_dir else None,
        )
        n_gpu = torch.cuda.device_count()
        if n_gpu > 0:
            torch.cuda.manual_seed_all(self.seed)
        self.model.to(device)
        # if n_gpu > 1:
        #     self.model = torch.nn.DataParallel(self.model)
        self.model.eval()

    def __call__(self, input_data, beam_size=1, max_length=100, batch_size=8):
        all_ids_with_beam = []
        num_batches = (len(input_data) + batch_size - 1) // batch_size
        iter = tqdm(range(num_batches), desc='Generate questions') if self.verbose else range(num_batches)
        for step in iter:
            batch_start = step * batch_size
            batch_end = min((step + 1) * batch_size, len(input_data))

            batch_text = []
            for entry in input_data[batch_start:batch_end]:
                context = entry['context']
                answer, answer_start, answer_end = entry['answer']
                context = 'generate question: ' + context[:answer_start] + \
                          '<hl> ' + answer + ' <hl>' + context[answer_end:] + ' </s>'
                batch_text.append(context)

            inputs = self.tokenizer.batch_encode_plus(
                batch_text,
                padding='max_length',
                truncation='longest_first',
                max_length=max_length,
                return_tensors='pt',
            )

            for key, value in inputs.items():
                inputs[key] = value.to(self.device)

            ids_with_beam = self.model.generate(num_beams=beam_size,
                                                num_return_sequences=beam_size,
                                                no_repeat_ngram_size=3,
                                                early_stopping=True,
                                                length_penalty=1.5,
                                                repetition_penalty=1.5,
                                                min_length=3,
                                                **inputs)
            ids_with_beam = ids_with_beam.reshape([len(batch_text), beam_size, -1])
            all_ids_with_beam.extend(ids_with_beam.detach().cpu().tolist())

        for i, ids_with_beam in enumerate(all_ids_with_beam):
            input_data[i]['questions'] = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in ids_with_beam]

        return input_data


if __name__ == '__main__':
    import json

    input_data = [
        {'context': 'My name is Sarah.', 'answer': ('Sarah', 11, 16)},
        {'context': 'My name is Sarah and I live in London.', 'answer': ('London', 31, 37)},
        {'context': 'Sarah lived in London. Jone lived in Canada.', 'answer': ('Canada', 37, 43)},
        {'context': 'Sarah lived in London. Jone lived in Canada.', 'answer': ('lived', 28, 33)},
    ]

    generator = Generator(
        'valhalla/t5-base-qg-hl',
        'cache/',
        'cuda' if torch.cuda.is_available() else 'cpu',
    )

    results = generator(input_data, beam_size=5)

    print(json.dumps(results, ensure_ascii=False, indent=4))