File size: 1,139 Bytes
d8ffdc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
class CustomDataset:
    def __init__(self, data):
        self.features = ['instruction', 'context', 'response']
        self.num_rows = len(data)
        self.data = data

    def __getitem__(self, idx):
        if idx < 0 or idx >= self.num_rows:
            raise IndexError("Index out of range")
        return {
            'instruction': self.data[idx]['instruction'],
            'context': self.data[idx]['context'],
            'response': self.data[idx]['response']
        }

    def __repr__(self):
        return f"Dataset({{'features': {self.features}, 'num_rows': {self.num_rows}}})"


def format_data(sample):
    instruction = f"<s>[INST] {sample['instruction']}"
    context = f"Here's some context: {sample['context']}" if len(sample["context"]) > 0 else None
    response = f" [/INST] {sample['response']}"
    # join all the parts together
    prompt = "".join([i for i in [instruction, context, response] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset(sample, tokenizer):
    sample["text"] = f"{format_data(sample)}{tokenizer.eos_token}"
    return sample