Spaces:
No application file
No application file
class CustomDataset: | |
def __init__(self, data): | |
self.features = ['instruction', 'context', 'response'] | |
self.num_rows = len(data) | |
self.data = data | |
def __getitem__(self, idx): | |
if idx < 0 or idx >= self.num_rows: | |
raise IndexError("Index out of range") | |
return { | |
'instruction': self.data[idx]['instruction'], | |
'context': self.data[idx]['context'], | |
'response': self.data[idx]['response'] | |
} | |
def __repr__(self): | |
return f"Dataset({{'features': {self.features}, 'num_rows': {self.num_rows}}})" | |
def format_data(sample): | |
instruction = f"<s>[INST] {sample['instruction']}" | |
context = f"Here's some context: {sample['context']}" if len(sample["context"]) > 0 else None | |
response = f" [/INST] {sample['response']}" | |
# join all the parts together | |
prompt = "".join([i for i in [instruction, context, response] if i is not None]) | |
return prompt | |
# template dataset to add prompt to each sample | |
def template_dataset(sample, tokenizer): | |
sample["text"] = f"{format_data(sample)}{tokenizer.eos_token}" | |
return sample |