class CustomDataset: def __init__(self, data): self.features = ['instruction', 'context', 'response'] self.num_rows = len(data) self.data = data def __getitem__(self, idx): if idx < 0 or idx >= self.num_rows: raise IndexError("Index out of range") return { 'instruction': self.data[idx]['instruction'], 'context': self.data[idx]['context'], 'response': self.data[idx]['response'] } def __repr__(self): return f"Dataset({{'features': {self.features}, 'num_rows': {self.num_rows}}})" def format_data(sample): instruction = f"[INST] {sample['instruction']}" context = f"Here's some context: {sample['context']}" if len(sample["context"]) > 0 else None response = f" [/INST] {sample['response']}" # join all the parts together prompt = "".join([i for i in [instruction, context, response] if i is not None]) return prompt # template dataset to add prompt to each sample def template_dataset(sample, tokenizer): sample["text"] = f"{format_data(sample)}{tokenizer.eos_token}" return sample