tosi-n7's picture
Upload folder using huggingface_hub
d8ffdc4
class CustomDataset:
def __init__(self, data):
self.features = ['instruction', 'context', 'response']
self.num_rows = len(data)
self.data = data
def __getitem__(self, idx):
if idx < 0 or idx >= self.num_rows:
raise IndexError("Index out of range")
return {
'instruction': self.data[idx]['instruction'],
'context': self.data[idx]['context'],
'response': self.data[idx]['response']
}
def __repr__(self):
return f"Dataset({{'features': {self.features}, 'num_rows': {self.num_rows}}})"
def format_data(sample):
instruction = f"<s>[INST] {sample['instruction']}"
context = f"Here's some context: {sample['context']}" if len(sample["context"]) > 0 else None
response = f" [/INST] {sample['response']}"
# join all the parts together
prompt = "".join([i for i in [instruction, context, response] if i is not None])
return prompt
# template dataset to add prompt to each sample
def template_dataset(sample, tokenizer):
sample["text"] = f"{format_data(sample)}{tokenizer.eos_token}"
return sample