Spaces:
Configuration error
Configuration error
File size: 1,659 Bytes
b78b52f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
"""
Convert alpaca dataset into sharegpt format.
Usage: python convert_alpaca.py --in_file alpaca_data.json --out_file alpaca_data_sharegpt.json
"""
import argparse
from datasets import load_dataset
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in_file", type=str)
parser.add_argument("--out_file", type=str)
parser.add_argument("--data_type", type=str, default='alpaca')
args = parser.parse_args()
print(args)
data_files = {"train": args.in_file}
raw_datasets = load_dataset('json', data_files=data_files)
ds = raw_datasets['train']
def process_alpaca(examples):
convs = []
for instruction, inp, output in zip(examples['instruction'], examples['input'], examples['output']):
if len(inp.strip()) > 1:
instruction = instruction + '\n\n' + inp
q = instruction
a = output
convs.append([
{"from": "human", "value": q},
{"from": "gpt", "value": a}
])
return {"conversations": convs}
if args.data_type in ['alpaca']:
ds = ds.map(process_alpaca, batched=True, remove_columns=ds.column_names, desc="Running process")
else:
# Other sharegpt dataset, need rename to conversations and remove unused columns
if "items" in ds.column_names:
ds = ds.rename(columns={"items": "conversations"})
columns_to_remove = ds.column_names.copy()
columns_to_remove.remove('conversations')
ds = ds.remove_columns(columns_to_remove)
ds.to_json(f"{args.out_file}", lines=True, force_ascii=False)
|