Spaces:
Configuration error
Configuration error
""" | |
Convert alpaca dataset into sharegpt format. | |
Usage: python convert_alpaca.py --in_file alpaca_data.json --out_file alpaca_data_sharegpt.json | |
""" | |
import argparse | |
from datasets import load_dataset | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--in_file", type=str) | |
parser.add_argument("--out_file", type=str) | |
parser.add_argument("--data_type", type=str, default='alpaca') | |
args = parser.parse_args() | |
print(args) | |
data_files = {"train": args.in_file} | |
raw_datasets = load_dataset('json', data_files=data_files) | |
ds = raw_datasets['train'] | |
def process_alpaca(examples): | |
convs = [] | |
for instruction, inp, output in zip(examples['instruction'], examples['input'], examples['output']): | |
if len(inp.strip()) > 1: | |
instruction = instruction + '\n\n' + inp | |
q = instruction | |
a = output | |
convs.append([ | |
{"from": "human", "value": q}, | |
{"from": "gpt", "value": a} | |
]) | |
return {"conversations": convs} | |
if args.data_type in ['alpaca']: | |
ds = ds.map(process_alpaca, batched=True, remove_columns=ds.column_names, desc="Running process") | |
else: | |
# Other sharegpt dataset, need rename to conversations and remove unused columns | |
if "items" in ds.column_names: | |
ds = ds.rename(columns={"items": "conversations"}) | |
columns_to_remove = ds.column_names.copy() | |
columns_to_remove.remove('conversations') | |
ds = ds.remove_columns(columns_to_remove) | |
ds.to_json(f"{args.out_file}", lines=True, force_ascii=False) | |