MedicalGPT-main / convert_dataset.py
nengrenjie83's picture
Upload 28 files
b78b52f
"""
Convert alpaca dataset into sharegpt format.
Usage: python convert_alpaca.py --in_file alpaca_data.json --out_file alpaca_data_sharegpt.json
"""
import argparse
from datasets import load_dataset
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in_file", type=str)
parser.add_argument("--out_file", type=str)
parser.add_argument("--data_type", type=str, default='alpaca')
args = parser.parse_args()
print(args)
data_files = {"train": args.in_file}
raw_datasets = load_dataset('json', data_files=data_files)
ds = raw_datasets['train']
def process_alpaca(examples):
convs = []
for instruction, inp, output in zip(examples['instruction'], examples['input'], examples['output']):
if len(inp.strip()) > 1:
instruction = instruction + '\n\n' + inp
q = instruction
a = output
convs.append([
{"from": "human", "value": q},
{"from": "gpt", "value": a}
])
return {"conversations": convs}
if args.data_type in ['alpaca']:
ds = ds.map(process_alpaca, batched=True, remove_columns=ds.column_names, desc="Running process")
else:
# Other sharegpt dataset, need rename to conversations and remove unused columns
if "items" in ds.column_names:
ds = ds.rename(columns={"items": "conversations"})
columns_to_remove = ds.column_names.copy()
columns_to_remove.remove('conversations')
ds = ds.remove_columns(columns_to_remove)
ds.to_json(f"{args.out_file}", lines=True, force_ascii=False)