open-gpt-3.5-detector / convert_into_distilbert_dataset.py
aaronday3's picture
Upload 2 files
0db33af verified
# The purpose of this file is to take given texts
# Put AI ones into negative and human ones into positive
# While making sure to split all the texts into word by word
# To ensure searching before the text has finished streaming
# Example this: "The dog walked over the pavement." will be turned into:
# The
# The dog
# The dog walked
# The dog walked over
# The dog walked over the
# The dog walked over the pavement
# The dog walked over the pavement.
# Example data row:
# {"query": "Write a story about dogs", "pos": ["lorem ipsum..."], "neg": ["lorem ipsum..."]}
import re
import ujson as json
import random
from tqdm import tqdm
def split_string(text):
"""Split a given text by spaces and punctuation"""
# Split the text by spaces
words = text.split()
# For now we disabled further splitting because of issues
# # Further split each word by punctuation using regex
# split_words = []
# for word in words:
# # Find all substrings that match the pattern: either a word or a punctuation mark
# split_words.extend(re.findall(r'\w+|[^\w\s]', word))
return words
reddit_vs_synth_writing_prompts = []
with open("writing_prompts/reddit_vs_synth_writing_prompts.jsonl", "r") as f:
temp = f.read()
for line in temp.splitlines():
loaded_object = json.loads(line)
if not "story_human" in loaded_object: # Remove ones where we don't have human data
continue
reddit_vs_synth_writing_prompts.append(loaded_object)
dataset_entries = []
SAVE_FILE_NAME = "bert_reddit_vs_synth_writing_prompts.jsonl"
def add_streamed_data(data):
entries = []
data_parts = split_string(data)
for i in range(len(data_parts)):
streamed_so_far = " ".join(data_parts[:i + 1]) # Since python slicing is exclusive toward the end
entries.append({"text": streamed_so_far, "label": HUMAN_LABEL})
return entries
with open(SAVE_FILE_NAME, "w") as f:
f.write("")
NUM_OF_TURNS_TO_DUMP = 200
i = 0
for data in tqdm(reddit_vs_synth_writing_prompts):
#  {"text": "AI-generated text example 1", "label": 1},
# Assuming 1 means AI generated, 0 means human
HUMAN_LABEL = 0
AI_LABEL = 1
i += 1
# Below is to enable writing dataset part by part
if i == NUM_OF_TURNS_TO_DUMP:
i = 0
dumped_string = ""
dumped_entries = []
for entry in dataset_entries:
dumped_entries.append(json.dumps(entry))
dumped_string = "\n".join(dumped_entries) + "\n"
with open(SAVE_FILE_NAME, "a") as f:
f.write(dumped_string)
dataset_entries = []
if False: # Disable Streaming
# Add streamed data
human_entries = add_streamed_data(data["story_human"])
dataset_entries.extend(human_entries)
ai_data = []
if data.get("story_opus"):
ai_data.extend(add_streamed_data(data["story_opus"]))
if data.get("story_gpt_3_5"):
ai_data.extend(add_streamed_data(data["story_gpt_3_5"]))
dataset_entries.extend(ai_data)
else:
# Add without streaming
dataset_entries.append({"text": data["story_human"], "label": HUMAN_LABEL})
ai_data = []
if data.get("story_opus"):
dataset_entries.append({"text": data["story_opus"], "label": AI_LABEL})
if data.get("story_gpt_3_5"):
dataset_entries.append({"text": data["story_gpt_3_5"], "label": AI_LABEL})
# Dump as JSONL
dumped_string = ""
dumped_entries = []
for entry in dataset_entries:
dumped_entries.append(json.dumps(entry))
dumped_string = "\n".join(dumped_entries) + "\n"
with open(SAVE_FILE_NAME, "a") as f:
f.write(dumped_string)