File size: 13,102 Bytes
4d3e798 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 |
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
import re
datapath = '/cluster/work/lawecon/Work/penghao/dataset/stories/'
pairpath = '../../../work/lawecon/Work/penghao/pairs.csv'
#3600 ->time lags
class StoryPairDataset(Dataset):
def __init__(self, datapath, pairpath, tokenizer, task, used_dataset_size=-1, train_test_split=0.1,
split_by='random',
max_len=4096*2, mode='m3', max_time_window=3000, least_likes=5, margin=True):
self.datapath = datapath
print(self.datapath)
self.train_test_split = train_test_split
self.pairpath = pairpath
self.tokenizer = tokenizer
self.max_len = max_len
self.split_by = split_by
self.least_likes = least_likes
self.max_time_window = max_time_window
self.used_dataset_size = used_dataset_size
if mode == 'm2':
self.max_time_window = 12009600
else:
self.max_time_window = max_time_window
self.pair = self.load_pair()
self.task = task
self.margin = margin
self.stories = self.load_stories(self.datapath)
print(self.stories.columns)
print(len(self.stories))
# turn df into dataset
# self.dataset = datasets.Dataset.from_pandas(self.df)
self.train, self.test = self.train_test_split__()
self.train = self.marginInclude(self.train)
self.test = self.marginInclude(self.test)
# combine train and test to a single dataset, before train and test
self.dataset = self.make_dataset()
print('current setting mode is ', mode)
print('currnet setting split_by is ', split_by)
print('current setting least_likes is ', least_likes)
def load_stories(self, path):
stories = pd.DataFrame()
#print(f"Reading stories from {path}...")
for file in glob.glob(path + '*.csv'):
#print(f"Reading {file}...")
try:
# Read the CSV file into a DataFrame
df = pd.read_csv(file)
# Check if the DataFrame is empty or not
if df.empty:
print(f"Warning: {file} is empty or not readable.")
continue
# Concatenate the DataFrames
stories = pd.concat([stories, df], ignore_index=True)
except pd.errors.EmptyDataError:
# print(f"Error: {file} is empty or not readable.")
pass
except pd.errors.ParserError:
print(f"Error: {file} cannot be parsed.")
except Exception as e:
print(f"Error: An unexpected error occurred while processing {file}. Details: {str(e)}")
# contain Index(['prompt_id', 'prompt', 'story_id', 'story_title', 'story_author', 'story_url', 'link', 'genre', 'is_sensitive', 'categories', 'likes', 'story_text', 'posted_date', 'comments'], dtype='object')
return stories
def load_pair(self):
pair = pd.read_csv(self.pairpath)
# contain the colums of prompt_id, story1_id, story2_id, rel, time_lag, least_likes
pair = pair[pair['time_lag'] <= self.max_time_window]
print('the max of tima lag is ', pair['time_lag'].max())
pair = pair[pair['least_likes'] >= self.least_likes]
# swap the order of story1 and story2 if rel is negative, and makes rel positive
pair.loc[pair['rel'] < 0, ['story1_id', 'story2_id']] = pair.loc[
pair['rel'] < 0, ['story2_id', 'story1_id']].values
pair['rel'] = abs(pair['rel'])
# filter the pair if they have same story id
pair = pair[pair['story1_id'] != pair['story2_id']]
if self.used_dataset_size == -1:
self.used_dataset_size = len(pair)
else:
pair = pair.sample(n=self.used_dataset_size)
print('the total number of pairs is ', len(pair))
# remove the duplicate pairs
pair = pair.drop_duplicates(subset=['story1_id', 'story2_id'])
#remove the rel = 0
pair = pair[pair['rel'] != 0]
print('the number of effective pairs is ', len(pair))
return pair
def marginInclude(self, df):
if self.margin:
# drop the column of rel
df = df.drop(columns=['rel'])
else:
# rename rel to margin
df = df.rename(columns={'rel': 'margin'})
return df
def train_test_split__(self):
'''
split the pairs into train and test set
:return:
'''
test_size = round(len(self.pair) * self.train_test_split)
if self.split_by == 'time':
# give the pair the information of year according to the story_id
self.stories['posted_date'] = pd.to_datetime(self.stories['posted_date'])
#convert datetime64[ns] to comparable format, e.g. 2021-04-27 23:29:00 -> 20210427
self.stories['posted_date'] = self.stories['posted_date'].dt.strftime('%Y%m%d')
# the time after 2022 is test set
test = self.pair[self.pair['story1_id'].apply(lambda x: int(self.stories[self.stories['story_id'] == x]['posted_date'].values[0]) > 20220000)]
train = self.pair[self.pair['story1_id'].apply(lambda x: int(self.stories[self.stories['story_id'] == x]['posted_date'].values[0]) <= 20220000)]
print('the number of test set is ', len(test))
print('the number of train set is ', len(train))
print('the ratio of test set is ', len(test) / (len(test) + len(train)))
elif self.split_by == 'random':
train, test = train_test_split(self.pair, test_size=self.train_test_split)
# covert to huggingface dataset
elif self.split_by == 'genre':
# count the number of pairs for each category
# give the pair the information of category according to the story_id
self.pair['genre'] = self.pair['story1_id'].apply(
lambda x: self.stories[self.stories['story_id'] == x]['genre'].values[0])
genre = {}
for c in self.pair['genre'].unique():
genre[c] = len(self.pair[self.pair['genre'] == c])
# select the category to nearest to 10 per cent of the total
genre = dict(sorted(genre.items(), key=lambda item: item[1], reverse=True))#sort the genre by the number of pairs from high to low
print(genre)
total = sum(genre.values())
#select the close genre to 10% of the total
test_genre = []
test_count = 0
while test_count < total * self.train_test_split:
test_genre.append(list(genre.keys())[0])
test_count += genre[list(genre.keys())[0]]
del genre[list(genre.keys())[0]]
if test_count + genre[list(genre.keys())[0]] > total * self.train_test_split:
break
test = self.pair[self.pair['genre'].apply(lambda x: x in test_genre)]
train = self.pair[self.pair['genre'].apply(lambda x: x not in test_genre)]
print('the genre of test set is ', test_genre)
print('the percentage of test set is ', test_count / total,'where total is ', total)
elif self.split_by == 'chaos':
#instead using the pairs, we randomly assign the story id to replace the old story id from that prompt
for i in range(len(self.pair)):
self.pair.at[i, 'story1_id'] = np.random.choice(self.stories[self.stories['prompt_id'] == self.pair.at[i, 'prompt_id']]['story_id'].values)
self.pair.at[i, 'story2_id'] = np.random.choice(self.stories[self.stories['prompt_id'] == self.pair.at[i, 'prompt_id']]['story_id'].values)
train, test = train_test_split(self.pair, test_size=self.train_test_split)
return train, test
def apply_template_to_text(self, row):
# Ensure proper access to columns in pair
prompt_id, story1_id, story2_id = row[['prompt_id', 'story1_id', 'story2_id']]
# Extract text based on IDs
chosen_prompt = self.stories[self.stories['prompt_id'] == prompt_id]['prompt']
chosen_prompt = chosen_prompt.values[0]
chosen_story = self.stories[self.stories['story_id'] == story1_id]['story_title'].values[0] + '/n' + \
self.stories[self.stories['story_id'] == story1_id]['story_text'].values[0]
rejected_prompt = self.stories[self.stories['prompt_id'] == prompt_id]['prompt']
rejected_prompt = rejected_prompt.values[0]
rejected_story = self.stories[self.stories['story_id'] == story2_id]['story_title'].values[0] + '/n' + \
self.stories[self.stories['story_id'] == story2_id]['story_text'].values[0]
# Create chosen and rejected text dictionaries
chosen_text = [{'role': 'user', 'content': chosen_prompt},
{'role': 'assistant', 'content': chosen_story}]
rejected_text = [{'role': 'user', 'content': rejected_prompt},
{'role': 'assistant', 'content': rejected_story}]
# Apply tokenizer to chosen and rejected text
chosen_text = self.tokenizer.apply_chat_template(chosen_text, tokenize=False)
rejected_text = self.tokenizer.apply_chat_template(rejected_text, tokenize=False)
res = {}
res['chosen_text'] = chosen_text
res['rejected_text'] = rejected_text
#add eos and bos token
res['chosen_text'] = self.tokenizer.bos_token + res['chosen_text'] + self.tokenizer.eos_token
res['rejected_text'] = self.tokenizer.bos_token + res['rejected_text'] + self.tokenizer.eos_token
res['text'] = chosen_text
#add eos and bos token
res['text'] = self.tokenizer.bos_token + res['text'] + self.tokenizer.eos_token
if 'gemma' in self.tokenizer.name_or_path:
split_words = '<|im_start|>assistant\n'
elif 'mistral' in self.tokenizer.name_or_path or 'llama' in self.tokenizer.name_or_path:
split_words = '[/INST]'
chosen_text_tmp = chosen_text.split(split_words)[-1]
prompt_text = chosen_text.replace(chosen_text_tmp, '')
chosen_text = chosen_text_tmp
rejected_text = rejected_text.split(split_words)[-1]
res['prompt'] = prompt_text
res['chosen'] = chosen_text
res['rejected'] = rejected_text
# add bos and eos token
res['prompt'] = self.tokenizer.bos_token + res['prompt']
res['chosen'] = res['chosen'] + self.tokenizer.eos_token
res['rejected'] = res['rejected'] + self.tokenizer.eos_token
return res
def convert_sft(self,df):
#collect all the story id in the pair
story_ids = list(set(df['story1_id'].values) | set(df['story2_id'].values))
#now make new train and test set as story_ids as story1_id and story2_id
df = pd.DataFrame()
df['story1_id'] = story_ids
df['story2_id'] = df['story1_id']
#reload stories
#self.stories = self.load_stories(self.datapath)
# get prompt_id from the pair
def get_prompt_id(x):
return self.stories[self.stories['story_id'] == x]['prompt_id'].values[0]
df['prompt_id'] = df['story1_id'].apply(lambda x: get_prompt_id(x))
return df
def make_dataset(self):
# reset the index
self.train.reset_index(drop=True, inplace=True)
self.test.reset_index(drop=True, inplace=True)
entries = []
if self.task == 'rm':
entries = ['chosen_text', 'rejected_text']
elif self.task == 'dpo':
entries = ['prompt', 'chosen', 'rejected']
elif self.task == 'sft':
self.train = self.convert_sft(self.train)
self.test = self.convert_sft(self.test)
entries = ['text']
print('the columns of train is ', self.train.columns)
for index, row in self.train.iterrows():
res = self.apply_template_to_text(row)
for e in entries:
self.train.at[index, e] = res[e]
for index, row in self.test.iterrows():
res = self.apply_template_to_text(row)
for e in entries:
self.test.at[index, e] = res[e]
print('the first example of train is ', self.train.iloc[0])
#since the we aggred on max_len = 8192, we need to filter this
if self.margin:
entries.append('margin')
train_dataset = Dataset.from_pandas(self.train[entries])
test_dataset = Dataset.from_pandas(self.test[entries])
return DatasetDict({'train': train_dataset, 'test': test_dataset})
def save_dataset(self, path):
'''
save the dataset to the readsy folder
:param path:
:return:
'''
self.dataset.save_to_disk('../' + path)
|