rag-bench-evaluation / scripts /process_data.py
gourisankar85's picture
Upload 159 files
d1eeaf0 verified
raw
history blame
2.85 kB
import logging
import random
import math
def process_data(instance, noise_rate, passage_num, filename, correct_rate=0):
print(filename)
"""Process the data for generating a noisy document set."""
query = instance['query']
ans = instance['answer']
logging.info(f"Query: {query}")
logging.info(f"Answer: {ans}")
neg_num = math.ceil(passage_num * noise_rate)
pos_num = passage_num - neg_num
docs = []
# Handling the '_int' case in filename
if '_int' in filename:
for i in instance['positive']:
random.shuffle(i)
docs = [i[0] for i in instance['positive']]
if len(docs) < pos_num:
maxnum = max([len(i) for i in instance['positive']])
for i in range(1, maxnum):
for j in instance['positive']:
if len(j) > i:
docs.append(j[i])
if len(docs) == pos_num:
break
if len(docs) == pos_num:
break
neg_num = passage_num - len(docs)
if neg_num > 0:
negative = instance['negative'][:neg_num]
docs += negative
# Handling the '_fact' case in filename
elif '_fact' in filename:
correct_num = math.ceil(passage_num * correct_rate)
pos_num = passage_num - neg_num - correct_num
indexs = list(range(len(instance['positive'])))
selected = random.sample(indexs, min(len(indexs), pos_num))
docs = [instance['positive_wrong'][i] for i in selected]
remain = [i for i in indexs if i not in selected]
if correct_num > 0 and len(remain) > 0:
docs += [instance['positive'][i] for i in random.sample(remain, min(len(remain), correct_num))]
if neg_num > 0:
docs += instance['negative'][:neg_num]
# Default case (when filename doesn't match '_int' or '_fact')
else:
if noise_rate == 1:
neg_num = passage_num
pos_num = 0
else:
if neg_num > len(instance['negative']):
neg_num = len(instance['negative'])
elif pos_num > len(instance['positive']):
pos_num = len(instance['positive'])
positive = instance['positive'][:pos_num]
negative = instance['negative'][:neg_num]
docs = positive + negative
# Count the positive and negative documents
num_positive = sum(1 for doc in docs if doc in positive)
num_negative = sum(1 for doc in docs if doc in negative)
logging.info(f"Using {num_positive} positive and {num_negative} negative documents as context")
# Shuffle the final document list
random.shuffle(docs)
return query, ans, docs