rookiemango
/

auto-info

Model card Files Files and versions Community

auto-info / process_supervision_training_data.py

rookiemango

Upload folder using huggingface_hub

32b6f1a verified 7 months ago

raw

history blame contribute delete

17.8 kB

	import pdb
	import re
	import json
	import tqdm

	from process_supervision import load_tokenizer
	from test import delete_extra_zero

	def process_line(tokenizer, lines, wf_name, output_dir,thres=0.3):
	acc = [] # Will store tuples of (label, prediction) for each expression
	recall_count = [0, 0] # [number of correct positives, number of actual positives]
	hullucination = []
	import json

	rft_file_line = 0
	rft_list = []
	verifier_file_line = 0
	verifier_list = []
	rft_verifier_file_line = 0
	rft_verifier_list = []
	acc = []


	with open(wf_name, 'w', encoding='utf-8') as wf:
	for line in tqdm.tqdm(lines):
	for output in line['outputs']:
	v_scores = output.get('vscores', [])
	response = output.get('response', "")
	is_true = output.get('label', "")

	if is_true:
	rft_list.append({"question": line['question'], "answer": output['response']})
	rft_file_line += 1
	if v_scores and v_scores[-1] >= thres:
	# Save to rft_verifier_enhanced.json
	rft_verifier_list.append({"question": line['question'], "answer": output['response']})
	rft_verifier_file_line += 1
	if v_scores and v_scores[-1] >= thres:
	verifier_list.append({"question": line['question'], "answer": output['response']})
	verifier_file_line += 1
	if is_true:
	acc.append(1)
	else:
	acc.append(0)
	print(rft_file_line)
	print(verifier_file_line)
	print(rft_verifier_file_line)
	print("acc" , sum(acc)/len(acc))

	with open(f"{output_dir}/rft.json", 'w', encoding='utf-8') as rft_file:
	json.dump(rft_list, rft_file , ensure_ascii=False, indent=2)

	with open(f"{output_dir}/verifier_enhanced.json", 'w', encoding='utf-8') as verifier_file:
	json.dump(verifier_list, verifier_file, ensure_ascii=False, indent=2)

	with open(f"{output_dir}/rft_verifier_enhanced.json", 'w', encoding='utf-8') as rft_verifier_file:
	json.dump(rft_verifier_list, rft_verifier_file , ensure_ascii=False, indent=2)





	def locate_sublist(lst, sublst):
	for i in range(len(lst) - len(sublst) + 1):
	if lst[i:i+len(sublst)] == sublst:
	return i # Return the starting index of the sublist in the list
	assert ('not right')


	def split_string_list(a_list, number ='\n'):
	sublists = []
	current_sublist = []
	for item in a_list:
	current_sublist.append(item)
	if item == number:
	if current_sublist: # if the current sublist is not empty
	sublists.append(''.join(current_sublist))
	current_sublist = [] # start a new sublist

	# Don't forget to add the last sublist if it's not empty
	if current_sublist:
	sublists.append(''.join(current_sublist))

	return sublists
	def split_token_list(a_list, number =13):
	sublists = []
	current_sublist = []
	for item in a_list:
	current_sublist.append(item)
	if item == number:
	if current_sublist: # if the current sublist is not empty
	sublists.append(current_sublist)
	current_sublist = [] # start a new sublist

	# Don't forget to add the last sublist if it's not empty
	if current_sublist:
	sublists.append(current_sublist)

	return sublists
	# Modify evaluate_expression function to return a list of results


	def evaluate_expression_para(response_all, v_score, tokenizer, is_true):
	# Initialize lists to hold multiple evaluation results for each expression
	# here we make the v_score label in a "first error detection"
	labels = []
	predictions = []
	sol_tokens = tokenizer(response_all).input_ids
	process_v_score = [0] * len(sol_tokens)
	hullucination = False
	gt_help = False
	error_detection = False
	response_list = split_string_list(response_all)
	token_list = split_token_list(sol_tokens)
	previous_len = 0
	for idx, string in enumerate(response_list):
	# match = re.search(r'<<(.+?)>>', string)
	para_token = token_list[idx]
	para_token_location = sum([len(item) for item in token_list[:idx]])

	if error_detection:
	break


	if abs(v_score[para_token_location]) < 1e-5:
	error_detection = True

	elif (v_score[para_token_location + len(para_token) - 1] - v_score[para_token_location])/v_score[para_token_location] < -0.5:
	error_detection = True

	else:
	if not error_detection:
	process_v_score[para_token_location : para_token_location + len(para_token) ] = [1] * len(para_token)

	# if match:
	# expression = match.group(1)
	# start_token = tokenizer(response_all[ : previous_len + match.span()[0]]).input_ids
	# if sol_tokens[:len(start_token)] != start_token:
	# start_token = start_token[:-1]
	# # print(tokenizer.decode(start_token))
	# seg_token_location = len(start_token)
	# seq_token = tokenizer(response_all[: previous_len + match.span()[1]]).input_ids[len(start_token):]
	# # print(tokenizer.decode(seq_token))
	# # Check if v_score change is positive
	# try:
	# if abs(v_score[seg_token_location]) < 1e-5:
	# prediction = 'negative' # there is a extra example in v_score
	# error_detection = True
	#
	# elif (v_score[min(seg_token_location + len(seq_token), len(v_score) - 1)] - v_score[seg_token_location]) / (v_score[seg_token_location]) < -0.9:
	# prediction = 'negative' # there is a negative change in v_score
	# error_detection = True
	# else:
	# prediction = 'positive' # no negative change in v_score
	# if not error_detection:
	# process_v_score[para_token_location : para_token_location + len(para_token)] = [1] * len(para_token)
	# except:
	# import pdb
	# pdb.set_trace()
	# try:
	# before_equal, after_equal = expression.split('=')
	# computed_value = float(eval(before_equal.strip()))
	# actual_value = float(delete_extra_zero(after_equal.strip().replace(",", "")))
	# # Use the positive v_score change as a proxy for a correct evaluation
	# if abs(computed_value - actual_value) <= 1e-3:
	# label = 'positive'
	# else:
	# label = 'negative'
	# hullucination = True
	#
	# # Record the label and prediction for this expression
	# labels.append(label)
	# predictions.append(prediction)
	# except Exception as e:
	# pass
	# else:
	# if not error_detection:
	# process_v_score[para_token_location: para_token_location + len(para_token)] = [1] * len(para_token)
	#

	if idx == len(response_list) - 1 and not error_detection and not is_true:
	process_v_score[para_token_location: para_token_location + len(para_token)] = [0] * len(para_token)
	gt_help = True

	previous_len += len(string)
	# if sum(process_v_score) != len(process_v_score) and sum(process_v_score) != 0:
	# print(process_v_score)

	return {'label': labels, 'prediction': predictions, 'hullucination': hullucination, 'gt_help': gt_help}, process_v_score





	def evaluate_expression(response_all, v_score, tokenizer, is_true):
	# Initialize lists to hold multiple evaluation results for each expression
	# here we make the v_score label in a "first error detection"
	labels = []
	predictions = []
	sol_tokens = tokenizer(response_all).input_ids
	process_v_score = [0] * len(sol_tokens)
	hullucination = False
	gt_help = False
	error_detection = False
	response_list = split_string_list(response_all)
	token_list = split_token_list(sol_tokens)
	previous_len = 0
	for idx, string in enumerate(response_list):
	match = re.search(r'<<(.+?)>>', string)
	para_token = token_list[idx]
	para_token_location = sum([len(item) for item in token_list[:idx]])
	if match:
	expression = match.group(1)
	start_token = tokenizer(response_all[ : previous_len + match.span()[0]]).input_ids
	if sol_tokens[:len(start_token)] != start_token:
	start_token = start_token[:-1]
	# print(tokenizer.decode(start_token))
	seg_token_location = len(start_token)
	seq_token = tokenizer(response_all[: previous_len + match.span()[1]]).input_ids[len(start_token):]
	# print(tokenizer.decode(seq_token))
	# Check if v_score change is positive
	try:
	if abs(v_score[seg_token_location]) < 1e-5:
	prediction = 'negative' # there is a extra example in v_score
	error_detection = True

	elif (v_score[min(seg_token_location + len(seq_token), len(v_score) - 1)] - v_score[seg_token_location]) / (v_score[seg_token_location]) < -0.9:
	prediction = 'negative' # there is a negative change in v_score
	error_detection = True
	else:
	prediction = 'positive' # no negative change in v_score
	if not error_detection:
	process_v_score[para_token_location : para_token_location + len(para_token)] = [1] * len(para_token)
	except:
	import pdb
	pdb.set_trace()
	try:
	before_equal, after_equal = expression.split('=')
	computed_value = float(eval(before_equal.strip()))
	actual_value = float(delete_extra_zero(after_equal.strip().replace(",", "")))
	# Use the positive v_score change as a proxy for a correct evaluation
	if abs(computed_value - actual_value) <= 1e-3:
	label = 'positive'
	else:
	label = 'negative'
	hullucination = True

	# Record the label and prediction for this expression
	labels.append(label)
	predictions.append(prediction)
	except Exception as e:
	pass
	else:
	if not error_detection:
	process_v_score[para_token_location: para_token_location + len(para_token)] = [1] * len(para_token)


	# if idx == len(response_list) - 1 and not error_detection and not is_true:
	# process_v_score[para_token_location: para_token_location + len(para_token)] = [0] * len(para_token)
	# gt_help = True

	previous_len += len(string)
	# if sum(process_v_score) != len(process_v_score) and sum(process_v_score) != 0:
	# print(process_v_score)

	return {'label': labels, 'prediction': predictions, 'hullucination': hullucination, 'gt_help': gt_help}, process_v_score






	import multiprocessing
	from functools import partial
	import os
	def process_chunk(tokenizer, chunk, wf_path):
	acc = [] # Will store tuples of (label, prediction) for each expression
	recall_count = [0, 0] # [number of correct positives, number of actual positives]
	hullucination = []
	gt_help = []

	with open(wf_path, 'w', encoding='utf-8') as wf:
	for line in tqdm.tqdm(chunk):
	for output in line['outputs']:
	import pdb
	pdb.set_trace()
	v_scores = output.get('vscores', [])
	response = output.get('response', "")
	is_true = output.get('label', "")
	evaluation_results, process_v_scores = evaluate_expression_para(response, v_scores, tokenizer, is_true)
	# output['process_vscores'] = process_v_scores

	if evaluation_results['hullucination']:
	hullucination.append(1)
	else:
	hullucination.append(0)

	if evaluation_results['gt_help']:
	gt_help.append(1)
	else:
	gt_help.append(0)


	# Add the results to the accuracy list for each expression
	for label, prediction in zip(evaluation_results['label'], evaluation_results['prediction']):
	acc.append((label, prediction))

	# Update recall counts for each expression
	for idx, prediction in enumerate(evaluation_results['prediction']):
	label = evaluation_results['label'][idx]
	if label == 'positive':
	recall_count[1] += 1 # Increment the count of actual positives
	if prediction == 'positive':
	recall_count[0] += 1 # Increment the count of correct positives
	wf.writelines(json.dumps(line, ensure_ascii=False) + '\n')

	# Calculate metrics for the chunk
	accuracy = sum(1 for label, prediction in acc if label == prediction) / len(acc) if acc else 0
	hullucination_rate = sum(hullucination) / len(hullucination) if hullucination else 0
	# Return the metrics and counts, not just the rates, to allow aggregation
	return {
	"accuracy_sum": sum(1 for label, prediction in acc if label == prediction),
	"total": len(acc),
	"recall_correct": recall_count[0],
	"recall_total": recall_count[1],
	"hullucination_sum": sum(hullucination),
	"hullucination_total": len(hullucination),
	"gt_help_sum": sum(gt_help),
	"gt_help_total": len(gt_help),
	}
	# print(
	# f"Chunk accuracy: {accuracy}, Chunk recall: {recall}, Chunk hullucination: {sum(hullucination) / len(hullucination) if hullucination else 0}")



	def parallel_process_line(tokenizer, lines, wf_path, num_processes=1):
	if num_processes is None:
	num_processes = multiprocessing.cpu_count()

	# Split lines into chunks
	chunk_size = int(len(lines) / num_processes)
	chunks = [lines[i:i + chunk_size] for i in range(0, len(lines), chunk_size)]

	# Generate a unique temporary file path for each chunk
	temp_files = [f"multirun/{wf_path}_temp_{i}.json" for i in range(len(chunks))]

	# Create a pool of workers to process data in parallel
	with multiprocessing.Pool(processes=num_processes) as pool:
	# Map each chunk to process_chunk function along with a unique temporary file path
	results = pool.starmap(process_chunk, [(tokenizer, chunk, temp_file) for chunk, temp_file in zip(chunks, temp_files)])

	# Combine results from temporary files into the final output file
	with open(f"multirun2/{wf_path}.json", 'w', encoding='utf-8') as wf:
	for temp_file in temp_files:
	with open(temp_file, 'r', encoding='utf-8') as tf:
	wf.write(tf.read())
	os.remove(temp_file) # Clean up temporary file

	# Aggregate metrics from all chunks
	total_acc = sum(result['accuracy_sum'] for result in results)
	total = sum(result['total'] for result in results)
	total_recall_correct = sum(result['recall_correct'] for result in results)
	total_recall = sum(result['recall_total'] for result in results)
	total_hullucination = sum(result['hullucination_sum'] for result in results)
	total_hullucination_counts = sum(result['hullucination_total'] for result in results)
	total_gt_help = sum(result['gt_help_sum'] for result in results)
	total_gt_help_counts = sum(result['gt_help_total'] for result in results)

	# Calculate overall metrics
	overall_accuracy = total_acc / total if total else 0
	overall_recall = total_recall_correct / total_recall if total_recall else 0
	overall_hullucination = total_hullucination / total_hullucination_counts if total_hullucination_counts else 0
	overall_gt_help = total_gt_help/ total_gt_help_counts if total_gt_help_counts else 0

	print(f"Overall accuracy: {overall_accuracy}")
	print(f"Overall recall: {overall_recall}")
	print(f"Overall hullucination: {overall_hullucination}")
	print(f"Overall gt_help: {overall_gt_help}")



	# Example usage
	# line = '{"outputs": [{"solution_str": "The result is <<5 * 3 = 15>>."}, {"solution_str": "The answer is <<2 + 2 = 5>>."}]}'
	# file_path = "eval_results/gsm8k/verifier/train/responses_v(threemodel)_g(threemodel).jsonl"
	file_path_list = [
	# "eval_results/gsm8k_train/verifier/test/responses_v(lean4_random_15k_all-sample10-osv-gt2)_g(lean4_rand).jsonl",
	# "eval_results/math_train/verifier/test/responses_v(lean4_random_15k_all-sample10-osv-gt2)_g(lean4_rand).jsonl",
	"eval_results/math/verifier/test/responses_v(lean4_random_15k_all-sample10-osv-gt2)_g(lean4_rand).jsonl",
	]
	line = []
	for file_path in file_path_list:
	line += [json.loads(line) for line in open(file_path, 'r', encoding = 'utf-8').readlines() ]
	for ex in line:
	dedup_outputs = []
	for output in ex['outputs']:
	if len(output['tokens']) > 2048:
	continue
	dedup_outputs.append(output)
	ex['outputs'] = dedup_outputs

	model_dir = "../models/lean4_random_15k_all-sample10-osv-gt2/"
	tokenizer = load_tokenizer(model_dir)
	process_line(tokenizer, line,'good.json' ,"data/continual_training_lean" ,0.3)

	# Example usage
	# tokenizer = load_tokenizer(model_dir)
	# parallel_process_line(tokenizer, line, "allresults.json")