rookiemango
/

auto-info

Model card Files Files and versions Community

rookiemango commited on May 10, 2024

Commit

167df7c

verified ·

1 Parent(s): d559180

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

__pycache__/process_supervision.cpython-39.pyc +0 -0
__pycache__/test.cpython-39.pyc +0 -0
generation_method.py +7 -5
multirun/allresults.json_temp_0.json +0 -0
process_supervision_training_data.py +403 -0
requirements.txt +4 -0
test.py +82 -0

__pycache__/process_supervision.cpython-39.pyc ADDED Viewed

Binary file (3.82 kB). View file

__pycache__/test.cpython-39.pyc ADDED Viewed

Binary file (1.91 kB). View file

generation_method.py CHANGED Viewed

@@ -229,6 +229,8 @@ def parse_arguments():
         if args.dataset == "lean4_5k_test":
             args.data_path = "data/lean4_gpt_5k/test/data.jsonl"
         elif args.dataset == "math_train":
             args.data_path = "data/test/math/train.jsonl"
@@ -237,12 +239,12 @@ def parse_arguments():
             args.data_path = "data/test/gsm8k/train.jsonl"
         elif args.dataset == "wild_test":
-            args.data_path = "/hpc2hdd/home/zyang398/data_2/wild_sample1k.jsonl"
         elif args.dataset == "lean4_basic_test":
-            args.data_path = "data/lean4_basic/1k_test.jsonl"
         elif args.dataset == "lean4_random_test":
-            args.data_path = "data/lean4_random/1k_test.json"
         elif args.dataset == "lean4_random_first_train":
             args.data_path = "data/lean4_random/5k_first.json"
         elif args.dataset == "lean4_random_second_train":
@@ -324,7 +326,7 @@ PROMPT_DICT = {
     ),
     "lean4": (
         "Statement and proof in natural language:\n\n"
-        "{statement_text}\n\n"
         "Translate the statement and proof in natural language to lean4:"
     ),
     "prompt_no_input": (
@@ -366,7 +368,7 @@ def get_question_answer(args):
         questions  = [ PROMPT_DICT['wild'].format(question= questions[id], answer =answers[id][args.data_answer_key] )  for id in range(len(questions))]
     else:
-        questions  = [ PROMPT_DICT['lean4'].format(statement_text = item)  for item in questions]
     return questions, answers

         if args.dataset == "lean4_5k_test":
             args.data_path = "data/lean4_gpt_5k/test/data.jsonl"
+        elif args.dataset == "lean4_15k_train":
+            args.data_path = "data/lean4_random/15k_filtered.json"
         elif args.dataset == "math_train":
             args.data_path = "data/test/math/train.jsonl"
             args.data_path = "data/test/gsm8k/train.jsonl"
         elif args.dataset == "wild_test":
+            args.data_path = "data/wild/wild_sample1k.jsonl"
         elif args.dataset == "lean4_basic_test":
+            args.data_path = "data/lean4_basic/1k_test_filtered.jsonl"
         elif args.dataset == "lean4_random_test":
+            args.data_path = "data/lean4_random/1k_test_filtered.json"
         elif args.dataset == "lean4_random_first_train":
             args.data_path = "data/lean4_random/5k_first.json"
         elif args.dataset == "lean4_random_second_train":
     ),
     "lean4": (
         "Statement and proof in natural language:\n\n"
+        "{model_response}\n\n"
         "Translate the statement and proof in natural language to lean4:"
     ),
     "prompt_no_input": (
         questions  = [ PROMPT_DICT['wild'].format(question= questions[id], answer =answers[id][args.data_answer_key] )  for id in range(len(questions))]
     else:
+        questions  = [ PROMPT_DICT['lean4'].format(model_response = item)  for item in questions]
     return questions, answers

multirun/allresults.json_temp_0.json ADDED Viewed

File without changes

process_supervision_training_data.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import pdb
+import re
+import json
+import tqdm
+from process_supervision import load_tokenizer
+from  test import delete_extra_zero
+def process_line(tokenizer, lines, wf_name):
+    acc = []  # Will store tuples of (label, prediction) for each expression
+    recall_count = [0, 0]  # [number of correct positives, number of actual positives]
+    hullucination = []
+    import json
+    rft_file_line = 0
+    rft_list = []
+    verifier_file_line = 0
+    verifier_list = []
+    rft_verifier_file_line = 0
+    rft_verifier_list = []
+    acc = []
+    with open(wf_name, 'w', encoding='utf-8') as wf:
+        for line in tqdm.tqdm(lines):
+            for output in line['outputs']:
+                v_scores = output.get('vscores', [])
+                response = output.get('response', "")
+                is_true = output.get('label', "")
+                if is_true:
+                    rft_list.append({"question": line['question'], "answer": output['response']})
+                    rft_file_line += 1
+                    if v_scores and v_scores[-1] > 0.5:
+                        # Save to rft_verifier_enhanced.json
+                        rft_verifier_list.append({"question": line['question'], "answer": output['response']})
+                        rft_verifier_file_line += 1
+                if v_scores and v_scores[-1] > 0.5:
+                    verifier_list.append({"question": line['question'],  "answer": output['response']})
+                    verifier_file_line += 1
+                    if is_true:
+                        acc.append(1)
+                    else:
+                        acc.append(0)
+    print(rft_file_line)
+    print(verifier_file_line)
+    print(rft_verifier_file_line)
+    print("acc" , sum(acc)/len(acc))
+    with open("data/continual_training/rft.json", 'w', encoding='utf-8') as rft_file:
+        json.dump(rft_list, rft_file , ensure_ascii=False, indent=2)
+    with open("data/continual_training/verifier_enhanced.json", 'w', encoding='utf-8') as verifier_file:
+        json.dump(verifier_list, verifier_file, ensure_ascii=False, indent=2)
+    with open("data/continual_training/rft_verifier_enhanced.json", 'w', encoding='utf-8') as rft_verifier_file:
+        json.dump(rft_verifier_list, rft_verifier_file , ensure_ascii=False, indent=2)
+def locate_sublist(lst, sublst):
+    for i in range(len(lst) - len(sublst) + 1):
+        if lst[i:i+len(sublst)] == sublst:
+            return i  # Return the starting index of the sublist in the list
+    assert ('not right')
+def split_string_list(a_list, number ='\n'):
+    sublists = []
+    current_sublist = []
+    for item in a_list:
+        current_sublist.append(item)
+        if item == number:
+            if current_sublist:  # if the current sublist is not empty
+                sublists.append(''.join(current_sublist))
+                current_sublist = []  # start a new sublist
+    # Don't forget to add the last sublist if it's not empty
+    if current_sublist:
+        sublists.append(''.join(current_sublist))
+    return sublists
+def split_token_list(a_list, number =13):
+    sublists = []
+    current_sublist = []
+    for item in a_list:
+        current_sublist.append(item)
+        if item == number:
+            if current_sublist:  # if the current sublist is not empty
+                sublists.append(current_sublist)
+                current_sublist = []  # start a new sublist
+    # Don't forget to add the last sublist if it's not empty
+    if current_sublist:
+        sublists.append(current_sublist)
+    return sublists
+# Modify evaluate_expression function to return a list of results
+def evaluate_expression_para(response_all, v_score, tokenizer, is_true):
+    # Initialize lists to hold multiple evaluation results for each expression
+    # here we make the v_score label in a "first error detection"
+    labels = []
+    predictions = []
+    sol_tokens = tokenizer(response_all).input_ids
+    process_v_score = [0] * len(sol_tokens)
+    hullucination = False
+    gt_help = False
+    error_detection = False
+    response_list = split_string_list(response_all)
+    token_list = split_token_list(sol_tokens)
+    previous_len = 0
+    for idx, string in enumerate(response_list):
+        # match = re.search(r'<<(.+?)>>', string)
+        para_token = token_list[idx]
+        para_token_location =  sum([len(item) for item in token_list[:idx]])
+        if error_detection:
+            break
+        if abs(v_score[para_token_location]) < 1e-5:
+            error_detection = True
+        elif  (v_score[para_token_location + len(para_token) - 1] - v_score[para_token_location])/v_score[para_token_location] < -0.5:
+            error_detection = True
+        else:
+            if not error_detection:
+                process_v_score[para_token_location : para_token_location + len(para_token) ] = [1] * len(para_token)
+        # if match:
+        #     expression = match.group(1)
+        #     start_token = tokenizer(response_all[ : previous_len + match.span()[0]]).input_ids
+        #     if sol_tokens[:len(start_token)] != start_token:
+        #         start_token = start_token[:-1]
+        #     # print(tokenizer.decode(start_token))
+        #     seg_token_location = len(start_token)
+        #     seq_token =  tokenizer(response_all[: previous_len + match.span()[1]]).input_ids[len(start_token):]
+        #     # print(tokenizer.decode(seq_token))
+        #     # Check if v_score change is positive
+        #     try:
+        #         if abs(v_score[seg_token_location]) < 1e-5:
+        #             prediction = 'negative'  # there is a extra example in v_score
+        #             error_detection = True
+        #
+        #         elif (v_score[min(seg_token_location + len(seq_token), len(v_score) - 1)] - v_score[seg_token_location]) / (v_score[seg_token_location]) < -0.9:
+        #             prediction = 'negative'  # there is a negative change in v_score
+        #             error_detection = True
+        #         else:
+        #             prediction = 'positive'  # no negative change in v_score
+        #             if not error_detection:
+        #                 process_v_score[para_token_location : para_token_location + len(para_token)] =  [1] * len(para_token)
+        #     except:
+        #         import pdb
+        #         pdb.set_trace()
+        #     try:
+        #         before_equal, after_equal = expression.split('=')
+        #         computed_value = float(eval(before_equal.strip()))
+        #         actual_value = float(delete_extra_zero(after_equal.strip().replace(",", "")))
+        #         # Use the positive v_score change as a proxy for a correct evaluation
+        #         if abs(computed_value - actual_value) <= 1e-3:
+        #             label = 'positive'
+        #         else:
+        #             label = 'negative'
+        #             hullucination = True
+        #
+        #         # Record the label and prediction for this expression
+        #         labels.append(label)
+        #         predictions.append(prediction)
+        #     except Exception as e:
+        #         pass
+        # else:
+        #     if not error_detection:
+        #         process_v_score[para_token_location: para_token_location + len(para_token)] = [1] * len(para_token)
+        #
+        if idx == len(response_list) - 1 and not error_detection and not is_true:
+            process_v_score[para_token_location: para_token_location + len(para_token)] = [0] * len(para_token)
+            gt_help = True
+        previous_len += len(string)
+    # if sum(process_v_score) !=  len(process_v_score) and sum(process_v_score) != 0:
+    #     print(process_v_score)
+    return {'label': labels, 'prediction': predictions, 'hullucination': hullucination, 'gt_help': gt_help}, process_v_score
+def evaluate_expression(response_all, v_score, tokenizer, is_true):
+    # Initialize lists to hold multiple evaluation results for each expression
+    # here we make the v_score label in a "first error detection"
+    labels = []
+    predictions = []
+    sol_tokens = tokenizer(response_all).input_ids
+    process_v_score = [0] * len(sol_tokens)
+    hullucination = False
+    gt_help = False
+    error_detection = False
+    response_list = split_string_list(response_all)
+    token_list = split_token_list(sol_tokens)
+    previous_len = 0
+    for idx, string in enumerate(response_list):
+        match = re.search(r'<<(.+?)>>', string)
+        para_token = token_list[idx]
+        para_token_location =  sum([len(item) for item in token_list[:idx]])
+        if match:
+            expression = match.group(1)
+            start_token = tokenizer(response_all[ : previous_len + match.span()[0]]).input_ids
+            if sol_tokens[:len(start_token)] != start_token:
+                start_token = start_token[:-1]
+            # print(tokenizer.decode(start_token))
+            seg_token_location = len(start_token)
+            seq_token =  tokenizer(response_all[: previous_len + match.span()[1]]).input_ids[len(start_token):]
+            # print(tokenizer.decode(seq_token))
+            # Check if v_score change is positive
+            try:
+                if abs(v_score[seg_token_location]) < 1e-5:
+                    prediction = 'negative'  # there is a extra example in v_score
+                    error_detection = True
+                elif (v_score[min(seg_token_location + len(seq_token), len(v_score) - 1)] - v_score[seg_token_location]) / (v_score[seg_token_location]) < -0.9:
+                    prediction = 'negative'  # there is a negative change in v_score
+                    error_detection = True
+                else:
+                    prediction = 'positive'  # no negative change in v_score
+                    if not error_detection:
+                        process_v_score[para_token_location : para_token_location + len(para_token)] =  [1] * len(para_token)
+            except:
+                import pdb
+                pdb.set_trace()
+            try:
+                before_equal, after_equal = expression.split('=')
+                computed_value = float(eval(before_equal.strip()))
+                actual_value = float(delete_extra_zero(after_equal.strip().replace(",", "")))
+                # Use the positive v_score change as a proxy for a correct evaluation
+                if abs(computed_value - actual_value) <= 1e-3:
+                    label = 'positive'
+                else:
+                    label = 'negative'
+                    hullucination = True
+                # Record the label and prediction for this expression
+                labels.append(label)
+                predictions.append(prediction)
+            except Exception as e:
+                pass
+        else:
+            if not error_detection:
+                process_v_score[para_token_location: para_token_location + len(para_token)] = [1] * len(para_token)
+            # if idx == len(response_list) - 1 and not error_detection and not is_true:
+            #     process_v_score[para_token_location: para_token_location + len(para_token)] = [0] * len(para_token)
+            #     gt_help = True
+        previous_len += len(string)
+    # if sum(process_v_score) !=  len(process_v_score) and sum(process_v_score) != 0:
+    #     print(process_v_score)
+    return {'label': labels, 'prediction': predictions, 'hullucination': hullucination, 'gt_help': gt_help}, process_v_score
+import multiprocessing
+from functools import partial
+import os
+def process_chunk(tokenizer, chunk, wf_path):
+    acc = []  # Will store tuples of (label, prediction) for each expression
+    recall_count = [0, 0]  # [number of correct positives, number of actual positives]
+    hullucination = []
+    gt_help = []
+    with open(wf_path, 'w', encoding='utf-8') as wf:
+        for line in tqdm.tqdm(chunk):
+            for output in line['outputs']:
+                import pdb
+                pdb.set_trace()
+                v_scores = output.get('vscores', [])
+                response = output.get('response', "")
+                is_true = output.get('label', "")
+                evaluation_results, process_v_scores = evaluate_expression_para(response, v_scores, tokenizer, is_true)
+                # output['process_vscores'] = process_v_scores
+                if evaluation_results['hullucination']:
+                    hullucination.append(1)
+                else:
+                    hullucination.append(0)
+                if evaluation_results['gt_help']:
+                    gt_help.append(1)
+                else:
+                    gt_help.append(0)
+                # Add the results to the accuracy list for each expression
+                for label, prediction in zip(evaluation_results['label'], evaluation_results['prediction']):
+                    acc.append((label, prediction))
+                # Update recall counts for each expression
+                for idx, prediction in enumerate(evaluation_results['prediction']):
+                    label = evaluation_results['label'][idx]
+                    if label == 'positive':
+                        recall_count[1] += 1  # Increment the count of actual positives
+                        if prediction == 'positive':
+                            recall_count[0] += 1  # Increment the count of correct positives
+            wf.writelines(json.dumps(line, ensure_ascii=False) + '\n')
+        # Calculate metrics for the chunk
+        accuracy = sum(1 for label, prediction in acc if label == prediction) / len(acc) if acc else 0
+        hullucination_rate = sum(hullucination) / len(hullucination) if hullucination else 0
+        # Return the metrics and counts, not just the rates, to allow aggregation
+    return {
+        "accuracy_sum": sum(1 for label, prediction in acc if label == prediction),
+        "total": len(acc),
+        "recall_correct": recall_count[0],
+        "recall_total": recall_count[1],
+        "hullucination_sum": sum(hullucination),
+        "hullucination_total": len(hullucination),
+        "gt_help_sum": sum(gt_help),
+        "gt_help_total": len(gt_help),
+    }
+        # print(
+        #     f"Chunk accuracy: {accuracy}, Chunk recall: {recall}, Chunk hullucination: {sum(hullucination) / len(hullucination) if hullucination else 0}")
+def parallel_process_line(tokenizer, lines, wf_path, num_processes=1):
+    if num_processes is None:
+        num_processes = multiprocessing.cpu_count()
+    # Split lines into chunks
+    chunk_size = int(len(lines) / num_processes)
+    chunks = [lines[i:i + chunk_size] for i in range(0, len(lines), chunk_size)]
+    # Generate a unique temporary file path for each chunk
+    temp_files = [f"multirun/{wf_path}_temp_{i}.json" for i in range(len(chunks))]
+    # Create a pool of workers to process data in parallel
+    with multiprocessing.Pool(processes=num_processes) as pool:
+        # Map each chunk to process_chunk function along with a unique temporary file path
+        results = pool.starmap(process_chunk, [(tokenizer, chunk, temp_file) for chunk, temp_file in zip(chunks, temp_files)])
+    # Combine results from temporary files into the final output file
+    with open(f"multirun2/{wf_path}.json", 'w', encoding='utf-8') as wf:
+        for temp_file in temp_files:
+            with open(temp_file, 'r', encoding='utf-8') as tf:
+                wf.write(tf.read())
+            os.remove(temp_file)  # Clean up temporary file
+    # Aggregate metrics from all chunks
+    total_acc = sum(result['accuracy_sum'] for result in results)
+    total = sum(result['total'] for result in results)
+    total_recall_correct = sum(result['recall_correct'] for result in results)
+    total_recall = sum(result['recall_total'] for result in results)
+    total_hullucination = sum(result['hullucination_sum'] for result in results)
+    total_hullucination_counts = sum(result['hullucination_total'] for result in results)
+    total_gt_help  = sum(result['gt_help_sum'] for result in results)
+    total_gt_help_counts = sum(result['gt_help_total'] for result in results)
+    # Calculate overall metrics
+    overall_accuracy = total_acc / total if total else 0
+    overall_recall = total_recall_correct / total_recall if total_recall else 0
+    overall_hullucination = total_hullucination / total_hullucination_counts if total_hullucination_counts else 0
+    overall_gt_help = total_gt_help/ total_gt_help_counts if total_gt_help_counts else 0
+    print(f"Overall accuracy: {overall_accuracy}")
+    print(f"Overall recall: {overall_recall}")
+    print(f"Overall hullucination: {overall_hullucination}")
+    print(f"Overall gt_help: {overall_gt_help}")
+# Example usage
+# line = '{"outputs": [{"solution_str": "The result is <<5 * 3 = 15>>."}, {"solution_str": "The answer is <<2 + 2 = 5>>."}]}'
+# file_path = "eval_results/gsm8k/verifier/train/responses_v(threemodel)_g(threemodel).jsonl"
+file_path = "eval_results/math/verifier/test/responses_v(lean4_random_15k_all-sample10-osv-gt2)_g(lean4_rand).jsonl"
+line = [json.loads(line) for line in open(file_path, 'r', encoding = 'utf-8').readlines()]
+for ex in line:
+    dedup_outputs = []
+    for output in ex['outputs']:
+        if len(output['tokens']) > 2048:
+            continue
+        dedup_outputs.append(output)
+    ex['outputs'] = dedup_outputs
+model_dir = "../models/lean4_random_15k_all-sample10-osv-gt2/"
+tokenizer = load_tokenizer(model_dir)
+process_line(tokenizer, line,'good.json' )
+# Example usage
+# tokenizer = load_tokenizer(model_dir)
+# parallel_process_line(tokenizer, line, "allresults.json")

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers==4.39.2
+DeepSpeed==0.14.0
+SentencePiece
+accelerate>=0.21.0

test.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from fraction import Fraction
+import re
+def is_number(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        pass
+    try:
+        import unicodedata
+        unicodedata.numeric(s)
+        return True
+    except (TypeError, ValueError):
+        pass
+    return False
+ANSWER_TRIGGER = 'The answer is'
+def handle_frac(pred):
+    if '/' in pred:
+        denominator = pred.split('/')[1]
+        numerator = pred.split('/')[0]
+        if is_number(denominator) == True and is_number(numerator) == True:
+            if denominator == '0':
+                return round(float(numerator.replace(',', '')))
+            else:
+                frac = Fraction(pred.replace(',', ''))
+                num_numerator = frac.numerator
+                num_denominator = frac.denominator
+                return round(float(num_numerator / num_denominator))
+def delete_extra_zero(n):
+    '''删除小数点后多余的0'''
+    try: n=float(n)
+    except:
+        # print("None {}".format(n))
+        try:
+            rr = str(handle_frac(n))
+            return rr
+        except:
+            return ''
+    if isinstance(n, int):
+        return str(n)
+    if isinstance(n, float):
+        n = str(n).rstrip('0')  # 删除小数点后多余的0
+        n = int(n.rstrip('.')) if n.endswith('.') else float(n)  # 只剩小数点直接转int，否则转回float
+        n=str(n)
+        return n
+def output_answer_clean(model_pred):
+    model_pred = model_pred.lower()
+    preds = model_pred.split(ANSWER_TRIGGER.lower())
+    answer_flag = True if len(preds) > 1 else False
+    if answer_flag:
+        # Pick first answer with flag
+        pred = preds[1]
+    else:
+        # Pick last number without flag
+        pred = preds[-1]
+    pred = pred.replace(",", "")
+    # pred = [s for s in re.findall(r'-?\d+\.?\d*', pred)]
+    # pred  = [s.replace(",", "") for s in re.findall(r'-?\d+/?\.?\d*', pred)]
+    pred = [delete_extra_zero(s.replace(",", "")) for s in re.findall(r'-?\d+/?\.?\d*', pred)]
+    if len(pred) == 0:
+        return None
+    if answer_flag:
+        # choose the first element in list
+        pred = pred[0]
+    else:
+        # choose the last element in list
+        pred = pred
+    try:
+        if pred[-1] == ".":
+            pred = pred[:-1]
+    except:
+        pass
+    if isinstance(pred, list):
+        return pred[-1]
+    else:
+        return pred