rookiemango
commited on
Upload folder using huggingface_hub
Browse files- .gitattributes +3 -0
- .gitignore +1 -0
- __pycache__/code.cpython-39.pyc +0 -0
- compile_result/lean4_basic_test/output.log +6 -0
- compile_result/lean4_random_test/output.log +34 -0
- data/updated_lean4_kv.json +3 -0
- pass_rate_atp_pass.py +0 -3
- pass_rate_multi_pass.py +22 -24
- pass_rate_new_test.py +40 -19
- pass_rate_output.txt +1 -82
- pass_rate_results/compile_result/lean4_basic_test/lean4_random_15kpass5.jsonl +3 -0
- pass_rate_results/compile_result/lean4_random_test/lean4_random_15kpass5.jsonl +3 -0
- update_lean4_kv.py +295 -0
.gitattributes
CHANGED
@@ -68,3 +68,6 @@ gpt_result/lean_random/gpt4/2.jsonl filter=lfs diff=lfs merge=lfs -text
|
|
68 |
gpt_result/lean_random/gpt4/3.jsonl filter=lfs diff=lfs merge=lfs -text
|
69 |
gpt_result/lean_random/gpt4/4.jsonl filter=lfs diff=lfs merge=lfs -text
|
70 |
gpt_result/lean_random/gpt4/5.jsonl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
68 |
gpt_result/lean_random/gpt4/3.jsonl filter=lfs diff=lfs merge=lfs -text
|
69 |
gpt_result/lean_random/gpt4/4.jsonl filter=lfs diff=lfs merge=lfs -text
|
70 |
gpt_result/lean_random/gpt4/5.jsonl filter=lfs diff=lfs merge=lfs -text
|
71 |
+
data/updated_lean4_kv.json filter=lfs diff=lfs merge=lfs -text
|
72 |
+
pass_rate_results/compile_result/lean4_basic_test/lean4_random_15kpass5.jsonl filter=lfs diff=lfs merge=lfs -text
|
73 |
+
pass_rate_results/compile_result/lean4_random_test/lean4_random_15kpass5.jsonl filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -5,3 +5,4 @@
|
|
5 |
/test/Mathlib/.lake
|
6 |
/test/*.olean
|
7 |
/test/*.olean.tmp
|
|
|
|
5 |
/test/Mathlib/.lake
|
6 |
/test/*.olean
|
7 |
/test/*.olean.tmp
|
8 |
+
pass_rate_results/*/*/*.json
|
__pycache__/code.cpython-39.pyc
CHANGED
Binary files a/__pycache__/code.cpython-39.pyc and b/__pycache__/code.cpython-39.pyc differ
|
|
compile_result/lean4_basic_test/output.log
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
Pass@1: 0.21311475409836064
|
3 |
+
Pass@5: 0.3155737704918033
|
4 |
+
|
5 |
+
|
6 |
+
|
compile_result/lean4_random_test/output.log
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
Traceback (most recent call last):
|
3 |
+
File "/opt/tiger/lean4-compile/pass_rate_new_test.py", line 165, in multi
|
4 |
+
result = future.result()
|
5 |
+
File "/usr/lib/python3.9/concurrent/futures/_base.py", line 435, in result
|
6 |
+
self._condition.wait(timeout)
|
7 |
+
File "/usr/lib/python3.9/threading.py", line 312, in wait
|
8 |
+
waiter.acquire()
|
9 |
+
KeyboardInterrupt
|
10 |
+
|
11 |
+
During handling of the above exception, another exception occurred:
|
12 |
+
|
13 |
+
Traceback (most recent call last):
|
14 |
+
File "/opt/tiger/lean4-compile/pass_rate_new_test.py", line 266, in <module>
|
15 |
+
main(args)
|
16 |
+
File "/opt/tiger/lean4-compile/pass_rate_new_test.py", line 252, in main
|
17 |
+
multi(command_list, args.output_path, args.k)
|
18 |
+
File "/opt/tiger/lean4-compile/pass_rate_new_test.py", line 166, in multi
|
19 |
+
results.append(result)
|
20 |
+
File "/usr/lib/python3.9/concurrent/futures/_base.py", line 628, in __exit__
|
21 |
+
self.shutdown(wait=True)
|
22 |
+
File "/usr/lib/python3.9/concurrent/futures/thread.py", line 229, in shutdown
|
23 |
+
t.join()
|
24 |
+
File "/usr/lib/python3.9/threading.py", line 1033, in join
|
25 |
+
self._wait_for_tstate_lock()
|
26 |
+
File "/usr/lib/python3.9/threading.py", line 1049, in _wait_for_tstate_lock
|
27 |
+
elif lock.acquire(block, timeout):
|
28 |
+
KeyboardInterrupt
|
29 |
+
|
30 |
+
Pass@1: 0.14285714285714285
|
31 |
+
Pass@5: 0.23949579831932774
|
32 |
+
|
33 |
+
|
34 |
+
|
data/updated_lean4_kv.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c798624b063042e0cf22453432c9d640a8e72204203b89ee1e40b915838e7dd2
|
3 |
+
size 352638720
|
pass_rate_atp_pass.py
CHANGED
@@ -89,9 +89,6 @@ input_path_lists = [
|
|
89 |
# "/opt/tiger/auto-info/generate_result/zero_shot/math_train/generation/lean4_random_15k_all_mathrft/2/10/",
|
90 |
# "/opt/tiger/auto-info/generate_result/zero_shot/lean4_15k_train/generation/lean4_random_15k_all_mathrft/2/10/",
|
91 |
# Add more input paths as needed
|
92 |
-
"/opt/tiger/auto-info/generate_result/zero_shot/lean4_basic_test/generation/lean4_random_15k_all_mathrft/2/5/",
|
93 |
-
"/opt/tiger/auto-info/generate_result/zero_shot/lean4_random_test/generation/lean4_random_15k_all_mathrft/2/5/",
|
94 |
-
"/opt/tiger/auto-info/generate_result/zero_shot/wild_test/generation/lean4_random_15k_all_mathrft/2/5/",
|
95 |
]
|
96 |
|
97 |
# Iterate through the input paths and run the command
|
|
|
89 |
# "/opt/tiger/auto-info/generate_result/zero_shot/math_train/generation/lean4_random_15k_all_mathrft/2/10/",
|
90 |
# "/opt/tiger/auto-info/generate_result/zero_shot/lean4_15k_train/generation/lean4_random_15k_all_mathrft/2/10/",
|
91 |
# Add more input paths as needed
|
|
|
|
|
|
|
92 |
]
|
93 |
|
94 |
# Iterate through the input paths and run the command
|
pass_rate_multi_pass.py
CHANGED
@@ -1,18 +1,10 @@
|
|
1 |
import pdb
|
2 |
import subprocess
|
3 |
import re
|
|
|
4 |
|
5 |
-
# Output file
|
6 |
-
output_file = "pass_rate_output.txt"
|
7 |
|
8 |
-
# Clearing the output file before appending new content
|
9 |
-
with open(output_file, "w") as file:
|
10 |
-
file.write("")
|
11 |
|
12 |
-
# List of input paths
|
13 |
-
input_path_lists = [
|
14 |
-
"test/zero_shot/wild_test/generation/lean4_random_15k_all/2/1/",
|
15 |
-
]
|
16 |
|
17 |
def get_output(input_string, k):
|
18 |
pattern = r"zero_shot/(\w+)/(.+?)/(\w+)"
|
@@ -20,7 +12,12 @@ def get_output(input_string, k):
|
|
20 |
if match:
|
21 |
part1 = match.group(1)
|
22 |
part2 = match.group(3) + f"pass{k}.jsonl"
|
23 |
-
result =
|
|
|
|
|
|
|
|
|
|
|
24 |
print(result)
|
25 |
else:
|
26 |
print("No match found.")
|
@@ -89,24 +86,25 @@ input_path_lists = [
|
|
89 |
# "/opt/tiger/auto-info/generate_result/zero_shot/math_train/generation/lean4_random_15k_all_mathrft/2/10/",
|
90 |
# "/opt/tiger/auto-info/generate_result/zero_shot/lean4_15k_train/generation/lean4_random_15k_all_mathrft/2/10/",
|
91 |
# Add more input paths as needed
|
92 |
-
"/opt/tiger/
|
93 |
-
"/opt/tiger/
|
94 |
-
"/opt/tiger/auto-info/generate_result/zero_shot/wild_test/generation/lean4_random_15k_all_mathrft/2/5/",
|
95 |
]
|
96 |
|
97 |
# Iterate through the input paths and run the command
|
98 |
for input_path in input_path_lists:
|
99 |
k = 5
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
command = f"python3 pass_rate_notlean_test.py --input_path {input_path} --output_path {get_output(input_path,k)} --k {k}"
|
104 |
-
subprocess.run(command, shell=True, stdout=open(output_file, "a"), stderr=subprocess.STDOUT)
|
105 |
-
print("\n\n",file=open(output_file, "a"))
|
106 |
|
|
|
|
|
107 |
else:
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
1 |
import pdb
|
2 |
import subprocess
|
3 |
import re
|
4 |
+
import os
|
5 |
|
|
|
|
|
6 |
|
|
|
|
|
|
|
7 |
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def get_output(input_string, k):
|
10 |
pattern = r"zero_shot/(\w+)/(.+?)/(\w+)"
|
|
|
12 |
if match:
|
13 |
part1 = match.group(1)
|
14 |
part2 = match.group(3) + f"pass{k}.jsonl"
|
15 |
+
result = os.path.join("compile_result", part1, part2)
|
16 |
+
|
17 |
+
# Create the parent directory if it doesn't exist
|
18 |
+
parent_dir = os.path.dirname(result)
|
19 |
+
os.makedirs(parent_dir, exist_ok=True)
|
20 |
+
|
21 |
print(result)
|
22 |
else:
|
23 |
print("No match found.")
|
|
|
86 |
# "/opt/tiger/auto-info/generate_result/zero_shot/math_train/generation/lean4_random_15k_all_mathrft/2/10/",
|
87 |
# "/opt/tiger/auto-info/generate_result/zero_shot/lean4_15k_train/generation/lean4_random_15k_all_mathrft/2/10/",
|
88 |
# Add more input paths as needed
|
89 |
+
"/opt/tiger/formal-align/generate_result/zero_shot/lean4_random_test/generation/lean4_random_15k/5",
|
90 |
+
"/opt/tiger/formal-align/generate_result/zero_shot/lean4_basic_test/generation/lean4_random_15k/5"
|
|
|
91 |
]
|
92 |
|
93 |
# Iterate through the input paths and run the command
|
94 |
for input_path in input_path_lists:
|
95 |
k = 5
|
96 |
+
output_path = get_output(input_path, k)
|
97 |
+
output_log_path = os.path.join(os.path.dirname(output_path), 'output.log')
|
98 |
+
|
|
|
|
|
|
|
99 |
|
100 |
+
if "wild_test" in input_path or "gsm8k_train" in input_path or "math_train" in input_path:
|
101 |
+
script_name = "pass_rate_notlean_test.py"
|
102 |
else:
|
103 |
+
script_name = "pass_rate_new_test.py"
|
104 |
+
|
105 |
+
command = f"python3 {script_name} --input_path {input_path} --output_path {output_path} --k {k}"
|
106 |
+
print(command)
|
107 |
+
|
108 |
+
with open(output_log_path, "w") as output_file:
|
109 |
+
subprocess.run(command, shell=True, stdout=output_file, stderr=subprocess.STDOUT)
|
110 |
+
print("\n\n", file=output_file)
|
pass_rate_new_test.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import subprocess
|
3 |
from argparse import ArgumentParser
|
4 |
import json
|
5 |
-
from concurrent.futures import ThreadPoolExecutor
|
6 |
from tqdm import tqdm
|
7 |
import glob
|
8 |
import tempfile
|
@@ -110,7 +110,7 @@ def multi(command_list, output_path, k ):
|
|
110 |
return filtered_data
|
111 |
result_dict = filter_json(item)
|
112 |
result_dict['results'] = []
|
113 |
-
|
114 |
for i, cmd in enumerate(item['cmd']):
|
115 |
temp_file = os.path.join(temp_dir,f"{index}_test_{i}.lean") # Ensure unique filenames
|
116 |
with open(temp_file, "w") as f:
|
@@ -136,16 +136,14 @@ def multi(command_list, output_path, k ):
|
|
136 |
elif not len(stderr) and "messages" in stdout:
|
137 |
flag = 0
|
138 |
for me in stdout['messages']:
|
139 |
-
import pdb
|
140 |
-
pdb.set_trace()
|
141 |
if me['severity'] == 'error':
|
142 |
flag = 1
|
143 |
start_line = me['pos']['line'] - 1
|
144 |
-
current_column =me['pos']['column'] -1
|
145 |
for line_n in range(start_line - 1, 0 , -1):
|
146 |
line_len = len(cmd.split('\n')[line_n])
|
147 |
current_column += line_len + 1
|
148 |
-
if not line_len:
|
149 |
break
|
150 |
result_item = {'stdout': stdout, 'stderr': stderr, 'status': 'nopass', 'string_pos':current_column}
|
151 |
break
|
@@ -161,14 +159,21 @@ def multi(command_list, output_path, k ):
|
|
161 |
|
162 |
total = len(command_list)
|
163 |
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
165 |
futures = [executor.submit(execute_command, cmd, i) for i, cmd in enumerate(command_list)]
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
|
|
|
|
172 |
def calculate_pass(result_list, k):
|
173 |
pass_1_count = 0
|
174 |
pass_k_count = 0
|
@@ -190,7 +195,7 @@ def multi(command_list, output_path, k ):
|
|
190 |
pass_k = pass_k_count / len(result_list) if result_list else 0
|
191 |
|
192 |
return pass_1, pass_k
|
193 |
-
|
194 |
pass_1, pass_k = calculate_pass(results, k)
|
195 |
print("Pass@1:", pass_1)
|
196 |
print(f"Pass@{k}:", pass_k)
|
@@ -199,7 +204,7 @@ def multi(command_list, output_path, k ):
|
|
199 |
# print(f"total test: {total}")
|
200 |
# print(f"Pass rate: {pass_rate}%")
|
201 |
|
202 |
-
output_file = f"
|
203 |
# Create the directory if it doesn't exist
|
204 |
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
205 |
|
@@ -211,22 +216,38 @@ def remove_simp_pattern_from_end(s):
|
|
211 |
pattern = r'@\[simp\s*.*?\]$'
|
212 |
return re.sub(pattern, '', s)
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
def main(args):
|
215 |
command_list = []
|
|
|
216 |
file_pattern = os.path.join(args.input_path, '[0-1]*.json')
|
217 |
for file_path in glob.glob(file_pattern):
|
218 |
-
with open(file_path, 'r', encoding='utf-8') as rf:
|
|
|
219 |
for line in rf.readlines():
|
220 |
try:
|
221 |
json_item = json.loads(line)
|
222 |
-
working_env =
|
223 |
-
|
224 |
# statement = json_item['total output'][0]
|
225 |
json_item['cmd'] = []
|
226 |
for output in json_item['total output'][:min(args.k, len(json_item['total output']))]:
|
227 |
statement = output.split("#align")[0]
|
228 |
json_item['cmd'].append('\n\n'.join([working_env, statement]))
|
229 |
-
json_item['answer'] = json_item['content']['
|
230 |
assert len(statement) > 0
|
231 |
# json_item['cmd'] = '\n'.join([working_env, json_item['total output'][0]])
|
232 |
except:
|
|
|
2 |
import subprocess
|
3 |
from argparse import ArgumentParser
|
4 |
import json
|
5 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6 |
from tqdm import tqdm
|
7 |
import glob
|
8 |
import tempfile
|
|
|
110 |
return filtered_data
|
111 |
result_dict = filter_json(item)
|
112 |
result_dict['results'] = []
|
113 |
+
|
114 |
for i, cmd in enumerate(item['cmd']):
|
115 |
temp_file = os.path.join(temp_dir,f"{index}_test_{i}.lean") # Ensure unique filenames
|
116 |
with open(temp_file, "w") as f:
|
|
|
136 |
elif not len(stderr) and "messages" in stdout:
|
137 |
flag = 0
|
138 |
for me in stdout['messages']:
|
|
|
|
|
139 |
if me['severity'] == 'error':
|
140 |
flag = 1
|
141 |
start_line = me['pos']['line'] - 1
|
142 |
+
current_column =me['pos']['column'] -1
|
143 |
for line_n in range(start_line - 1, 0 , -1):
|
144 |
line_len = len(cmd.split('\n')[line_n])
|
145 |
current_column += line_len + 1
|
146 |
+
if not line_len:
|
147 |
break
|
148 |
result_item = {'stdout': stdout, 'stderr': stderr, 'status': 'nopass', 'string_pos':current_column}
|
149 |
break
|
|
|
159 |
|
160 |
total = len(command_list)
|
161 |
|
162 |
+
|
163 |
+
# Get the number of available CPU cores
|
164 |
+
max_workers = os.cpu_count()
|
165 |
+
|
166 |
+
results = []
|
167 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
168 |
futures = [executor.submit(execute_command, cmd, i) for i, cmd in enumerate(command_list)]
|
169 |
+
|
170 |
+
with tqdm(total=len(futures), desc="Processing Commands") as progress_bar:
|
171 |
+
for future in as_completed(futures):
|
172 |
+
result = future.result()
|
173 |
+
results.append(result)
|
174 |
+
progress_bar.update(1)
|
175 |
+
|
176 |
+
|
177 |
def calculate_pass(result_list, k):
|
178 |
pass_1_count = 0
|
179 |
pass_k_count = 0
|
|
|
195 |
pass_k = pass_k_count / len(result_list) if result_list else 0
|
196 |
|
197 |
return pass_1, pass_k
|
198 |
+
|
199 |
pass_1, pass_k = calculate_pass(results, k)
|
200 |
print("Pass@1:", pass_1)
|
201 |
print(f"Pass@{k}:", pass_k)
|
|
|
204 |
# print(f"total test: {total}")
|
205 |
# print(f"Pass rate: {pass_rate}%")
|
206 |
|
207 |
+
output_file = f"output_path"
|
208 |
# Create the directory if it doesn't exist
|
209 |
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
210 |
|
|
|
216 |
pattern = r'@\[simp\s*.*?\]$'
|
217 |
return re.sub(pattern, '', s)
|
218 |
|
219 |
+
def update_dict(lean_kv):
|
220 |
+
update_kv = {}
|
221 |
+
for k, v in lean_kv.items():
|
222 |
+
# print(k)
|
223 |
+
# print(k.split("#align")[0])
|
224 |
+
update_kv[k.split("#align")[0]] = v
|
225 |
+
# Write the combined data to a new JSON file
|
226 |
+
with open('up_lean4_kv.json', 'w') as output_file:
|
227 |
+
json.dump(update_kv, output_file, indent=4)
|
228 |
+
return update_kv
|
229 |
+
|
230 |
+
|
231 |
+
|
232 |
+
|
233 |
def main(args):
|
234 |
command_list = []
|
235 |
+
retrieval_path = "data/updated_lean4_kv.json"
|
236 |
file_pattern = os.path.join(args.input_path, '[0-1]*.json')
|
237 |
for file_path in glob.glob(file_pattern):
|
238 |
+
with open(file_path, 'r', encoding='utf-8') as rf, open(retrieval_path, 'r', encoding='utf-8') as retrival_kv:
|
239 |
+
lean4_kv = json.load(retrival_kv)
|
240 |
for line in rf.readlines():
|
241 |
try:
|
242 |
json_item = json.loads(line)
|
243 |
+
# working_env = content']['working_file']
|
244 |
+
working_env = lean4_kv[json_item['content']['formal']]
|
245 |
# statement = json_item['total output'][0]
|
246 |
json_item['cmd'] = []
|
247 |
for output in json_item['total output'][:min(args.k, len(json_item['total output']))]:
|
248 |
statement = output.split("#align")[0]
|
249 |
json_item['cmd'].append('\n\n'.join([working_env, statement]))
|
250 |
+
json_item['answer'] = json_item['content']['formal']
|
251 |
assert len(statement) > 0
|
252 |
# json_item['cmd'] = '\n'.join([working_env, json_item['total output'][0]])
|
253 |
except:
|
pass_rate_output.txt
CHANGED
@@ -1,82 +1 @@
|
|
1 |
-
Running for input path:
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
Running for input path: ../auto-info/generate_result/zero_shot/lean4_random_test/generation/deepseek-math-7b-base/5/
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
Running for input path: ../auto-info/generate_result/zero_shot/wild_test/generation/deepseek-math-7b-base/5/
|
10 |
-
|
11 |
-
total len: 1000
|
12 |
-
Pass@1: 0.0
|
13 |
-
Pass@5: 0.0
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
Running for input path: ../auto-info/generate_result/zero_shot/lean4_basic_test/generation/deepseek-math-7b-instruct/5/
|
18 |
-
|
19 |
-
total len: 981
|
20 |
-
Pass@1: 0.004077471967380225
|
21 |
-
Pass@5: 0.029561671763506627
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
Running for input path: ../auto-info/generate_result/zero_shot/lean4_random_test/generation/deepseek-math-7b-instruct/5/
|
26 |
-
|
27 |
-
total len: 970
|
28 |
-
Pass@1: 0.002061855670103093
|
29 |
-
Pass@5: 0.016494845360824743
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
Running for input path: ../auto-info/generate_result/zero_shot/lean4_basic_test/generation/llemma_7b/5/
|
34 |
-
|
35 |
-
total len: 981
|
36 |
-
Pass@1: 0.0010193679918450561
|
37 |
-
Pass@5: 0.004077471967380225
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
Running for input path: ../auto-info/generate_result/zero_shot/lean4_random_test/generation/llemma_7b/5/
|
42 |
-
|
43 |
-
total len: 970
|
44 |
-
Pass@1: 0.0
|
45 |
-
Pass@5: 0.007216494845360825
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
Running for input path: ../auto-info/generate_result/zero_shot/wild_test/generation/llemma_7b/5/
|
50 |
-
|
51 |
-
total len: 1000
|
52 |
-
Pass@1: 0.0
|
53 |
-
Pass@5: 0.0
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
Running for input path: ../auto-info/generate_result/zero_shot/lean4_basic_test/generation/llemma_34b/5/
|
58 |
-
|
59 |
-
total len: 981
|
60 |
-
Pass@1: 0.0
|
61 |
-
Pass@5: 0.0
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
Running for input path: ../auto-info/generate_result/zero_shot/lean4_random_test/generation/llemma_34b/5/
|
66 |
-
|
67 |
-
total len: 970
|
68 |
-
Pass@1: 0.0
|
69 |
-
Pass@5: 0.0010309278350515464
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
Running for input path: ../auto-info/generate_result/zero_shot/wild_test/generation/llemma_34b/5/
|
74 |
-
|
75 |
-
total len: 1000
|
76 |
-
Pass@1: 0.0
|
77 |
-
Pass@5: 0.0
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
Running for input path: ../auto-info/generate_result/zero_shot/lean4_basic_test/generation/internlm2-math-7b/5/
|
82 |
-
|
|
|
1 |
+
Running for input path: /opt/tiger/formal-align/generate_result/zero_shot/lean4_random_test/generation/lean4_random_15k/5/opt/tiger/formal-align/generate_result/zero_shot/lean4_basic_test/generation/lean4_random_15k/5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pass_rate_results/compile_result/lean4_basic_test/lean4_random_15kpass5.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4040c382e0ee3c796951b7c60b55f91616ea8653c1730c75978c3464d9562ecc
|
3 |
+
size 12184988
|
pass_rate_results/compile_result/lean4_random_test/lean4_random_15kpass5.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ba08a0aea4c769c9f16378f48121a50f7df9379fec11518773896607a9ad1a0
|
3 |
+
size 55140396
|
update_lean4_kv.py
ADDED
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
from argparse import ArgumentParser
|
4 |
+
import json
|
5 |
+
from concurrent.futures import ThreadPoolExecutor
|
6 |
+
from tqdm import tqdm
|
7 |
+
import glob
|
8 |
+
import tempfile
|
9 |
+
import random
|
10 |
+
|
11 |
+
def wrapped_function(item):
|
12 |
+
results = []
|
13 |
+
passed = 0
|
14 |
+
total = 0
|
15 |
+
|
16 |
+
temp_dir = tempfile.gettempdir()
|
17 |
+
temp_file = os.path.join(temp_dir, f"test.lean")
|
18 |
+
|
19 |
+
with open(temp_file, "w") as f:
|
20 |
+
f.write(item['cmd'])
|
21 |
+
|
22 |
+
# Rest of the function code...
|
23 |
+
# Process the item using the temporary file
|
24 |
+
# ...
|
25 |
+
|
26 |
+
# Clean up the temporary file
|
27 |
+
data = '{"path": "%s", "allTactics": true}' %(temp_file)
|
28 |
+
command = 'echo \'%s\' | lake exe repl' % data
|
29 |
+
|
30 |
+
try:
|
31 |
+
result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
32 |
+
stdout = result.stdout.decode('utf-8')
|
33 |
+
stderr = result.stderr.decode('utf-8')
|
34 |
+
# stdout = result.stdout.decode('utf-8')
|
35 |
+
json_stdout = json.loads(stdout)
|
36 |
+
if "messages" not in json_stdout.keys():
|
37 |
+
passed += 1
|
38 |
+
# results.append({'item': item['content'], 'stdout': stdout, 'stderr': stderr, 'status': 'pass'})
|
39 |
+
results.append({ 'stdout': stdout, 'stderr': stderr, 'status': 'pass'})
|
40 |
+
except subprocess.CalledProcessError as e:
|
41 |
+
# results.append({'item': item['content'], 'error': str(e), 'status': 'nopass'})
|
42 |
+
results.append({ 'error': str(e), 'status': 'nopass'})
|
43 |
+
total += 1
|
44 |
+
|
45 |
+
pass_rate = passed / (passed + total) * 100
|
46 |
+
|
47 |
+
|
48 |
+
return {'results': results, 'pass_rate': pass_rate}
|
49 |
+
|
50 |
+
# Set the directory where your .lean files are located
|
51 |
+
|
52 |
+
# Get a list of all .lean files in the directory
|
53 |
+
# lean_files = [f for f in os.listdir(directory) if f.endswith(".lean")]
|
54 |
+
# lean_files = ["test/file.lean"]
|
55 |
+
def single(command_list, args):
|
56 |
+
results = []
|
57 |
+
passed = 0
|
58 |
+
total = 0
|
59 |
+
for item in tqdm(command_list):
|
60 |
+
with open("test/test.lean", "w", encoding = 'utf-8') as f:
|
61 |
+
f.write(item['cmd'])
|
62 |
+
data = '{"path": "test/test.lean", "allTactics": true}'
|
63 |
+
# data = '{"cmd": "%s", "allTactics": true}' % item['cmd']
|
64 |
+
command = 'echo \'%s\' | lake exe repl' % data
|
65 |
+
try:
|
66 |
+
# process = subprocess.Popen(['lake', 'exe', 'repl'], stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
67 |
+
# stderr=subprocess.PIPE)
|
68 |
+
# stdout, stderr = process.communicate(input=data.encode(encoding='utf-8'))
|
69 |
+
# stdout = stdout.decode('utf-8')
|
70 |
+
result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
71 |
+
stdout = result.stdout.decode('utf-8')
|
72 |
+
json_stdout = json.loads(stdout)
|
73 |
+
if "messages" not in json_stdout.keys():
|
74 |
+
passed += 1
|
75 |
+
stderr = result.stderr.decode('utf-8')
|
76 |
+
results.append({
|
77 |
+
# 'item': item['content'],
|
78 |
+
'stdout': stdout,
|
79 |
+
'stderr': stderr,
|
80 |
+
'status': 'pass'
|
81 |
+
})
|
82 |
+
except subprocess.CalledProcessError as e:
|
83 |
+
results.append({
|
84 |
+
# 'item': item['content'],
|
85 |
+
'error': str(e),
|
86 |
+
'status': 'nopass'
|
87 |
+
})
|
88 |
+
total += 1
|
89 |
+
|
90 |
+
# Calculate pass rate
|
91 |
+
pass_rate = passed / total * 100
|
92 |
+
print(pass_rate)
|
93 |
+
|
94 |
+
# Save results to a JSON file
|
95 |
+
with open('results.json', 'w') as f:
|
96 |
+
json.dump({'results': results, 'pass_rate': pass_rate}, f, indent=2, ensure_ascii=False)
|
97 |
+
|
98 |
+
|
99 |
+
def multi(command_list, output_path, k ):
|
100 |
+
results = []
|
101 |
+
passed = 0
|
102 |
+
total = 0
|
103 |
+
def execute_command(item, index):
|
104 |
+
temp_dir = '/opt/jianqiao'
|
105 |
+
def filter_json(json_data):
|
106 |
+
filtered_data = {}
|
107 |
+
for key in json_data.keys():
|
108 |
+
if key in ['question', 'answer', 'total output', 'results']:
|
109 |
+
filtered_data[key] = json_data[key]
|
110 |
+
return filtered_data
|
111 |
+
result_dict = filter_json(item)
|
112 |
+
result_dict['results'] = []
|
113 |
+
|
114 |
+
for i, cmd in enumerate(item['cmd']):
|
115 |
+
temp_file = os.path.join(temp_dir,f"{index}_test_{i}.lean") # Ensure unique filenames
|
116 |
+
with open(temp_file, "w") as f:
|
117 |
+
f.write(cmd)
|
118 |
+
|
119 |
+
data = '{"path": "%s", "allTactics": true}' % temp_file
|
120 |
+
command = f'echo \'{data}\' | lake exe repl'
|
121 |
+
|
122 |
+
try:
|
123 |
+
result = subprocess.run(command, shell=True, check=True,timeout=600, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
124 |
+
stdout = json.loads(result.stdout.decode('utf-8'))
|
125 |
+
stderr = result.stderr.decode('utf-8')
|
126 |
+
|
127 |
+
except subprocess.TimeoutExpired as e:
|
128 |
+
result_item = {'error': str(e), 'status': 'nopass_limit'}
|
129 |
+
|
130 |
+
except subprocess.CalledProcessError as e:
|
131 |
+
result_item = {'error': str(e), 'status': 'nopass_error'}
|
132 |
+
|
133 |
+
else:
|
134 |
+
if "messages" not in stdout and not len(stderr):
|
135 |
+
result_item = {'stdout': stdout, 'stderr': stderr, 'status': 'pass' }
|
136 |
+
elif not len(stderr) and "messages" in stdout:
|
137 |
+
flag = 0
|
138 |
+
for me in stdout['messages']:
|
139 |
+
import pdb
|
140 |
+
pdb.set_trace()
|
141 |
+
if me['severity'] == 'error':
|
142 |
+
flag = 1
|
143 |
+
start_line = me['pos']['line'] - 1
|
144 |
+
current_column =me['pos']['column'] -1
|
145 |
+
for line_n in range(start_line - 1, 0 , -1):
|
146 |
+
line_len = len(cmd.split('\n')[line_n])
|
147 |
+
current_column += line_len + 1
|
148 |
+
if not line_len:
|
149 |
+
break
|
150 |
+
result_item = {'stdout': stdout, 'stderr': stderr, 'status': 'nopass', 'string_pos':current_column}
|
151 |
+
break
|
152 |
+
if not flag :
|
153 |
+
result_item = {'stdout': stdout, 'stderr': stderr, 'status': 'pass'}
|
154 |
+
else:
|
155 |
+
assert len(stderr)
|
156 |
+
result_item = {'stdout': stdout, 'stderr': stderr, 'status': 'nopass', 'string_pos': 0 }
|
157 |
+
|
158 |
+
result_dict['results'].append(result_item)
|
159 |
+
return result_dict
|
160 |
+
|
161 |
+
|
162 |
+
total = len(command_list)
|
163 |
+
|
164 |
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
165 |
+
futures = [executor.submit(execute_command, cmd, i) for i, cmd in enumerate(command_list)]
|
166 |
+
for future in tqdm(futures, total=total, desc="Processing Commands"):
|
167 |
+
result = future.result()
|
168 |
+
results.append(result)
|
169 |
+
# if result['status'] == 'pass':
|
170 |
+
# passed += 1
|
171 |
+
|
172 |
+
def calculate_pass(result_list, k):
|
173 |
+
pass_1_count = 0
|
174 |
+
pass_k_count = 0
|
175 |
+
|
176 |
+
for result in result_list:
|
177 |
+
results = result.get('results', [])
|
178 |
+
if results:
|
179 |
+
for j in range(min(1, len(results))):
|
180 |
+
if results[j].get('status') == 'pass':
|
181 |
+
pass_1_count += 1
|
182 |
+
break
|
183 |
+
|
184 |
+
for j in range(min(k, len(results))):
|
185 |
+
if results[j].get('status') == 'pass':
|
186 |
+
pass_k_count += 1
|
187 |
+
break
|
188 |
+
|
189 |
+
pass_1 = pass_1_count / len(result_list) if result_list else 0
|
190 |
+
pass_k = pass_k_count / len(result_list) if result_list else 0
|
191 |
+
|
192 |
+
return pass_1, pass_k
|
193 |
+
|
194 |
+
pass_1, pass_k = calculate_pass(results, k)
|
195 |
+
print("Pass@1:", pass_1)
|
196 |
+
print(f"Pass@{k}:", pass_k)
|
197 |
+
|
198 |
+
# pass_rate = (passed / total) * 100
|
199 |
+
# print(f"total test: {total}")
|
200 |
+
# print(f"Pass rate: {pass_rate}%")
|
201 |
+
|
202 |
+
output_file = f"pass_rate_results/{output_path}"
|
203 |
+
# Create the directory if it doesn't exist
|
204 |
+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
205 |
+
|
206 |
+
with open(f"{output_file}", 'w') as f:
|
207 |
+
json.dump({'results': results, 'pass_1': pass_1, f"pass_{k}":pass_k}, f, indent=2, ensure_ascii=False)
|
208 |
+
|
209 |
+
import re
|
210 |
+
def remove_simp_pattern_from_end(s):
|
211 |
+
pattern = r'@\[simp\s*.*?\]$'
|
212 |
+
return re.sub(pattern, '', s)
|
213 |
+
|
214 |
+
def update_dict(lean_kv):
|
215 |
+
update_kv = {}
|
216 |
+
for k, v in lean_kv.items():
|
217 |
+
# print(k)
|
218 |
+
# print(k.split("#align")[0])
|
219 |
+
update_kv[k.split("#align")[0]] = v
|
220 |
+
# Write the combined data to a new JSON file
|
221 |
+
with open('up_lean4_kv.json', 'w') as output_file:
|
222 |
+
json.dump(update_kv, output_file, indent=4)
|
223 |
+
return update_kv
|
224 |
+
|
225 |
+
|
226 |
+
def find_key(lean_kv):
|
227 |
+
update_kv = {}
|
228 |
+
for k, v in lean_kv.items():
|
229 |
+
if "theorem odd_sub" in k:
|
230 |
+
import pdb
|
231 |
+
pdb.set_trace()
|
232 |
+
|
233 |
+
|
234 |
+
import os
|
235 |
+
import json
|
236 |
+
import tqdm
|
237 |
+
|
238 |
+
def main(args):
|
239 |
+
# Define the directory and file names
|
240 |
+
file_dir = '../formal-align/data'
|
241 |
+
file_names = ['FormL4_basic_test.json', 'FormL4_random_test.json', 'FormL4_train.json']
|
242 |
+
|
243 |
+
# Initialize a list to store all items from JSON files
|
244 |
+
all_items = []
|
245 |
+
|
246 |
+
# Loop through each file and load its content
|
247 |
+
for file_name in file_names:
|
248 |
+
file_path = os.path.join(file_dir, file_name)
|
249 |
+
|
250 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
251 |
+
data = json.load(file)
|
252 |
+
if isinstance(data, list):
|
253 |
+
all_items.extend(data) # Assuming each file contains a list of items
|
254 |
+
else:
|
255 |
+
all_items.append(data) # If the file contains a single item
|
256 |
+
|
257 |
+
# Load the original key-value pairs from the file
|
258 |
+
filename = 'data/lean4_kv.json'
|
259 |
+
with open(filename, 'r') as file:
|
260 |
+
original_kv = json.load(file)
|
261 |
+
|
262 |
+
new_kv = {}
|
263 |
+
|
264 |
+
# Iterate over each item in all_items and update the new_kv dictionary
|
265 |
+
for item in tqdm.tqdm(all_items):
|
266 |
+
formal = item['formal']
|
267 |
+
for k, v in list(original_kv.items()):
|
268 |
+
if formal in k:
|
269 |
+
new_kv[formal] = v
|
270 |
+
del original_kv[k]
|
271 |
+
break # Exit the inner loop since a match is found
|
272 |
+
|
273 |
+
# Save the updated key-value pairs to a new file
|
274 |
+
output_filename = 'data/updated_lean4_kv.json'
|
275 |
+
with open(output_filename, 'w') as file:
|
276 |
+
json.dump(new_kv, file, indent=4)
|
277 |
+
|
278 |
+
print(f"Updated key-value pairs saved to: {output_filename}")
|
279 |
+
|
280 |
+
if __name__ == '__main__':
|
281 |
+
arg_parser = ArgumentParser()
|
282 |
+
arg_parser.add_argument('--data_path', type=str,
|
283 |
+
default='data/grade-school-math-master/grade_school_math/data/test.jsonl')
|
284 |
+
arg_parser.add_argument('--input_path', type=str, default='')
|
285 |
+
arg_parser.add_argument('--cuda_num', type=int, default=8)
|
286 |
+
arg_parser.add_argument('--k', type=int, default=1)
|
287 |
+
arg_parser.add_argument('--output_path', type=str, default='total.json')
|
288 |
+
arg_parser.add_argument('--generate_method', type=str,
|
289 |
+
choices=['single', 'sft', 'comp', 'self_consistency', 'single_consistency'])
|
290 |
+
arg_parser.add_argument('--method', type=str, choices=['main', 'test', 'get_data'])
|
291 |
+
args = arg_parser.parse_args()
|
292 |
+
main(args)
|
293 |
+
|
294 |
+
|
295 |
+
|