Spaces:
Runtime error
Runtime error
import gradio as gr | |
from huggingface_hub import login | |
import re | |
# from vllm import LLM, SamplingParams | |
import pandas as pd | |
from collections import Counter | |
from datasets import load_dataset, Dataset, concatenate_datasets | |
from dataclasses import dataclass | |
from concurrent.futures import ThreadPoolExecutor, TimeoutError | |
import os | |
from typing import Dict, Any, List | |
# code execution | |
import os | |
import re | |
import signal | |
import subprocess | |
import tempfile | |
from contextlib import contextmanager | |
from typing import Tuple | |
from transformers import PreTrainedTokenizer, set_seed | |
import torch | |
from tqdm import tqdm | |
import time | |
from sympy import N, simplify | |
from sympy.parsing.latex import parse_latex | |
import random | |
from pathlib import Path | |
from openai import OpenAI | |
client = OpenAI( | |
base_url="https://ji0rhe7rvh6wrfmq.us-east-1.aws.endpoints.huggingface.cloud/v1/", | |
api_key=os.environ.get("HF_TOKEN"), | |
) | |
class Config: | |
model_id: str # SELECT MODEL | |
revision: str # SELECT REVISION | |
# Append an optional system prompt to each problem | |
system_prompt: str | |
# Number of samples to generate per problem | |
num_samples: int | |
num_generations: int | |
# Generation parameters | |
do_sample: bool | |
temperature: float | |
top_p: float | |
top_k: int | |
max_new_tokens: int | |
restart_on_fail: bool | |
# Enable 4-bit quantization | |
is_quantized: bool | |
# Run on train or test data? | |
is_submission: bool = True if os.getenv("KAGGLE_IS_COMPETITION_RERUN") else False | |
validation_set: str = "kaggle-validation-set-medium" | |
notebook_time_limit: int = 9 * 60 * 60 - 15 * 60 # 9 hours - 15 minute buffer | |
# Debug by solving only the first problem | |
debug: bool = False | |
# Push solutions to the Hub | |
push_to_hub: bool = False | |
class PythonREPL: | |
def __init__(self, timeout=5): | |
self.timeout = timeout | |
def execute(self, query: str) -> Tuple[bool, str]: | |
query = "import math\nimport numpy as np\nimport sympy as sp\n" + query | |
query = query.strip().split("\n") | |
if "print(" not in query[-1]: | |
if "#" in query[-1]: | |
query[-1] = query[-1].split("#")[0] | |
query[-1] = "print(" + query[-1] + ")" | |
query = "\n".join(query) | |
with tempfile.TemporaryDirectory() as temp_dir: | |
temp_file_path = os.path.join(temp_dir, "tmp.py") | |
with open(temp_file_path, "w") as f: | |
f.write(query) | |
result = subprocess.run( | |
["python3", temp_file_path], | |
capture_output=True, | |
check=False, | |
text=True, | |
timeout=self.timeout, | |
) | |
if result.returncode == 0: | |
output = result.stdout | |
return True, output.strip() | |
else: | |
error_msg = result.stderr.strip() | |
msgs = error_msg.split("\n") | |
new_msgs = [] | |
want_next = False | |
for m in msgs: | |
if "Traceback" in m: | |
new_msgs.append(m) | |
elif m == msgs[-1]: | |
new_msgs.append(m) | |
elif temp_file_path in m: | |
st = m.index('"/') + 1 if '"/' in m else 0 | |
ed = m.index(temp_file_path) + 1 if temp_file_path in m else None | |
clr = m[st:ed] if not ed else m[st:] | |
m = m.replace(clr, "") | |
new_msgs.append(m) | |
want_next = True | |
elif want_next: | |
new_msgs.append(m) | |
want_next = False | |
error_msg = "\n".join(new_msgs) | |
return False, error_msg.strip() | |
def __call__(self, query: str) -> Tuple[bool, str]: | |
with ThreadPoolExecutor() as executor: | |
future = executor.submit(self.execute, query) | |
try: | |
return future.result(timeout=self.timeout) | |
except TimeoutError: | |
return False, f"Timed out after {self.timeout} seconds." | |
def execute_completion( | |
executor: PythonREPL, | |
completion: str, | |
return_status: bool = False, | |
last_code_block: bool = False, | |
) -> str | Tuple[str, bool]: | |
# executions = ["!" + code for code in re.findall(r"```bash(.*?)```", completion, re.DOTALL) if "!" not in code] | |
executions = re.findall(r"```python(.*?)```", completion, re.DOTALL) | |
if len(executions) == 0: # directly return cot result | |
return completion, False if return_status else completion | |
else: | |
if last_code_block: | |
executions = [executions[-1]] | |
# Python | |
execution_outputs = [] | |
successes = [] | |
for code in executions: | |
success = False | |
if "subprocess" in code: | |
output = "subprocess is not allowed" | |
execution_outputs.append(output) | |
successes.append(success) | |
continue | |
if "venv" in code: | |
output = "venv is not allowed" | |
execution_outputs.append(output) | |
successes.append(success) | |
continue | |
try: | |
success, output = executor(code) | |
except TimeoutError as e: | |
print("time out") | |
output = e | |
if not success and not return_status: | |
output = "" | |
execution_outputs.append(output) | |
successes.append(success) | |
output = str(execution_outputs[-1]).strip() | |
success = successes[-1] | |
if return_status: | |
return output, success | |
else: | |
return output | |
def postprocess_completion( | |
text: str, return_status: bool = False, last_code_block=False, timeout=5 | |
) -> str | Tuple[str, bool]: | |
executor = PythonREPL(timeout=timeout) | |
result = execute_completion(executor, text, return_status=return_status, last_code_block=last_code_block) | |
del executor | |
return result | |
def apply_template(example: Dict[str, Any], prompt: str) -> Dict[str, Any]: | |
return prompt.format(example["prompt"], "{}") | |
def last_boxed_only_string(string): | |
""" | |
Extracts the last LaTeX boxed or framed expression from a string. | |
Args: | |
string (str): The input string containing LaTeX expressions. | |
Returns: | |
str or None: The last boxed or framed expression, if found; | |
otherwise, None. | |
""" | |
idx = string.rfind("\\boxed") | |
if idx < 0: | |
idx = string.rfind("\\fbox") | |
if idx < 0: | |
return None | |
i = idx | |
right_brace_idx = None | |
num_left_braces_open = 0 | |
while i < len(string): | |
if string[i] == "{": | |
num_left_braces_open += 1 | |
if string[i] == "}": | |
num_left_braces_open -= 1 | |
if num_left_braces_open == 0: | |
right_brace_idx = i | |
break | |
i += 1 | |
if right_brace_idx is None: | |
retval = None | |
else: | |
retval = string[idx : right_brace_idx + 1] | |
return retval | |
def remove_boxed(s): | |
""" | |
Removes the LaTeX boxed command, returning the content inside the braces. | |
Args: | |
s (str): The string containing a LaTeX boxed expression. | |
Returns: | |
str or None: The content inside the boxed command, if valid; | |
otherwise, None. | |
""" | |
left = "\\boxed{" | |
try: | |
assert s[: len(left)] == left | |
assert s[-1] == "}" | |
length = len(left) | |
return s[length:-1] | |
except Exception: | |
return None | |
def extract_boxed_answer(pred_str, strip_double_curly_brace=False): | |
""" | |
Extracts the answer from a LaTeX boxed expression within | |
a prediction string. | |
Args: | |
pred_str (str): The string containing one or more LaTeX | |
boxed expressions. | |
strip_double_curly_brace (bool): If True, removes an additional | |
layer of braces. | |
Returns: | |
str or None: The extracted answer, if any; otherwise, None. | |
""" | |
boxed_str = last_boxed_only_string(pred_str) | |
if boxed_str is None: | |
return None | |
answer = remove_boxed(boxed_str) | |
if answer is None: | |
return None | |
if strip_double_curly_brace: | |
match = re.match("^\{(.*)\}$", answer) # noqa: W605 | |
if match: | |
answer = match.group(1) | |
return answer | |
def normalize_final_answer(final_answer: str) -> str: | |
""" | |
Normalizes a final answer string by removing or replacing various LaTeX | |
and text elements. | |
Args: | |
final_answer (str): The answer string to normalize. | |
Returns: | |
str: The normalized answer string. | |
""" | |
match = re.search(r"(.*?)Problem:", final_answer, flags=re.S) | |
if match: | |
final_answer = match.group(1) # 返回匹配的第一部分,即"Problem"之前的所有文本 | |
"""Normalize a final answer to a quantitative reasoning question.""" | |
# final_answer = final_answer.split('=')[-1] | |
SUBSTITUTIONS = [ | |
("an ", ""), | |
("a ", ""), | |
(".$", "$"), | |
("\\$", ""), | |
(r"\ ", ""), | |
(" ", ""), | |
("mbox", "text"), | |
(",\\text{and}", ","), | |
("\\text{and}", ","), | |
("\\text{m}", "\\text{}"), | |
("\\le", "<"), | |
] | |
REMOVED_EXPRESSIONS = [ | |
"square", | |
"ways", | |
"integers", | |
"dollars", | |
"mph", | |
"inches", | |
"ft", | |
"hours", | |
"km", | |
"units", | |
"\\ldots", | |
"sue", | |
"points", | |
"feet", | |
"minutes", | |
"digits", | |
"cents", | |
"degrees", | |
"cm", | |
"gm", | |
"pounds", | |
"meters", | |
"meals", | |
"edges", | |
"students", | |
"childrentickets", | |
"multiples", | |
"\\text{s}", | |
"\\text{.}", | |
"\\text{\ns}", | |
"\\text{}^2", | |
"\\text{}^3", | |
"\\text{\n}", | |
"\\text{}", | |
r"\mathrm{th}", | |
r"^\circ", | |
r"^{\circ}", | |
r"\;", | |
r",\!", | |
"{,}", | |
'"', | |
"\\dots", | |
"\n", | |
"\r", | |
"\f", | |
"\%", | |
] | |
for before, after in SUBSTITUTIONS: | |
final_answer = final_answer.replace(before, after) | |
for expr in REMOVED_EXPRESSIONS: | |
final_answer = final_answer.replace(expr, "") | |
# Extract answer that is in LaTeX math, is bold, | |
# is surrounded by a box, etc. | |
final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer) | |
final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer) | |
final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer) | |
final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer) | |
assert "\n" not in final_answer | |
assert "\r" not in final_answer | |
assert "\f" not in final_answer | |
if len(re.findall(r"finalansweris(.*)", final_answer)) > 0: | |
final_answer = re.findall(r"finalansweris(.*)", final_answer)[-1] | |
if len(re.findall(r"answer?is:?(.*)", final_answer)) > 0: | |
final_answer = re.findall(r"answer?is:?(.*)", final_answer)[-1] | |
if len(re.findall(r"oxed\{(.*?)\}", final_answer)) > 0: | |
final_answer = re.findall(r"oxed\{(.*?)\}", final_answer)[-1] | |
if len(re.findall(r"\$(.*?)\$", final_answer)) > 0: | |
final_answer = re.findall(r"\$(.*?)\$", final_answer)[-1] | |
final_answer = final_answer.strip() | |
if "rac" in final_answer and "\\frac" not in final_answer: | |
final_answer = final_answer.replace("rac", "\\frac") | |
final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer) | |
final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer) | |
final_answer = final_answer.replace("$", "") | |
if final_answer.replace(",", "").isdigit(): | |
final_answer = final_answer.replace(",", "") | |
return final_answer | |
def naive_parse(answer: str) -> str: | |
""" | |
Extracts and returns the numeric digits from the input string, processing them in reverse order | |
until a non-numeric character is encountered after encountering the first numeric character. | |
Args: | |
answer (str): The input string to parse. | |
Returns: | |
str: A string consisting of the numeric digits extracted from the input, in their original order. | |
Example: | |
>>> naive_parse("abc123def") | |
'123' | |
>>> naive_parse("def456ghi") | |
'456' | |
>>> naive_parse("no numbers here") | |
'' | |
""" | |
out = [] | |
start = False | |
end = False | |
for l in reversed(list(answer)): | |
if l in "0123456789" and not end: | |
start = True | |
out.append(l) | |
else: | |
if start: | |
end = True | |
out = reversed(out) | |
return "".join(out) | |
def validate_answer_is_numeric(x: str | int | float) -> int: | |
FLOAT_TOLERANCE = 0.2 | |
try: | |
x = round(float(x)) | |
f = float(x) | |
if abs(x - f) > FLOAT_TOLERANCE: | |
x = -1 | |
except Exception: | |
x = -1 | |
return x | |
def get_majority_vote(responses: List[int]) -> int: | |
if len(responses) < 1: | |
return 0 | |
else: | |
c = Counter(responses) | |
value, count = c.most_common()[0] | |
return value | |
def filter_answers(answers: List[str]) -> List[int]: | |
formatted_answers = [validate_answer_is_numeric(a) for a in answers] | |
# Filter for non-negative answers | |
formatted_answers = [a for a in formatted_answers if a >= 0] | |
# Compute modulo | |
formatted_answers = [a % 1_000 for a in formatted_answers] | |
# less than 2.1 billion or cannot convert to C int (32-bit) | |
formatted_answers = [a for a in formatted_answers if a <= 999] | |
return formatted_answers | |
def check_sympy_equivalence(ref_answer: str, model_answer: str) -> bool: | |
def do_answers_match(ref_answer: str, model_answer: str) -> bool: | |
ref_sympy = parse_latex(ref_answer) | |
model_sympy = parse_latex(model_answer) | |
diff = simplify(ref_sympy - model_sympy) | |
return True if -1e-12 < N(diff) < 1e-12 or diff.is_zero else False | |
try: | |
result = do_answers_match(ref_answer, model_answer) | |
return result | |
except Exception as e: | |
print(e) | |
return False | |
def check_string_match(ref_answer: str, model_answer: str) -> bool: | |
try: | |
return ref_answer == model_answer | |
except Exception as e: | |
print(e) | |
return False | |
def check_answer(ref_answer: str, model_answer: str) -> bool: | |
# check if strings are the same | |
correct = check_string_match(ref_answer, model_answer) | |
if correct: | |
return True | |
# use the sympy library to check if the expressions are the same | |
correct = check_sympy_equivalence(ref_answer, model_answer) | |
if correct: | |
return True | |
return False | |
debug = False | |
model_id = "Numina-Math-7B" | |
revision = "main" | |
system_prompt = "{}" | |
validation_set = "kaggle-validation-set-medium" | |
is_submission = True | |
num_samples = 4 | |
num_generations = 4 | |
temperature = 0.8 | |
is_quantized = False | |
restart_on_fail = False | |
top_p = 1.0 | |
top_k = 0 | |
max_new_tokens = 2048 | |
# Papermill related variables | |
push_to_hub = False | |
notebook_name = "" | |
config = Config( | |
debug=debug, | |
push_to_hub=push_to_hub, | |
model_id=model_id, | |
revision=revision, | |
system_prompt=system_prompt, | |
validation_set=validation_set, | |
is_quantized=is_quantized, | |
restart_on_fail=restart_on_fail, | |
is_submission=is_submission, | |
num_samples=num_samples, | |
num_generations=num_generations, | |
do_sample=True, | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
max_new_tokens=max_new_tokens, | |
) | |
print(f"=== Running submission with config ===\n\n{config}") | |
def generate(message): | |
chat_completion = client.chat.completions.create( | |
model="tgi", | |
messages=message, | |
stream=True, | |
max_tokens=1024, | |
stop=["```output\n"], | |
temperature=temperature, | |
) | |
for message in chat_completion: | |
yield message.choices[0].delta.content | |
def get_majority_text(data): | |
from collections import Counter | |
# Count the frequency of each answer in model_answers | |
answer_counts = Counter(data["model_answers"]) | |
# Find the majority response | |
majority_response = answer_counts.most_common(1)[0][0] | |
# Find the index of the first occurrence of the majority response | |
majority_index = data["model_answers"].index(majority_response) | |
# Return the corresponding text in gen_texts | |
return data["gen_texts"][majority_index] | |
def extract_solution(text): | |
# Split the text at "### Solution:" | |
parts = text.split("### Solution:", 1) | |
if len(parts) > 1: | |
# Return everything after "### Solution:" | |
return parts[1].strip() | |
else: | |
# Return an empty string if "### Solution:" is not found | |
return "" | |
def process_code( | |
example: Dict[str, Any], | |
config: Config, | |
restart_on_fail: bool = False, | |
last_step: bool = False, | |
) -> Dict[str, Any]: | |
gen_text = example["gen_texts"] | |
num_python_blocks = len(re.findall(r"```python(.*?)```", gen_text, re.DOTALL)) | |
if num_python_blocks == 0: | |
if restart_on_fail: | |
print("no code has ever been generated, RESTARTING") | |
# reset the text to the original | |
example["gen_texts"] = example["text"] | |
else: | |
print("no code has ever been generated, STOP") | |
example["should_prune"] = True | |
example["has_code"] = False | |
return example | |
if gen_text[-10:] != "```output\n" and ("answer is" in gen_text[-100:] or "\\boxed" in gen_text[-100:]): | |
num_output_blocks = len(re.findall(r"```output(.*?)```", gen_text, re.DOTALL)) | |
if num_output_blocks == 0: | |
print("the model hallucinated the code answer") | |
example["should_prune"] = True | |
return example | |
if "boxed" in gen_text[-100:]: | |
try: | |
answer = normalize_final_answer(extract_boxed_answer(gen_text[-100:])) | |
except Exception: | |
answer = "-1" | |
else: | |
answer = normalize_final_answer(gen_text[-100:]) | |
example["model_answers"] = answer | |
if not config.is_submission: | |
example["corrects"] = check_answer(example["ground_truth"], answer) | |
example["should_prune"] = True | |
print("Answer is: ", answer, example["ground_truth"], example["corrects"]) | |
return example | |
if last_step: | |
# no point in continuing if we are at the last step | |
return example | |
if gen_text[-10:] != "```output\n": | |
# something else has gone wrong with the generation | |
print("warning: output block not found: ", gen_text[-40:]) | |
if restart_on_fail: | |
example["gen_texts"] = example["text"] | |
else: | |
example["should_prune"] = True | |
return example | |
code_result, status = postprocess_completion(gen_text, return_status=True, last_code_block=True) | |
# add the code result for the next round of generation | |
TRUNCATION_LIMIT = 200 | |
if len(code_result) > TRUNCATION_LIMIT: | |
code_result = code_result[:TRUNCATION_LIMIT] + " ... (output truncated)" | |
example["gen_texts"] = gen_text + f"{code_result}\n```" | |
return example | |
# load the vllm instance and set sampling parameters | |
# vllm = build_vllm(config) | |
def solve_problem(problem, temperature, progress=gr.Progress()): | |
problem = apply_template({"prompt": problem}, prompt=config.system_prompt) | |
print(f"Problem: {problem}") | |
sample = { | |
"problem": problem, # not used for the submission TODO Remove | |
"ground_truth": "unknown", # not used for the submission TODO Remove | |
"text": "### Solution:\n", | |
"gen_texts": "### Solution:\n", # used to store all the generated text | |
"should_prune": False, | |
"problem_index": -1, # not used for the submission TODO Remove | |
"model_answers": "-1", | |
"has_code": True, | |
"corrects": False, # not used for the submission TODO Remove | |
} | |
for step in progress.tqdm( | |
range(config.num_generations), desc="Generating candidates" | |
): # Depth of the tree (e.g. 6 steps = 5 code blocks) | |
step_reponse = sample["gen_texts"] | |
messages = [ | |
{"role": "user", "content": sample["problem"]}, | |
{"role": "assistant", "content": sample["gen_texts"]}, | |
] | |
for reponse_message in generate(messages, temperature): | |
if reponse_message is not None: | |
step_reponse += reponse_message | |
yield step_reponse | |
sample["gen_texts"] = step_reponse | |
# TODO: Maybe it should just return the result of running the code | |
sample = process_code( | |
sample, | |
config=config, | |
restart_on_fail=config.restart_on_fail, | |
last_step=(step == (config.num_generations - 1)), | |
) | |
sample["gen_texts"] = sample["gen_texts"] + "\n" | |
run_code_reponse = sample["gen_texts"].replace(step_reponse, "") | |
for output_mseeage in run_code_reponse: | |
if output_mseeage is not None: | |
step_reponse += output_mseeage | |
yield step_reponse | |
if sample["should_prune"]: | |
break | |
yield sample["gen_texts"] | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
inp = gr.Textbox(placeholder="Problem", label="Problem", lines=5) | |
with gr.Accordion("Advanced Options", open=False): | |
temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.8, step=0.1, label="Temperature") | |
with gr.Row(): | |
out = gr.Markdown() | |
btn = gr.Button("Run") | |
btn.click(fn=solve_problem, inputs=[inp, temperature], outputs=out) | |
if __name__ == "__main__": | |
demo.queue(default_concurrency_limit=5).launch() | |