Spaces:

duckdb-nsql-hub
/

DuckDB-SQL-Eval

Running

File size: 5,731 Bytes

e9713ec

import argparse
from typing import List, Dict, Any, Tuple
import pickle as pkl
import tqdm
from .exec_eval import exec_on_db, result_eq
import os
from collections import defaultdict
import time
from multiprocessing import cpu_count, Pool, Manager
from itertools import repeat

NUM_PROCESSES = cpu_count() // 3
if NUM_PROCESSES == 0:
    NUM_PROCESSES = 1
MULTIPLICATIVE_OVERHEAD = 3
ADDITIVE_OVERHEAD = 30
GOLD_TIMEOUT = 100

cache_path = "cache.pkl"
m = Manager()
cache = m.dict()


def load_predictions(f_path: str) -> List[str]:
    preds = []
    with open(f_path, "r") as in_file:
        for l in in_file:
            preds.append(l.strip())
    return preds


def acc(l, idxes=None):
    if idxes is None:
        idxes = [_ for _ in range(len(l))]
    c = 0
    for idx in idxes:
        if l[idx]:
            c += 1
    return float(c) / len(idxes)


# the input is a tuple of gold_dict, model prediction and whether to use cache
# and teh output is whether the model prediction passes the entire test suite
def judge(args: Tuple[Dict[str, Any], str, bool]) -> bool:
    gold_dict, pred, use_cache = args

    testsuite_paths = gold_dict["testsuite"]
    gold_query = gold_dict["query"]
    order_matters = "order by" in gold_query.lower()
    db_path = gold_dict["db_path"]

    # if already computed sometime before
    # and cache allowed, directly return the result
    k = (db_path, gold_query, pred)
    if use_cache and k in cache:
        return cache[k]

    pass_all_testcase = True
    for testcase_path in testsuite_paths:

        start = time.time()
        flg, gold_result = exec_on_db(testcase_path, gold_query, timeout=GOLD_TIMEOUT)
        duration = time.time() - start
        timeout = ADDITIVE_OVERHEAD + MULTIPLICATIVE_OVERHEAD * duration

        if flg != "result":
            print("Warning: executing gold query results in an exception")
            continue
        flg, pred_result = exec_on_db(testcase_path, pred, timeout=int(timeout))
        if flg != "result":
            pass_all_testcase = False
            break
        if not result_eq(gold_result, pred_result, order_matters):
            pass_all_testcase = False
            break

    # save the results in the cache
    if use_cache:
        cache[k] = pass_all_testcase
    return pass_all_testcase


# cache is a dictionary
# the key is a ternary tuple (empty_database_path, SQL1, SQL2)
# the value is whether SQL1 and SQL2 are equivalent, judged by the test suites
def load_cache() -> Dict[Tuple[str, str, str], bool]:
    if os.path.exists(cache_path):
        d = m.dict(pkl.load(open(cache_path, "rb")))
        for k, v in d.items():
            cache[k] = v
    return cache


# dump the cache
def save_cache():
    pkl.dump(dict(cache), open(cache_path, "wb"))


def main(
    preds: List[str],
    gold_file: str = "classical_test.pkl",
    verbose: bool = True,
    num_processes: int = NUM_PROCESSES,
    subset: str = "full",
    use_cache: bool = True,
) -> List[bool]:
    gold_dicts = pkl.load(open(gold_file, "rb"))
    if subset != "full":
        gold_dicts = [
            d
            for d in gold_dicts
            if d["db_path"] == "database/{db_id}/{db_id}.sqlite".format(db_id=subset)
        ]
    assert len(gold_dicts) == len(
        preds
    ), "number of gold and prediction should be equal"
    group_name2idxes = defaultdict(list)

    for idx, gold_dict in enumerate(gold_dicts):
        group_name2idxes[gold_dict["db_id"]].append(idx)

    with Pool(num_processes) as pool:
        result = list(
            tqdm.tqdm(
                pool.imap(judge, zip(gold_dicts, preds, repeat(use_cache, len(preds)))),
                total=len(gold_dicts),
            )
        )

    if verbose:
        print("overall accuracy: ", acc(result))
        for group, idxes in group_name2idxes.items():
            print("accuracy for ", group, acc(result, idxes))
    return result


if __name__ == "__main__":
    start = time.time()
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--gold",
        dest="gold",
        type=str,
        default="classical_test.pkl",
        help="the path to the predicted queries",
    )
    parser.add_argument(
        "--pred", dest="pred", type=str, help="the path to the predicted queries"
    )
    parser.add_argument(
        "--out_file", type=str, required=True, help="the output file path"
    )
    parser.add_argument(
        "--num_processes", default=NUM_PROCESSES, help="number of processes to use"
    )
    parser.add_argument(
        "--subset",
        default="full",
        choices=(
            "atis",
            "advising",
            "academic",
            "imdb",
            "restaurants",
            "geography",
            "scholar",
            "yelp",
            "full",
        ),
        help="which subset to evaluate on.",
    )
    parser.add_argument(
        "--disable_cache",
        default=False,
        action="store_true",
        help="whether to directly apply previously computed result and cache the current results. "
        "use this flag to disable caching.",
    )
    args = parser.parse_args()

    preds = load_predictions(args.pred)
    assert not os.path.exists(args.out_file), (
        "output file path %s already exists" % args.out_file
    )

    use_cache = not args.disable_cache
    if use_cache:
        load_cache()

    result = main(
        preds=preds,
        gold_file=args.gold,
        verbose=True,
        num_processes=args.num_processes,
        subset=args.subset,
        use_cache=use_cache,
    )
    pkl.dump(result, open(args.out_file, "wb"))
    print("total time used: ", time.time() - start)

    if use_cache:
        save_cache()