import re import sqlparse from typing import List, Tuple, Set, Iterator, Dict, Any, Union from sqlparse.sql import Comparison, Identifier from sqlparse.tokens import Whitespace import itertools from collections import namedtuple Token = namedtuple("Token", ["ttype", "value"]) VALUE_NUM_SYMBOL = "VALUERARE" QUOTE_CHARS = {"`", "'", '"'} def tokenize(query: str) -> List[Token]: tokens = list([Token(t.ttype, t.value) for t in sqlparse.parse(query)[0].flatten()]) return tokens def join_tokens(tokens: List[Token]) -> str: return "".join([x.value for x in tokens]).strip().replace(" ", " ") def round_trip_test(query: str) -> None: tokens = tokenize(query) reconstructed = "".join([token.value for token in tokens]) assert query == reconstructed, "Round trip test fails for string %s" % query def postprocess(query: str) -> str: query = query.replace("> =", ">=").replace("< =", "<=").replace("! =", "!=") return query # strip_query, reformat_query and replace values # were implemented by Yu Tao for processing CoSQL def strip_query(query: str) -> Tuple[List[str], List[str]]: query_keywords, all_values = [], [] # then replace all stuff enclosed by "" with a numerical value to get it marked as {VALUE} # Tao's implementation is commented out here. """ str_1 = re.findall("\"[^\"]*\"", query) str_2 = re.findall("\'[^\']*\'", query) values = str_1 + str_2 """ toks = sqlparse.parse(query)[0].flatten() values = [ t.value for t in toks if t.ttype == sqlparse.tokens.Literal.String.Single or t.ttype == sqlparse.tokens.Literal.String.Symbol ] for val in values: all_values.append(val) query = query.replace(val.strip(), VALUE_NUM_SYMBOL) query_tokenized = query.split() float_nums = re.findall("[-+]?\d*\.\d+", query) all_values += [qt for qt in query_tokenized if qt in float_nums] query_tokenized = [ VALUE_NUM_SYMBOL if qt in float_nums else qt for qt in query_tokenized ] query = " ".join(query_tokenized) int_nums = [i.strip() for i in re.findall("[^tT]\d+", query)] all_values += [qt for qt in query_tokenized if qt in int_nums] query_tokenized = [ VALUE_NUM_SYMBOL if qt in int_nums else qt for qt in query_tokenized ] # print int_nums, query, query_tokenized for tok in query_tokenized: if "." in tok: table = re.findall("[Tt]\d+\.", tok) if len(table) > 0: to = tok.replace(".", " . ").split() to = [t.lower() for t in to if len(t) > 0] query_keywords.extend(to) else: query_keywords.append(tok.lower()) elif len(tok) > 0: query_keywords.append(tok.lower()) return query_keywords, all_values def reformat_query(query: str) -> str: query = query.strip().replace(";", "").replace("\t", "") query = " ".join( [t.value for t in tokenize(query) if t.ttype != sqlparse.tokens.Whitespace] ) t_stars = ["t1.*", "t2.*", "t3.*", "T1.*", "T2.*", "T3.*"] for ts in t_stars: query = query.replace(ts, "*") return query def replace_values(sql: str) -> Tuple[List[str], Set[str]]: sql = sqlparse.format(sql, reindent=False, keyword_case="upper") # sql = re.sub(r"(<=|>=|!=|=|<|>|,)", r" \1 ", sql) sql = re.sub(r"(T\d+\.)\s", r"\1", sql) query_toks_no_value, values = strip_query(sql) return query_toks_no_value, set(values) # extract the non-value tokens and the set of values # from a sql query def extract_query_values(sql: str) -> Tuple[List[str], Set[str]]: reformated = reformat_query(query=sql) query_value_replaced, values = replace_values(reformated) return query_value_replaced, values # plug in the values into query with value slots def plugin(query_value_replaced: List[str], values_in_order: List[str]) -> str: q_length = len(query_value_replaced) query_w_values = query_value_replaced[:] value_idx = [ idx for idx in range(q_length) if query_value_replaced[idx] == VALUE_NUM_SYMBOL.lower() ] assert len(value_idx) == len(values_in_order) for idx, value in zip(value_idx, values_in_order): query_w_values[idx] = value return " ".join(query_w_values) # a generator generating all possible ways of # filling values into predicted query def plugin_all_permutations( query_value_replaced: List[str], values: Set[str] ) -> Iterator[str]: num_slots = len([v for v in query_value_replaced if v == VALUE_NUM_SYMBOL.lower()]) for values in itertools.product(*[list(values) for _ in range(num_slots)]): yield plugin(query_value_replaced, list(values)) # given the gold query and the model prediction # extract values from the gold, extract predicted sql with value slots # return 1) number of possible ways to plug in gold values and 2) an iterator of predictions with value plugged in def get_all_preds_for_execution(gold: str, pred: str) -> Tuple[int, Iterator[str]]: _, gold_values = extract_query_values(gold) pred_query_value_replaced, _ = extract_query_values(pred) num_slots = len( [v for v in pred_query_value_replaced if v == VALUE_NUM_SYMBOL.lower()] ) num_alternatives = len(gold_values) ** num_slots return ( num_alternatives, plugin_all_permutations(pred_query_value_replaced, gold_values), ) def remove_distinct(s): toks = [t.value for t in list(sqlparse.parse(s)[0].flatten())] return "".join([t for t in toks if t.lower() != "distinct"]) def extract_all_comparison_from_node(node: Token) -> List[Comparison]: comparison_list = [] if hasattr(node, "tokens"): for t in node.tokens: comparison_list.extend(extract_all_comparison_from_node(t)) if type(node) == Comparison: comparison_list.append(node) return comparison_list def extract_all_comparison(query: str) -> List[Comparison]: tree = sqlparse.parse(query)[0] comparison_list = extract_all_comparison_from_node(tree) return comparison_list def extract_toks_from_comparison(comparison_node: Comparison) -> List[Token]: tokens = [t for t in comparison_node.tokens if t.ttype != Whitespace] return tokens def extract_info_from_comparison(comparison_node: Comparison) -> Dict[str, Any]: tokens = extract_toks_from_comparison(comparison_node) left, op, right = tokens returned_dict = {"left": left, "op": op.value, "right": right} if type(left) != Identifier: return returned_dict table = None if len(left.tokens) == 3 and re.match("^[tT][0-9]$", left.tokens[0].value) is None: table = left.tokens[0].value.lower() col = left.tokens[-1].value if type(right) == Identifier: if len(right.tokens) == 1 and type(right.tokens[0]) == sqlparse.sql.Token: right_val = right.tokens[0].value else: return returned_dict elif type(right) == sqlparse.sql.Token: right_val = right.value else: return returned_dict returned_dict["table_col"], returned_dict["val"] = ( (table, col.upper()), process_str_value(right_val), ) return returned_dict def extract_all_comparison_from_query(query: str) -> List[Dict[str, Any]]: comparison_list = extract_all_comparison(query) return [extract_info_from_comparison(c) for c in comparison_list] def extract_typed_value_in_comparison_from_query( query: str, ) -> List[Tuple[Tuple[Union[str, None], str], str]]: cmps = extract_all_comparison_from_query(query) typed_values = [ (cmp["table_col"], cmp["val"]) for cmp in cmps if "table_col" in cmp ] for table, col, val1, val2 in re.findall( "(?:([^\.\s]*)\.)?([^\.\s]+) between ([^\s;]+) and ([^\s;]+)", query, re.IGNORECASE, ): if table == "": table = None else: table = table.lower() col = col.upper() for v in [val1, val2]: typed_values.append(((table, col), v)) return typed_values def process_str_value(v: str) -> str: if len(v) > 0 and v[0] in QUOTE_CHARS: v = v[1:] if len(v) > 0 and v[-1] in QUOTE_CHARS: v = v[:-1] for c in QUOTE_CHARS: v = v.replace(c + c, c) return v