Simon Sorg commited on
Commit
b954f30
·
1 Parent(s): c897860

feat: add compute

Browse files
Files changed (1) hide show
  1. valid_efficiency_score.py +107 -29
valid_efficiency_score.py CHANGED
@@ -11,46 +11,49 @@
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
- """TODO: Add a description here."""
 
 
 
15
 
16
  import evaluate
17
  import datasets
 
 
 
18
 
19
-
20
- # TODO: Add BibTeX citation
21
  _CITATION = """\
22
- @InProceedings{huggingface:module,
23
- title = {A great new module},
24
- authors={huggingface, Inc.},
25
- year={2020}
 
26
  }
27
  """
28
 
29
- # TODO: Add description of the module here
30
  _DESCRIPTION = """\
31
- This new module is designed to solve this great ML task and is crafted with a lot of care.
32
  """
33
 
34
-
35
- # TODO: Add description of the arguments of the module here
36
  _KWARGS_DESCRIPTION = """
37
- Calculates how good are predictions given some references, using certain scores
38
  Args:
39
  predictions: list of predictions to score. Each predictions
40
  should be a string with tokens separated by spaces.
41
  references: list of reference for each prediction. Each
42
  reference should be a string with tokens separated by spaces.
 
 
 
 
 
43
  Returns:
44
- accuracy: description of the first score,
45
- another_score: description of the second score,
46
  Examples:
47
- Examples should be written in doctest format, and should illustrate how
48
- to use the function.
49
-
50
- >>> my_new_module = evaluate.load("my_new_module")
51
  >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
52
  >>> print(results)
53
- {'accuracy': 1.0}
54
  """
55
 
56
  # TODO: Define external resources urls if needed
@@ -59,10 +62,9 @@ BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
59
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
  class ValidEfficiencyScore(evaluate.Metric):
62
- """TODO: Short description of my evaluation module."""
63
 
64
  def _info(self):
65
- # TODO: Specifies the evaluate.EvaluationModuleInfo object
66
  return evaluate.MetricInfo(
67
  # This is the description that will appear on the modules page.
68
  module_type="metric",
@@ -71,14 +73,14 @@ class ValidEfficiencyScore(evaluate.Metric):
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
  features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
76
  }),
77
  # Homepage of the module for documentation
78
  homepage="http://module.homepage",
79
  # Additional links to the codebase or references
80
  codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81
- reference_urls=["http://path.to.reference.url/new_module"]
82
  )
83
 
84
  def _download_and_prepare(self, dl_manager):
@@ -86,10 +88,86 @@ class ValidEfficiencyScore(evaluate.Metric):
86
  # TODO: Download external resources if needed
87
  pass
88
 
89
- def _compute(self, predictions, references):
90
- """Returns the scores"""
91
  # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  return {
94
- "accuracy": accuracy,
95
- }
 
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
+ #
15
+ # This is a module to compute the Valid Efficiency Score (VES) of a model's predictions for text-to-SQL tasks as
16
+ # proposed in "Can LLM Already Serve as a Database Interface?
17
+ # A Big Bench for Large-Scale Database Grounded Text-to-SQLs" (Li et al., 2023)
18
 
19
  import evaluate
20
  import datasets
21
+ from time import time
22
+ import numpy as np
23
+ from math import sqrt
24
 
 
 
25
  _CITATION = """\
26
+ @article{li2023can,
27
+ title={Can llm already serve as a database interface? a big bench for large-scale database grounded text-to-sqls},
28
+ author={Li, Jinyang and Hui, Binyuan and Qu, Ge and Li, Binhua and Yang, Jiaxi and Li, Bowen and Wang, Bailin and Qin, Bowen and Cao, Rongyu and Geng, Ruiying and others},
29
+ journal={arXiv preprint arXiv:2305.03111},
30
+ year={2023}
31
  }
32
  """
33
 
 
34
  _DESCRIPTION = """\
35
+ This module computes the Valid Efficiency Score (VES) of a model's predictions for text-to-SQL tasks.
36
  """
37
 
 
 
38
  _KWARGS_DESCRIPTION = """
39
+ Calculates how good the predictions are given some ground truth sql queries, using the Valid Efficiency Score (VES).
40
  Args:
41
  predictions: list of predictions to score. Each predictions
42
  should be a string with tokens separated by spaces.
43
  references: list of reference for each prediction. Each
44
  reference should be a string with tokens separated by spaces.
45
+ execute: function that takes a list of sql queries and returns a list of results, one for each query.
46
+ Results should be a list of tuples, each tuple containing the values of a row.
47
+ filter_func: function that takes a string and returns a boolean.
48
+ If True, the string is kept, otherwise it is dropped.
49
+ num_executions: number of times to execute each sql query to get the execution time.
50
  Returns:
51
+ ves: Valid Efficiency Score of the predictions compared to the references.
 
52
  Examples:
53
+ >>> my_new_module = evaluate.load("valid_efficiency_score")
 
 
 
54
  >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
55
  >>> print(results)
56
+ {'ves': 1.0}
57
  """
58
 
59
  # TODO: Define external resources urls if needed
 
62
 
63
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
64
  class ValidEfficiencyScore(evaluate.Metric):
65
+ """Valid Efficiency Score (VES) metric for text-to-SQL tasks."""
66
 
67
  def _info(self):
 
68
  return evaluate.MetricInfo(
69
  # This is the description that will appear on the modules page.
70
  module_type="metric",
 
73
  inputs_description=_KWARGS_DESCRIPTION,
74
  # This defines the format of each prediction and reference
75
  features=datasets.Features({
76
+ 'predictions': datasets.Value('string'),
77
+ 'references': datasets.Value('string'),
78
  }),
79
  # Homepage of the module for documentation
80
  homepage="http://module.homepage",
81
  # Additional links to the codebase or references
82
  codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
83
+ reference_urls=[]
84
  )
85
 
86
  def _download_and_prepare(self, dl_manager):
 
88
  # TODO: Download external resources if needed
89
  pass
90
 
91
+ def _compute(self, predictions, references, execute, filter_func=None, num_executions=100):
92
+ """Returns the valid efficiency score of the predictions compared to the references."""
93
  # TODO: Compute the different scores of the module
94
+ if len(predictions) != len(references):
95
+ raise ValueError("Predictions and references must have the same number of elements.")
96
+ # Run filter_func on predictions and references if needed
97
+ filtered_predictions = []
98
+ filtered_references = []
99
+ passing_reference_only = 0
100
+ if filter_func is not None:
101
+ for prediction, reference in zip(predictions, references):
102
+ # Only keep if both prediction and reference pass the filter
103
+ if filter_func(prediction) and filter_func(reference):
104
+ filtered_predictions.append(prediction)
105
+ filtered_references.append(reference)
106
+ # If only the reference passes the filter, count it
107
+ elif filter_func(reference):
108
+ passing_reference_only += 1
109
+
110
+ # Execute ground truth sql queries to get the ground truth results and the time it takes to execute them
111
+ ground_results = execute(filtered_references)
112
+ reference_times = np.zeros(num_executions)
113
+ for i in range(num_executions):
114
+ start_time = time()
115
+ execute(filtered_references)
116
+ end_time = time()
117
+ reference_times[i] = end_time - start_time
118
+
119
+ # Execute predicted sql queries to get the predicted results and the time it takes to execute them
120
+ predicted_results = execute(filtered_predictions)
121
+ prediction_times = np.zeros(num_executions)
122
+ for i in range(num_executions):
123
+ start_time = time()
124
+ execute(filtered_predictions)
125
+ end_time = time()
126
+ prediction_times[i] = end_time - start_time
127
+
128
+ # Get mean, std and 3 sigma interval
129
+ reference_mean = np.mean(reference_times)
130
+ reference_std = np.std(reference_times)
131
+ lower_bound = reference_mean - 3 * reference_std
132
+ upper_bound = reference_mean + 3 * reference_std
133
+ # Drop outliers
134
+ filtered_reference_times = reference_times[(reference_times >= lower_bound) & (reference_times <= upper_bound)]
135
+
136
+ # Get mean, std and 3 sigma interval
137
+ prediction_mean = np.mean(prediction_times)
138
+ prediction_std = np.std(prediction_times)
139
+ lower_bound = prediction_mean - 3 * prediction_std
140
+ upper_bound = prediction_mean + 3 * prediction_std
141
+ # Drop outliers
142
+ filtered_prediction_times = prediction_times[
143
+ (prediction_times >= lower_bound) & (prediction_times <= upper_bound)]
144
+
145
+ # Get new means as e_scores
146
+ reference_mean = np.mean(filtered_reference_times)
147
+ prediction_mean = np.mean(filtered_prediction_times)
148
+
149
+ r_value = sqrt(reference_mean / prediction_mean)
150
+
151
+ # Run filter_func on predictions and references if needed
152
+ filtered_predictions = []
153
+ filtered_references = []
154
+ divider = 0
155
+ if filter_func is not None:
156
+ for prediction, reference in zip(predictions, references):
157
+ # Only keep if both prediction and reference pass the filter
158
+ pred_bool = filter_func(prediction)
159
+ ref_bool = filter_func(reference)
160
+ if pred_bool and ref_bool:
161
+ filtered_predictions.append(prediction)
162
+ filtered_references.append(reference)
163
+ divider += 1
164
+ # If only the reference passes the filter, count it
165
+ elif pred_bool != ref_bool:
166
+ divider += 1
167
+
168
+ accuracy = sum(
169
+ execute(i) == execute(j) for i, j in zip(filtered_predictions, filtered_references)) / divider
170
+
171
  return {
172
+ "ves": accuracy * r_value,
173
+ }