Upload metrics.py with huggingface_hub
Browse files- metrics.py +111 -34
metrics.py
CHANGED
@@ -1879,6 +1879,7 @@ class BertScore(HuggingfaceBulkMetric):
|
|
1879 |
hf_metric_fields = ["f1", "precision", "recall"]
|
1880 |
ci_scores = ["f1", "precision", "recall"]
|
1881 |
model_name: str
|
|
|
1882 |
|
1883 |
prediction_type = "str"
|
1884 |
|
@@ -1886,7 +1887,9 @@ class BertScore(HuggingfaceBulkMetric):
|
|
1886 |
|
1887 |
def prepare(self):
|
1888 |
super().prepare()
|
1889 |
-
self.hf_compute_args = {"model_type": self.model_name, "batch_size":
|
|
|
|
|
1890 |
|
1891 |
|
1892 |
class SentenceBert(BulkInstanceMetric):
|
@@ -1947,6 +1950,9 @@ class Reward(BulkInstanceMetric):
|
|
1947 |
|
1948 |
model_name: str
|
1949 |
|
|
|
|
|
|
|
1950 |
_requirements_list: List[str] = ["transformers", "torch"]
|
1951 |
|
1952 |
def prepare(self):
|
@@ -2141,9 +2147,13 @@ class Perplexity(BulkInstanceMetric):
|
|
2141 |
reduction_map = {"mean": ["perplexity"]}
|
2142 |
prediction_type = "str"
|
2143 |
|
2144 |
-
|
|
|
2145 |
batch_size: int = 32
|
2146 |
model_name: str
|
|
|
|
|
|
|
2147 |
|
2148 |
_requirements_list: List[str] = ["transformers", "torch"]
|
2149 |
|
@@ -2160,24 +2170,41 @@ class Perplexity(BulkInstanceMetric):
|
|
2160 |
|
2161 |
:return: the likelihood of generating text Y_i after each text X_i_j = P(Y_i|X_i_1), ..., P(Y_i|X_i_n) for every i.
|
2162 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2163 |
sources = []
|
2164 |
targets = []
|
2165 |
for prediction, instance_references in zip(predictions, references):
|
2166 |
for instance_reference in instance_references:
|
2167 |
-
sources.append(
|
2168 |
-
|
2169 |
-
|
2170 |
-
|
2171 |
-
|
2172 |
-
|
2173 |
-
|
2174 |
-
|
2175 |
-
|
2176 |
-
|
2177 |
-
|
|
|
|
|
|
|
2178 |
|
2179 |
# compute P(Q|P) and store in queue
|
2180 |
-
scores = lm.compute_lm(
|
2181 |
source=sources, target=targets, batch_size=self.batch_size
|
2182 |
)
|
2183 |
|
@@ -2200,8 +2227,25 @@ class Perplexity(BulkInstanceMetric):
|
|
2200 |
|
2201 |
return all_instances_scores
|
2202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2203 |
class AbstractLM(ABC):
|
2204 |
-
def __init__(self, model_name):
|
2205 |
import torch
|
2206 |
from transformers import AutoTokenizer
|
2207 |
|
@@ -2211,6 +2255,7 @@ class Perplexity(BulkInstanceMetric):
|
|
2211 |
self.model_class().from_pretrained(self.model_name).to(self.device)
|
2212 |
)
|
2213 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
|
2214 |
|
2215 |
def compute_lm(
|
2216 |
self, source: List[str], target: List[str], batch_size: int
|
@@ -2232,7 +2277,10 @@ class Perplexity(BulkInstanceMetric):
|
|
2232 |
batch_source, padding=True, return_tensors="pt"
|
2233 |
)
|
2234 |
tokens_target = self.tokenizer(
|
2235 |
-
batch_target,
|
|
|
|
|
|
|
2236 |
)
|
2237 |
|
2238 |
# compute the logits
|
@@ -3353,7 +3401,7 @@ class BinaryMaxAccuracy(GlobalMetric):
|
|
3353 |
def compute(
|
3354 |
self,
|
3355 |
references: List[List[str]],
|
3356 |
-
predictions: List[
|
3357 |
task_data: List[Dict],
|
3358 |
) -> dict:
|
3359 |
float_predictions = [to_float_or_default(p) for p in predictions]
|
@@ -3361,24 +3409,53 @@ class BinaryMaxAccuracy(GlobalMetric):
|
|
3361 |
["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
|
3362 |
]
|
3363 |
|
3364 |
-
|
3365 |
-
|
3366 |
-
|
3367 |
-
|
3368 |
-
|
3369 |
-
|
3370 |
-
|
3371 |
-
|
3372 |
-
|
3373 |
-
|
3374 |
-
|
3375 |
-
|
3376 |
-
|
3377 |
-
|
3378 |
-
|
3379 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3380 |
|
3381 |
-
return {
|
|
|
|
|
|
|
3382 |
|
3383 |
|
3384 |
######################
|
|
|
1879 |
hf_metric_fields = ["f1", "precision", "recall"]
|
1880 |
ci_scores = ["f1", "precision", "recall"]
|
1881 |
model_name: str
|
1882 |
+
model_layer: int = None
|
1883 |
|
1884 |
prediction_type = "str"
|
1885 |
|
|
|
1887 |
|
1888 |
def prepare(self):
|
1889 |
super().prepare()
|
1890 |
+
self.hf_compute_args = {"model_type": self.model_name, "batch_size": 32}
|
1891 |
+
if self.model_layer:
|
1892 |
+
self.hf_compute_args["num_layers"] = self.model_layer
|
1893 |
|
1894 |
|
1895 |
class SentenceBert(BulkInstanceMetric):
|
|
|
1950 |
|
1951 |
model_name: str
|
1952 |
|
1953 |
+
prediction_type = "str"
|
1954 |
+
single_reference_per_prediction = True
|
1955 |
+
|
1956 |
_requirements_list: List[str] = ["transformers", "torch"]
|
1957 |
|
1958 |
def prepare(self):
|
|
|
2147 |
reduction_map = {"mean": ["perplexity"]}
|
2148 |
prediction_type = "str"
|
2149 |
|
2150 |
+
source_template: str
|
2151 |
+
target_template: str
|
2152 |
batch_size: int = 32
|
2153 |
model_name: str
|
2154 |
+
single_token_mode: bool = False
|
2155 |
+
|
2156 |
+
lm = None
|
2157 |
|
2158 |
_requirements_list: List[str] = ["transformers", "torch"]
|
2159 |
|
|
|
2170 |
|
2171 |
:return: the likelihood of generating text Y_i after each text X_i_j = P(Y_i|X_i_1), ..., P(Y_i|X_i_n) for every i.
|
2172 |
"""
|
2173 |
+
if self.lm is None:
|
2174 |
+
from transformers import AutoConfig
|
2175 |
+
|
2176 |
+
config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=True)
|
2177 |
+
self.lm = (
|
2178 |
+
self.EncoderDecoderLM(
|
2179 |
+
model_name=self.model_name, single_token_mode=self.single_token_mode
|
2180 |
+
)
|
2181 |
+
if config.is_encoder_decoder is True
|
2182 |
+
else self.DecoderOnlyLM(
|
2183 |
+
model_name=self.model_name, single_token_mode=self.single_token_mode
|
2184 |
+
)
|
2185 |
+
)
|
2186 |
+
|
2187 |
sources = []
|
2188 |
targets = []
|
2189 |
for prediction, instance_references in zip(predictions, references):
|
2190 |
for instance_reference in instance_references:
|
2191 |
+
sources.append(
|
2192 |
+
self.Template.apply(
|
2193 |
+
self.source_template,
|
2194 |
+
prediction=prediction,
|
2195 |
+
reference=instance_reference,
|
2196 |
+
)
|
2197 |
+
)
|
2198 |
+
targets.append(
|
2199 |
+
self.Template.apply(
|
2200 |
+
self.target_template,
|
2201 |
+
prediction=prediction,
|
2202 |
+
reference=instance_reference,
|
2203 |
+
)
|
2204 |
+
)
|
2205 |
|
2206 |
# compute P(Q|P) and store in queue
|
2207 |
+
scores = self.lm.compute_lm(
|
2208 |
source=sources, target=targets, batch_size=self.batch_size
|
2209 |
)
|
2210 |
|
|
|
2227 |
|
2228 |
return all_instances_scores
|
2229 |
|
2230 |
+
class Template:
|
2231 |
+
regex = re.compile(r"\{(\w+)}")
|
2232 |
+
|
2233 |
+
@classmethod
|
2234 |
+
def apply(cls, template, **kwargs):
|
2235 |
+
matches = Perplexity.Template.regex.finditer(template)
|
2236 |
+
output = []
|
2237 |
+
cursor = 0
|
2238 |
+
for match in matches:
|
2239 |
+
start = match.start()
|
2240 |
+
end = match.end()
|
2241 |
+
output.append(template[cursor:start])
|
2242 |
+
output.append(kwargs[match.group(1)])
|
2243 |
+
cursor = end
|
2244 |
+
output.append(template[cursor:])
|
2245 |
+
return "".join(output)
|
2246 |
+
|
2247 |
class AbstractLM(ABC):
|
2248 |
+
def __init__(self, model_name, single_token_mode):
|
2249 |
import torch
|
2250 |
from transformers import AutoTokenizer
|
2251 |
|
|
|
2255 |
self.model_class().from_pretrained(self.model_name).to(self.device)
|
2256 |
)
|
2257 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
2258 |
+
self.single_token_mode = single_token_mode
|
2259 |
|
2260 |
def compute_lm(
|
2261 |
self, source: List[str], target: List[str], batch_size: int
|
|
|
2277 |
batch_source, padding=True, return_tensors="pt"
|
2278 |
)
|
2279 |
tokens_target = self.tokenizer(
|
2280 |
+
batch_target,
|
2281 |
+
padding=True,
|
2282 |
+
return_tensors="pt",
|
2283 |
+
add_special_tokens=not self.single_token_mode,
|
2284 |
)
|
2285 |
|
2286 |
# compute the logits
|
|
|
3401 |
def compute(
|
3402 |
self,
|
3403 |
references: List[List[str]],
|
3404 |
+
predictions: List[str],
|
3405 |
task_data: List[Dict],
|
3406 |
) -> dict:
|
3407 |
float_predictions = [to_float_or_default(p) for p in predictions]
|
|
|
3409 |
["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
|
3410 |
]
|
3411 |
|
3412 |
+
# Sticking to the test >= thr, accuracy induced by threshold thr is the number of float predictions
|
3413 |
+
# that pass the test (are >= thr) and are paired with reference "1" plus the number of float predictions that
|
3414 |
+
# fail the test (are < thr) and are paired with reference "0".
|
3415 |
+
# A given threshold thr induces the same partition over the float predictions into passing and failing
|
3416 |
+
# as threshold thr' induces, with thr' being the smallest among the ones passing the test of thr.
|
3417 |
+
# Hence, we only need to review thresholds being float predictions, plus a threshold being larger than
|
3418 |
+
# the largest float predictions, to induce the partition into all-failing , none-passing.
|
3419 |
+
|
3420 |
+
fp = [
|
3421 |
+
(float_predictions[i], i, -1 if references[i][0] == "1" else +1)
|
3422 |
+
for i in range(len(float_predictions))
|
3423 |
+
]
|
3424 |
+
fp.sort()
|
3425 |
+
# each triplet above: float-prediction f; f's ordinal position in float_predictions, which is also
|
3426 |
+
# a means to obtain distinct triplets; and: the change in number of predictions that the test sends
|
3427 |
+
# to the reference they are paired with, a change implied by a move of thr that transfers f
|
3428 |
+
# from the set of passing the test to the set of failing it.
|
3429 |
+
|
3430 |
+
rightmost_thr = 1.0 if fp[-1][0] < 1 else fp[-1][0] + 0.01
|
3431 |
+
# trying to be esthetic, have the threshold within [0,1], although this is not a requirement,
|
3432 |
+
# and even the float predictions are not guaranteed to be within the range [0,1]
|
3433 |
+
|
3434 |
+
current_thr = fp[0][0]
|
3435 |
+
# partition float_predictions into all-passing, none-failing
|
3436 |
+
current_acc = sum(r[0] == "1" for r in references)
|
3437 |
+
# number of predictions that thr sends to the reference they are paired with
|
3438 |
+
|
3439 |
+
best_acc = current_acc
|
3440 |
+
best_thr = current_thr
|
3441 |
+
|
3442 |
+
i = 0
|
3443 |
+
while (i < len(predictions)) and (best_acc < len(predictions)):
|
3444 |
+
# best_acc can not exceed len(predictions)
|
3445 |
+
delta = fp[i][2]
|
3446 |
+
i += 1
|
3447 |
+
while i < len(predictions) and fp[i][0] <= fp[i - 1][0]:
|
3448 |
+
delta += fp[i][2]
|
3449 |
+
i += 1
|
3450 |
+
current_acc += delta
|
3451 |
+
if current_acc > best_acc:
|
3452 |
+
best_acc = current_acc
|
3453 |
+
best_thr = fp[i][0] if i < len(predictions) else rightmost_thr
|
3454 |
|
3455 |
+
return {
|
3456 |
+
self.main_score: float(best_acc) / len(predictions),
|
3457 |
+
"best_thr_max_acc": best_thr,
|
3458 |
+
}
|
3459 |
|
3460 |
|
3461 |
######################
|