Spaces:
Sleeping
Sleeping
File size: 10,066 Bytes
f78ed8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 |
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import nltk
import math
import sys
from fractions import Fraction
import warnings
from collections import Counter
from nltk.translate.bleu_score import modified_precision, closest_ref_length, brevity_penalty, SmoothingFunction
import warnings
def corpus_bleu(
list_of_references,
hypotheses,
weights=(0.25, 0.25, 0.25, 0.25),
smoothing_function=None,
auto_reweigh=False,
averaging_mode="geometric",
no_length_penalty=False
):
"""
Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
the hypotheses and their respective references.
Instead of averaging the sentence level BLEU scores (i.e. marco-average
precision), the original BLEU metric (Papineni et al. 2002) accounts for
the micro-average precision (i.e. summing the numerators and denominators
for each hypothesis-reference(s) pairs before the division).
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
... 'ensures', 'that', 'the', 'military', 'always',
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
... 'heed', 'Party', 'commands']
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
... 'guarantees', 'the', 'military', 'forces', 'always',
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
... 'interested', 'in', 'world', 'history']
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
... 'because', 'he', 'read', 'the', 'book']
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
>>> hypotheses = [hyp1, hyp2]
>>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
0.5920...
The example below show that corpus_bleu() is different from averaging
sentence_bleu() for hypotheses
>>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
>>> score2 = sentence_bleu([ref2a], hyp2)
>>> (score1 + score2) / 2 # doctest: +ELLIPSIS
0.6223...
:param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
:type list_of_references: list(list(list(str)))
:param hypotheses: a list of hypothesis sentences
:type hypotheses: list(list(str))
:param weights: weights for unigrams, bigrams, trigrams and so on
:type weights: list(float)
:param smoothing_function:
:type smoothing_function: SmoothingFunction
:param auto_reweigh: Option to re-normalize the weights uniformly.
:type auto_reweigh: bool
:return: The corpus-level BLEU score.
:rtype: float
"""
# Before proceeding to compute BLEU, perform sanity checks.
p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
hyp_lengths, ref_lengths = 0, 0
assert len(list_of_references) == len(hypotheses), (
"The number of hypotheses and their reference(s) should be the " "same "
)
# Iterate through each hypothesis and their corresponding references.
for references, hypothesis in zip(list_of_references, hypotheses):
# For each order of ngram, calculate the numerator and
# denominator for the corpus-level modified precision.
for i, _ in enumerate(weights, start=1):
p_i = modified_precision(references, hypothesis, i)
p_numerators[i] += p_i.numerator
p_denominators[i] += p_i.denominator
# Calculate the hypothesis length and the closest reference length.
# Adds them to the corpus-level hypothesis and reference counts.
hyp_len = len(hypothesis)
hyp_lengths += hyp_len
ref_lengths += closest_ref_length(references, hyp_len)
# Calculate corpus-level brevity penalty.
if no_length_penalty and averaging_mode == 'geometric':
bp = 1.0
elif no_length_penalty and averaging_mode == 'arithmetic':
bp = 0.0
else:
assert not no_length_penalty
assert averaging_mode != 'arithmetic', 'Not sure how to apply length penalty when aurithmetic mode'
bp = brevity_penalty(ref_lengths, hyp_lengths)
# Uniformly re-weighting based on maximum hypothesis lengths if largest
# order of n-grams < 4 and weights is set at default.
if auto_reweigh:
if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
weights = (1 / hyp_lengths,) * hyp_lengths
# Collects the various precision values for the different ngram orders.
p_n = [
Fraction(p_numerators[i], p_denominators[i], _normalize=False)
for i, _ in enumerate(weights, start=1)
]
# Returns 0 if there's no matching n-grams
# We only need to check for p_numerators[1] == 0, since if there's
# no unigrams, there won't be any higher order ngrams.
if p_numerators[1] == 0:
return 0
# If there's no smoothing, set use method0 from SmoothinFunction class.
if not smoothing_function:
smoothing_function = SmoothingFunction().method0
# Smoothen the modified precision.
# Note: smoothing_function() may convert values into floats;
# it tries to retain the Fraction object as much as the
# smoothing method allows.
p_n = smoothing_function(
p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
)
if averaging_mode == "geometric":
s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
s = bp * math.exp(math.fsum(s))
elif averaging_mode == "arithmetic":
s = (w_i * p_i for w_i, p_i in zip(weights, p_n))
s = math.fsum(s)
return s
def sentence_bleu(
references,
hypothesis,
weights=(0.25, 0.25, 0.25, 0.25),
smoothing_function=None,
auto_reweigh=False,
averaging_mode="geometric",
no_length_penalty=False
):
return corpus_bleu(
[references], [hypothesis], weights, smoothing_function, auto_reweigh, averaging_mode, no_length_penalty
)
def get_target_sequences(manifest, ground_truth, to_take=1000):
import json
import pathlib
with open(ground_truth, 'r') as fin:
original_continuations = json.loads(fin.read())
sequence2length = [(k, v[0]) for k, v in original_continuations.items()]
assert all(float(v) >= 6.0 for (_, v) in sequence2length) # 6 seconds
sequence2length.sort(key=lambda x: x[1])
to_take_sequences = set(v[0] for v in sequence2length[:to_take])
to_take_ids = []
with open(manifest, 'r') as f:
f.readline()
for i, line in enumerate(f.readlines()):
seq_id = line.split()[0]
seq_id = pathlib.Path(seq_id).name.split('__')[0]
if seq_id in to_take_sequences:
to_take_ids.append(i)
print(f'Took {len(to_take_ids)} ids')
return set(to_take_ids)
def get_self_bleu(utterances, averaging_mode, weights):
self_bleu = []
for i in range(len(utterances)):
hypo = utterances[i]
rest = utterances[:i] + utterances[i+1:]
self_bleu.append(sentence_bleu(rest, hypo, weights,
no_length_penalty=True, averaging_mode=averaging_mode))
return self_bleu
def get_self_bleu2_arithmetic(utterances):
weights = (0.5, 0.5) # equal weight for unigrams and bigrams
return get_self_bleu(utterances, averaging_mode='arithmetic', weights=weights)
def get_self_bleu2_geometric(utterances):
weights = (0.5, 0.5)
return get_self_bleu(utterances, averaging_mode='geometric', weights=weights)
def get_auto_bleu2_arithmetic(utterances):
weights = (0.5, 0.5)
return [auto_bleu(u, mean_mode='arithmetic', weights=weights) for u in utterances]
def get_auto_bleu2_geometric(utterances):
weights = (0.5, 0.5)
return [auto_bleu(u, mean_mode='geometric', weights=weights) for u in utterances]
def get_auto_bleu3_geometric(utterances):
weights = (1./3, 1./3, 1./3)
return [auto_bleu(u, mean_mode='geometric', weights=weights) for u in utterances]
def get_auto_bleu3_arithmetic(utterances):
weights = (1./3, 1./3, 1./3)
return [auto_bleu(u, mean_mode='arithmetic', weights=weights) for u in utterances]
def get_self_bleu3_arithmetic(utterances):
weights = (1./3, 1./3, 1./3)
return get_self_bleu(utterances, averaging_mode='arithmetic', weights=weights)
def get_self_bleu3_geometric(utterances):
weights = (1./3, 1./3, 1./3)
return get_self_bleu(utterances, averaging_mode='geometric', weights=weights)
def auto_bleu(sentence, weights, mean_mode='arithmetic'):
if len(sentence) <= 1:
return 0
N = len(weights)
bleu_n = np.zeros([N])
for n in range(N):
targ_ngrams = list(nltk.ngrams(sentence, n+1))
for p in range(len(targ_ngrams)):
left = sentence[:p]
right = sentence[(p+n+1):]
rest_ngrams = list(nltk.ngrams(left, n+1)) + \
list(nltk.ngrams(right, n+1))
# compute the nb of matching ngrams
bleu_n[n] += targ_ngrams[p] in rest_ngrams
bleu_n[n] /= len(targ_ngrams) # average them to get a proportion
weights = np.array(weights)
if mean_mode == 'arithmetic':
return (bleu_n * weights).sum()
elif mean_mode == 'geometric':
return (bleu_n ** weights).prod()
else:
raise ValueError(f'Unknown agggregation mode {mean_mode}')
def run_f(task_params):
f, terms = task_params
return f(terms)
|