Spaces:
Sleeping
Sleeping
File size: 7,221 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
# Natural Language Toolkit: Text Segmentation Metrics
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <[email protected]>
# Steven Bird <[email protected]>
# David Doukhan <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Text Segmentation Metrics
1. Windowdiff
Pevzner, L., and Hearst, M., A Critique and Improvement of
an Evaluation Metric for Text Segmentation,
Computational Linguistics 28, 19-36
2. Generalized Hamming Distance
Bookstein A., Kulyukin V.A., Raita T.
Generalized Hamming Distance
Information Retrieval 5, 2002, pp 353-375
Baseline implementation in C++
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html
Study describing benefits of Generalized Hamming Distance Versus
WindowDiff for evaluating text segmentation tasks
Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ?
TALN 2009
3. Pk text segmentation metric
Beeferman D., Berger A., Lafferty J. (1999)
Statistical Models for Text Segmentation
Machine Learning, 34, 177-210
"""
try:
import numpy as np
except ImportError:
pass
def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
"""
Compute the windowdiff score for a pair of segmentations. A
segmentation is any sequence over a vocabulary of two items
(e.g. "0", "1"), where the specified boundary value is used to
mark the edge of a segmentation.
>>> s1 = "000100000010"
>>> s2 = "000010000100"
>>> s3 = "100000010000"
>>> '%.2f' % windowdiff(s1, s1, 3)
'0.00'
>>> '%.2f' % windowdiff(s1, s2, 3)
'0.30'
>>> '%.2f' % windowdiff(s2, s3, 3)
'0.80'
:param seg1: a segmentation
:type seg1: str or list
:param seg2: a segmentation
:type seg2: str or list
:param k: window width
:type k: int
:param boundary: boundary value
:type boundary: str or int or bool
:param weighted: use the weighted variant of windowdiff
:type weighted: boolean
:rtype: float
"""
if len(seg1) != len(seg2):
raise ValueError("Segmentations have unequal length")
if k > len(seg1):
raise ValueError(
"Window width k should be smaller or equal than segmentation lengths"
)
wd = 0
for i in range(len(seg1) - k + 1):
ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary))
if weighted:
wd += ndiff
else:
wd += min(1, ndiff)
return wd / (len(seg1) - k + 1.0)
# Generalized Hamming Distance
def _init_mat(nrows, ncols, ins_cost, del_cost):
mat = np.empty((nrows, ncols))
mat[0, :] = ins_cost * np.arange(ncols)
mat[:, 0] = del_cost * np.arange(nrows)
return mat
def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
for i, rowi in enumerate(rowv):
for j, colj in enumerate(colv):
shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j]
if rowi == colj:
# boundaries are at the same location, no transformation required
tcost = mat[i, j]
elif rowi > colj:
# boundary match through a deletion
tcost = del_cost + mat[i, j + 1]
else:
# boundary match through an insertion
tcost = ins_cost + mat[i + 1, j]
mat[i + 1, j + 1] = min(tcost, shift_cost)
def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"):
"""
Compute the Generalized Hamming Distance for a reference and a hypothetical
segmentation, corresponding to the cost related to the transformation
of the hypothetical segmentation into the reference segmentation
through boundary insertion, deletion and shift operations.
A segmentation is any sequence over a vocabulary of two items
(e.g. "0", "1"), where the specified boundary value is used to
mark the edge of a segmentation.
Recommended parameter values are a shift_cost_coeff of 2.
Associated with a ins_cost, and del_cost equal to the mean segment
length in the reference segmentation.
>>> # Same examples as Kulyukin C++ implementation
>>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
0.5
>>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
2.0
>>> ghd('011', '110', 1.0, 1.0, 0.5)
1.0
>>> ghd('1', '0', 1.0, 1.0, 0.5)
1.0
>>> ghd('111', '000', 1.0, 1.0, 0.5)
3.0
>>> ghd('000', '111', 1.0, 2.0, 0.5)
6.0
:param ref: the reference segmentation
:type ref: str or list
:param hyp: the hypothetical segmentation
:type hyp: str or list
:param ins_cost: insertion cost
:type ins_cost: float
:param del_cost: deletion cost
:type del_cost: float
:param shift_cost_coeff: constant used to compute the cost of a shift.
``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j``
are the positions indicating the shift
:type shift_cost_coeff: float
:param boundary: boundary value
:type boundary: str or int or bool
:rtype: float
"""
ref_idx = [i for (i, val) in enumerate(ref) if val == boundary]
hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary]
nref_bound = len(ref_idx)
nhyp_bound = len(hyp_idx)
if nref_bound == 0 and nhyp_bound == 0:
return 0.0
elif nref_bound > 0 and nhyp_bound == 0:
return nref_bound * ins_cost
elif nref_bound == 0 and nhyp_bound > 0:
return nhyp_bound * del_cost
mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost)
_ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff)
return mat[-1, -1]
# Beeferman's Pk text segmentation evaluation metric
def pk(ref, hyp, k=None, boundary="1"):
"""
Compute the Pk metric for a pair of segmentations A segmentation
is any sequence over a vocabulary of two items (e.g. "0", "1"),
where the specified boundary value is used to mark the edge of a
segmentation.
>>> '%.2f' % pk('0100'*100, '1'*400, 2)
'0.50'
>>> '%.2f' % pk('0100'*100, '0'*400, 2)
'0.50'
>>> '%.2f' % pk('0100'*100, '0100'*100, 2)
'0.00'
:param ref: the reference segmentation
:type ref: str or list
:param hyp: the segmentation to evaluate
:type hyp: str or list
:param k: window size, if None, set to half of the average reference segment length
:type boundary: str or int or bool
:param boundary: boundary value
:type boundary: str or int or bool
:rtype: float
"""
if k is None:
k = int(round(len(ref) / (ref.count(boundary) * 2.0)))
err = 0
for i in range(len(ref) - k + 1):
r = ref[i : i + k].count(boundary) > 0
h = hyp[i : i + k].count(boundary) > 0
if r != h:
err += 1
return err / (len(ref) - k + 1.0)
|