Spaces:
Sleeping
Sleeping
.. Copyright (C) 2001-2023 NLTK Project | |
.. For license information, see LICENSE.TXT | |
======= | |
Metrics | |
======= | |
----- | |
Setup | |
----- | |
>>> import pytest | |
>>> _ = pytest.importorskip("numpy") | |
The `nltk.metrics` package provides a variety of *evaluation measures* | |
which can be used for a wide variety of NLP tasks. | |
>>> from nltk.metrics import * | |
------------------ | |
Standard IR Scores | |
------------------ | |
We can use standard scores from information retrieval to test the | |
performance of taggers, chunkers, etc. | |
>>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() | |
>>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() | |
>>> print(accuracy(reference, test)) | |
0.8 | |
The following measures apply to sets: | |
>>> reference_set = set(reference) | |
>>> test_set = set(test) | |
>>> precision(reference_set, test_set) | |
1.0 | |
>>> print(recall(reference_set, test_set)) | |
0.8 | |
>>> print(f_measure(reference_set, test_set)) | |
0.88888888888... | |
Measuring the likelihood of the data, given probability distributions: | |
>>> from nltk import FreqDist, MLEProbDist | |
>>> pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf")) | |
>>> pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss")) | |
>>> print(log_likelihood(['a', 'd'], [pdist1, pdist2])) | |
-2.7075187496... | |
---------------- | |
Distance Metrics | |
---------------- | |
String edit distance (Levenshtein): | |
>>> edit_distance("rain", "shine") | |
3 | |
>>> edit_distance_align("shine", "shine") | |
[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)] | |
>>> edit_distance_align("rain", "brainy") | |
[(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (4, 6)] | |
>>> edit_distance_align("", "brainy") | |
[(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6)] | |
>>> edit_distance_align("", "") | |
[(0, 0)] | |
Other distance measures: | |
>>> s1 = set([1,2,3,4]) | |
>>> s2 = set([3,4,5]) | |
>>> binary_distance(s1, s2) | |
1.0 | |
>>> print(jaccard_distance(s1, s2)) | |
0.6 | |
>>> print(masi_distance(s1, s2)) | |
0.868 | |
---------------------- | |
Miscellaneous Measures | |
---------------------- | |
Rank Correlation works with two dictionaries mapping keys to ranks. | |
The dictionaries should have the same set of keys. | |
>>> spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3}) | |
0.5 | |
Windowdiff uses a sliding window in comparing two segmentations of the same input (e.g. tokenizations, chunkings). | |
Segmentations are represented using strings of zeros and ones. | |
>>> s1 = "000100000010" | |
>>> s2 = "000010000100" | |
>>> s3 = "100000010000" | |
>>> s4 = "000000000000" | |
>>> s5 = "111111111111" | |
>>> windowdiff(s1, s1, 3) | |
0.0 | |
>>> abs(windowdiff(s1, s2, 3) - 0.3) < 1e-6 # windowdiff(s1, s2, 3) == 0.3 | |
True | |
>>> abs(windowdiff(s2, s3, 3) - 0.8) < 1e-6 # windowdiff(s2, s3, 3) == 0.8 | |
True | |
>>> windowdiff(s1, s4, 3) | |
0.5 | |
>>> windowdiff(s1, s5, 3) | |
1.0 | |
---------------- | |
Confusion Matrix | |
---------------- | |
>>> reference = 'This is the reference data. Testing 123. aoaeoeoe' | |
>>> test = 'Thos iz_the rifirenci data. Testeng 123. aoaeoeoe' | |
>>> print(ConfusionMatrix(reference, test)) | |
| . 1 2 3 T _ a c d e f g h i n o r s t z | | |
--+-------------------------------------------+ | |
|<8>. . . . . 1 . . . . . . . . . . . . . . | | |
. | .<2>. . . . . . . . . . . . . . . . . . . | | |
1 | . .<1>. . . . . . . . . . . . . . . . . . | | |
2 | . . .<1>. . . . . . . . . . . . . . . . . | | |
3 | . . . .<1>. . . . . . . . . . . . . . . . | | |
T | . . . . .<2>. . . . . . . . . . . . . . . | | |
_ | . . . . . .<.>. . . . . . . . . . . . . . | | |
a | . . . . . . .<4>. . . . . . . . . . . . . | | |
c | . . . . . . . .<1>. . . . . . . . . . . . | | |
d | . . . . . . . . .<1>. . . . . . . . . . . | | |
e | . . . . . . . . . .<6>. . . 3 . . . . . . | | |
f | . . . . . . . . . . .<1>. . . . . . . . . | | |
g | . . . . . . . . . . . .<1>. . . . . . . . | | |
h | . . . . . . . . . . . . .<2>. . . . . . . | | |
i | . . . . . . . . . . 1 . . .<1>. 1 . . . . | | |
n | . . . . . . . . . . . . . . .<2>. . . . . | | |
o | . . . . . . . . . . . . . . . .<3>. . . . | | |
r | . . . . . . . . . . . . . . . . .<2>. . . | | |
s | . . . . . . . . . . . . . . . . . .<2>. 1 | | |
t | . . . . . . . . . . . . . . . . . . .<3>. | | |
z | . . . . . . . . . . . . . . . . . . . .<.>| | |
--+-------------------------------------------+ | |
(row = reference; col = test) | |
<BLANKLINE> | |
>>> cm = ConfusionMatrix(reference, test) | |
>>> print(cm.pretty_format(sort_by_count=True)) | |
| e a i o s t . T h n r 1 2 3 c d f g _ z | | |
--+-------------------------------------------+ | |
|<8>. . . . . . . . . . . . . . . . . . 1 . | | |
e | .<6>. 3 . . . . . . . . . . . . . . . . . | | |
a | . .<4>. . . . . . . . . . . . . . . . . . | | |
i | . 1 .<1>1 . . . . . . . . . . . . . . . . | | |
o | . . . .<3>. . . . . . . . . . . . . . . . | | |
s | . . . . .<2>. . . . . . . . . . . . . . 1 | | |
t | . . . . . .<3>. . . . . . . . . . . . . . | | |
. | . . . . . . .<2>. . . . . . . . . . . . . | | |
T | . . . . . . . .<2>. . . . . . . . . . . . | | |
h | . . . . . . . . .<2>. . . . . . . . . . . | | |
n | . . . . . . . . . .<2>. . . . . . . . . . | | |
r | . . . . . . . . . . .<2>. . . . . . . . . | | |
1 | . . . . . . . . . . . .<1>. . . . . . . . | | |
2 | . . . . . . . . . . . . .<1>. . . . . . . | | |
3 | . . . . . . . . . . . . . .<1>. . . . . . | | |
c | . . . . . . . . . . . . . . .<1>. . . . . | | |
d | . . . . . . . . . . . . . . . .<1>. . . . | | |
f | . . . . . . . . . . . . . . . . .<1>. . . | | |
g | . . . . . . . . . . . . . . . . . .<1>. . | | |
_ | . . . . . . . . . . . . . . . . . . .<.>. | | |
z | . . . . . . . . . . . . . . . . . . . .<.>| | |
--+-------------------------------------------+ | |
(row = reference; col = test) | |
<BLANKLINE> | |
>>> print(cm.pretty_format(sort_by_count=True, truncate=10)) | |
| e a i o s t . T h | | |
--+---------------------+ | |
|<8>. . . . . . . . . | | |
e | .<6>. 3 . . . . . . | | |
a | . .<4>. . . . . . . | | |
i | . 1 .<1>1 . . . . . | | |
o | . . . .<3>. . . . . | | |
s | . . . . .<2>. . . . | | |
t | . . . . . .<3>. . . | | |
. | . . . . . . .<2>. . | | |
T | . . . . . . . .<2>. | | |
h | . . . . . . . . .<2>| | |
--+---------------------+ | |
(row = reference; col = test) | |
<BLANKLINE> | |
>>> print(cm.pretty_format(sort_by_count=True, truncate=10, values_in_chart=False)) | |
| 1 | | |
| 1 2 3 4 5 6 7 8 9 0 | | |
---+---------------------+ | |
1 |<8>. . . . . . . . . | | |
2 | .<6>. 3 . . . . . . | | |
3 | . .<4>. . . . . . . | | |
4 | . 1 .<1>1 . . . . . | | |
5 | . . . .<3>. . . . . | | |
6 | . . . . .<2>. . . . | | |
7 | . . . . . .<3>. . . | | |
8 | . . . . . . .<2>. . | | |
9 | . . . . . . . .<2>. | | |
10 | . . . . . . . . .<2>| | |
---+---------------------+ | |
(row = reference; col = test) | |
Value key: | |
1: | |
2: e | |
3: a | |
4: i | |
5: o | |
6: s | |
7: t | |
8: . | |
9: T | |
10: h | |
<BLANKLINE> | |
For "e", the number of true positives should be 6, while the number of false negatives is 3. | |
So, the recall ought to be 6 / (6 + 3): | |
>>> cm.recall("e") # doctest: +ELLIPSIS | |
0.666666... | |
For "e", the false positive is just 1, so the precision should be 6 / (6 + 1): | |
>>> cm.precision("e") # doctest: +ELLIPSIS | |
0.857142... | |
The f-measure with default value of ``alpha = 0.5`` should then be: | |
* *1/(alpha/p + (1-alpha)/r) =* | |
* *1/(0.5/p + 0.5/r) =* | |
* *2pr / (p + r) =* | |
* *2 * 0.857142... * 0.666666... / (0.857142... + 0.666666...) =* | |
* *0.749999...* | |
>>> cm.f_measure("e") # doctest: +ELLIPSIS | |
0.749999... | |
-------------------- | |
Association measures | |
-------------------- | |
These measures are useful to determine whether the coocurrence of two random | |
events is meaningful. They are used, for instance, to distinguish collocations | |
from other pairs of adjacent words. | |
We bring some examples of bigram association calculations from Manning and | |
Schutze's SNLP, 2nd Ed. chapter 5. | |
>>> n_new_companies, n_new, n_companies, N = 8, 15828, 4675, 14307668 | |
>>> bam = BigramAssocMeasures | |
>>> bam.raw_freq(20, (42, 20), N) == 20. / N | |
True | |
>>> bam.student_t(n_new_companies, (n_new, n_companies), N) | |
0.999... | |
>>> bam.chi_sq(n_new_companies, (n_new, n_companies), N) | |
1.54... | |
>>> bam.likelihood_ratio(150, (12593, 932), N) | |
1291... | |
For other associations, we ensure the ordering of the measures: | |
>>> bam.mi_like(20, (42, 20), N) > bam.mi_like(20, (41, 27), N) | |
True | |
>>> bam.pmi(20, (42, 20), N) > bam.pmi(20, (41, 27), N) | |
True | |
>>> bam.phi_sq(20, (42, 20), N) > bam.phi_sq(20, (41, 27), N) | |
True | |
>>> bam.poisson_stirling(20, (42, 20), N) > bam.poisson_stirling(20, (41, 27), N) | |
True | |
>>> bam.jaccard(20, (42, 20), N) > bam.jaccard(20, (41, 27), N) | |
True | |
>>> bam.dice(20, (42, 20), N) > bam.dice(20, (41, 27), N) | |
True | |
>>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N) # doctest: +SKIP | |
False | |
For trigrams, we have to provide more count information: | |
>>> n_w1_w2_w3 = 20 | |
>>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40 | |
>>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3) | |
>>> n_w1, n_w2, n_w3 = 100, 200, 300 | |
>>> uni_counts = (n_w1, n_w2, n_w3) | |
>>> N = 14307668 | |
>>> tam = TrigramAssocMeasures | |
>>> tam.raw_freq(n_w1_w2_w3, pair_counts, uni_counts, N) == 1. * n_w1_w2_w3 / N | |
True | |
>>> uni_counts2 = (n_w1, n_w2, 100) | |
>>> tam.student_t(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.student_t(n_w1_w2_w3, pair_counts, uni_counts, N) | |
True | |
>>> tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts, N) | |
True | |
>>> tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts, N) | |
True | |
>>> tam.pmi(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.pmi(n_w1_w2_w3, pair_counts, uni_counts, N) | |
True | |
>>> tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts, N) | |
True | |
>>> tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts, N) | |
True | |
>>> tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts, N) | |
True | |
For fourgrams, we have to provide more count information: | |
>>> n_w1_w2_w3_w4 = 5 | |
>>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40 | |
>>> n_w1_w2_w3, n_w2_w3_w4 = 20, 10 | |
>>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3) | |
>>> triplet_counts = (n_w1_w2_w3, n_w2_w3_w4) | |
>>> n_w1, n_w2, n_w3, n_w4 = 100, 200, 300, 400 | |
>>> uni_counts = (n_w1, n_w2, n_w3, n_w4) | |
>>> N = 14307668 | |
>>> qam = QuadgramAssocMeasures | |
>>> qam.raw_freq(n_w1_w2_w3_w4, pair_counts, triplet_counts, uni_counts, N) == 1. * n_w1_w2_w3_w4 / N | |
True | |