Spaces:
Running
Running
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Edit distances between Unicode International Phonetic Alphabet strings. | |
This is a Hugging Face wrapper around the panphon library's distance module. | |
""" | |
import evaluate | |
import datasets | |
import numpy as np | |
import panphon.distance | |
_CITATION = """\ | |
@inproceedings{Mortensen-et-al:2016, | |
author = {David R. Mortensen and | |
Patrick Littell and | |
Akash Bharadwaj and | |
Kartik Goyal and | |
Chris Dyer and | |
Lori S. Levin}, | |
title = {PanPhon: {A} Resource for Mapping {IPA} Segments to Articulatory Feature Vectors}, | |
booktitle = {Proceedings of {COLING} 2016, the 26th International Conference on Computational Linguistics: Technical Papers}, | |
pages = {3475--3484}, | |
publisher = {{ACL}}, | |
year = {2016} | |
} | |
""" | |
_DESCRIPTION = """ | |
Error rates in terms of distance between articulatory phonological features can help understand differences | |
between strings in the International Phonetic Alphabet (IPA) in a linguistically motivated way. | |
This is useful when evaluating speech recognition or orthographic to IPA conversion tasks. | |
""" | |
_KWARGS_DESCRIPTION = """ | |
Calculates the following measures of difference that rely on phonetic features: | |
- Phone error rate (PER) gives edit distance in terms of phones, rather than Unicode characters, since phones can consist of\ | |
multiple characters. It is normalized by the number of phones of the reference string. | |
- Phone feature error rate (PFER) is Levenshtein distance between strings where distance between individual phones\ | |
is computed using Hamming distance between phonetic features. By default it is a metric that obeys the triangle\ | |
equality, but can also be normalized by number of phones. | |
- Feature error rate (FER) is the edit distance in terms of articulatory features normalized by the number of phones in the reference. | |
Each measure is given for each prediction, reference pair along with the mean value across all pairs. | |
Args: | |
predictions: list of predictions to score. Each predictions should be a string of unicode characters. | |
references: list of reference for each prediction. Each reference should be a string with of unicode characters. | |
feature_model: string to set which panphon.distance.Distance feature parsing model is used, choose from "strict", "permissive", "segment". Defaults to "segment". | |
is_normalize_pfer: bool, set to True to normalize PFER by the largest number of phones in the prediction, reference pair | |
Returns: | |
phone_error_rates: list of floats giving PER for each prediction, reference pair | |
mean_phone_error_rate: float, average PER across all examples | |
phone_feature_error_rates: list of floats giving PFER for each prediction, reference pair | |
mean_phone_feature_error_rate: float, average PFER across all examples | |
feature_error_rates: list of floats giving FER for each prediction, reference pair | |
mean_feature_error_rate: float, average FER across all examples | |
Examples: | |
Compare articulatory differences in voicing in "bob" vs. "pop" and different pronunciations of "the": | |
phone_distance = evaluate.load("ginic/phone_errors") | |
phone_distance.compute(predictions=["bob", "ði"], references=["pop", "ðə"]) | |
{'phone_error_rates': [0.6666666666666666, 0.5], 'mean_phone_error_rate': 0.5833333333333333, 'phone_feature_error_rates': [0.08333333333333333, 0.125], 'mean_phone_feature_error_rate': 0.10416666666666666, 'feature_error_rates': [0.027777777777777776, 0.0625], 'mean_feature_error_rate': 0.04513888888888889} | |
Normalize PFER by the length of string with largest number of phones: | |
phone_distance = evaluate.load("ginic/phone_errors") | |
phone_distance.compute(predictions=["bob", "ði"], references=["pop", "ðə"], is_normalize_pfer=True) | |
""" | |
def phone_error_rate(prediction:str, reference: str, distance_computer:panphon.distance.Distance): | |
"""Computes phone error rates. This is similar to the Distance.phoneme_error_rate function, but | |
is more efficient and fixes some bugs related to normalization in the original function. | |
Args: | |
distance_computer (panphon.distance.Distance): computes edit distance and resolves characters to phones | |
Returns: | |
float: the phone error rate | |
>>> phone_error_rate("bob", "po", panphon.distance.Distance()) | |
1.0 | |
>>> phone_error_rate("ði", "ðə", panphon.distance.Distance()) | |
0.5 | |
""" | |
if reference: # Can only be computed when the length of the reference greater than 0 | |
pred_phones = distance_computer.fm.ipa_segs(prediction) | |
ref_phones = distance_computer.fm.ipa_segs(reference) | |
phone_edits = distance_computer.min_edit_distance( | |
lambda x: 1, # deletion cost | |
lambda x: 1, # insertion cost | |
lambda x, y: 0 if x == y else 1, # substitution cost, | |
[[]], | |
pred_phones, | |
ref_phones | |
) | |
return phone_edits / len(ref_phones) | |
else: | |
raise ValueError("one or more references are empty strings") | |
class PhoneDistance(evaluate.Metric): | |
"""Class for computing distance between Unicode IPA strings """ | |
def _info(self): | |
# TODO: Specifies the evaluate.EvaluationModuleInfo object | |
return evaluate.MetricInfo( | |
# This is the description that will appear on the modules page. | |
module_type="metric", | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
# This defines the format of each prediction and reference | |
features=datasets.Features({ | |
'predictions': datasets.Value('string', id="sequence"), | |
'references': datasets.Value('string', id="sequence"), | |
}), | |
# Additional links to the codebase or references | |
codebase_urls=["https://github.com/dmort27/panphon", "https://huggingface.co/spaces/ginic/phone_distance/tree/main"], | |
reference_urls=["https://pypi.org/project/panphon/", "https://arxiv.org/abs/2308.03917"] | |
) | |
def _compute(self, predictions:list[str], references:list[str], feature_model:str="segment", is_normalize_pfer:bool=False): | |
"""Computes phoneme error rates, phone feature error rate (Hamming feature edit distance) and feature error rates between prediction and reference strings | |
Args: | |
predictions (list[str]): Predicted transcriptions. | |
references (list[str]): Reference transcriptions. | |
feature_model (str, optional): panphon.distance.Distance feature parsing model to be used, choose from "strict", "permissive", "segment". Defaults to "segment". | |
is_normalize_pfer (bool, optional): Set to true to normalize phone feature error rates by maximum length (measure won't be a true metric). Defaults to False. | |
Returns: | |
dict: {"phone_error_rates": list[float], "mean_phone_error_rate": float, "phone_feature_error_rates": list[float], "mean_phone_feature_error_rates": float, | |
"feature_error_rates": list[float], "mean_feature_error_rate": float} | |
""" | |
distance_computer = panphon.distance.Distance(feature_model=feature_model) | |
phone_error_rates = [] | |
feature_error_rates = [] | |
hamming_distances = [] | |
for p, r in zip(predictions, references): | |
if is_normalize_pfer: | |
hd = distance_computer.hamming_feature_edit_distance_div_maxlen(p, r) | |
else: | |
hd = distance_computer.hamming_feature_edit_distance(p, r) | |
hamming_distances.append(hd) | |
per = phone_error_rate(p, r, distance_computer) | |
phone_error_rates.append(per) | |
fer = distance_computer.feature_error_rate(p, r) | |
feature_error_rates.append(fer) | |
return { | |
"phone_error_rates": phone_error_rates, | |
"mean_phone_error_rate": np.mean(phone_error_rates), | |
"phone_feature_error_rates": hamming_distances, | |
"mean_phone_feature_error_rate": np.mean(hamming_distances), | |
"feature_error_rates": feature_error_rates, | |
"mean_feature_error_rate": np.mean(feature_error_rates) | |
} | |