Commit
·
58b2225
1
Parent(s):
0a897e8
evaluated without lm
Browse files
eval.py
CHANGED
@@ -46,12 +46,25 @@ def log_results(result: Dataset, args: Dict[str, str]):
|
|
46 |
|
47 |
result.map(write_to_file, with_indices=True)
|
48 |
|
|
|
|
|
|
|
|
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
def normalize_text(text: str) -> str:
|
51 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
52 |
|
53 |
chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
54 |
-
|
55 |
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
56 |
|
57 |
# In addition, we can normalize the target text, e.g. removing new lines characters etc...
|
@@ -60,7 +73,10 @@ def normalize_text(text: str) -> str:
|
|
60 |
|
61 |
for t in token_sequences_to_ignore:
|
62 |
text = " ".join(text.split(t))
|
63 |
-
|
|
|
|
|
|
|
64 |
return text
|
65 |
|
66 |
|
@@ -69,8 +85,14 @@ def main(args):
|
|
69 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
70 |
|
71 |
# for testing: only process the first two examples as a test
|
72 |
-
#
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
# load processor
|
75 |
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
76 |
sampling_rate = feature_extractor.sampling_rate
|
@@ -88,14 +110,12 @@ def main(args):
|
|
88 |
prediction = asr(
|
89 |
batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
|
90 |
)
|
91 |
-
|
92 |
batch["prediction"] = prediction["text"]
|
93 |
batch["target"] = normalize_text(batch["sentence"])
|
94 |
return batch
|
95 |
|
96 |
# run inference on all examples
|
97 |
result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
|
98 |
-
|
99 |
# compute and log_results
|
100 |
# do not change function below
|
101 |
log_results(result, args)
|
|
|
46 |
|
47 |
result.map(write_to_file, with_indices=True)
|
48 |
|
49 |
+
def clean_batch(text):
|
50 |
+
text = re.sub("([^A-Za-zÀ-ú ])", '', text).lower()
|
51 |
+
text = re.sub("([ß|þ|ð|æ])",'',text)
|
52 |
+
return text
|
53 |
|
54 |
+
def homologate_accents(text):
|
55 |
+
text=re.sub("([â|ã|ä|å|à])","a",text)
|
56 |
+
text=re.sub("([é|ê|ë])","e",text)
|
57 |
+
text=re.sub("([ì|î|ï])","i",text)
|
58 |
+
text=re.sub("([ö|õ|ô|ò|ø])","o",text)
|
59 |
+
text=re.sub("ù","u",text)
|
60 |
+
text=re.sub("ç","c",text)
|
61 |
+
return text
|
62 |
+
|
63 |
def normalize_text(text: str) -> str:
|
64 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
65 |
|
66 |
chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
67 |
+
text = text.lower()
|
68 |
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
69 |
|
70 |
# In addition, we can normalize the target text, e.g. removing new lines characters etc...
|
|
|
73 |
|
74 |
for t in token_sequences_to_ignore:
|
75 |
text = " ".join(text.split(t))
|
76 |
+
|
77 |
+
#added functions
|
78 |
+
text = homologate_accents(text)
|
79 |
+
text = clean_batch(text)
|
80 |
return text
|
81 |
|
82 |
|
|
|
85 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
86 |
|
87 |
# for testing: only process the first two examples as a test
|
88 |
+
#dataset = dataset.select(range(15))
|
89 |
+
# vocab = [character for character in "aábcdeéfghiíjklmnñoópqrstuúüvwxyz·-."]
|
90 |
+
|
91 |
+
# dataset = dataset.filter(
|
92 |
+
# lambda example: not any((c not in vocab) for c in example),
|
93 |
+
# input_columns='sentence',
|
94 |
+
# desc="remove examples with weird characters"
|
95 |
+
# )
|
96 |
# load processor
|
97 |
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
98 |
sampling_rate = feature_extractor.sampling_rate
|
|
|
110 |
prediction = asr(
|
111 |
batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
|
112 |
)
|
|
|
113 |
batch["prediction"] = prediction["text"]
|
114 |
batch["target"] = normalize_text(batch["sentence"])
|
115 |
return batch
|
116 |
|
117 |
# run inference on all examples
|
118 |
result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
|
|
|
119 |
# compute and log_results
|
120 |
# do not change function below
|
121 |
log_results(result, args)
|
mozilla-foundation_common_voice_8_0_es_test_eval_results.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
WER: 0.12618083227750462
|
2 |
+
CER: 0.035028395923434555
|
mozilla-foundation_common_voice_8_0_es_validation_eval_results.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
WER: 0.10670647680293982
|
2 |
+
CER: 0.0284079393233586
|