evaluated without lm

Browse files

Files changed (3) hide show

eval.py +26 -6
mozilla-foundation_common_voice_8_0_es_test_eval_results.txt +2 -0
mozilla-foundation_common_voice_8_0_es_validation_eval_results.txt +2 -0

eval.py CHANGED Viewed

@@ -46,12 +46,25 @@ def log_results(result: Dataset, args: Dict[str, str]):
             result.map(write_to_file, with_indices=True)
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
     chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
     text = re.sub(chars_to_ignore_regex, "", text.lower())
     # In addition, we can normalize the target text, e.g. removing new lines characters etc...
@@ -60,7 +73,10 @@ def normalize_text(text: str) -> str:
     for t in token_sequences_to_ignore:
         text = " ".join(text.split(t))
     return text
@@ -69,8 +85,14 @@ def main(args):
     dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
     # for testing: only process the first two examples as a test
-    # dataset = dataset.select(range(10))
     # load processor
     feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
     sampling_rate = feature_extractor.sampling_rate
@@ -88,14 +110,12 @@ def main(args):
         prediction = asr(
             batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
         )
         batch["prediction"] = prediction["text"]
         batch["target"] = normalize_text(batch["sentence"])
         return batch
     # run inference on all examples
     result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
     # compute and log_results
     # do not change function below
     log_results(result, args)

             result.map(write_to_file, with_indices=True)
+def clean_batch(text):
+    text = re.sub("([^A-Za-zÀ-ú ])", '', text).lower()
+    text = re.sub("([ß|þ|ð|æ])",'',text)
+    return text
+def homologate_accents(text):
+    text=re.sub("([â|ã|ä|å|à])","a",text)
+    text=re.sub("([é|ê|ë])","e",text)
+    text=re.sub("([ì|î|ï])","i",text)
+    text=re.sub("([ö|õ|ô|ò|ø])","o",text)
+    text=re.sub("ù","u",text)
+    text=re.sub("ç","c",text)
+    return text
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
     chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+    text = text.lower()
     text = re.sub(chars_to_ignore_regex, "", text.lower())
     # In addition, we can normalize the target text, e.g. removing new lines characters etc...
     for t in token_sequences_to_ignore:
         text = " ".join(text.split(t))
+    #added functions
+    text = homologate_accents(text)
+    text = clean_batch(text)
     return text
     dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
     # for testing: only process the first two examples as a test
+    #dataset = dataset.select(range(15))
+#    vocab = [character for character in "aábcdeéfghiíjklmnñoópqrstuúüvwxyz·-."]
+#    dataset = dataset.filter(
+#            lambda example: not any((c not in vocab) for c in example),
+#            input_columns='sentence',
+#            desc="remove examples with weird characters"
+#        )
     # load processor
     feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
     sampling_rate = feature_extractor.sampling_rate
         prediction = asr(
             batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
         )
         batch["prediction"] = prediction["text"]
         batch["target"] = normalize_text(batch["sentence"])
         return batch
     # run inference on all examples
     result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
     # compute and log_results
     # do not change function below
     log_results(result, args)

mozilla-foundation_common_voice_8_0_es_test_eval_results.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ WER: 0.12618083227750462
2	+ CER: 0.035028395923434555

mozilla-foundation_common_voice_8_0_es_validation_eval_results.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ WER: 0.10670647680293982
2	+ CER: 0.0284079393233586