Plim commited on
Commit
373963f
·
1 Parent(s): 65c46e9

Model save

Browse files
.ipynb_checkpoints/run_speech_recognition_ctc-checkpoint.py CHANGED
@@ -434,14 +434,11 @@ def main():
434
  # that make training complicated and do not help in transcribing the speech
435
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
436
  # that could be easily picked up by the model
437
- chars_to_ignore_regex = '[,?.!-;:"“%‘”�—…–=^_`{|}~£§«®°±´µ·º»½×ßáãäìíðñòóõöøýþÿāăąćċčďđēėęěğġħĩī생집]'
438
  text_column_name = data_args.text_column_name
439
 
440
  def remove_and_replace_special_characters(batch):
441
- if chars_to_ignore_regex is not None:
442
- batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower().replace('’', "'") + " "
443
- else:
444
- batch["target_text"] = batch[text_column_name].lower().replace('’', "'") + " "
445
  return batch
446
 
447
  with training_args.main_process_first(desc="dataset map special characters removal"):
 
434
  # that make training complicated and do not help in transcribing the speech
435
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
436
  # that could be easily picked up by the model
437
+ chars_to_ignore_regex = '[^a-zàâäçéèêëîïôöùûüÿ\'’ ]'
438
  text_column_name = data_args.text_column_name
439
 
440
  def remove_and_replace_special_characters(batch):
441
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name].lower()).replace('’', "'") + " "
 
 
 
442
  return batch
443
 
444
  with training_args.main_process_first(desc="dataset map special characters removal"):
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49ea08a4f479298cf334d5e185c3436c6386682d50c2b013bf79ba2880dd7fb2
3
  size 1263088113
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ec1c7675b56877de46f623ac51149e73d4f88cac691dc72595621d35344ce9b
3
  size 1263088113
run_speech_recognition_ctc.py CHANGED
@@ -434,14 +434,11 @@ def main():
434
  # that make training complicated and do not help in transcribing the speech
435
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
436
  # that could be easily picked up by the model
437
- chars_to_ignore_regex = '[,?.!-;:"“%‘”�—…–=^_`{|}~£§«®°±´µ·º»½×ßáãäìíðñòóõöøýþÿāăąćċčďđēėęěğġħĩī생집]'
438
  text_column_name = data_args.text_column_name
439
 
440
  def remove_and_replace_special_characters(batch):
441
- if chars_to_ignore_regex is not None:
442
- batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower().replace('’', "'") + " "
443
- else:
444
- batch["target_text"] = batch[text_column_name].lower().replace('’', "'") + " "
445
  return batch
446
 
447
  with training_args.main_process_first(desc="dataset map special characters removal"):
 
434
  # that make training complicated and do not help in transcribing the speech
435
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
436
  # that could be easily picked up by the model
437
+ chars_to_ignore_regex = '[^a-zàâäçéèêëîïôöùûüÿ\'’ ]'
438
  text_column_name = data_args.text_column_name
439
 
440
  def remove_and_replace_special_characters(batch):
441
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name].lower()).replace('’', "'") + " "
 
 
 
442
  return batch
443
 
444
  with training_args.main_process_first(desc="dataset map special characters removal"):