Irpan commited on
Commit
4f70bd6
·
1 Parent(s): 9510f4a
Files changed (22) hide show
  1. kaztts_male2_tacotron2_train.loss.ave/exp/tts_stats_raw_char/train/feats_stats.npz +3 -0
  2. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/config.yaml +231 -0
  3. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/attn_loss.png +0 -0
  4. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/backward_time.png +0 -0
  5. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/bce_loss.png +0 -0
  6. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/forward_time.png +0 -0
  7. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/gpu_max_cached_mem_GB.png +0 -0
  8. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/iter_time.png +0 -0
  9. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/l1_loss.png +0 -0
  10. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/loss.png +0 -0
  11. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/mse_loss.png +0 -0
  12. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/optim0_lr0.png +0 -0
  13. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/optim_step_time.png +0 -0
  14. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/train_time.png +0 -0
  15. kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/train.loss.ave_5best.pth +3 -0
  16. kaztts_male2_tacotron2_train.loss.ave/meta.yaml +9 -0
  17. parallelwavegan_male2_checkpoint/checkpoint-400000steps.pkl +3 -0
  18. parallelwavegan_male2_checkpoint/config.yml +104 -0
  19. requirements.txt +4 -1
  20. tts.py +47 -2
  21. turkicTTS_ipa_convert.py +1815 -0
  22. turkicTTS_utils.py +24 -0
kaztts_male2_tacotron2_train.loss.ave/exp/tts_stats_raw_char/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dca35226cf181d840baedf98032bea16d3cb4b69496cd59adb8f04b49d298fc
3
+ size 1402
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/config.yaml ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_raw_char
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 200
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ grad_clip: 1.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_tensorboard: true
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: null
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 5120000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/tts_stats_raw_char/train/text_shape.char
72
+ - exp/tts_stats_raw_char/train/speech_shape
73
+ valid_shape_file:
74
+ - exp/tts_stats_raw_char/valid/text_shape.char
75
+ - exp/tts_stats_raw_char/valid/speech_shape
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 150
80
+ - 204800
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/tr_no_dev/text
89
+ - text
90
+ - text
91
+ - - dump/raw/tr_no_dev/wav.scp
92
+ - speech
93
+ - sound
94
+ valid_data_path_and_name_and_type:
95
+ - - dump/raw/dev/text
96
+ - text
97
+ - text
98
+ - - dump/raw/dev/wav.scp
99
+ - speech
100
+ - sound
101
+ allow_variable_data_keys: false
102
+ max_cache_size: 0.0
103
+ max_cache_fd: 32
104
+ valid_max_cache_size: null
105
+ optim: adam
106
+ optim_conf:
107
+ lr: 0.001
108
+ eps: 1.0e-06
109
+ weight_decay: 0.0
110
+ scheduler: null
111
+ scheduler_conf: {}
112
+ token_list:
113
+ - <blank>
114
+ - <unk>
115
+ - <space>
116
+ - а
117
+ - ы
118
+ - е
119
+ - н
120
+ - т
121
+ - р
122
+ - і
123
+ - л
124
+ - с
125
+ - д
126
+ - қ
127
+ - м
128
+ - к
129
+ - о
130
+ - б
131
+ - ж
132
+ - у
133
+ - з
134
+ - и
135
+ - ғ
136
+ - п
137
+ - ң
138
+ - ш
139
+ - й
140
+ - г
141
+ - .
142
+ - ү
143
+ - ұ
144
+ - ө
145
+ - ','
146
+ - ә
147
+ - '-'
148
+ - я
149
+ - в
150
+ - х
151
+ - ц
152
+ - ф
153
+ - э
154
+ - ь
155
+ - ю
156
+ - ч
157
+ - ':'
158
+ - '?'
159
+ - ;
160
+ - ъ
161
+ - һ
162
+ - '!'
163
+ - щ
164
+ - ё
165
+ - <sos/eos>
166
+ odim: null
167
+ model_conf: {}
168
+ use_preprocessor: true
169
+ token_type: char
170
+ bpemodel: null
171
+ non_linguistic_symbols: null
172
+ cleaner: null
173
+ g2p: null
174
+ feats_extract: fbank
175
+ feats_extract_conf:
176
+ n_fft: 1024
177
+ hop_length: 256
178
+ win_length: null
179
+ fs: 22050
180
+ fmin: 80
181
+ fmax: 7600
182
+ n_mels: 80
183
+ normalize: global_mvn
184
+ normalize_conf:
185
+ stats_file: exp/tts_stats_raw_char/train/feats_stats.npz
186
+ tts: tacotron2
187
+ tts_conf:
188
+ embed_dim: 512
189
+ elayers: 1
190
+ eunits: 512
191
+ econv_layers: 3
192
+ econv_chans: 512
193
+ econv_filts: 5
194
+ atype: location
195
+ adim: 512
196
+ aconv_chans: 32
197
+ aconv_filts: 15
198
+ cumulate_att_w: true
199
+ dlayers: 2
200
+ dunits: 1024
201
+ prenet_layers: 2
202
+ prenet_units: 256
203
+ postnet_layers: 5
204
+ postnet_chans: 512
205
+ postnet_filts: 5
206
+ output_activation: null
207
+ use_batch_norm: true
208
+ use_concate: true
209
+ use_residual: false
210
+ dropout_rate: 0.5
211
+ zoneout_rate: 0.1
212
+ reduction_factor: 1
213
+ spk_embed_dim: null
214
+ use_masking: true
215
+ bce_pos_weight: 5.0
216
+ use_guided_attn_loss: true
217
+ guided_attn_loss_sigma: 0.4
218
+ guided_attn_loss_lambda: 1.0
219
+ pitch_extract: null
220
+ pitch_extract_conf: {}
221
+ pitch_normalize: null
222
+ pitch_normalize_conf: {}
223
+ energy_extract: null
224
+ energy_extract_conf: {}
225
+ energy_normalize: null
226
+ energy_normalize_conf: {}
227
+ required:
228
+ - output_dir
229
+ - token_list
230
+ version: 0.10.3a4
231
+ distributed: false
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/attn_loss.png ADDED
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/backward_time.png ADDED
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/bce_loss.png ADDED
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/forward_time.png ADDED
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/gpu_max_cached_mem_GB.png ADDED
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/iter_time.png ADDED
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/l1_loss.png ADDED
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/loss.png ADDED
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/mse_loss.png ADDED
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/optim0_lr0.png ADDED
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/optim_step_time.png ADDED
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/images/train_time.png ADDED
kaztts_male2_tacotron2_train.loss.ave/exp/tts_train_raw_char/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:deb9cb49703b012d8f1406d15f4fa182b39ecb320bc4e2be5a557ac58766ea75
3
+ size 106809178
kaztts_male2_tacotron2_train.loss.ave/meta.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.3a4
2
+ files:
3
+ model_file: exp/tts_train_raw_char/train.loss.ave_5best.pth
4
+ python: "3.8.12 | packaged by conda-forge | (default, Oct 12 2021, 21:59:51) \n[GCC\
5
+ \ 9.4.0]"
6
+ timestamp: 1641919865.515724
7
+ torch: 1.7.0
8
+ yaml_files:
9
+ train_config: exp/tts_train_raw_char/config.yaml
parallelwavegan_male2_checkpoint/checkpoint-400000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb415182fdf84dbe6a0bbca72e5ed9ed39009b622cf39e20ba630cb830594329
3
+ size 17694194
parallelwavegan_male2_checkpoint/config.yml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ allow_cache: true
2
+ batch_max_steps: 25600
3
+ batch_size: 6
4
+ config: conf/parallel_wavegan.v1.yaml
5
+ dev_dumpdir: dump/dev/norm
6
+ dev_feats_scp: null
7
+ dev_segments: null
8
+ dev_wav_scp: null
9
+ discriminator_grad_norm: 1
10
+ discriminator_optimizer_params:
11
+ eps: 1.0e-06
12
+ lr: 5.0e-05
13
+ weight_decay: 0.0
14
+ discriminator_params:
15
+ bias: true
16
+ conv_channels: 64
17
+ in_channels: 1
18
+ kernel_size: 3
19
+ layers: 10
20
+ nonlinear_activation: LeakyReLU
21
+ nonlinear_activation_params:
22
+ negative_slope: 0.2
23
+ out_channels: 1
24
+ use_weight_norm: true
25
+ discriminator_scheduler_params:
26
+ gamma: 0.5
27
+ step_size: 200000
28
+ discriminator_train_start_steps: 100000
29
+ distributed: false
30
+ eval_interval_steps: 1000
31
+ fft_size: 1024
32
+ fmax: 7600
33
+ fmin: 80
34
+ format: hdf5
35
+ generator_grad_norm: 10
36
+ generator_optimizer_params:
37
+ eps: 1.0e-06
38
+ lr: 0.0001
39
+ weight_decay: 0.0
40
+ generator_params:
41
+ aux_channels: 80
42
+ aux_context_window: 2
43
+ dropout: 0.0
44
+ gate_channels: 128
45
+ in_channels: 1
46
+ kernel_size: 3
47
+ layers: 30
48
+ out_channels: 1
49
+ residual_channels: 64
50
+ skip_channels: 64
51
+ stacks: 3
52
+ upsample_net: ConvInUpsampleNetwork
53
+ upsample_params:
54
+ upsample_scales:
55
+ - 4
56
+ - 4
57
+ - 4
58
+ - 4
59
+ use_weight_norm: true
60
+ generator_scheduler_params:
61
+ gamma: 0.5
62
+ step_size: 200000
63
+ global_gain_scale: 1.0
64
+ hop_size: 256
65
+ lambda_adv: 4.0
66
+ log_interval_steps: 100
67
+ num_mels: 80
68
+ num_save_intermediate_results: 4
69
+ num_workers: 2
70
+ outdir: exp/train_nodev_parallel_wavegan.v1
71
+ pin_memory: true
72
+ pretrain: ''
73
+ rank: 0
74
+ remove_short_samples: true
75
+ resume: ''
76
+ sampling_rate: 22050
77
+ save_interval_steps: 5000
78
+ stft_loss_params:
79
+ fft_sizes:
80
+ - 1024
81
+ - 2048
82
+ - 512
83
+ hop_sizes:
84
+ - 120
85
+ - 240
86
+ - 50
87
+ win_lengths:
88
+ - 600
89
+ - 1200
90
+ - 240
91
+ window: hann_window
92
+ train_dumpdir: dump/train_nodev/norm
93
+ train_feats_scp: null
94
+ train_max_steps: 400000
95
+ train_segments: null
96
+ train_wav_scp: null
97
+ trim_frame_size: 2048
98
+ trim_hop_size: 512
99
+ trim_silence: false
100
+ trim_threshold_in_db: 60
101
+ verbose: 1
102
+ version: 0.4.8
103
+ win_length: null
104
+ window: hann
requirements.txt CHANGED
@@ -4,4 +4,7 @@ torchaudio
4
  transformers
5
  numpy
6
  scipy==1.13.1
7
- umsc
 
 
 
 
4
  transformers
5
  numpy
6
  scipy==1.13.1
7
+ umsc
8
+ parallel_wavegan==0.6.1
9
+ espnet==202412
10
+ espnet-tts-frontend==0.0.3
tts.py CHANGED
@@ -1,8 +1,13 @@
1
  from transformers import VitsModel, AutoTokenizer
2
  import torch
3
  import scipy.io.wavfile
 
 
 
4
  import util
5
 
 
 
6
  # Load processor and model
7
  models_info = {
8
  "Meta-MMS": {
@@ -10,10 +15,37 @@ models_info = {
10
  "model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
11
  "arabic_script": True
12
  },
 
13
  }
14
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def synthesize(text, model_id):
 
 
 
17
  if models_info[model_id]["arabic_script"]:
18
  text = util.ug_latn_to_arab(text)
19
  processor = models_info[model_id]["processor"]
@@ -27,4 +59,17 @@ def synthesize(text, model_id):
27
  sample_rate = model.config.sampling_rate
28
  scipy.io.wavfile.write(output_path, rate=sample_rate, data=output.numpy()[0])
29
 
30
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from transformers import VitsModel, AutoTokenizer
2
  import torch
3
  import scipy.io.wavfile
4
+ from parallel_wavegan.utils import load_model
5
+ from espnet2.bin.tts_inference import Text2Speech
6
+ from turkicTTS_utils import normalization
7
  import util
8
 
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+
11
  # Load processor and model
12
  models_info = {
13
  "Meta-MMS": {
 
15
  "model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
16
  "arabic_script": True
17
  },
18
+ "IS2AI-TurkicTTS": None
19
  }
20
+
21
+ vocoder_checkpoint="parallelwavegan_male2_checkpoint/checkpoint-400000steps.pkl" ### specify vocoder path
22
+ vocoder = load_model(vocoder_checkpoint).to(device).eval()
23
+ vocoder.remove_weight_norm()
24
+
25
+ ### specify path to the main model(transformer/tacotron2/fastspeech) and its config file
26
+ config_file = "exp/tts_train_raw_char/config.yaml"
27
+ model_path = "exp/tts_train_raw_char/train.loss.ave_5best.pth"
28
+
29
+ text2speech = Text2Speech(
30
+ config_file,
31
+ model_path,
32
+ device=device, ## if cuda not available use cpu
33
+ ### only for Tacotron 2
34
+ threshold=0.5,
35
+ minlenratio=0.0,
36
+ maxlenratio=10.0,
37
+ use_att_constraint=True,
38
+ backward_window=1,
39
+ forward_window=3,
40
+ ### only for FastSpeech & FastSpeech2
41
+ speed_control_alpha=1.0,
42
+ )
43
+ text2speech.spc2wav = None ### disable griffin-lim
44
 
45
  def synthesize(text, model_id):
46
+ if model_id == 'IS2AI-TurkicTTS':
47
+ return synthesize_turkic_tts(text)
48
+
49
  if models_info[model_id]["arabic_script"]:
50
  text = util.ug_latn_to_arab(text)
51
  processor = models_info[model_id]["processor"]
 
59
  sample_rate = model.config.sampling_rate
60
  scipy.io.wavfile.write(output_path, rate=sample_rate, data=output.numpy()[0])
61
 
62
+ return output_path
63
+
64
+ def synthesize_turkic_tts(text):
65
+ text = normalization(text, 'uyghur')
66
+
67
+ with torch.no_grad():
68
+ c_mel = text2speech(text)['feat_gen']
69
+ wav = vocoder.inference(c_mel)
70
+
71
+ output = wav.view(-1).cpu()
72
+
73
+ output_path = "tts_output.wav"
74
+ scipy.io.wavfile.write(output_path, rate=22050, data=output.numpy()[0])
75
+
turkicTTS_ipa_convert.py ADDED
@@ -0,0 +1,1815 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ '''
6
+ 2022.06.03
7
+ kazakh_to_ipa() <> ipa_to_kazakh()
8
+ test_kazakh()
9
+ turkish_to_ipa() <> ipa_to_turkish()
10
+ test_turkish()
11
+
12
+ 2022.07.05
13
+ kyrgyz_to_ipa() <> ipa_to_kyrgyz()
14
+ test_kyrgyz()
15
+ uzbek_to_ipa() <> ipa_to_uzbek()
16
+ test_uzbek()
17
+ azerbaijani_to_ipa() <> ipa_to_azerbaijani()
18
+ test_azerbaijani()
19
+ turkmen_to_ipa() <> ipa_to_turkmen()
20
+ test_turkmen()
21
+
22
+ 2022.07.07
23
+ tatar_to_ipa() <> ipa_to_tatar()
24
+ test_tatar()
25
+ bashkir_to_ipa() <> ipa_to_bashkir()
26
+ test_bashkir()
27
+ sakha_to_ipa() <> ipa_to_sakha()
28
+ test_sakha()
29
+
30
+ 2022.07.12
31
+ experimentally added î and â to turkish_to_ipa()
32
+
33
+ 2022.08.04
34
+ uyghur_to_ipa() <> ipa_to_uyghur()
35
+ '''
36
+
37
+ import re
38
+
39
+
40
+ # kazakh scripts
41
+
42
+ def kazakh_to_ipa(text):
43
+ # we shall begin with sound combinations:
44
+ # the longer a combination, the upper it is on the list.
45
+ # single sounds should go to the bottom, with consonants taking precedence over vowels.
46
+ # consonants are less likely to change than vowels.
47
+
48
+ # for convenience, we shall use single symbols to denote multiple-sound combinations.
49
+ # we can later convert them to conventional symbols.
50
+
51
+ # three-sound convenience vowels:
52
+ text = re.sub("[Юю]", "ǔ", text)
53
+
54
+ # two-sound convenience consonants:
55
+ text = re.sub("[Цц]", "š", text)
56
+ text = re.sub("[Чч]", "ʆ", text)
57
+
58
+ # two-sound convenience vowels:
59
+ text = re.sub("[Яя]", "ǎ", text)
60
+ text = re.sub("[Ее]", "ě", text)
61
+ text = re.sub("[Ёё]", "ǒ", text)
62
+ text = re.sub("[Ии]", "ǐ", text)
63
+ text = re.sub("[Уу]", "u", text)
64
+
65
+ # single-sound consonants:
66
+ text = re.sub("[Бб]", "b", text)
67
+ text = re.sub("[Вв]", "v", text)
68
+ text = re.sub("[Гг]", "g", text)
69
+ text = re.sub("[Ғғ]", "ɣ", text)
70
+ text = re.sub("[Дд]", "d", text)
71
+ text = re.sub("[Жж]", "ʒ", text)
72
+ text = re.sub("[Зз]", "z", text)
73
+ text = re.sub("[Йй]", "j", text)
74
+ text = re.sub("[Кк]", "k", text)
75
+ text = re.sub("[Ққ]", "q", text)
76
+ text = re.sub("[Лл]", "l", text)
77
+ text = re.sub("[Мм]", "m", text)
78
+ text = re.sub("[Нн]", "n", text)
79
+ text = re.sub("[Ңң]", "ŋ", text)
80
+ text = re.sub("[Пп]", "p", text)
81
+ text = re.sub("[Рр]", "r", text)
82
+ text = re.sub("[Сс]", "s", text)
83
+ text = re.sub("[Тт]", "t", text)
84
+ text = re.sub("[Фф]", "f", text)
85
+ text = re.sub("[Хх]", "x", text)
86
+ text = re.sub("[Һһ]", "h", text)
87
+ text = re.sub("[Шш]", "ʃ", text)
88
+ text = re.sub("[Щщ]", "ɕ", text)
89
+ text = re.sub("[Ъъ]", "ʔ", text)
90
+ text = re.sub("[Ьь]", "ʲ", text)
91
+
92
+ # single-sound vowels:
93
+ text = re.sub("[Аа]", "ɑ", text)
94
+ text = re.sub("[Әә]", "æ", text)
95
+ text = re.sub("[Оо]", "ɔ", text)
96
+ text = re.sub("[Өө]", "ɵ", text)
97
+ text = re.sub("[Ұұ]", "ʊ", text)
98
+ text = re.sub("[Үү]", "ʏ", text)
99
+ text = re.sub("[Ыы]", "ɤ", text)
100
+ text = re.sub("[Іі]", "ɪ", text)
101
+ text = re.sub("[Ээ]", "e", text)
102
+
103
+ # rules
104
+
105
+ '''
106
+ rule 1:
107
+ if [æ], [ě], [ɵ], [ʏ], [ɪ] are followed by [l] and [l] is NOT followed by [æ], [ě], [ɵ], [ʏ], [ɪ], or [ʲ],
108
+ use [ł] instead of [l] (e.g., [kěł], [kěłdɪ], but [kělěmɪn], [marsělʲ]).
109
+ '''
110
+ text = re.sub(r"([æěɵʏɪ])(l)([^æěɵʏɪʲ])", r"\1ł\3", text)
111
+
112
+ '''
113
+ rule 2:
114
+ if the letters "о" and "ө", [ɔ] and [ɵ] at the beginning of a word are preceded by [w] (e.g., осы [wɔsɤ] not [ɔsɤ], өзі [wɵzɪ] not [ɵzɪ]).
115
+ '''
116
+ text = re.sub(r"\b([ɔɵ])", r"w\1", text)
117
+
118
+ '''
119
+ rule 3
120
+ if the letter "у" [u] is followed by consonants, use [w] instead of [u].
121
+ '''
122
+ text = re.sub(r"u([bvgɣdʒzjkqlłmnŋprstfxhʃɕʔšʆʲ])", r"w\1", text)
123
+
124
+ '''
125
+ rule 4:
126
+ if the letter "и" [ǐ] is followed by consonants, use [i] instead of [ǐ].
127
+ '''
128
+ text = re.sub(r"ǐ([bvgɣdʒzjkqlłmnŋprstfxhʃɕʔšʆʲ])", r"i\1", text)
129
+
130
+ return text
131
+
132
+ def ipa_to_kazakh(text):
133
+ # three-sound convenience vowels:
134
+ text = re.sub("ǔ", "ю", text)
135
+
136
+ # two-sound convenience consonants:
137
+ text = re.sub("š", "ц", text)
138
+ text = re.sub("ʆ", "ч", text)
139
+
140
+ # two-sound convenience vowels:
141
+ text = re.sub("ǎ", "я", text)
142
+ text = re.sub("ě", "е", text)
143
+ text = re.sub("ǒ", "ё", text)
144
+ text = re.sub("ǐ", "и", text)
145
+ text = re.sub("u", "у", text)
146
+
147
+ # single-sound consonants:
148
+ text = re.sub("b", "б", text)
149
+ text = re.sub("v", "в", text)
150
+ text = re.sub("g", "г", text)
151
+ text = re.sub("ɣ", "ғ", text)
152
+ text = re.sub("d", "д", text)
153
+ text = re.sub("ʒ", "ж", text)
154
+ text = re.sub("z", "з", text)
155
+ text = re.sub("j", "й", text)
156
+ text = re.sub("k", "к", text)
157
+ text = re.sub("q", "қ", text)
158
+ text = re.sub("l", "л", text)
159
+ text = re.sub("m", "м", text)
160
+ text = re.sub("n", "н", text)
161
+ text = re.sub("ŋ", "ң", text)
162
+ text = re.sub("p", "п", text)
163
+ text = re.sub("r", "р", text)
164
+ text = re.sub("s", "с", text)
165
+ text = re.sub("t", "т", text)
166
+ text = re.sub("f", "ф", text)
167
+ text = re.sub("x", "х", text)
168
+ text = re.sub("h", "һ", text)
169
+ text = re.sub("ʃ", "ш", text)
170
+ text = re.sub("ɕ", "щ", text)
171
+ text = re.sub("ʔ", "ъ", text)
172
+ text = re.sub("ʲ", "ь", text)
173
+
174
+ # single-sound vowels:
175
+ text = re.sub("ɑ", "а", text)
176
+ text = re.sub("æ", "ә", text)
177
+ text = re.sub("ɔ", "о", text)
178
+ text = re.sub("ɵ", "ө", text)
179
+ text = re.sub("ʊ", "ұ", text)
180
+ text = re.sub("ʏ", "ү", text)
181
+ text = re.sub("ɤ", "ы", text)
182
+ text = re.sub("ɪ", "і", text)
183
+ text = re.sub("e", "э", text)
184
+
185
+ # anti-rules
186
+
187
+ '''
188
+ anti-rule 1:
189
+ '''
190
+ text = re.sub(r"([әеөүі])(ł)([^әеөүіь])", r"\1л\3", text)
191
+
192
+ '''
193
+ anti-rule 2:
194
+ '''
195
+ text = re.sub(r"\bw([оө])", r"\1", text)
196
+
197
+ '''
198
+ anti-rule 3:
199
+ '''
200
+ text = re.sub(r"w([бвгғджзйкқлмнңпрстфхһцчшщъьчц])", r"у\1", text)
201
+
202
+ '''
203
+ anti-rule 4:
204
+ the symbol [i] is used in one case only, so we can just replace it for и.
205
+ '''
206
+ text = re.sub(r"i", r"и", text)
207
+
208
+ '''
209
+ anti-rules for Turkish and Kyrgyz Ǯ, Turkish ł, azerbaijani ḡ, sakha ɲ
210
+ '''
211
+ text = re.sub(r"w([Ǯ])", r"у\1", text)
212
+ text = re.sub(r"Ǯ", r"дж", text)
213
+ text = re.sub(r"ł", r"ль", text)
214
+ text = re.sub(r"ḡ", r"гь", text)
215
+ text = re.sub(r"ɲ", r"нь", text)
216
+
217
+ return text
218
+
219
+ # testing kazakh scripts
220
+
221
+ def test_kazakh(text):
222
+ input_text = text.lower().split()
223
+ output_text = ipa_to_kazakh(kazakh_to_ipa(text)).split()
224
+
225
+ input_difference = []
226
+ output_difference = []
227
+ for item in input_text:
228
+ if item not in output_text:
229
+ input_difference.append(item)
230
+ for item in output_text:
231
+ if item not in input_text:
232
+ output_difference.append(item)
233
+
234
+ if input_text == output_text:
235
+ print("input text and output text -- identical")
236
+ else:
237
+ print("input text and output text -- different")
238
+ print("input:", input_difference)
239
+ print("output:", output_difference)
240
+
241
+ # turkish scripts
242
+
243
+ def turkish_to_ipa(text):
244
+ # we shall begin with sound combinations:
245
+ # the longer a combination, the upper it is on the list.
246
+ # single sounds should go to the bottom, with consonants taking precedence over vowels.
247
+ # consonants are less likely to change than vowels.
248
+
249
+ # for convenience, we shall use single symbols to denote multiple-sound combinations.
250
+ # we can later convert them to conventional symbols.
251
+
252
+ # two-sound convenience consonants:
253
+ text = re.sub("[Cc]", "Ǯ", text)
254
+ text = re.sub("[Çç]", "ʆ", text)
255
+
256
+ # two-sound convenience vowels:
257
+ text = re.sub("[İi]", "ǐ", text)
258
+ text = re.sub("[Uu]", "u", text)
259
+
260
+ # single-sound consonants:
261
+ text = re.sub("[Jj]", "ʒ", text)
262
+ text = re.sub("[Yy]", "j", text)
263
+ text = re.sub("[Bb]", "b", text)
264
+ text = re.sub("[Dd]", "d", text)
265
+ text = re.sub("[Ff]", "f", text)
266
+ text = re.sub("[Gg]", "g", text)
267
+ text = re.sub("[Ğğ]", "ɣ", text)
268
+ text = re.sub("[Hh]", "h", text)
269
+ text = re.sub("[Kk]", "k", text)
270
+ text = re.sub("[Ll]", "l", text)
271
+ text = re.sub("[Mm]", "m", text)
272
+ text = re.sub("[Nn]", "n", text)
273
+ text = re.sub("[Pp]", "p", text)
274
+ text = re.sub("[Rr]", "r", text)
275
+ text = re.sub("[Ss]", "s", text)
276
+ text = re.sub("[Şş]", "ʃ", text)
277
+ text = re.sub("[Tt]", "t", text)
278
+ text = re.sub("[Vv]", "v", text)
279
+ text = re.sub("[Zz]", "z", text)
280
+
281
+ # single-sound vowels:
282
+ text = re.sub("[Aa]", "ɑ", text)
283
+ text = re.sub("[Ee]", "e", text)
284
+ text = re.sub("[Iı]", "ɤ", text)
285
+ text = re.sub("[Oo]", "ɔ", text)
286
+ text = re.sub("[Öö]", "ɵ", text)
287
+ text = re.sub("[Üü]", "ʏ", text)
288
+ text = re.sub("[Îî]", "ǐ", text) # experimentally added
289
+ text = re.sub("[Ââ]", "ɑ", text) # experimentally added
290
+
291
+
292
+ '''
293
+ rule 1:
294
+ if [e], [ɵ], [ʏ], [i] are followed by [l] and [l] is NOT followed by [e], [ɵ], [ʏ], or [i],
295
+ use [ł] instead of [l] (e.g., [geł], [gełdi], but [gelecek]).
296
+ '''
297
+ text = re.sub(r"([eɵʏǐ])(l)([^eɵʏǐ])", r"\1ł\3", text)
298
+
299
+ '''
300
+ rule 2:
301
+ if the letter "u" [u] is followed by consonants, use [w] instead of [u].
302
+ '''
303
+ text = re.sub(r"u([bvgɣdʒzklłmnprstfhʃʆǮ])", r"w\1", text)
304
+
305
+ '''
306
+ rule 3:
307
+ if the letter "i" [ǐ] is followed by consonants, use [i] instead of [ǐ].
308
+ '''
309
+ text = re.sub(r"ǐ([bvgɣdʒzklłmnprstfhʃʆǮ])", r"i\1", text)
310
+
311
+ return text
312
+
313
+ def ipa_to_turkish(text):
314
+ # two-sound convenience consonants:
315
+ text = re.sub("Ǯ", "c", text)
316
+ text = re.sub("ʆ", "ç", text)
317
+
318
+ # single-sound consonants:
319
+ text = re.sub("j", "y", text)
320
+ text = re.sub("ʒ", "j", text)
321
+ text = re.sub("b", "b", text)
322
+ text = re.sub("d", "d", text)
323
+ text = re.sub("f", "f", text)
324
+ text = re.sub("g", "g", text)
325
+ text = re.sub("ɣ", "ğ", text)
326
+ text = re.sub("h", "h", text)
327
+ text = re.sub("k", "k", text)
328
+ text = re.sub("l", "l", text)
329
+ text = re.sub("m", "m", text)
330
+ text = re.sub("n", "n", text)
331
+ text = re.sub("p", "p", text)
332
+ text = re.sub("r", "r", text)
333
+ text = re.sub("s", "s", text)
334
+ text = re.sub("ʃ", "ş", text)
335
+ text = re.sub("t", "t", text)
336
+ text = re.sub("v", "v", text)
337
+ text = re.sub("z", "z", text)
338
+
339
+ # single-sound vowels:
340
+ text = re.sub("ɑ", "a", text)
341
+ text = re.sub("e", "e", text)
342
+ text = re.sub("ɤ", "ı", text)
343
+ text = re.sub("ǐ", "i", text)
344
+ text = re.sub("ɔ", "o", text)
345
+ text = re.sub("ɵ", "ö", text)
346
+ text = re.sub("ʏ", "ü", text)
347
+
348
+ '''
349
+ anti-rule 1:
350
+ '''
351
+ text = re.sub(r"([eöüi])(ł)([^eöüi])", r"\1l\3", text)
352
+
353
+ '''
354
+ anti-rule 2:
355
+ the symbol [w] is used in one case only, so we can just replace it for u.
356
+ '''
357
+ text = re.sub(r"w", r"u", text)
358
+
359
+ '''
360
+ anti-rule 3:
361
+ the symbol [i] is used in one case only, so we can just replace it for i.
362
+ '''
363
+ text = re.sub(r"i", r"i", text)
364
+
365
+ return text
366
+
367
+ # testing turkish scripts
368
+
369
+ def test_turkish(text):
370
+ input_text = text.lower().split()
371
+ output_text = ipa_to_turkish(turkish_to_ipa(text)).split()
372
+
373
+ input_difference = []
374
+ output_difference = []
375
+ for item in input_text:
376
+ if item not in output_text:
377
+ input_difference.append(item)
378
+ for item in output_text:
379
+ if item not in input_text:
380
+ output_difference.append(item)
381
+
382
+ if input_text == output_text:
383
+ print("input text and output text -- identical")
384
+ else:
385
+ print("input text and output text -- different")
386
+ print("input:", input_difference)
387
+ print("output:", output_difference)
388
+
389
+ # kyrgyz scripts
390
+
391
+ def kyrgyz_to_ipa(text):
392
+ # we shall begin with sound combinations:
393
+ # the longer a combination, the upper it is on the list
394
+ # single sounds should go to the bottom, with consonants taking precedence over vowels.
395
+ # consonants are less likely to change than vowels.
396
+
397
+ # for convenience, we shall use single symbols to denote multiple-sound combinations.
398
+ # we can later convert them to conventional symbols.
399
+
400
+ # three-sound convenience vowels:
401
+ text = re.sub("[Юю]", "ǔ", text)
402
+
403
+ # two-sound convenience consonants:
404
+ text = re.sub("[Цц]", "š", text)
405
+ text = re.sub("[Чч]", "ʆ", text)
406
+ text = re.sub("[Жж]", "Ǯ", text)
407
+
408
+ # two-sound convenience vowels:
409
+ text = re.sub("[Яя]", "ǎ", text)
410
+ text = re.sub("[Ее]", "ě", text)
411
+ text = re.sub("[Ёё]", "ǒ", text)
412
+ text = re.sub("[Ии]", "ǐ", text)
413
+ text = re.sub("[Уу]", "u", text)
414
+
415
+ # single-sound consonants:
416
+ text = re.sub("[Бб]", "b", text)
417
+ text = re.sub("[Вв]", "v", text)
418
+ text = re.sub("[Гг]", "g", text)
419
+ text = re.sub("[Дд]", "d", text)
420
+ text = re.sub("[Зз]", "z", text)
421
+ text = re.sub("[Йй]", "j", text)
422
+ text = re.sub("[Кк]", "k", text)
423
+ text = re.sub("[Лл]", "l", text)
424
+ text = re.sub("[Мм]", "m", text)
425
+ text = re.sub("[Нн]", "n", text)
426
+ text = re.sub("[Ңң]", "ŋ", text)
427
+ text = re.sub("[Пп]", "p", text)
428
+ text = re.sub("[Рр]", "r", text)
429
+ text = re.sub("[Сс]", "s", text)
430
+ text = re.sub("[Тт]", "t", text)
431
+ text = re.sub("[Фф]", "f", text)
432
+ text = re.sub("[Хх]", "x", text)
433
+ text = re.sub("[Шш]", "ʃ", text)
434
+ text = re.sub("[Щщ]", "ɕ", text)
435
+ text = re.sub("[Ъъ]", "ʔ", text)
436
+ text = re.sub("[Ьь]", "ʲ", text)
437
+
438
+ # single-sound vowels:
439
+ text = re.sub("[Аа]", "ɑ", text)
440
+ text = re.sub("[Оо]", "ɔ", text)
441
+ text = re.sub("[Өө]", "ɵ", text)
442
+ text = re.sub("[Үү]", "ʏ", text)
443
+ text = re.sub("[Ыы]", "ɤ", text)
444
+ text = re.sub("[Ээ]", "e", text)
445
+
446
+ # rules 1-4 are similar to those for Kazakh:
447
+
448
+ '''
449
+ rule 1:
450
+ if [ě], [ɵ], [ʏ], are followed by [l] and [l] is NOT followed by [ě], [ɵ], [ʏ], or [ʲ],
451
+ use [ł] instead of [l].
452
+ '''
453
+ text = re.sub(r"([ɵʏě])(l)([^ɵʏěʲ])", r"\1ł\3", text)
454
+
455
+ '''
456
+ rule 2:
457
+ if the letters "о" and "ө", [ɔ] and [ɵ] at the beginning of a word are followed by [w] (e.g., осы [wɔsɤ] not [ɔsɤ], өзі [wɵzɪ] not [ɵzɪ]).
458
+ '''
459
+ text = re.sub(r"\b([ɔɵ])", r"w\1", text)
460
+
461
+ '''
462
+ rule 3
463
+ if the letter "у" [u] is followed by consonants, use [w] instead of [u].
464
+ '''
465
+ text = re.sub(r"u([bvgɣdzjkqlłmnŋprstfxhʃɕʔšʆǮʲ])", r"w\1", text)
466
+
467
+ '''
468
+ rule 4:
469
+ if the letter "и" [ǐ] is followed by consonants, use [i] instead of [ǐ].
470
+ '''
471
+ text = re.sub(r"ǐ([bvgɣdzjkqlłmnŋprstfxhʃɕʔšʆǮʲ])", r"i\1", text)
472
+
473
+ # rules 5-6 are specific to Kyrgyz:
474
+
475
+ '''
476
+ rule 5
477
+ ɑ|ɔ|u|ɤ + k + ɑ|ɔ|u|ɤ
478
+ '''
479
+ text = re.sub(r"([ɑɔwɤ])k", r"\1q", text)
480
+ text = re.sub(r"k([ɑɔuɤ])", r"q\1", text)
481
+
482
+ '''
483
+ rule 6
484
+ ɑ|��|u|ɤ + g + ɑ|ɔ|u|ɤ
485
+ '''
486
+ text = re.sub(r"([ɑɔwɤ])g", r"\1ɣ", text)
487
+ text = re.sub(r"g([ɑɔuɤ])", r"ɣ\1", text)
488
+
489
+ return text
490
+
491
+ def ipa_to_kyrgyz(text):
492
+ # three-sound convenience vowels:
493
+ text = re.sub("ǔ", "ю", text)
494
+
495
+ # two-sound convenience consonants:
496
+ text = re.sub("š", "ц", text)
497
+ text = re.sub("ʆ", "ч", text)
498
+ text = re.sub("Ǯ", "ж", text)
499
+
500
+ # two-sound convenience vowels:
501
+ text = re.sub("ǎ", "я", text)
502
+ text = re.sub("ě", "е", text)
503
+ text = re.sub("ǒ", "ё", text)
504
+ text = re.sub("ǐ", "и", text)
505
+ text = re.sub("u", "у", text)
506
+
507
+ # single-sound consonants:
508
+ text = re.sub("b", "б", text)
509
+ text = re.sub("v", "в", text)
510
+ text = re.sub("g", "г", text)
511
+ text = re.sub("ɣ", "г", text)
512
+ text = re.sub("d", "д", text)
513
+ text = re.sub("z", "з", text)
514
+ text = re.sub("j", "й", text)
515
+ text = re.sub("k", "к", text)
516
+ text = re.sub("l", "л", text)
517
+ text = re.sub("m", "м", text)
518
+ text = re.sub("n", "н", text)
519
+ text = re.sub("ŋ", "ң", text)
520
+ text = re.sub("p", "п", text)
521
+ text = re.sub("q", "к", text)
522
+ text = re.sub("r", "р", text)
523
+ text = re.sub("s", "с", text)
524
+ text = re.sub("t", "т", text)
525
+ text = re.sub("f", "ф", text)
526
+ text = re.sub("x", "х", text)
527
+ text = re.sub("ʃ", "ш", text)
528
+ text = re.sub("ɕ", "щ", text)
529
+ text = re.sub("ʔ", "ъ", text)
530
+ text = re.sub("ʲ", "ь", text)
531
+
532
+ # single-sound vowels:
533
+ text = re.sub("ɑ", "а", text)
534
+ text = re.sub("ɔ", "о", text)
535
+ text = re.sub("ɵ", "ө", text)
536
+ text = re.sub("ʏ", "ү", text)
537
+ text = re.sub("ɤ", "ы", text)
538
+ text = re.sub("e", "э", text)
539
+
540
+ # anti-rules 1-4 are similar to those for Kazakh:
541
+
542
+ '''
543
+ anti-rule 1:
544
+ '''
545
+ text = re.sub(r"([өүе])(ł)([^өүеʲ])", r"\1л\3", text)
546
+
547
+ '''
548
+ anti-rule 2:
549
+ '''
550
+ text = re.sub(r"\bw([оө])", r"\1", text)
551
+
552
+ '''
553
+ anti-rule 3:
554
+ '''
555
+ text = re.sub(r"w([бвгдзйклмнңпрстфхцчшщъьчцж])", r"у\1", text)
556
+
557
+ '''
558
+ anti-rule 4:
559
+ '''
560
+ text = re.sub(r"i([бвгдзйклмнңпрстфхцчшщъьчцж])", r"и\1", text)
561
+
562
+ return text
563
+
564
+ # testing kyrgyz scripts
565
+
566
+ def test_kyrgyz(text):
567
+ input_text = text.lower().split()
568
+ output_text = ipa_to_kyrgyz(kyrgyz_to_ipa(text)).split()
569
+
570
+ input_difference = []
571
+ output_difference = []
572
+ for item in input_text:
573
+ if item not in output_text:
574
+ input_difference.append(item)
575
+ for item in output_text:
576
+ if item not in input_text:
577
+ output_difference.append(item)
578
+
579
+ if input_text == output_text:
580
+ print("input text and output text -- identical")
581
+ else:
582
+ print("input text and output text -- different")
583
+ print("input:", input_difference)
584
+ print("output:", output_difference)
585
+
586
+ # uzbek scripts
587
+
588
+ def uzbek_to_ipa(text):
589
+ # we shall begin with sound combinations:
590
+ # the longer a combination, the upper it is on the list
591
+ # single sounds should go to the bottom, with consonants taking precedence over vowels.
592
+ # consonants are less likely to change than vowels.
593
+
594
+ # for convenience, we shall use single symbols to denote multiple-sound combinations.
595
+ # we can later convert them to conventional symbols.
596
+
597
+ # two-sound convenience consonants:
598
+ text = re.sub("[Jj]", "Ǯ", text)
599
+ text = re.sub("Ch", "ʆ", text)
600
+ text = re.sub("ch", "ʆ", text)
601
+
602
+ # two-sound convenience vowels:
603
+ text = re.sub("[Ii]", "ǐ", text)
604
+ text = re.sub("[Uu]", "u", text)
605
+
606
+ # single-sound consonants:
607
+ text = re.sub("[Bb]", "b", text)
608
+ text = re.sub("[Dd]", "d", text)
609
+ text = re.sub("[Ff]", "f", text)
610
+ text = re.sub("G‘", "ɣ", text)
611
+ text = re.sub("g‘", "ɣ", text)
612
+ text = re.sub("[Gg]", "g", text)
613
+ text = re.sub("[Hh]", "h", text)
614
+ text = re.sub("[Kk]", "k", text)
615
+ text = re.sub("[Ll]", "l", text)
616
+ text = re.sub("[Mm]", "m", text)
617
+ text = re.sub("[Nn]", "n", text)
618
+ text = re.sub("Ng", "ŋ", text)
619
+ text = re.sub("ng", "ŋ", text)
620
+ text = re.sub("[Pp]", "p", text)
621
+ text = re.sub("[Qq]", "q", text)
622
+ text = re.sub("[Rr]", "r", text)
623
+ text = re.sub("[Ss]", "s", text)
624
+ text = re.sub("Sh", "ʃ", text)
625
+ text = re.sub("sh", "ʃ", text)
626
+ text = re.sub("[Tt]", "t", text)
627
+ text = re.sub("[Vv]", "v", text)
628
+ text = re.sub("[Xx]", "x", text)
629
+ text = re.sub("[Yy]", "j", text)
630
+ text = re.sub("[Zz]", "z", text)
631
+
632
+ # single-sound vowels:
633
+ text = re.sub("[Aa]", "æ", text)
634
+ text = re.sub("[Ee]", "e", text)
635
+ text = re.sub("Oʻ", "ɵ", text)
636
+ text = re.sub("oʻ", "ɵ", text)
637
+ text = re.sub("[Oo]", "ɔ", text)
638
+
639
+ # hard sign
640
+ text = re.sub("'", "ʔ", text)
641
+
642
+ '''
643
+ rule 1:
644
+ if [æ], [e], [ɵ], [ǐ] are followed by [l] and [l] is NOT followed by [æ], [e], [ɵ], [ǐ],
645
+ use [ł] instead of [l].
646
+ '''
647
+ text = re.sub(r"([æɵǐe])(l)([^æɵǐe])", r"\1ł\3", text)
648
+
649
+ '''
650
+ rule 2:
651
+ if the letter "u" [u] is followed by consonants, use [w] instead of [u].
652
+ '''
653
+ text = re.sub(r"u([bvgɣdzjkqlłmnŋprstfxhʃʔʆǮ])", r"w\1", text)
654
+
655
+ '''
656
+ rule 3:
657
+ if the letter "i" [ǐ] is followed by consonants, use [i] instead of [ǐ].
658
+ '''
659
+ text = re.sub(r"ǐ([bvgɣdzjkqlłmnŋprstfxhʃʔʆǮ])", r"i\1", text)
660
+
661
+ return text
662
+
663
+ def ipa_to_uzbek(text):
664
+ # two-sound convenience consonants:
665
+ text = re.sub("j", "y", text) # exception! precedence issue
666
+ text = re.sub("Ǯ", "j", text)
667
+ text = re.sub("ʆ", "ch", text)
668
+
669
+ # two-sound convenience vowels:
670
+ text = re.sub("ǐ", "i", text)
671
+ text = re.sub("u", "u", text)
672
+
673
+ # single-sound convenience consonants:
674
+ text = re.sub("b", "b", text)
675
+ text = re.sub("d", "d", text)
676
+ text = re.sub("f", "f", text)
677
+ text = re.sub("g", "g", text)
678
+ text = re.sub("ɣ", "g‘", text)
679
+ text = re.sub("h", "h", text)
680
+ text = re.sub("k", "k", text)
681
+ text = re.sub("l", "l", text)
682
+ text = re.sub("m", "m", text)
683
+ text = re.sub("n", "n", text)
684
+ text = re.sub("ŋ", "ng", text)
685
+ text = re.sub("p", "p", text)
686
+ text = re.sub("q", "q", text)
687
+ text = re.sub("r", "r", text)
688
+ text = re.sub("s", "s", text)
689
+ text = re.sub("ʃ", "sh", text)
690
+ text = re.sub("t", "t", text)
691
+ text = re.sub("v", "v", text)
692
+ text = re.sub("x", "x", text)
693
+ text = re.sub("z", "z", text)
694
+
695
+ # single-sound convenience vowels:
696
+ text = re.sub("æ", "a", text)
697
+ text = re.sub("e", "e", text)
698
+ text = re.sub("ɵ", "o‘", text)
699
+ text = re.sub("ɔ", "o", text)
700
+
701
+ # hard sign
702
+ text = re.sub("ʔ", "'", text)
703
+
704
+ '''
705
+ anti-rule 1:
706
+ '''
707
+ text = re.sub(r"([aei‘])(ł)([^aei‘])", r"\1l\3", text)
708
+
709
+ '''
710
+ anti-rule 2:
711
+ '''
712
+ text = re.sub(r"w([bcvgɣdjzklmnpqrstfhyx])", r"u\1", text)
713
+
714
+ '''
715
+ anti-rule 3:
716
+ '''
717
+ text = re.sub(r"i([bcvgɣdjzklmnpqrstfhyx])", r"i\1", text)
718
+
719
+ return text
720
+
721
+ # testing uzbek scripts
722
+
723
+ def test_uzbek(text):
724
+ input_text = text.lower().split()
725
+ output_text = ipa_to_uzbek(uzbek_to_ipa(text)).split()
726
+
727
+ input_difference = []
728
+ output_difference = []
729
+ for item in input_text:
730
+ if item not in output_text:
731
+ input_difference.append(item)
732
+ for item in output_text:
733
+ if item not in input_text:
734
+ output_difference.append(item)
735
+
736
+ if input_text == output_text:
737
+ print("input text and output text -- identical")
738
+ else:
739
+ print("input text and output text -- different")
740
+ print("input:", input_difference)
741
+ print("output:", output_difference)
742
+
743
+ # azerbaijani scripts
744
+
745
+ def azerbaijani_to_ipa(text):
746
+ # we shall begin with sound combinations:
747
+ # the longer a combination, the upper it is on the list
748
+ # single sounds should go to the bottom, with consonants taking precedence over vowels.
749
+ # consonants are less likely to change than vowels.
750
+
751
+ # for convenience, we shall use single symbols to denote multiple-sound combinations.
752
+ # we can later convert them to conventional symbols.
753
+
754
+ # two-sound convenience consonants:
755
+ text = re.sub("[Cc]", "Ǯ", text)
756
+ text = re.sub("[Çç]", "ʆ", text)
757
+ text = re.sub("[Gg]", "ḡ", text)
758
+
759
+ # two-sound convenience vowels:
760
+ text = re.sub("[İi]", "ǐ", text)
761
+ text = re.sub("[Uu]", "u", text)
762
+
763
+ # single-sound consonants:
764
+ text = re.sub("[Jj]", "ʒ", text)
765
+ text = re.sub("[Yy]", "j", text)
766
+ text = re.sub("[Bb]", "b", text)
767
+ text = re.sub("[Dd]", "d", text)
768
+ text = re.sub("[Ff]", "f", text)
769
+ text = re.sub("[Ğğ]", "ɣ", text)
770
+ text = re.sub("[Hh]", "h", text)
771
+ text = re.sub("[Xx]", "x", text)
772
+ text = re.sub("[Kk]", "k", text)
773
+ text = re.sub("[Qq]", "g", text)
774
+ text = re.sub("[Ll]", "l", text)
775
+ text = re.sub("[Mm]", "m", text)
776
+ text = re.sub("[Nn]", "n", text)
777
+ text = re.sub("[Pp]", "p", text)
778
+ text = re.sub("[Rr]", "r", text)
779
+ text = re.sub("[Ss]", "s", text)
780
+ text = re.sub("[Şş]", "ʃ", text)
781
+ text = re.sub("[Tt]", "t", text)
782
+ text = re.sub("[Vv]", "v", text)
783
+ text = re.sub("[Zz]", "z", text)
784
+
785
+ # single-sound vowels:
786
+ text = re.sub("[Aa]", "ɑ", text)
787
+ text = re.sub("[Ee]", "e", text)
788
+ text = re.sub("[Əə]", "æ", text)
789
+ text = re.sub("[Iı]", "ɤ", text)
790
+ text = re.sub("[Oo]", "ɔ", text)
791
+ text = re.sub("[Öö]", "ɵ", text)
792
+ text = re.sub("[Üü]", "ʏ", text)
793
+
794
+ '''
795
+ rule 1:
796
+ if [æ], [e], [ɵ], [ʏ], [i] are followed by [l] and [l] is NOT followed by [æ], [e], [ɵ], [ʏ], or [i],
797
+ use [ł] instead of [l].
798
+ '''
799
+ text = re.sub(r"([æeɵʏǐ])(l)([^æeɵʏǐ])", r"\1ł\3", text)
800
+
801
+ '''
802
+ rule 2:
803
+ if the letter "u" [ʊw] is followed by consonants, use [w] instead of [u].
804
+ '''
805
+ text = re.sub(r"u([bvgḡɣdʒzklłmnprstfhxʃʆǮ])", r"w\1", text)
806
+
807
+ '''
808
+ rule 3:
809
+ if the letter "i" [ǐ] is followed by consonants, use [i] instead of [ǐ].
810
+ '''
811
+ text = re.sub(r"ǐ([bvgḡɣdʒzklłmnprstfhxʃʆǮ])", r"i\1", text)
812
+
813
+ return text
814
+
815
+ def ipa_to_azerbaijani(text):
816
+ # two-sound convenience consonants:
817
+ text = re.sub("Ǯ", "c", text)
818
+ text = re.sub("ʆ", "ç", text)
819
+ text = re.sub("g", "q", text) # precedence issue
820
+ text = re.sub("ḡ", "g", text)
821
+
822
+ # single-sound consonants:
823
+ text = re.sub("j", "y", text)
824
+ text = re.sub("ʒ", "j", text)
825
+ text = re.sub("b", "b", text)
826
+ text = re.sub("d", "d", text)
827
+ text = re.sub("f", "f", text)
828
+ text = re.sub("ɣ", "ğ", text)
829
+ text = re.sub("h", "h", text)
830
+ text = re.sub("x", "x", text)
831
+ text = re.sub("k", "k", text)
832
+ text = re.sub("l", "l", text)
833
+ text = re.sub("m", "m", text)
834
+ text = re.sub("n", "n", text)
835
+ text = re.sub("p", "p", text)
836
+ text = re.sub("r", "r", text)
837
+ text = re.sub("s", "s", text)
838
+ text = re.sub("ʃ", "ş", text)
839
+ text = re.sub("t", "t", text)
840
+ text = re.sub("v", "v", text)
841
+ text = re.sub("z", "z", text)
842
+
843
+ # single-sound vowels:
844
+ text = re.sub("ɑ", "a", text)
845
+ text = re.sub("e", "e", text)
846
+ text = re.sub("æ", "ə", text)
847
+ text = re.sub("ɤ", "ı", text)
848
+ text = re.sub("ǐ", "i", text)
849
+ text = re.sub("ɔ", "o", text)
850
+ text = re.sub("ɵ", "ö", text)
851
+ text = re.sub("ʏ", "ü", text)
852
+
853
+ '''
854
+ anti-rule 1:
855
+ '''
856
+ text = re.sub(r"([əeöüiě])(ł)([^əeöüiě])", r"\1l\3", text)
857
+
858
+ '''
859
+ anti-rule 2:
860
+ '''
861
+ text = re.sub(r"w([bvgğdjzkqlmnprstfhxşçc])", r"u\1", text)
862
+
863
+ '''
864
+ anti-rule 3:
865
+ '''
866
+ text = re.sub(r"i([bcvgğdjzkqlmnprstfhxşç])", r"i\1", text)
867
+
868
+ return text
869
+
870
+ # testing azerbaijani scripts
871
+
872
+ def test_azerbaijani(text):
873
+ input_text = text.lower().split()
874
+ output_text = ipa_to_azerbaijani(azerbaijani_to_ipa(text)).split()
875
+
876
+ input_difference = []
877
+ output_difference = []
878
+ for item in input_text:
879
+ if item not in output_text:
880
+ input_difference.append(item)
881
+ for item in output_text:
882
+ if item not in input_text:
883
+ output_difference.append(item)
884
+
885
+ if input_text == output_text:
886
+ print("input text and output text -- identical")
887
+ else:
888
+ print("input text and output text -- different")
889
+ print("input:", input_difference)
890
+ print("output:", output_difference)
891
+
892
+ # turkmen scripts
893
+
894
+ def turkmen_to_ipa(text):
895
+ # we shall begin with sound combinations:
896
+ # the longer a combination, the upper it is on the list
897
+ # single sounds should go to the bottom, with consonants taking precedence over vowels.
898
+ # consonants are less likely to change than vowels.
899
+
900
+ # for convenience, we shall use single symbols to denote multiple-sound combinations.
901
+ # we can later convert them to conventional symbols.
902
+
903
+ # two-sound convenience consonants:
904
+ text = re.sub("[Çç]", "ʆ", text)
905
+ text = re.sub("[Jj]", "Ǯ", text)
906
+
907
+ # two-sound convenience vowels:
908
+ text = re.sub("[İi]", "ǐ", text)
909
+ text = re.sub("[Uu]", "u", text)
910
+
911
+ # single-sound consonants:
912
+ text = re.sub("[Bb]", "b", text)
913
+ text = re.sub("[Dd]", "d", text)
914
+ text = re.sub("[Ff]", "f", text)
915
+ text = re.sub("[Gg]", "g", text)
916
+ text = re.sub("[Hh]", "h", text)
917
+ text = re.sub("[Žž]", "ʒ", text)
918
+ text = re.sub("[Kk]", "k", text)
919
+ text = re.sub("[Ll]", "l", text)
920
+ text = re.sub("[Mm]", "m", text)
921
+ text = re.sub("[Nn]", "n", text)
922
+ text = re.sub("[Ňň]", "ŋ", text)
923
+ text = re.sub("[Pp]", "p", text)
924
+ text = re.sub("[Rr]", "r", text)
925
+ text = re.sub("[Ss]", "s", text) # θ
926
+ text = re.sub("[Şş]", "ʃ", text)
927
+ text = re.sub("[Tt]", "t", text)
928
+ text = re.sub("[Ww]", "v", text)
929
+ text = re.sub("[Ýý]", "j", text)
930
+ text = re.sub("[Zz]", "z", text) # ð
931
+
932
+ # single-sound vowels:
933
+ text = re.sub("[Aa]", "ɑ", text)
934
+ text = re.sub("[Ää]", "æ", text)
935
+ text = re.sub("[Ee]", "e", text)
936
+ text = re.sub("[Oo]", "ɔ", text)
937
+ text = re.sub("[Öö]", "ɵ", text)
938
+ text = re.sub("[Üü]", "ʏ", text)
939
+ text = re.sub("[Yy]", "ɤ", text)
940
+
941
+ # rules:
942
+
943
+ '''
944
+ rule 1:
945
+ if [æ], [e], [ɵ], [ʏ], [i] are followed by [l] and [l] is NOT followed by [æ], [e], [ɵ], [ʏ], or [i],
946
+ use [ł] instead of [l].
947
+ '''
948
+ text = re.sub(r"([æeɵʏǐ])(l)([^æeɵʏǐ])", r"\1ł\3", text)
949
+
950
+ '''
951
+ rule 2:
952
+ if the letter "u" [ʊw] is followed by consonants, use [w] instead of [u].
953
+ '''
954
+ text = re.sub(r"u([bvgɣqdʒzkqlłmnprstfhʃʆǮw])", r"w\1", text)
955
+
956
+ '''
957
+ rule 3:
958
+ if the letter "i" [ǐ] is followed by consonants, use [i] instead of [ǐ].
959
+ '''
960
+ text = re.sub(r"ǐ([bvgɣqdʒzkqlłmnprstfhʃʆǮ])", r"i\1", text)
961
+
962
+ # rules 4-5 are specific to Turkmen:
963
+
964
+ '''
965
+ rule 4:
966
+ a, o, u, y + k + a, o, u, y:
967
+ '''
968
+ text = re.sub(r"k([ɑɔuɤ])", r"q\1", text)
969
+ text = re.sub(r"([ɑɔwɤ])k", r"\1q", text)
970
+
971
+ '''
972
+ rule 5:
973
+ a, o, u, y + g + a, o, u, y:
974
+ '''
975
+ text = re.sub(r"g([ɑɔuɤ])", r"ɣ\1", text)
976
+ text = re.sub(r"([ɑɔwɤ])g", r"\1ɣ", text)
977
+
978
+ return text
979
+
980
+ def ipa_to_turkmen(text):
981
+ # two-sound convenience consonants:
982
+ text = re.sub("j", "ý", text) # precedence issue
983
+ text = re.sub("Ǯ", "j", text)
984
+ text = re.sub("ʆ", "ç", text)
985
+
986
+ # single-sound consonants: # w --> v can be found where the letter u anti-rule is
987
+ text = re.sub("b", "b", text)
988
+ text = re.sub("d", "d", text)
989
+ text = re.sub("f", "f", text)
990
+ text = re.sub("g", "g", text)
991
+ text = re.sub("ɣ", "g", text)
992
+ text = re.sub("h", "h", text)
993
+ text = re.sub("ʒ", "ž", text)
994
+ text = re.sub("k", "k", text)
995
+ text = re.sub("q", "k", text)
996
+ text = re.sub("l", "l", text)
997
+ text = re.sub("m", "m", text)
998
+ text = re.sub("n", "n", text)
999
+ text = re.sub("ŋ", "ň", text)
1000
+ text = re.sub("p", "p", text)
1001
+ text = re.sub("r", "r", text)
1002
+ text = re.sub("s", "s", text)
1003
+ text = re.sub("ʃ", "ş", text)
1004
+ text = re.sub("t", "t", text)
1005
+ text = re.sub("z", "z", text)
1006
+
1007
+ # single-sound vowels:
1008
+ text = re.sub("ɑ", "a", text)
1009
+ text = re.sub("e", "e", text)
1010
+ text = re.sub("æ", "ä", text)
1011
+ text = re.sub("ǐ", "i", text)
1012
+ text = re.sub("ɔ", "o", text)
1013
+ text = re.sub("ɵ", "ö", text)
1014
+ text = re.sub("ʏ", "ü", text)
1015
+ text = re.sub("ɤ", "y", text)
1016
+
1017
+ # anti-rules:
1018
+
1019
+ '''
1020
+ anti-rule 1:
1021
+ '''
1022
+ text = re.sub(r"([äeöüiě])(ł)([^äeöüiě])", r"\1l\3", text)
1023
+
1024
+ '''
1025
+ anti-rule 2:
1026
+ '''
1027
+ text = re.sub(r"w([bdfghžklmnňprsştýzjçɣqv])", r"u\1", text) # precedence issue
1028
+ text = re.sub("v", "w", text) # precedence issue
1029
+
1030
+ '''
1031
+ anti-rule 3:
1032
+ '''
1033
+ text = re.sub(r"i([bdfghžklmnňprsştwýzjçɣq])", r"i\1", text)
1034
+
1035
+ return text
1036
+
1037
+ # testing turkmen scripts
1038
+
1039
+ def test_turkmen(text):
1040
+ input_text = text.lower().split()
1041
+ output_text = ipa_to_turkmen(turkmen_to_ipa(text)).split()
1042
+
1043
+ input_difference = []
1044
+ output_difference = []
1045
+ for item in input_text:
1046
+ if item not in output_text:
1047
+ input_difference.append(item)
1048
+ for item in output_text:
1049
+ if item not in input_text:
1050
+ output_difference.append(item)
1051
+
1052
+ if input_text == output_text:
1053
+ print("input text and output text -- identical")
1054
+ else:
1055
+ print("input text and output text -- different")
1056
+ print("input:", input_difference)
1057
+ print("output:", output_difference)
1058
+
1059
+ # tatar scripts
1060
+
1061
+ def tatar_to_ipa(text):
1062
+ # we shall begin with sound combinations:
1063
+ # the longer a combination, the upper it is on the list
1064
+ # single sounds should go to the bottom, with consonants taking precedence over vowels.
1065
+ # consonants are less likely to change than vowels.
1066
+
1067
+ # for convenience, we shall use single symbols to denote multiple-sound combinations.
1068
+ # we can later convert them to conventional symbols.
1069
+
1070
+ # three-sound convenience vowels:
1071
+ text = re.sub("[Юю]", "ǔ", text)
1072
+
1073
+ # two-sound convenience consonants:
1074
+ text = re.sub("[Цц]", "š", text)
1075
+ text = re.sub("[Чч]", "ʆ", text)
1076
+ text = re.sub("[Җҗ]", "Ǯ", text)
1077
+
1078
+ # two-sound convenience vowels:
1079
+ text = re.sub("[Яя]", "ǎ", text)
1080
+ text = re.sub("[Ее]", "ě", text)
1081
+ text = re.sub("[Ёё]", "ǒ", text)
1082
+ text = re.sub("[Ии]", "ǐ", text)
1083
+ text = re.sub("[Уу]", "u", text)
1084
+
1085
+ # single-sound consonants:
1086
+ text = re.sub("[Бб]", "b", text)
1087
+ text = re.sub("[Вв]", "v", text)
1088
+ text = re.sub("[Гг]", "g", text)
1089
+ text = re.sub("[Дд]", "d", text)
1090
+ text = re.sub("[Жж]", "ʒ", text)
1091
+ text = re.sub("[Зз]", "z", text)
1092
+ text = re.sub("[Йй]", "j", text)
1093
+ text = re.sub("[Кк]", "k", text)
1094
+ text = re.sub("[Лл]", "l", text)
1095
+ text = re.sub("[Мм]", "m", text)
1096
+ text = re.sub("[Нн]", "n", text)
1097
+ text = re.sub("[Ңң]", "ŋ", text)
1098
+ text = re.sub("[Пп]", "p", text)
1099
+ text = re.sub("[Рр]", "r", text)
1100
+ text = re.sub("[Сс]", "s", text)
1101
+ text = re.sub("[Тт]", "t", text)
1102
+ text = re.sub("[Фф]", "f", text)
1103
+ text = re.sub("[Хх]", "x", text)
1104
+ text = re.sub("[Һһ]", "h", text)
1105
+ text = re.sub("[Шш]", "ʃ", text)
1106
+ text = re.sub("[Щщ]", "ɕ", text)
1107
+ text = re.sub("[Ъъ]", "ʔ", text)
1108
+ text = re.sub("[Ьь]", "ʲ", text)
1109
+
1110
+ # single-sound vowels:
1111
+ text = re.sub("[Аа]", "ɑ", text)
1112
+ text = re.sub("[Әә]", "æ", text)
1113
+ text = re.sub("[Оо]", "ɔ", text)
1114
+ text = re.sub("[Өө]", "ɵ", text)
1115
+ text = re.sub("[Үү]", "ʏ", text)
1116
+ text = re.sub("[Ыы]", "ɤ", text)
1117
+ text = re.sub("[Ээ]", "e", text)
1118
+
1119
+ # rules 1-4 are similar to those for Kazakh:
1120
+
1121
+ '''
1122
+ rule 1:
1123
+ if [ě], [ɵ], [ʏ], are followed by [l] and [l] is NOT followed by [ě], [ɵ], [ʏ], or [ʲ],
1124
+ use [ł] instead of [l].
1125
+ '''
1126
+ text = re.sub(r"([æɵʏě])(l)([^æɵʏěʲ])", r"\1ł\3", text)
1127
+
1128
+ '''
1129
+ rule 2:
1130
+ if the letters "о" and "ө", [ɔ] and [ɵ] at the beginning of a word are followed by [w] (e.g., осы [wɔsɤ] not [ɔsɤ], өзі [wɵzɪ] not [ɵzɪ]).
1131
+ '''
1132
+ text = re.sub(r"\b([ɔɵ])", r"w\1", text)
1133
+
1134
+ '''
1135
+ rule 3
1136
+ if the letter "у" [u] is followed by consonants, use [w] instead of [u].
1137
+ '''
1138
+ text = re.sub(r"u([bvgɣdʒzjkqlłmnŋprstfxhʃɕʔšʆǮʲ])", r"w\1", text)
1139
+
1140
+ '''
1141
+ rule 4:
1142
+ if the letter "и" [ǐ] is followed by consonants, use [i] instead of [ǐ].
1143
+ '''
1144
+ text = re.sub(r"ǐ([bvgɣdʒzjkqlłmnŋprstfxhʃɕʔšʆǮʲ])", r"i\1", text)
1145
+
1146
+ # rules 5-6 are specific to Tatar:
1147
+
1148
+ '''
1149
+ rule 5:
1150
+ а, о, у, ы, ъ + к + а, о, у, ы, ъ
1151
+ '''
1152
+ text = re.sub(r"k([ɑɔwɤʔ])", r"q\1", text)
1153
+ text = re.sub(r"([ɑɔwɤʔ])k", r"\1q", text)
1154
+
1155
+ '''
1156
+ rule 6:
1157
+ а, о, у, ы, ъ + г + а, о, у, ы, ъ
1158
+ '''
1159
+ text = re.sub(r"g([ɑɔwɤʔ])", r"ɣ\1", text)
1160
+ text = re.sub(r"([ɑɔwɤʔ])g", r"\1ɣ", text)
1161
+
1162
+ return text
1163
+
1164
+ def ipa_to_tatar(text):
1165
+ # three-sound convenience vowels:
1166
+ text = re.sub("ǔ", "ю", text)
1167
+
1168
+ # two-sound convenience consonants:
1169
+ text = re.sub("š", "ц", text)
1170
+ text = re.sub("ʆ", "ч", text)
1171
+ text = re.sub("Ǯ", "җ", text)
1172
+
1173
+ # two-sound convenience vowels:
1174
+ text = re.sub("ǎ", "я", text)
1175
+ text = re.sub("ě", "е", text)
1176
+ text = re.sub("ǒ", "ё", text)
1177
+ text = re.sub("ǐ", "и", text)
1178
+ text = re.sub("u", "у", text)
1179
+
1180
+ # single-sound consonants:
1181
+ text = re.sub("b", "б", text)
1182
+ text = re.sub("v", "в", text)
1183
+ text = re.sub("g", "г", text)
1184
+ text = re.sub("ɣ", "г", text)
1185
+ text = re.sub("d", "д", text)
1186
+ text = re.sub("ʒ", "ж", text)
1187
+ text = re.sub("z", "з", text)
1188
+ text = re.sub("j", "й", text)
1189
+ text = re.sub("k", "к", text)
1190
+ text = re.sub("l", "л", text)
1191
+ text = re.sub("m", "м", text)
1192
+ text = re.sub("n", "н", text)
1193
+ text = re.sub("ŋ", "ң", text)
1194
+ text = re.sub("p", "п", text)
1195
+ text = re.sub("q", "к", text)
1196
+ text = re.sub("r", "р", text)
1197
+ text = re.sub("s", "с", text)
1198
+ text = re.sub("t", "т", text)
1199
+ text = re.sub("f", "ф", text)
1200
+ text = re.sub("x", "х", text)
1201
+ text = re.sub("h", "һ", text)
1202
+ text = re.sub("ʃ", "ш", text)
1203
+ text = re.sub("ɕ", "щ", text)
1204
+ text = re.sub("ʔ", "ъ", text)
1205
+ text = re.sub("ʲ", "ь", text)
1206
+
1207
+ # single-sound vowels:
1208
+ text = re.sub("ɑ", "а", text)
1209
+ text = re.sub("æ", "ә", text)
1210
+ text = re.sub("ɔ", "о", text)
1211
+ text = re.sub("ɵ", "ө", text)
1212
+ text = re.sub("ʏ", "ү", text)
1213
+ text = re.sub("ɤ", "ы", text)
1214
+ text = re.sub("e", "э", text)
1215
+
1216
+ # anti-rules 1-4 are similar to those for Kazakh:
1217
+
1218
+ '''
1219
+ anti-rule 1:
1220
+ '''
1221
+ text = re.sub(r"([әөүе])(ł)([^әөүеʲ])", r"\1л\3", text)
1222
+
1223
+ '''
1224
+ anti-rule 2:
1225
+ '''
1226
+ text = re.sub(r"\bw([оө])", r"\1", text)
1227
+
1228
+ '''
1229
+ anti-rule 3:
1230
+ '''
1231
+ text = re.sub(r"w([бвгдзйклмнңпрстфхһцчшщъьчцжҗqɣ])", r"у\1", text)
1232
+
1233
+ '''
1234
+ anti-rule 4:
1235
+ '''
1236
+ text = re.sub(r"i([бвгдзйклмнңпрстфхһцчшщъьчцжҗqɣ])", r"и\1", text)
1237
+
1238
+ return text
1239
+
1240
+ # testing tatar scripts
1241
+
1242
+ def test_tatar(text):
1243
+ input_text = text.lower().split()
1244
+ output_text = ipa_to_tatar(tatar_to_ipa(text)).split()
1245
+
1246
+ input_difference = []
1247
+ output_difference = []
1248
+ for item in input_text:
1249
+ if item not in output_text:
1250
+ input_difference.append(item)
1251
+ for item in output_text:
1252
+ if item not in input_text:
1253
+ output_difference.append(item)
1254
+
1255
+ if input_text == output_text:
1256
+ print("input text and output text -- identical")
1257
+ else:
1258
+ print("input text and output text -- different")
1259
+ print("input:", input_difference)
1260
+ print("output:", output_difference)
1261
+
1262
+ # bashkir scripts
1263
+
1264
+ def bashkir_to_ipa(text):
1265
+ # we shall begin with sound combinations:
1266
+ # the longer a combination, the upper it is on the list
1267
+ # single sounds should go to the bottom, with consonants taking precedence over vowels.
1268
+ # consonants are less likely to change than vowels.
1269
+
1270
+ # for convenience, we shall use single symbols to denote multiple-sound combinations.
1271
+ # we shall later convert them to conventional symbols.
1272
+
1273
+ # three-sound convenience vowels:
1274
+ text = re.sub("[Юю]", "ǔ", text)
1275
+
1276
+ # two-sound convenience consonants:
1277
+ text = re.sub("[Цц]", "š", text)
1278
+ text = re.sub("[Чч]", "ʆ", text)
1279
+
1280
+ # two-sound convenience vowels:
1281
+ text = re.sub("[Яя]", "ǎ", text)
1282
+ text = re.sub("[Ее]", "ě", text)
1283
+ text = re.sub("[Ёё]", "ǒ", text)
1284
+ text = re.sub("[Ии]", "ǐ", text)
1285
+ text = re.sub("[Уу]", "u", text)
1286
+
1287
+ # single-sound consonants:
1288
+ text = re.sub("[Бб]", "b", text)
1289
+ text = re.sub("[Вв]", "v", text)
1290
+ text = re.sub("[Гг]", "g", text)
1291
+ text = re.sub("[Ғғ]", "ɣ", text)
1292
+ text = re.sub("[Дд]", "d", text)
1293
+ text = re.sub("[Ҙҙ]", "z", text)
1294
+ text = re.sub("[Жж]", "ʒ", text)
1295
+ text = re.sub("[Зз]", "z", text)
1296
+ text = re.sub("[Йй]", "j", text)
1297
+ text = re.sub("[Кк]", "k", text)
1298
+ text = re.sub("[Ҡҡ]", "q", text)
1299
+ text = re.sub("[Лл]", "l", text)
1300
+ text = re.sub("[Мм]", "m", text)
1301
+ text = re.sub("[Нн]", "n", text)
1302
+ text = re.sub("[Ңң]", "ŋ", text)
1303
+ text = re.sub("[Пп]", "p", text)
1304
+ text = re.sub("[Рр]", "r", text)
1305
+ text = re.sub("[Сс]", "s", text)
1306
+ text = re.sub("[Ҫҫ]", "s", text)
1307
+ text = re.sub("[Тт]", "t", text)
1308
+ text = re.sub("[Хх]", "x", text)
1309
+ text = re.sub("[Фф]", "f", text)
1310
+ text = re.sub("[Һһ]", "h", text)
1311
+ text = re.sub("[Шш]", "ʃ", text)
1312
+ text = re.sub("[Щщ]", "ɕ", text)
1313
+ text = re.sub("[Ъъ]", "ʔ", text)
1314
+ text = re.sub("[Ьь]", "ʲ", text)
1315
+
1316
+ # single-sound vowels:
1317
+ text = re.sub("[Аа]", "ɑ", text)
1318
+ text = re.sub("[Әә]", "æ", text)
1319
+ text = re.sub("[Оо]", "ɔ", text)
1320
+ text = re.sub("[Өө]", "ɵ", text)
1321
+ text = re.sub("[Үү]", "ʏ", text)
1322
+ text = re.sub("[Ыы]", "ɤ", text)
1323
+ text = re.sub("[Ээ]", "e", text)
1324
+
1325
+ # rules 1-4 are similar to those for Kazakh:
1326
+
1327
+ '''
1328
+ rule 1:
1329
+ if [ě], [ɵ], [ʏ], are followed by [l] and [l] is NOT followed by [ě], [ɵ], [ʏ], or [ʲ],
1330
+ use [ł] instead of [l].
1331
+ '''
1332
+ text = re.sub(r"([æɵʏě])(l)([^æɵʏěʲ])", r"\1ł\3", text)
1333
+
1334
+ '''
1335
+ rule 2:
1336
+ if the letters "о" and "ө", [ɔ] and [ɵ] at the beginning of a word are followed by [w] (e.g., осы [wɔsɤ] not [ɔsɤ], өзі [wɵzɪ] not [ɵzɪ]).
1337
+ '''
1338
+ text = re.sub(r"\b([ɔɵ])", r"w\1", text)
1339
+
1340
+ '''
1341
+ rule 3
1342
+ if the letter "у" [u] is followed by consonants, use [w] instead of [u].
1343
+ '''
1344
+ text = re.sub(r"u([bvgɣdʒzjkqlłmnŋprstfxhʃɕʔšʆʲ])", r"w\1", text)
1345
+
1346
+ '''
1347
+ rule 4:
1348
+ if the letter "и" [ǐ] is followed by consonants, use [i] instead of [ǐ].
1349
+ '''
1350
+ text = re.sub(r"ǐ([bvgɣdʒzjkqlłmnŋprstfxhʃɕʔšʆʲ])", r"i\1", text)
1351
+
1352
+ return text
1353
+
1354
+ def ipa_to_bashkir(text):
1355
+ # three-sound convenience vowels:
1356
+ text = re.sub("ǔ", "ю", text)
1357
+
1358
+ # two-sound convenience consonants:
1359
+ text = re.sub("š", "ц", text)
1360
+ text = re.sub("ʆ", "ч", text)
1361
+
1362
+ # two-sound convenience vowels:
1363
+ text = re.sub("ě", "е", text)
1364
+ text = re.sub("ǒ", "ё", text)
1365
+ text = re.sub("ǐ", "и", text)
1366
+ text = re.sub("u", "у", text)
1367
+ text = re.sub("ǎ", "я", text)
1368
+
1369
+ # single-sound consonants:
1370
+ text = re.sub("b", "б", text)
1371
+ text = re.sub("v", "в", text)
1372
+ text = re.sub("g", "г", text)
1373
+ text = re.sub("ɣ", "ғ", text)
1374
+ text = re.sub("d", "д", text)
1375
+ text = re.sub("z", "з", text)
1376
+ text = re.sub("ʒ", "ж", text)
1377
+ text = re.sub("j", "й", text)
1378
+ text = re.sub("k", "к", text)
1379
+ text = re.sub("q", "ҡ", text)
1380
+ text = re.sub("l", "л", text)
1381
+ text = re.sub("m", "м", text)
1382
+ text = re.sub("n", "н", text)
1383
+ text = re.sub("ŋ", "ң", text)
1384
+ text = re.sub("p", "п", text)
1385
+ text = re.sub("r", "р", text)
1386
+ text = re.sub("s", "с", text)
1387
+ text = re.sub("t", "т", text)
1388
+ text = re.sub("f", "ф", text)
1389
+ text = re.sub("x", "х", text)
1390
+ text = re.sub("h", "һ", text)
1391
+ text = re.sub("ʃ", "ш", text)
1392
+ text = re.sub("ɕ", "щ", text)
1393
+ text = re.sub("ʔ", "ъ", text)
1394
+ text = re.sub("ʲ", "ь", text)
1395
+
1396
+ # single-sound vowels:
1397
+ text = re.sub("ɑ", "а", text)
1398
+ text = re.sub("æ", "ә", text)
1399
+ text = re.sub("ɔ", "о", text)
1400
+ text = re.sub("ɵ", "ө", text)
1401
+ text = re.sub("ʏ", "ү", text)
1402
+ text = re.sub("ɤ", "ы", text)
1403
+ text = re.sub("e", "э", text)
1404
+
1405
+ # anti-rules 1-4 are similar to those for Kazakh:
1406
+
1407
+ '''
1408
+ anti-rule 1:
1409
+ '''
1410
+ text = re.sub(r"([әөүе])(ł)([^әөүеʲ])", r"\1л\3", text)
1411
+
1412
+ '''
1413
+ anti-rule 2:
1414
+ '''
1415
+ text = re.sub(r"\bw([оө])", r"\1", text)
1416
+
1417
+ '''
1418
+ anti-rule 3:
1419
+ '''
1420
+ text = re.sub(r"w([бвгғдзйкҡлмнңпрстфхһцчшщъьчцж])", r"у\1", text)
1421
+
1422
+ '''
1423
+ anti-rule 4:
1424
+
1425
+
1426
+ '''
1427
+ text = re.sub(r"i([бвгғдзйкҡлмнңпрстфхһцчшщъьчцж])", r"и\1", text)
1428
+
1429
+ return text
1430
+
1431
+ # testing bashkir scripts
1432
+
1433
+ def test_bashkir(text):
1434
+ input_text = text.lower().split()
1435
+ output_text = ipa_to_bashkir(bashkir_to_ipa(text)).split()
1436
+
1437
+ input_difference = []
1438
+ output_difference = []
1439
+ for item in input_text:
1440
+ if item not in output_text:
1441
+ input_difference.append(item)
1442
+ for item in output_text:
1443
+ if item not in input_text:
1444
+ output_difference.append(item)
1445
+
1446
+ if input_text == output_text:
1447
+ print("input text and output text -- identical")
1448
+ else:
1449
+ print("input text and output text -- different")
1450
+ print("input:", input_difference)
1451
+ print("output:", output_difference)
1452
+
1453
+ # sakha scripts
1454
+
1455
+ def sakha_to_ipa(text):
1456
+ # we shall begin with sound combinations:
1457
+ # the longer a combination, the upper it is on the list
1458
+ # single sounds should go to the bottom, with consonants taking precedence over vowels.
1459
+ # consonants are less likely to change than vowels.
1460
+
1461
+ # for convenience, we shall use single symbols to denote multiple-sound combinations.
1462
+ # we can later convert them to conventional symbols.
1463
+
1464
+ # three-sound convenience vowels:
1465
+ text = re.sub("[Юю]", "ǔ", text)
1466
+
1467
+ # two-sound convenience consonants:
1468
+ text = re.sub("[Цц]", "š", text)
1469
+ text = re.sub("[Чч]", "ʆ", text)
1470
+ text = re.sub("ДЬ", "Ǯ", text)
1471
+ text = re.sub("дь", "Ǯ", text)
1472
+ text = re.sub("Дь", "Ǯ", text)
1473
+ text = re.sub("дЬ", "Ǯ", text)
1474
+ text = re.sub("НЬ", "ɲ", text)
1475
+ text = re.sub("нь", "ɲ", text)
1476
+ text = re.sub("Нь", "ɲ", text)
1477
+ text = re.sub("нЬ", "ɲ", text)
1478
+
1479
+ # two-sound convenience vowels:
1480
+ text = re.sub("[Яя]", "ǎ", text)
1481
+ text = re.sub("[Ее]", "ě", text)
1482
+ text = re.sub("[Ёё]", "ǒ", text)
1483
+ text = re.sub("[Ии]", "ǐ", text)
1484
+ text = re.sub("[Уу]", "u", text)
1485
+
1486
+ # single-sound consonants:
1487
+ text = re.sub("[Бб]", "b", text)
1488
+ text = re.sub("[Вв]", "v", text)
1489
+ text = re.sub("[Гг]", "g", text)
1490
+ text = re.sub("[Ҕҕ]", "ɣ", text)
1491
+ text = re.sub("[Дд]", "d", text)
1492
+ text = re.sub("[Жж]", "ʒ", text)
1493
+ text = re.sub("[Зз]", "z", text)
1494
+ text = re.sub("[Йй]", "j", text)
1495
+ text = re.sub("[Кк]", "k", text)
1496
+ text = re.sub("[Лл]", "l", text)
1497
+ text = re.sub("[Мм]", "m", text)
1498
+ text = re.sub("[Нн]", "n", text)
1499
+ text = re.sub("[Ҥҥ]", "ŋ", text)
1500
+ text = re.sub("[Пп]", "p", text)
1501
+ text = re.sub("[Рр]", "r", text)
1502
+ text = re.sub("[Сс]", "s", text)
1503
+ text = re.sub("[Тт]", "t", text)
1504
+ text = re.sub("[Хх]", "x", text)
1505
+ text = re.sub("[Фф]", "f", text)
1506
+ text = re.sub("[Һһ]", "h", text)
1507
+ text = re.sub("[Шш]", "ʃ", text)
1508
+ text = re.sub("[Щщ]", "ɕ", text)
1509
+ text = re.sub("[Ъъ]", "ʔ", text)
1510
+ text = re.sub("[Ьь]", "ʲ", text)
1511
+
1512
+ # single-sound vowels:
1513
+ text = re.sub("[Аа]", "ɑ", text)
1514
+ text = re.sub("[Әә]", "æ", text)
1515
+ text = re.sub("[Оо]", "ɔ", text)
1516
+ text = re.sub("[Өө]", "ɵ", text)
1517
+ text = re.sub("[Үү]", "ʏ", text)
1518
+ text = re.sub("[Ыы]", "ɤ", text)
1519
+ text = re.sub("[Ээ]", "e", text)
1520
+
1521
+ # rules 1-4 are similar to those for Kazakh:
1522
+
1523
+ '''
1524
+ rule 1:
1525
+ if [ě], [ɵ], [ʏ], are followed by [l] and [l] is NOT followed by [ě], [ɵ], [ʏ], or [ʲ],
1526
+ use [ł] instead of [l].
1527
+ '''
1528
+ text = re.sub(r"([æɵʏě])(l)([^æɵʏěʲ])", r"\1ł\3", text)
1529
+
1530
+ '''
1531
+ rule 2:
1532
+ if the letters "о" and "ө", [ɔ] and [ɵ] at the beginning of a word are followed by [w] (e.g., осы [wɔsɤ] not [ɔsɤ], өзі [wɵzɪ] not [ɵzɪ]).
1533
+ '''
1534
+ text = re.sub(r"\b([ɔɵ])", r"w\1", text)
1535
+
1536
+ '''
1537
+ rule 3
1538
+ if the letter "у" [u] is followed by consonants, use [w] instead of [u].
1539
+ '''
1540
+ text = re.sub(r"u([bvgɣdʒzjklłmnŋɲprstfxhʃɕʔšʆǮʲ])", r"w\1", text)
1541
+
1542
+ '''
1543
+ rule 4:
1544
+ if the letter "и" [ǐ] is followed by consonants, use [i] instead of [ǐ].
1545
+ '''
1546
+ text = re.sub(r"ǐ([bvgɣdʒzjklłmnŋɲprstfxhʃɕʔšʆǮʲ])", r"i\1", text)
1547
+
1548
+ return text
1549
+
1550
+ def ipa_to_sakha(text):
1551
+ # three-sound convenience vowels:
1552
+ text = re.sub("ǔ", "ю", text)
1553
+
1554
+ # two-sound convenience consonants:
1555
+ text = re.sub("š", "ц", text)
1556
+ text = re.sub("ʆ", "ч", text)
1557
+ text = re.sub("Ǯ", "дь", text)
1558
+ text = re.sub("ɲ", "нь", text)
1559
+
1560
+ # two-sound convenience vowels:
1561
+ text = re.sub("ě", "е", text)
1562
+ text = re.sub("ǒ", "ё", text)
1563
+ text = re.sub("ǐ", "и", text)
1564
+ text = re.sub("u", "у", text)
1565
+ text = re.sub("ǎ", "я", text)
1566
+
1567
+ # single-sound consonants:
1568
+ text = re.sub("b", "б", text)
1569
+ text = re.sub("v", "в", text)
1570
+ text = re.sub("g", "г", text)
1571
+ text = re.sub("ɣ", "ҕ", text)
1572
+ text = re.sub("d", "д", text)
1573
+ text = re.sub("z", "з", text)
1574
+ text = re.sub("ʒ", "ж", text)
1575
+ text = re.sub("j", "й", text)
1576
+ text = re.sub("k", "к", text)
1577
+ text = re.sub("l", "л", text)
1578
+ text = re.sub("m", "м", text)
1579
+ text = re.sub("n", "н", text)
1580
+ text = re.sub("ŋ", "ҥ", text)
1581
+ text = re.sub("p", "п", text)
1582
+ text = re.sub("r", "р", text)
1583
+ text = re.sub("s", "с", text)
1584
+ text = re.sub("t", "т", text)
1585
+ text = re.sub("f", "ф", text)
1586
+ text = re.sub("x", "х", text)
1587
+ text = re.sub("h", "һ", text)
1588
+ text = re.sub("ʃ", "ш", text)
1589
+ text = re.sub("ɕ", "щ", text)
1590
+ text = re.sub("ʔ", "ъ", text)
1591
+ text = re.sub("ʲ", "ь", text)
1592
+
1593
+ # single-sound vowels:
1594
+ text = re.sub("ɑ", "а", text)
1595
+ text = re.sub("æ", "ә", text)
1596
+ text = re.sub("ɔ", "о", text)
1597
+ text = re.sub("ɵ", "ө", text)
1598
+ text = re.sub("ʏ", "ү", text)
1599
+ text = re.sub("ɤ", "ы", text)
1600
+ text = re.sub("e", "э", text)
1601
+
1602
+ # anti-rules 1-4 are similar to those for Kazakh:
1603
+
1604
+ '''
1605
+ anti-rule 1:
1606
+ '''
1607
+ text = re.sub(r"([әөүе])(ł)([^әөүеʲ])", r"\1л\3", text)
1608
+
1609
+ '''
1610
+ anti-rule 2:
1611
+ '''
1612
+ text = re.sub(r"\bw([оө])", r"\1", text)
1613
+
1614
+ '''
1615
+ anti-rule 3:
1616
+ '''
1617
+ text = re.sub(r"w(дь)", r"у\1", text)
1618
+ text = re.sub(r"w(нь)", r"у\1", text)
1619
+ text = re.sub(r"w([бвгҕдзйклмнҥпрстфхһцчшщъьчцж])", r"у\1", text)
1620
+
1621
+ '''
1622
+ anti-rule 4:
1623
+ '''
1624
+ text = re.sub(r"i(дь)", r"и\1", text)
1625
+ text = re.sub(r"i(нь)", r"и\1", text)
1626
+ text = re.sub(r"i([бвгҕдзйклмнҥпрстфхһцчшщъьчцж])", r"и\1", text)
1627
+
1628
+ return text
1629
+
1630
+ # testing sakha scripts
1631
+
1632
+ # testing bashkir scripts
1633
+
1634
+ def test_sakha(text):
1635
+ input_text = text.lower().split()
1636
+ output_text = ipa_to_sakha(sakha_to_ipa(text)).split()
1637
+
1638
+ input_difference = []
1639
+ output_difference = []
1640
+ for item in input_text:
1641
+ if item not in output_text:
1642
+ input_difference.append(item)
1643
+ for item in output_text:
1644
+ if item not in input_text:
1645
+ output_difference.append(item)
1646
+
1647
+ if input_text == output_text:
1648
+ print("input text and output text -- identical")
1649
+ else:
1650
+ print("input text and output text -- different")
1651
+ print("input:", input_difference)
1652
+ print("output:", output_difference)
1653
+
1654
+ # uyghur scripts
1655
+
1656
+ def uyghur_to_ipa(text):
1657
+ # we shall begin with sound combinations:
1658
+ # the longer a combination, the upper it is on the list
1659
+ # single sounds should go to the bottom, with consonants taking precedence over vowels.
1660
+ # consonants are less likely to change than vowels.
1661
+
1662
+ # for convenience, we shall use single symbols to denote multiple-sound combinations.
1663
+ # we can later convert them to conventional symbols.
1664
+
1665
+ # two-sound convenience consonants:
1666
+ text = re.sub("[Jj]", "Ǯ", text)
1667
+ text = re.sub("Ch", "ʆ", text)
1668
+ text = re.sub("ch", "ʆ", text)
1669
+
1670
+ # two-sound convenience vowels:
1671
+ text = re.sub("[Ii]", "ǐ", text)
1672
+ text = re.sub("[Uu]", "u", text)
1673
+
1674
+ # single-sound consonants:
1675
+ text = re.sub("[Bb]", "b", text)
1676
+ text = re.sub("[Dd]", "d", text)
1677
+ text = re.sub("[Ff]", "f", text)
1678
+ text = re.sub("Gh", "ɣ", text)
1679
+ text = re.sub("gh", "ɣ", text)
1680
+ text = re.sub("[Gg]", "g", text)
1681
+ text = re.sub("[Hh]", "h", text)
1682
+ text = re.sub("[Kk]", "k", text)
1683
+ text = re.sub("[Ll]", "l", text)
1684
+ text = re.sub("[Mm]", "m", text)
1685
+ text = re.sub("[Nn]", "n", text)
1686
+ text = re.sub("Ng", "ŋ", text)
1687
+ text = re.sub("ng", "ŋ", text)
1688
+ text = re.sub("[Pp]", "p", text)
1689
+ text = re.sub("[Qq]", "q", text)
1690
+ text = re.sub("[Rr]", "r", text)
1691
+ text = re.sub("[Ss]", "s", text)
1692
+ text = re.sub("Sh", "ʃ", text)
1693
+ text = re.sub("sh", "ʃ", text)
1694
+ text = re.sub("[Tt]", "t", text)
1695
+ text = re.sub("[Ww]", "v", text)
1696
+ text = re.sub("[Xx]", "x", text)
1697
+ text = re.sub("[Yy]", "j", text)
1698
+ text = re.sub("[Zz]", "z", text)
1699
+ text = re.sub("Zh", "ʒ", text)
1700
+ text = re.sub("zh", "ʒ", text)
1701
+
1702
+ # single-sound vowels:
1703
+ text = re.sub("[Aa]", "ɑ", text)
1704
+ text = re.sub("[Ee]", "æ", text)
1705
+ text = re.sub("[ËÉëé]", "e", text)
1706
+ text = re.sub("[Oo]", "ɔ", text)
1707
+ text = re.sub("[Öö]", "ɵ", text)
1708
+ text = re.sub("[Üü]", "ʏ", text)
1709
+
1710
+ # hard sign
1711
+ text = re.sub("'", "ʔ", text)
1712
+
1713
+ '''
1714
+ rule 1:
1715
+ if [æ], [e], [ɵ], [ǐ] are followed by [l] and [l] is NOT followed by [æ], [e], [ɵ], [ǐ],
1716
+ use [ł] instead of [l].
1717
+ '''
1718
+ text = re.sub(r"([æɵǐeʏ])(l)([^æɵǐeʏ])", r"\1ł\3", text)
1719
+
1720
+ '''
1721
+ rule 2:
1722
+ if the letter "u" [u] is followed by consonants, use [w] instead of [u].
1723
+ '''
1724
+ text = re.sub(r"u([bvgɣdzjkqlłmnŋprstfxhʃʆǮʒ])", r"w\1", text)
1725
+
1726
+ '''
1727
+ rule 3:
1728
+ if the letter "i" [ǐ] is followed by consonants, use [i] instead of [ǐ].
1729
+ '''
1730
+ text = re.sub(r"ǐ([bvgɣdzjkqlłmnŋprstfxhʃʆǮʒ])", r"i\1", text)
1731
+
1732
+ return text
1733
+
1734
+ def ipa_to_uyghur(text):
1735
+ # two-sound convenience consonants:
1736
+ text = re.sub("j", "y", text) # exception! precedence issue
1737
+ text = re.sub("Ǯ", "j", text)
1738
+ text = re.sub("ʆ", "ch", text)
1739
+ text = re.sub("ʒ", "zh", text)
1740
+
1741
+ # two-sound convenience vowels:
1742
+ text = re.sub("ǐ", "i", text)
1743
+ text = re.sub("u", "u", text)
1744
+
1745
+ # single-sound convenience consonants:
1746
+ text = re.sub("b", "b", text)
1747
+ text = re.sub("d", "d", text)
1748
+ text = re.sub("f", "f", text)
1749
+ text = re.sub("g", "g", text)
1750
+ text = re.sub("ɣ", "gh", text)
1751
+ text = re.sub("h", "h", text)
1752
+ text = re.sub("k", "k", text)
1753
+ text = re.sub("l", "l", text)
1754
+ text = re.sub("m", "m", text)
1755
+ text = re.sub("n", "n", text)
1756
+ text = re.sub("ŋ", "ng", text)
1757
+ text = re.sub("p", "p", text)
1758
+ text = re.sub("q", "q", text)
1759
+ text = re.sub("r", "r", text)
1760
+ text = re.sub("s", "s", text)
1761
+ text = re.sub("ʃ", "sh", text)
1762
+ text = re.sub("t", "t", text)
1763
+ text = re.sub("v", "w", text)
1764
+ text = re.sub("x", "x", text)
1765
+ text = re.sub("z", "z", text)
1766
+
1767
+ # single-sound convenience vowels:
1768
+ text = re.sub("ɑ", "a", text)
1769
+ text = re.sub("e", "ë", text) # precedence
1770
+ text = re.sub("æ", "e", text)
1771
+ text = re.sub("ɵ", "ö", text)
1772
+ text = re.sub("ɔ", "o", text)
1773
+ text = re.sub("ʏ", "ü", text)
1774
+
1775
+ # hard sign
1776
+ text = re.sub("ʔ", "'", text)
1777
+
1778
+ '''
1779
+ anti-rule 1:
1780
+ '''
1781
+ text = re.sub(r"([eëiöü])(ł)([^eëiöü])", r"\1l\3", text)
1782
+
1783
+ '''
1784
+ anti-rule 2:
1785
+ '''
1786
+ text = re.sub(r"w([bcvgdjzklmnpqrstfhyx])", r"u\1", text)
1787
+
1788
+ '''
1789
+ anti-rule 3:
1790
+ '''
1791
+ text = re.sub(r"i([bcvgdjzklmnpqrstfhyx])", r"i\1", text)
1792
+
1793
+ return text
1794
+
1795
+ # testing uyghur scripts
1796
+
1797
+ def test_uyghur(text):
1798
+ input_text = text.lower().split()
1799
+ output_text = ipa_to_uyghur(uyghur_to_ipa(text)).split()
1800
+
1801
+ input_difference = []
1802
+ output_difference = []
1803
+ for item in input_text:
1804
+ if item not in output_text:
1805
+ input_difference.append(item)
1806
+ for item in output_text:
1807
+ if item not in input_text:
1808
+ output_difference.append(item)
1809
+
1810
+ if input_text == output_text:
1811
+ print("input text and output text -- identical")
1812
+ else:
1813
+ print("input text and output text -- different")
1814
+ print("input:", input_difference)
1815
+ print("output:", output_difference)
turkicTTS_utils.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import turkicTTS_ipa_convert as ipa_convert
2
+
3
+ dispatcher = {'kazakh' : ipa_convert.kazakh_to_ipa,
4
+ 'turkish' : ipa_convert.turkish_to_ipa,
5
+ 'azerbaijani': ipa_convert.azerbaijani_to_ipa,
6
+ 'kyrgyz' : ipa_convert.kyrgyz_to_ipa,
7
+ 'uzbek' : ipa_convert.uzbek_to_ipa,
8
+ 'turkmen' : ipa_convert.turkmen_to_ipa,
9
+ 'tatar' : ipa_convert.tatar_to_ipa,
10
+ 'bashkir' : ipa_convert.bashkir_to_ipa,
11
+ 'sakha' : ipa_convert.sakha_to_ipa,
12
+ 'uyghur' : ipa_convert.uyghur_to_ipa,
13
+ 'tts_sent' : ipa_convert.ipa_to_kazakh}
14
+
15
+ def call_func(x, func):
16
+ try:
17
+ return dispatcher[func](x)
18
+ except:
19
+ return "Invalid function"
20
+
21
+ def normalization(x, lang="kazakh"):
22
+ ipa_text = call_func(x, lang)
23
+ kz_text = call_func(ipa_text, 'tts_sent')
24
+ return kz_text