diff --git a/CosyVoice-300M-Instruct/.msc b/CosyVoice-300M-Instruct/.msc new file mode 100644 index 0000000000000000000000000000000000000000..9c6ab0a1b66e3322baa8ac17e08c5bab838b031d Binary files /dev/null and b/CosyVoice-300M-Instruct/.msc differ diff --git a/CosyVoice-300M-Instruct/.mv b/CosyVoice-300M-Instruct/.mv new file mode 100644 index 0000000000000000000000000000000000000000..48b6557d9d3dda7b170b873a629b42df41dfb952 --- /dev/null +++ b/CosyVoice-300M-Instruct/.mv @@ -0,0 +1 @@ +Revision:master,CreatedAt:1720198244 \ No newline at end of file diff --git a/CosyVoice-300M-Instruct/README.md b/CosyVoice-300M-Instruct/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d10e157809fbcded1299bd31bbbfa78ab8e63f65 --- /dev/null +++ b/CosyVoice-300M-Instruct/README.md @@ -0,0 +1,150 @@ +# CosyVoice +## 👉🏻 [CosyVoice Demos](https://fun-audio-llm.github.io/) 👈🏻 +[[CosyVoice Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)][[CosyVoice Code](https://github.com/FunAudioLLM/CosyVoice)] + +For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVoice) and [SenseVoice space](https://www.modelscope.cn/studios/iic/SenseVoice). + +## Install + +**Clone and install** + +- Clone the repo +``` sh +git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git +# If you failed to clone submodule due to network failures, please run following command until success +cd CosyVoice +git submodule update --init --recursive +``` + +- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html +- Create Conda env: + +``` sh +conda create -n cosyvoice python=3.8 +conda activate cosyvoice +pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com + +# If you encounter sox compatibility issues +# ubuntu +sudo apt-get install sox libsox-dev +# centos +sudo yum install sox sox-devel +``` + +**Model download** + +We strongly recommand that you download our pretrained `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `speech_kantts_ttsfrd` resource. + +If you are expert in this field, and you are only interested in training your own CosyVoice model from scratch, you can skip this step. + +``` python +# SDK模型下载 +from modelscope import snapshot_download +snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M') +snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT') +snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct') +snapshot_download('iic/speech_kantts_ttsfrd', local_dir='pretrained_models/speech_kantts_ttsfrd') +``` + +``` sh +# git模型下载,请确保已安装git lfs +mkdir -p pretrained_models +git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M +git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT +git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct +git clone https://www.modelscope.cn/iic/speech_kantts_ttsfrd.git pretrained_models/speech_kantts_ttsfrd +``` + +Unzip `ttsfrd` resouce and install `ttsfrd` package +``` sh +cd pretrained_models/speech_kantts_ttsfrd/ +unzip resource.zip -d . +pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl +``` + +**Basic Usage** + +For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model. +For sft inference, please use `CosyVoice-300M-SFT` model. +For instruct inference, please use `CosyVoice-300M-Instruct` model. +First, add `third_party/AcademiCodec` and `third_party/Matcha-TTS` to your `PYTHONPATH`. + +``` sh +export PYTHONPATH=third_party/AcademiCodec:third_party/Matcha-TTS +``` + +``` python +from cosyvoice.cli.cosyvoice import CosyVoice +from cosyvoice.utils.file_utils import load_wav +import torchaudio + +cosyvoice = CosyVoice('speech_tts/CosyVoice-300M-SFT') +# sft usage +print(cosyvoice.list_avaliable_spks()) +output = cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女') +torchaudio.save('sft.wav', output['tts_speech'], 22050) + +cosyvoice = CosyVoice('speech_tts/CosyVoice-300M') +# zero_shot usage +prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) +output = cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k) +torchaudio.save('zero_shot.wav', output['tts_speech'], 22050) +# cross_lingual usage +prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000) +output = cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k) +torchaudio.save('cross_lingual.wav', output['tts_speech'], 22050) + +cosyvoice = CosyVoice('speech_tts/CosyVoice-300M-Instruct') +# instruct usage +output = cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的勇气智慧。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.') +torchaudio.save('instruct.wav', output['tts_speech'], 22050) +``` + +**Start web demo** + +You can use our web demo page to get familiar with CosyVoice quickly. +We support sft/zero_shot/cross_lingual/instruct inference in web demo. + +Please see the demo website for details. + +``` python +# change speech_tts/CosyVoice-300M-SFT for sft inference, or speech_tts/CosyVoice-300M-Instruct for instruct inference +python3 webui.py --port 50000 --model_dir speech_tts/CosyVoice-300M +``` + +**Advanced Usage** + +For advanced user, we have provided train and inference scripts in `examples/libritts/cosyvoice/run.sh`. +You can get familiar with CosyVoice following this recipie. + +**Build for deployment** + +Optionally, if you want to use grpc for service deployment, +you can run following steps. Otherwise, you can just ignore this step. + +``` sh +cd runtime/python +docker build -t cosyvoice:v1.0 . +# change speech_tts/CosyVoice-300M to speech_tts/CosyVoice-300M-Instruct if you want to use instruct inference +docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python && python3 server.py --port 50000 --max_conc 4 --model_dir speech_tts/CosyVoice-300M && sleep infinity" +python3 client.py --port 50000 --mode +``` + +## Discussion & Communication + +You can directly discuss on [Github Issues](https://github.com/FunAudioLLM/CosyVoice/issues). + +You can also scan the QR code to join our officla Dingding chat group. + + + +## Acknowledge + +1. We borrowed a lot of code from [FunASR](https://github.com/modelscope/FunASR). +2. We borrowed a lot of code from [FunCodec](https://github.com/modelscope/FunCodec). +3. We borrowed a lot of code from [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS). +4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec). +5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet). + +## Disclaimer +The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal. diff --git a/CosyVoice-300M-Instruct/configuration.json b/CosyVoice-300M-Instruct/configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..5e812fae901c12933ac69ebf3eb79d0eb49bbab4 --- /dev/null +++ b/CosyVoice-300M-Instruct/configuration.json @@ -0,0 +1 @@ +{"framework":"Pytorch","task":"text-to-speech"} \ No newline at end of file diff --git a/CosyVoice-300M-Instruct/cosyvoice.yaml b/CosyVoice-300M-Instruct/cosyvoice.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cc5eee088d053314d2054cc6978e6897387692f1 --- /dev/null +++ b/CosyVoice-300M-Instruct/cosyvoice.yaml @@ -0,0 +1,197 @@ +# set random seed, so that you may reproduce your result. +__set_seed1: !apply:random.seed [1986] +__set_seed2: !apply:numpy.random.seed [1986] +__set_seed3: !apply:torch.manual_seed [1986] +__set_seed4: !apply:torch.cuda.manual_seed_all [1986] + +# fixed params +sample_rate: 22050 +text_encoder_input_size: 512 +llm_input_size: 1024 +llm_output_size: 1024 +spk_embed_dim: 192 + +# model params +# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. +# for system/third_party class/function, we do not require this. +llm: !new:cosyvoice.llm.llm.TransformerLM + text_encoder_input_size: !ref + llm_input_size: !ref + llm_output_size: !ref + text_token_size: 51866 + speech_token_size: 4096 + length_normalized_loss: True + lsm_weight: 0 + spk_embed_dim: !ref + text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder + input_size: !ref + output_size: 1024 + attention_heads: 16 + linear_units: 4096 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0 + normalize_before: True + input_layer: 'linear' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + use_cnn_module: False + macaron_style: False + use_dynamic_chunk: False + use_dynamic_left_chunk: False + static_chunk_size: 1 + llm: !new:cosyvoice.transformer.encoder.TransformerEncoder + input_size: !ref + output_size: !ref + attention_heads: 16 + linear_units: 4096 + num_blocks: 14 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0 + input_layer: 'linear_legacy' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + static_chunk_size: 1 + +flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec + input_size: 512 + output_size: 80 + spk_embed_dim: !ref + output_type: 'mel' + vocab_size: 4096 + input_frame_rate: 50 + only_mask_loss: True + encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder + output_size: 512 + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + normalize_before: True + input_layer: 'linear' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + input_size: 512 + use_cnn_module: False + macaron_style: False + length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator + channels: 80 + sampling_ratios: [1, 1, 1, 1] + decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM + in_channels: 240 + n_spks: 1 + spk_emb_dim: 80 + cfm_params: !new:omegaconf.DictConfig + content: + sigma_min: 1e-06 + solver: 'euler' + t_scheduler: 'cosine' + training_cfg_rate: 0.2 + inference_cfg_rate: 0.7 + reg_loss_type: 'l1' + estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder + in_channels: 320 + out_channels: 80 + channels: [256, 256] + dropout: 0 + attention_head_dim: 64 + n_blocks: 4 + num_mid_blocks: 12 + num_heads: 8 + act_fn: 'gelu' + +hift: !new:cosyvoice.hifigan.generator.HiFTGenerator + in_channels: 80 + base_channels: 512 + nb_harmonics: 8 + sampling_rate: !ref + nsf_alpha: 0.1 + nsf_sigma: 0.003 + nsf_voiced_threshold: 10 + upsample_rates: [8, 8] + upsample_kernel_sizes: [16, 16] + istft_params: + n_fft: 16 + hop_len: 4 + resblock_kernel_sizes: [3, 7, 11] + resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] + source_resblock_kernel_sizes: [7, 11] + source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]] + lrelu_slope: 0.1 + audio_limit: 0.99 + f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor + num_class: 1 + in_channels: 80 + cond_channels: 512 + +# processor functions +parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener +get_tokenizer: !name:whisper.tokenizer.get_tokenizer + multilingual: True + num_languages: 100 + language: 'en' + task: 'transcribe' +allowed_special: 'all' +tokenize: !name:cosyvoice.dataset.processor.tokenize + get_tokenizer: !ref + allowed_special: !ref +filter: !name:cosyvoice.dataset.processor.filter + max_length: 40960 + min_length: 0 + token_max_length: 200 + token_min_length: 1 +resample: !name:cosyvoice.dataset.processor.resample + resample_rate: !ref +feat_extractor: !name:matcha.utils.audio.mel_spectrogram + n_fft: 1024 + num_mels: 80 + sampling_rate: !ref + hop_size: 256 + win_size: 1024 + fmin: 0 + fmax: 8000 + center: False +compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank + feat_extractor: !ref +parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding + normalize: True +shuffle: !name:cosyvoice.dataset.processor.shuffle + shuffle_size: 1000 +sort: !name:cosyvoice.dataset.processor.sort + sort_size: 500 # sort_size should be less than shuffle_size +batch: !name:cosyvoice.dataset.processor.batch + batch_type: 'dynamic' + max_frames_in_batch: 2000 +padding: !name:cosyvoice.dataset.processor.padding + +# dataset processor pipeline +data_pipeline: [ + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , +] + +# train conf +train_conf: + optim: adam + optim_conf: + lr: 0.001 + scheduler: warmuplr + scheduler_conf: + warmup_steps: 2500 + max_epoch: 200 + grad_clip: 5 + accum_grad: 2 + log_interval: 100 + save_per_step: -1 \ No newline at end of file diff --git a/CosyVoice-300M-SFT/.msc b/CosyVoice-300M-SFT/.msc new file mode 100644 index 0000000000000000000000000000000000000000..e63490f61b4ca23426b2472050f4514e197d1e2c Binary files /dev/null and b/CosyVoice-300M-SFT/.msc differ diff --git a/CosyVoice-300M-SFT/.mv b/CosyVoice-300M-SFT/.mv new file mode 100644 index 0000000000000000000000000000000000000000..94bd57ecb59b116f7e5f93a8f0ce32d24a4ac5c0 --- /dev/null +++ b/CosyVoice-300M-SFT/.mv @@ -0,0 +1 @@ +Revision:master,CreatedAt:1720196168 \ No newline at end of file diff --git a/CosyVoice-300M-SFT/README.md b/CosyVoice-300M-SFT/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d10e157809fbcded1299bd31bbbfa78ab8e63f65 --- /dev/null +++ b/CosyVoice-300M-SFT/README.md @@ -0,0 +1,150 @@ +# CosyVoice +## 👉🏻 [CosyVoice Demos](https://fun-audio-llm.github.io/) 👈🏻 +[[CosyVoice Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)][[CosyVoice Code](https://github.com/FunAudioLLM/CosyVoice)] + +For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVoice) and [SenseVoice space](https://www.modelscope.cn/studios/iic/SenseVoice). + +## Install + +**Clone and install** + +- Clone the repo +``` sh +git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git +# If you failed to clone submodule due to network failures, please run following command until success +cd CosyVoice +git submodule update --init --recursive +``` + +- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html +- Create Conda env: + +``` sh +conda create -n cosyvoice python=3.8 +conda activate cosyvoice +pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com + +# If you encounter sox compatibility issues +# ubuntu +sudo apt-get install sox libsox-dev +# centos +sudo yum install sox sox-devel +``` + +**Model download** + +We strongly recommand that you download our pretrained `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `speech_kantts_ttsfrd` resource. + +If you are expert in this field, and you are only interested in training your own CosyVoice model from scratch, you can skip this step. + +``` python +# SDK模型下载 +from modelscope import snapshot_download +snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M') +snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT') +snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct') +snapshot_download('iic/speech_kantts_ttsfrd', local_dir='pretrained_models/speech_kantts_ttsfrd') +``` + +``` sh +# git模型下载,请确保已安装git lfs +mkdir -p pretrained_models +git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M +git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT +git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct +git clone https://www.modelscope.cn/iic/speech_kantts_ttsfrd.git pretrained_models/speech_kantts_ttsfrd +``` + +Unzip `ttsfrd` resouce and install `ttsfrd` package +``` sh +cd pretrained_models/speech_kantts_ttsfrd/ +unzip resource.zip -d . +pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl +``` + +**Basic Usage** + +For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model. +For sft inference, please use `CosyVoice-300M-SFT` model. +For instruct inference, please use `CosyVoice-300M-Instruct` model. +First, add `third_party/AcademiCodec` and `third_party/Matcha-TTS` to your `PYTHONPATH`. + +``` sh +export PYTHONPATH=third_party/AcademiCodec:third_party/Matcha-TTS +``` + +``` python +from cosyvoice.cli.cosyvoice import CosyVoice +from cosyvoice.utils.file_utils import load_wav +import torchaudio + +cosyvoice = CosyVoice('speech_tts/CosyVoice-300M-SFT') +# sft usage +print(cosyvoice.list_avaliable_spks()) +output = cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女') +torchaudio.save('sft.wav', output['tts_speech'], 22050) + +cosyvoice = CosyVoice('speech_tts/CosyVoice-300M') +# zero_shot usage +prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) +output = cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k) +torchaudio.save('zero_shot.wav', output['tts_speech'], 22050) +# cross_lingual usage +prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000) +output = cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k) +torchaudio.save('cross_lingual.wav', output['tts_speech'], 22050) + +cosyvoice = CosyVoice('speech_tts/CosyVoice-300M-Instruct') +# instruct usage +output = cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的勇气智慧。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.') +torchaudio.save('instruct.wav', output['tts_speech'], 22050) +``` + +**Start web demo** + +You can use our web demo page to get familiar with CosyVoice quickly. +We support sft/zero_shot/cross_lingual/instruct inference in web demo. + +Please see the demo website for details. + +``` python +# change speech_tts/CosyVoice-300M-SFT for sft inference, or speech_tts/CosyVoice-300M-Instruct for instruct inference +python3 webui.py --port 50000 --model_dir speech_tts/CosyVoice-300M +``` + +**Advanced Usage** + +For advanced user, we have provided train and inference scripts in `examples/libritts/cosyvoice/run.sh`. +You can get familiar with CosyVoice following this recipie. + +**Build for deployment** + +Optionally, if you want to use grpc for service deployment, +you can run following steps. Otherwise, you can just ignore this step. + +``` sh +cd runtime/python +docker build -t cosyvoice:v1.0 . +# change speech_tts/CosyVoice-300M to speech_tts/CosyVoice-300M-Instruct if you want to use instruct inference +docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python && python3 server.py --port 50000 --max_conc 4 --model_dir speech_tts/CosyVoice-300M && sleep infinity" +python3 client.py --port 50000 --mode +``` + +## Discussion & Communication + +You can directly discuss on [Github Issues](https://github.com/FunAudioLLM/CosyVoice/issues). + +You can also scan the QR code to join our officla Dingding chat group. + + + +## Acknowledge + +1. We borrowed a lot of code from [FunASR](https://github.com/modelscope/FunASR). +2. We borrowed a lot of code from [FunCodec](https://github.com/modelscope/FunCodec). +3. We borrowed a lot of code from [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS). +4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec). +5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet). + +## Disclaimer +The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal. diff --git a/CosyVoice-300M-SFT/configuration.json b/CosyVoice-300M-SFT/configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..5e812fae901c12933ac69ebf3eb79d0eb49bbab4 --- /dev/null +++ b/CosyVoice-300M-SFT/configuration.json @@ -0,0 +1 @@ +{"framework":"Pytorch","task":"text-to-speech"} \ No newline at end of file diff --git a/CosyVoice-300M-SFT/cosyvoice.yaml b/CosyVoice-300M-SFT/cosyvoice.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cc5eee088d053314d2054cc6978e6897387692f1 --- /dev/null +++ b/CosyVoice-300M-SFT/cosyvoice.yaml @@ -0,0 +1,197 @@ +# set random seed, so that you may reproduce your result. +__set_seed1: !apply:random.seed [1986] +__set_seed2: !apply:numpy.random.seed [1986] +__set_seed3: !apply:torch.manual_seed [1986] +__set_seed4: !apply:torch.cuda.manual_seed_all [1986] + +# fixed params +sample_rate: 22050 +text_encoder_input_size: 512 +llm_input_size: 1024 +llm_output_size: 1024 +spk_embed_dim: 192 + +# model params +# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. +# for system/third_party class/function, we do not require this. +llm: !new:cosyvoice.llm.llm.TransformerLM + text_encoder_input_size: !ref + llm_input_size: !ref + llm_output_size: !ref + text_token_size: 51866 + speech_token_size: 4096 + length_normalized_loss: True + lsm_weight: 0 + spk_embed_dim: !ref + text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder + input_size: !ref + output_size: 1024 + attention_heads: 16 + linear_units: 4096 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0 + normalize_before: True + input_layer: 'linear' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + use_cnn_module: False + macaron_style: False + use_dynamic_chunk: False + use_dynamic_left_chunk: False + static_chunk_size: 1 + llm: !new:cosyvoice.transformer.encoder.TransformerEncoder + input_size: !ref + output_size: !ref + attention_heads: 16 + linear_units: 4096 + num_blocks: 14 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0 + input_layer: 'linear_legacy' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + static_chunk_size: 1 + +flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec + input_size: 512 + output_size: 80 + spk_embed_dim: !ref + output_type: 'mel' + vocab_size: 4096 + input_frame_rate: 50 + only_mask_loss: True + encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder + output_size: 512 + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + normalize_before: True + input_layer: 'linear' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + input_size: 512 + use_cnn_module: False + macaron_style: False + length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator + channels: 80 + sampling_ratios: [1, 1, 1, 1] + decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM + in_channels: 240 + n_spks: 1 + spk_emb_dim: 80 + cfm_params: !new:omegaconf.DictConfig + content: + sigma_min: 1e-06 + solver: 'euler' + t_scheduler: 'cosine' + training_cfg_rate: 0.2 + inference_cfg_rate: 0.7 + reg_loss_type: 'l1' + estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder + in_channels: 320 + out_channels: 80 + channels: [256, 256] + dropout: 0 + attention_head_dim: 64 + n_blocks: 4 + num_mid_blocks: 12 + num_heads: 8 + act_fn: 'gelu' + +hift: !new:cosyvoice.hifigan.generator.HiFTGenerator + in_channels: 80 + base_channels: 512 + nb_harmonics: 8 + sampling_rate: !ref + nsf_alpha: 0.1 + nsf_sigma: 0.003 + nsf_voiced_threshold: 10 + upsample_rates: [8, 8] + upsample_kernel_sizes: [16, 16] + istft_params: + n_fft: 16 + hop_len: 4 + resblock_kernel_sizes: [3, 7, 11] + resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] + source_resblock_kernel_sizes: [7, 11] + source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]] + lrelu_slope: 0.1 + audio_limit: 0.99 + f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor + num_class: 1 + in_channels: 80 + cond_channels: 512 + +# processor functions +parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener +get_tokenizer: !name:whisper.tokenizer.get_tokenizer + multilingual: True + num_languages: 100 + language: 'en' + task: 'transcribe' +allowed_special: 'all' +tokenize: !name:cosyvoice.dataset.processor.tokenize + get_tokenizer: !ref + allowed_special: !ref +filter: !name:cosyvoice.dataset.processor.filter + max_length: 40960 + min_length: 0 + token_max_length: 200 + token_min_length: 1 +resample: !name:cosyvoice.dataset.processor.resample + resample_rate: !ref +feat_extractor: !name:matcha.utils.audio.mel_spectrogram + n_fft: 1024 + num_mels: 80 + sampling_rate: !ref + hop_size: 256 + win_size: 1024 + fmin: 0 + fmax: 8000 + center: False +compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank + feat_extractor: !ref +parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding + normalize: True +shuffle: !name:cosyvoice.dataset.processor.shuffle + shuffle_size: 1000 +sort: !name:cosyvoice.dataset.processor.sort + sort_size: 500 # sort_size should be less than shuffle_size +batch: !name:cosyvoice.dataset.processor.batch + batch_type: 'dynamic' + max_frames_in_batch: 2000 +padding: !name:cosyvoice.dataset.processor.padding + +# dataset processor pipeline +data_pipeline: [ + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , +] + +# train conf +train_conf: + optim: adam + optim_conf: + lr: 0.001 + scheduler: warmuplr + scheduler_conf: + warmup_steps: 2500 + max_epoch: 200 + grad_clip: 5 + accum_grad: 2 + log_interval: 100 + save_per_step: -1 \ No newline at end of file diff --git a/CosyVoice-300M/.msc b/CosyVoice-300M/.msc new file mode 100644 index 0000000000000000000000000000000000000000..74d04c1303cac022fc692720139fb2857dd1f63c Binary files /dev/null and b/CosyVoice-300M/.msc differ diff --git a/CosyVoice-300M/.mv b/CosyVoice-300M/.mv new file mode 100644 index 0000000000000000000000000000000000000000..b4b9faf6a9443614c74bec63023ac4f21bab249d --- /dev/null +++ b/CosyVoice-300M/.mv @@ -0,0 +1 @@ +Revision:master,CreatedAt:1720194552 \ No newline at end of file diff --git a/CosyVoice-300M/README.md b/CosyVoice-300M/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d10e157809fbcded1299bd31bbbfa78ab8e63f65 --- /dev/null +++ b/CosyVoice-300M/README.md @@ -0,0 +1,150 @@ +# CosyVoice +## 👉🏻 [CosyVoice Demos](https://fun-audio-llm.github.io/) 👈🏻 +[[CosyVoice Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)][[CosyVoice Code](https://github.com/FunAudioLLM/CosyVoice)] + +For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVoice) and [SenseVoice space](https://www.modelscope.cn/studios/iic/SenseVoice). + +## Install + +**Clone and install** + +- Clone the repo +``` sh +git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git +# If you failed to clone submodule due to network failures, please run following command until success +cd CosyVoice +git submodule update --init --recursive +``` + +- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html +- Create Conda env: + +``` sh +conda create -n cosyvoice python=3.8 +conda activate cosyvoice +pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com + +# If you encounter sox compatibility issues +# ubuntu +sudo apt-get install sox libsox-dev +# centos +sudo yum install sox sox-devel +``` + +**Model download** + +We strongly recommand that you download our pretrained `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `speech_kantts_ttsfrd` resource. + +If you are expert in this field, and you are only interested in training your own CosyVoice model from scratch, you can skip this step. + +``` python +# SDK模型下载 +from modelscope import snapshot_download +snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M') +snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT') +snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct') +snapshot_download('iic/speech_kantts_ttsfrd', local_dir='pretrained_models/speech_kantts_ttsfrd') +``` + +``` sh +# git模型下载,请确保已安装git lfs +mkdir -p pretrained_models +git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M +git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT +git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct +git clone https://www.modelscope.cn/iic/speech_kantts_ttsfrd.git pretrained_models/speech_kantts_ttsfrd +``` + +Unzip `ttsfrd` resouce and install `ttsfrd` package +``` sh +cd pretrained_models/speech_kantts_ttsfrd/ +unzip resource.zip -d . +pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl +``` + +**Basic Usage** + +For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model. +For sft inference, please use `CosyVoice-300M-SFT` model. +For instruct inference, please use `CosyVoice-300M-Instruct` model. +First, add `third_party/AcademiCodec` and `third_party/Matcha-TTS` to your `PYTHONPATH`. + +``` sh +export PYTHONPATH=third_party/AcademiCodec:third_party/Matcha-TTS +``` + +``` python +from cosyvoice.cli.cosyvoice import CosyVoice +from cosyvoice.utils.file_utils import load_wav +import torchaudio + +cosyvoice = CosyVoice('speech_tts/CosyVoice-300M-SFT') +# sft usage +print(cosyvoice.list_avaliable_spks()) +output = cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女') +torchaudio.save('sft.wav', output['tts_speech'], 22050) + +cosyvoice = CosyVoice('speech_tts/CosyVoice-300M') +# zero_shot usage +prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) +output = cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k) +torchaudio.save('zero_shot.wav', output['tts_speech'], 22050) +# cross_lingual usage +prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000) +output = cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k) +torchaudio.save('cross_lingual.wav', output['tts_speech'], 22050) + +cosyvoice = CosyVoice('speech_tts/CosyVoice-300M-Instruct') +# instruct usage +output = cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的勇气智慧。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.') +torchaudio.save('instruct.wav', output['tts_speech'], 22050) +``` + +**Start web demo** + +You can use our web demo page to get familiar with CosyVoice quickly. +We support sft/zero_shot/cross_lingual/instruct inference in web demo. + +Please see the demo website for details. + +``` python +# change speech_tts/CosyVoice-300M-SFT for sft inference, or speech_tts/CosyVoice-300M-Instruct for instruct inference +python3 webui.py --port 50000 --model_dir speech_tts/CosyVoice-300M +``` + +**Advanced Usage** + +For advanced user, we have provided train and inference scripts in `examples/libritts/cosyvoice/run.sh`. +You can get familiar with CosyVoice following this recipie. + +**Build for deployment** + +Optionally, if you want to use grpc for service deployment, +you can run following steps. Otherwise, you can just ignore this step. + +``` sh +cd runtime/python +docker build -t cosyvoice:v1.0 . +# change speech_tts/CosyVoice-300M to speech_tts/CosyVoice-300M-Instruct if you want to use instruct inference +docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python && python3 server.py --port 50000 --max_conc 4 --model_dir speech_tts/CosyVoice-300M && sleep infinity" +python3 client.py --port 50000 --mode +``` + +## Discussion & Communication + +You can directly discuss on [Github Issues](https://github.com/FunAudioLLM/CosyVoice/issues). + +You can also scan the QR code to join our officla Dingding chat group. + + + +## Acknowledge + +1. We borrowed a lot of code from [FunASR](https://github.com/modelscope/FunASR). +2. We borrowed a lot of code from [FunCodec](https://github.com/modelscope/FunCodec). +3. We borrowed a lot of code from [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS). +4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec). +5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet). + +## Disclaimer +The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal. diff --git a/CosyVoice-300M/configuration.json b/CosyVoice-300M/configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..5e812fae901c12933ac69ebf3eb79d0eb49bbab4 --- /dev/null +++ b/CosyVoice-300M/configuration.json @@ -0,0 +1 @@ +{"framework":"Pytorch","task":"text-to-speech"} \ No newline at end of file diff --git a/CosyVoice-300M/cosyvoice.yaml b/CosyVoice-300M/cosyvoice.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cc5eee088d053314d2054cc6978e6897387692f1 --- /dev/null +++ b/CosyVoice-300M/cosyvoice.yaml @@ -0,0 +1,197 @@ +# set random seed, so that you may reproduce your result. +__set_seed1: !apply:random.seed [1986] +__set_seed2: !apply:numpy.random.seed [1986] +__set_seed3: !apply:torch.manual_seed [1986] +__set_seed4: !apply:torch.cuda.manual_seed_all [1986] + +# fixed params +sample_rate: 22050 +text_encoder_input_size: 512 +llm_input_size: 1024 +llm_output_size: 1024 +spk_embed_dim: 192 + +# model params +# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. +# for system/third_party class/function, we do not require this. +llm: !new:cosyvoice.llm.llm.TransformerLM + text_encoder_input_size: !ref + llm_input_size: !ref + llm_output_size: !ref + text_token_size: 51866 + speech_token_size: 4096 + length_normalized_loss: True + lsm_weight: 0 + spk_embed_dim: !ref + text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder + input_size: !ref + output_size: 1024 + attention_heads: 16 + linear_units: 4096 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0 + normalize_before: True + input_layer: 'linear' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + use_cnn_module: False + macaron_style: False + use_dynamic_chunk: False + use_dynamic_left_chunk: False + static_chunk_size: 1 + llm: !new:cosyvoice.transformer.encoder.TransformerEncoder + input_size: !ref + output_size: !ref + attention_heads: 16 + linear_units: 4096 + num_blocks: 14 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0 + input_layer: 'linear_legacy' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + static_chunk_size: 1 + +flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec + input_size: 512 + output_size: 80 + spk_embed_dim: !ref + output_type: 'mel' + vocab_size: 4096 + input_frame_rate: 50 + only_mask_loss: True + encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder + output_size: 512 + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + normalize_before: True + input_layer: 'linear' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + input_size: 512 + use_cnn_module: False + macaron_style: False + length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator + channels: 80 + sampling_ratios: [1, 1, 1, 1] + decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM + in_channels: 240 + n_spks: 1 + spk_emb_dim: 80 + cfm_params: !new:omegaconf.DictConfig + content: + sigma_min: 1e-06 + solver: 'euler' + t_scheduler: 'cosine' + training_cfg_rate: 0.2 + inference_cfg_rate: 0.7 + reg_loss_type: 'l1' + estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder + in_channels: 320 + out_channels: 80 + channels: [256, 256] + dropout: 0 + attention_head_dim: 64 + n_blocks: 4 + num_mid_blocks: 12 + num_heads: 8 + act_fn: 'gelu' + +hift: !new:cosyvoice.hifigan.generator.HiFTGenerator + in_channels: 80 + base_channels: 512 + nb_harmonics: 8 + sampling_rate: !ref + nsf_alpha: 0.1 + nsf_sigma: 0.003 + nsf_voiced_threshold: 10 + upsample_rates: [8, 8] + upsample_kernel_sizes: [16, 16] + istft_params: + n_fft: 16 + hop_len: 4 + resblock_kernel_sizes: [3, 7, 11] + resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] + source_resblock_kernel_sizes: [7, 11] + source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]] + lrelu_slope: 0.1 + audio_limit: 0.99 + f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor + num_class: 1 + in_channels: 80 + cond_channels: 512 + +# processor functions +parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener +get_tokenizer: !name:whisper.tokenizer.get_tokenizer + multilingual: True + num_languages: 100 + language: 'en' + task: 'transcribe' +allowed_special: 'all' +tokenize: !name:cosyvoice.dataset.processor.tokenize + get_tokenizer: !ref + allowed_special: !ref +filter: !name:cosyvoice.dataset.processor.filter + max_length: 40960 + min_length: 0 + token_max_length: 200 + token_min_length: 1 +resample: !name:cosyvoice.dataset.processor.resample + resample_rate: !ref +feat_extractor: !name:matcha.utils.audio.mel_spectrogram + n_fft: 1024 + num_mels: 80 + sampling_rate: !ref + hop_size: 256 + win_size: 1024 + fmin: 0 + fmax: 8000 + center: False +compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank + feat_extractor: !ref +parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding + normalize: True +shuffle: !name:cosyvoice.dataset.processor.shuffle + shuffle_size: 1000 +sort: !name:cosyvoice.dataset.processor.sort + sort_size: 500 # sort_size should be less than shuffle_size +batch: !name:cosyvoice.dataset.processor.batch + batch_type: 'dynamic' + max_frames_in_batch: 2000 +padding: !name:cosyvoice.dataset.processor.padding + +# dataset processor pipeline +data_pipeline: [ + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , +] + +# train conf +train_conf: + optim: adam + optim_conf: + lr: 0.001 + scheduler: warmuplr + scheduler_conf: + warmup_steps: 2500 + max_epoch: 200 + grad_clip: 5 + accum_grad: 2 + log_interval: 100 + save_per_step: -1 \ No newline at end of file diff --git a/CosyVoice-ttsfrd/.gitattributes b/CosyVoice-ttsfrd/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..9719c1ba0877231f146937a007218d0eca4d3962 --- /dev/null +++ b/CosyVoice-ttsfrd/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +resource.zip filter=lfs diff=lfs merge=lfs -text diff --git a/CosyVoice-ttsfrd/README.md b/CosyVoice-ttsfrd/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1202d928cf01202f767806d03fa47ddeebcf8158 --- /dev/null +++ b/CosyVoice-ttsfrd/README.md @@ -0,0 +1,150 @@ +# CosyVoice +## 👉🏻 [CosyVoice Demos](https://fun-audio-llm.github.io/) 👈🏻 +[[CosyVoice Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)][[CosyVoice Code](https://github.com/FunAudioLLM/CosyVoice)] + +For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVoice) and [SenseVoice space](https://www.modelscope.cn/studios/iic/SenseVoice). + +## Install + +**Clone and install** + +- Clone the repo +``` sh +git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git +# If you failed to clone submodule due to network failures, please run following command until success +cd CosyVoice +git submodule update --init --recursive +``` + +- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html +- Create Conda env: + +``` sh +conda create -n cosyvoice python=3.8 +conda activate cosyvoice +pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com + +# If you encounter sox compatibility issues +# ubuntu +sudo apt-get install sox libsox-dev +# centos +sudo yum install sox sox-devel +``` + +**Model download** + +We strongly recommand that you download our pretrained `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `speech_kantts_ttsfrd` resource. + +If you are expert in this field, and you are only interested in training your own CosyVoice model from scratch, you can skip this step. + +``` python +# SDK模型下载 +from modelscope import snapshot_download +snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M') +snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT') +snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct') +snapshot_download('iic/speech_kantts_ttsfrd', local_dir='pretrained_models/speech_kantts_ttsfrd') +``` + +``` sh +# git模型下载,请确保已安装git lfs +mkdir -p pretrained_models +git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M +git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT +git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct +git clone https://www.modelscope.cn/iic/speech_kantts_ttsfrd.git pretrained_models/speech_kantts_ttsfrd +``` + +Unzip `ttsfrd` resouce and install `ttsfrd` package +``` sh +cd pretrained_models/speech_kantts_ttsfrd/ +unzip resource.zip -d . +pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl +``` + +**Basic Usage** + +For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model. +For sft inference, please use `CosyVoice-300M-SFT` model. +For instruct inference, please use `CosyVoice-300M-Instruct` model. +First, add `third_party/AcademiCodec` and `third_party/Matcha-TTS` to your `PYTHONPATH`. + +``` sh +export PYTHONPATH=third_party/AcademiCodec:third_party/Matcha-TTS +``` + +``` python +from cosyvoice.cli.cosyvoice import CosyVoice +from cosyvoice.utils.file_utils import load_wav +import torchaudio + +cosyvoice = CosyVoice('speech_tts/CosyVoice-300M-SFT') +# sft usage +print(cosyvoice.list_avaliable_spks()) +output = cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女') +torchaudio.save('sft.wav', output['tts_speech'], 22050) + +cosyvoice = CosyVoice('speech_tts/CosyVoice-300M') +# zero_shot usage +prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) +output = cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k) +torchaudio.save('zero_shot.wav', output['tts_speech'], 22050) +# cross_lingual usage +prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000) +output = cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k) +torchaudio.save('cross_lingual.wav', output['tts_speech'], 22050) + +cosyvoice = CosyVoice('speech_tts/CosyVoice-300M-Instruct') +# instruct usage +output = cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的勇气智慧。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.') +torchaudio.save('instruct.wav', output['tts_speech'], 22050) +``` + +**Start web demo** + +You can use our web demo page to get familiar with CosyVoice quickly. +We support sft/zero_shot/cross_lingual/instruct inference in web demo. + +Please see the demo website for details. + +``` python +# change speech_tts/CosyVoice-300M-SFT for sft inference, or speech_tts/CosyVoice-300M-Instruct for instruct inference +python3 webui.py --port 50000 --model_dir speech_tts/CosyVoice-300M +``` + +**Advanced Usage** + +For advanced user, we have provided train and inference scripts in `examples/libritts/cosyvoice/run.sh`. +You can get familiar with CosyVoice following this recipie. + +**Build for deployment** + +Optionally, if you want to use grpc for service deployment, +you can run following steps. Otherwise, you can just ignore this step. + +``` sh +cd runtime/python +docker build -t cosyvoice:v1.0 . +# change speech_tts/CosyVoice-300M to speech_tts/CosyVoice-300M-Instruct if you want to use instruct inference +docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python && python3 server.py --port 50000 --max_conc 4 --model_dir speech_tts/CosyVoice-300M && sleep infinity" +python3 client.py --port 50000 --mode +``` + +## Discussion & Communication + +You can directly discuss on [Github Issues](https://github.com/FunAudioLLM/CosyVoice/issues). + +You can also scan the QR code to join our officla Dingding chat group. + + + +## Acknowledge + +1. We borrowed a lot of code from [FunASR](https://github.com/modelscope/FunASR). +2. We borrowed a lot of code from [FunCodec](https://github.com/modelscope/FunCodec). +3. We borrowed a lot of code from [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS). +4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec). +5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet). + +## Disclaimer +The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal. diff --git a/CosyVoice-ttsfrd/configuration.json b/CosyVoice-ttsfrd/configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..5e812fae901c12933ac69ebf3eb79d0eb49bbab4 --- /dev/null +++ b/CosyVoice-ttsfrd/configuration.json @@ -0,0 +1 @@ +{"framework":"Pytorch","task":"text-to-speech"} \ No newline at end of file diff --git a/CosyVoice-ttsfrd/resource/festival/Singing.v0_1.dtd b/CosyVoice-ttsfrd/resource/festival/Singing.v0_1.dtd new file mode 100644 index 0000000000000000000000000000000000000000..b0dd8a8881a8a6599c8a4d751122718d11c9d1a0 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/Singing.v0_1.dtd @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + +%ISOlat1; + + + diff --git a/CosyVoice-ttsfrd/resource/festival/apml.scm b/CosyVoice-ttsfrd/resource/festival/apml.scm new file mode 100644 index 0000000000000000000000000000000000000000..85a46868c0f45e882f3c16862fb22d6405759f5d --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/apml.scm @@ -0,0 +1,551 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 2002 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Author: Rob Clark +;;; Date: July 2002 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Sets up the current voice to synthesise from APML. +;; +;; + +(require 'apml_f2bf0lr) +(require 'apml_kaldurtreeZ) + +;; Default pitch settings (if unspecified in current voice.) + +(defvar apml_default_pitch_mean 170 ) +(defvar apml_default_pitch_standard_deviation 34 ) + +;; apml sythesis wrappers. + +(define (apml_client_synth apml) + "(apml_client_synth apml) +Synthesise apml and return waveform(s) to client." + (utt.send.wave.client (apml_synth apml))) + +(define (apml_synth apml) +"(apml_synth xml) +Synthesis an apml string." +(let ((tmpfile (make_tmp_filename)) + utt) + (string_to_file tmpfile apml) + (set! utt (apml_file_synth tmpfile)) + (delete-file tmpfile) + utt)) + +(define (apml_file_synth filename) + "(apml_file_synth filename) +Synthesis an apml file." + (let ((utt (Utterance Tokens nil))) + (utt.load utt filename) + (utt.synth utt))) + +(define (string_to_file file s) +"(string_to_file file string) + Write string to file." +(let ((fd)) + (set! fd (fopen file "wb")) + (format fd "%s" s) + (fclose fd))) + + +;;; +;;; Phrasing. +;;; + +;; phrasing CART. +; +; It has been decided that by default, only punctuation should affect +; phrasing (and subsequently pauses) +; +(set! apml_phrase_tree + ' + ((lisp_apml_punc in ("?" "." ":")) ; big punctuation + ((BB)) + ((lisp_apml_punc in ("'" "\"" "," ";")) ; else little punctuation + ((B)) + ((lisp_apml_last_word is 1) + ((BB)) ; need a BB at the end! + ((NB)))))) ; else nothing + +;; feature functions for phrasing +(define (apml_punc word) + (item.feat (item.relation.parent word 'Token) 'punc)) + +(define (apml_last_word word) + (if (item.next word) + "0" "1")) + + +;;; +;;; Pauses +;;; + +;; feature functions for pauses +(define (apml_is_pause word) + (if (item.relation (item.relation.parent word 'Token) 'Pause) + t + nil)) + +(define (apml_pause word) + (if (item.relation word 'Pause) + (item.feat (item.relation.parent (item.relation.parent word 'Token) 'Pause) "sec") + 0)) + +(define (Apml_Pauses utt) + "(Pauses UTT) +Predict pause insertion for apml." + (let ((words (utt.relation.items utt 'Word)) lastword tpname) + (if words + (begin + (insert_initial_pause utt) ;; always have a start pause + (set! lastword (car (last words))) + (mapcar + (lambda (w) + (let ((pbreak (item.feat w "pbreak")) + (emph (item.feat w "R:Token.parent.EMPH"))) + (cond + ((apml_is_pause w) + (insert_pause utt w)) + ((or (string-equal "B" pbreak) + (string-equal "BB" pbreak)) + (insert_pause utt w)) + ((equal? w lastword) + (insert_pause utt w))))) + words) + ;; The embarassing bit. Remove any words labelled as punc or fpunc + (mapcar + (lambda (w) + (let ((pos (item.feat w "pos"))) + (if (or (string-equal "punc" pos) + (string-equal "fpunc" pos)) + (let ((pbreak (item.feat w "pbreak")) + (wp (item.relation w 'Phrase))) + (if (and (string-matches pbreak "BB?") + (item.relation.prev w 'Word)) + (item.set_feat + (item.relation.prev w 'Word) "pbreak" pbreak)) + (item.relation.remove w 'Word) + ;; can't refer to w as we've just deleted it + (item.relation.remove wp 'Phrase))))) + words))) + utt)) + + + +;;; +;;; Intonation. +;;; + +;; Accent prediction (well transfer really). +;; +;; We treat L+H* L-H% on a single syllable as a special case. + +(set! apml_accent_cart + ' + ((lisp_apml_accent is "Hstar") + ((H*)) + ((lisp_apml_accent is "Lstar") + ((L*)) + ((lisp_apml_LHLH is "LHLH") + ((L+H*L-H%)) + ((lisp_apml_accent is "LplusHstar") + ((L+H*)) + ((lisp_apml_accent is "LstarplusH") + ((L*+H)) + ((NONE)))))))) + +(set! apml_boundary_cart + ' + ((lisp_apml_boundary is "LL") + ((L-L%)) + ((lisp_apml_LHLH is "LHLH") + ((NONE)) ; this is dealt with by the accent feature + ((lisp_apml_boundary is "LH") + ((L-H%)) + ((lisp_apml_boundary is "HH") + ((H-H%)) + ((lisp_apml_boundary is "HL") + ((H-L%)) + ((NONE)))))))) + +;; feature functions. +(define (apml_accent syl) + (let ((token (item.relation.parent (item.relation.parent syl 'SylStructure) 'Token))) + (if (and (eq (item.feat syl 'stress) 1) + (item.relation.parent token 'Emphasis)) + (item.feat (item.relation.parent token 'Emphasis) 'x-pitchaccent) + 0))) + +(define (apml_boundary syl) + (let ((token (item.relation.parent (item.relation.parent syl 'SylStructure) 'Token))) + (if (and (> (item.feat syl 'syl_break) 0) + (item.relation.parent token 'Boundary)) + (item.feat (item.relation.parent token 'Boundary) 'type) + 0))) + +(define (apml_LHLH syl) + (let ((accent (apml_accent syl)) + (boundary (apml_boundary syl))) + (if (and (string-equal accent "LplusHstar") + (string-equal boundary "LH")) + "LHLH" + 0))) + + +(define (apml_seg_is_LHLH_vowel seg) + (if (and (string-equal (apml_LHLH (item.relation.parent seg 'SylStructure)) + "LHLH") + (string-equal (item.feat seg 'ph_vc) "+")) + "LHLH" + 0)) + + +;;;; feature functions: + +(define (apml_tgtype syl) + (let ((l (apml_boundl (item.relation.parent syl 'SylStructure))) + (r (apml_boundr (item.relation.parent syl 'SylStructure)))) + (if (eq (item.feat syl 'accented) 0) + 0 ; this is a quirk related to the way the models were trained + (cond + ((eq l 0) + 1) + ((eq r 1) + 3) + (t 2))))) + + +(define (apml_iecount syl) + (if (eq (item.feat syl 'accented) 0) + 0 ; this is a quirk related to the way the models were trained + (+ (item.feat syl 'asyl_in) 1))) + +;; suport functions. +(define (apml_boundl word) +"(apml_boundl word) +Number of boundaries in this performative to the left of this word." + (let ((w (item.prev word)) + (c 0)) + (while (and w (apml_same_p w word)) + (if (item.relation.parent (item.relation.parent w 'Token) 'Boundary) + (set! c (+ c 1))) + (set! w (item.prev w))) + c)) + +(define (apml_boundr word) +"(apml_boundr word) +Number of boundaries in this performative to the right of this word." + (let ((w word) + (c 0)) + (while (and w (apml_same_p w word)) + (if (item.relation.parent (item.relation.parent w 'Token) 'Boundary) + (set! c (+ c 1))) + (set! w (item.next w))) + c)) + +(define (apml_same_p w1 w2) +"(apml_same_p w1 w2) + Are these two words in the same performative?" +(let ((p1 (item.relation.parent (item.relation.parent w1 'Token) 'SemStructure)) + (p2 (item.relation.parent (item.relation.parent w1 'Token) 'SemStructure))) + (if (and (item.parent p1) (item.parent p2)) ; not true if theme/rheme omitted. + (equal? (item.parent p1) (item.parent p2)) + (equal? p1 p2)))) + +;;; +;;; segment timings +;;; + +(define (apml_seg_times utt) + "(apml_seg_times utt) +Output the segment timings for an apml utterance." + (let ((segs (utt.relation.items utt 'Segment))) + (mapcar + (lambda (x) + (format t "%s %s\n" (item.name x) (item.feat x 'end))) + segs) + t)) + +;;; +;;; Additional functions for f0model. +;;; + + +(define (find_hstar_left syl) +"(find_hstar_left syl) +If the closest accent or boundary to the left is H* return how many syllables away it is. Returns 0 if nearest accent is not H*" +(let ((count 0)) + ;; if this syllable has a pitch event + (if (or (not (string-equal (item.feat syl 'tobi_accent) "NONE")) + (not (string-equal (item.feat syl 'tobi_endtone) "NONE"))) + 0) + (while (and syl + (string-equal (item.feat syl 'tobi_accent) "NONE") + (string-equal (item.feat syl 'tobi_endtone) "NONE")) + (set! count (+ count 1)) + (set! syl (item.prev syl))) + (cond + ;; run out of syllables before finding accent + ((null syl) + 0) + ((string-equal (item.feat syl 'tobi_accent) "H*") + count) + (t 0)))) + +(define (find_ll_right syl) +"(find_ll_right syl) +If the closest accent or boundary to the right is L-L% return how many syllables away it is. Returns 0 if nearest is not L-L%." +(let ((count 0)) + ;; if this syllable has a pitch event + (if (or (not (string-equal (item.feat syl 'tobi_accent) "NONE")) + (not (string-equal (item.feat syl 'tobi_endtone) "NONE"))) + 0) + (while (and syl + (string-equal (item.feat syl 'tobi_accent) "NONE") + (string-equal (item.feat syl 'tobi_endtone) "NONE")) + (set! count (+ count 1)) + (set! syl (item.next syl))) + (cond + ;; run out of syllables before finding boundary + ((null syl) + 0) + ((string-equal (item.feat syl 'tobi_endtone) "L-L%") + count) + (t 0)))) + +(define (l_spread syl) +"(l_spread syl) +Proportion of pitch lowering required due to L- spreading backwards." +(let ((l (find_hstar_left syl)) + (r (find_ll_right syl))) + (cond + ((or (eq l 0) + (eq r 0)) + 0) + (t + (/ r (- (+ l r) 1)))))) + + +;;; +;;; Debuging and other useful stuff. +;;; + + + +(define (apml_print_semstruct utt) +"(apml_print_semstruct utt) +Pretty print APML semantic structure." + (let ((i (utt.relation.first utt 'SemStructure))) + (while (not (null i)) + (apml_pss_item 0 i) + (apml_pss_daughters 1 (item.daughters i)) + (set! i (item.next i))))) + +(define (apml_pss_daughters depth list) + (mapcar + (lambda (x) + (apml_pss_item depth x) + (apml_pss_daughters (+ depth 1) (item.daughters x)) + ) + list)) + + +(define (apml_pss_item depth item) + (let ((c 0)) + (while (< c depth) + (format t " ") + (set! c (+ c 1))) + (format t "%s\n" (item.name item)))) + + +(define (apml_print_words utt) +"(apml_print_words utt) + Pretty print APML words with associated accents." + (mapcar + (lambda (x) + (format t "%s (" (item.name x)) + (apml_pww_accent x) + (apml_pww_boundary x) + (apml_pww_pause x) + (format t ")\n")) + (utt.relation.items utt 'Word)) + t) + +(define (apml_pww_accent item) + (let ((p (item.relation.parent (item.relation.parent item 'Token) 'Emphasis))) + (if p (apml_ppw_list (item.features p))))) + +(define (apml_pww_boundary item) + (let ((p (item.relation.parent (item.relation.parent item 'Token) 'Boundary))) + (if p (apml_ppw_list (item.features p))))) + +(define (apml_pww_pause item) + (let ((p (item.relation.parent (item.relation.parent item 'Token) 'Pause))) + (if p (apml_ppw_list (item.features p))))) + +(define (apml_ppw_list l) + (mapcar + (lambda (x) + (format t " %s" x)) + (flatten l))) + + +(define (apml_print_sylstructure utt filename) +"(apml_print_sylstructure utt filename) +Pretty print APML syllable structure. Filename t for stdout" + (let (fd) + (if (not (eq? filename t)) + (set! fd (fopen filename "wb")) + (set! fd t)) + (mapcar + (lambda (x) + (format fd "%s\n" (item.name x)) + (apml_psyl fd x)) + (utt.relation.items utt 'Word)) + t)) + +(define (apml_psyl fd word) + (mapcar + (lambda (x) + (apml_psegs fd x) + (if (eq (item.feat x 'stress) 1) + (format fd " (1)")) + (if (item.relation.daughter1 x 'Intonation) + (begin + (let ((ie (item.relation.daughter1 x 'Intonation))) + (format fd " [") + (while ie + (format fd "%s" (item.name ie)) + (set! ie (item.next ie)) + (if ie (format t " "))) + (format fd "]")))) + (format fd "\n")) + (item.daughters (item.relation word 'SylStructure)))) + +(define (apml_psegs fd syl) + (let ((segs (item.daughters syl))) + (format fd " ") + (while segs + (format fd "%s" (item.name (car segs))) + (if (cdr segs) + (format fd ".")) + (set! segs (cdr segs))))) + + +(define (apml_get_lr_params) + (let ((m 0) + (s 0)) + (if (or (equal? (Parameter.get 'Int_Target_Method) Int_Targets_LR) + (equal? (Parameter.get 'Int_Target_Method) Int_Targets_5_LR)) + (begin + (set! m (car (cdr (car int_lr_params)))) + (set! s (car (cdr (car (cdr int_lr_params)))))) + (begin + (set! m apml_default_pitch_mean) + (set! s apml_default_pitch_standard_deviation))) + (list m s))) + + + + +(define (apml_initialise) + "(apml_initialise) +Set up the current voice for apml use." + (if (not (string-matches current-voice ".*multisyn.*")) ; nothing if multisyn + (cond + ((or (string-equal (Parameter.get 'Language) "americanenglish") + (string-equal (Parameter.get 'Language) "britishenglish")) + (begin + (format t "Initialising APML for English.\n") + ;; Phrasing. + (Parameter.set 'Phrase_Method 'cart_tree) + (set! phrase_cart_tree apml_phrase_tree) + ;; Pauses. + ;;(set! duration_cart_tree apml_kal_duration_cart_tree) + ;;(set! duration_ph_info apml_kal_durs) + ;;(Parameter.set 'Pause_Method Apml_Pauses) + ;; Lexicon. + ;;;; We now assume the lexicon you have already set is suitable, + ;;;; You probably want to ensure this is "apmlcmu" or "unilex" + ;;(if (not (member_string "apmlcmu" (lex.list))) + ;; (load (path-append lexdir "apmlcmu/apmlcmulex.scm"))) + ;;(lex.select "apmlcmu") + ;; Add other lex entries here: + ;;(lex.add.entry '("minerals" nil (((m ih n) 1) ((er) 0) ((ax l z) 0)))) + ;;(lex.add.entry '("fibre" nil (((f ay b) 1) ((er) 0)))) + ;;(lex.add.entry '("dont" v (((d ow n t) 1)))) + ;;(lex.add.entry '("pectoris" nil (((p eh k) 2) ((t ao r) 1) ((ih s) 0)))) + ;;(lex.add.entry '("sideeffects" nil (((s ay d) 1) ((ax f) 0) ((eh k t s) 2)))) + + ;; Intonation events. + (set! int_accent_cart_tree apml_accent_cart) + (set! int_tone_cart_tree apml_boundary_cart) + (Parameter.set 'Int_Method Intonation_Tree) + ;; Intonation f0 contour. + (set! f0_lr_start apml_f2b_f0_lr_start) + (set! f0_lr_left apml_f2b_f0_lr_left) + (set! f0_lr_mid apml_f2b_f0_lr_mid) + (set! f0_lr_right apml_f2b_f0_lr_right) + (set! f0_lr_end apml_f2b_f0_lr_end) + (set! int_lr_params + (list (list 'target_f0_mean (car (apml_get_lr_params))) + (list 'target_f0_std (car (cdr (apml_get_lr_params)))) + (list 'model_f0_mean 170) + (list 'model_f0_std 40))) + (Parameter.set 'Int_Target_Method Int_Targets_5_LR) + nil)) + ((string-equal (Parameter.get 'Language) "italian") + (begin + (format t "Initialising APML for Italian.\n") + ;; Phrasing. + (Parameter.set 'Phrase_Method 'cart_tree) + (set! phrase_cart_tree apml_phrase_tree) + ;; Intonation events. + (set! int_accent_cart_tree apml_accent_cart) + (set! int_tone_cart_tree apml_boundary_cart) + (Parameter.set 'Int_Method Intonation_Tree) + ;; Intonation f0 contour. + (set! f0_lr_start apml_f2b_f0_lr_start) + (set! f0_lr_mid apml_f2b_f0_lr_mid) + (set! f0_lr_end apml_f2b_f0_lr_end) + (set! int_lr_params + (list (list 'target_f0_mean (car (apml_get_lr_params))) + (list 'target_f0_std (car (cdr (apml_get_lr_params)))) + (list 'model_f0_mean 170) + (list 'model_f0_std 34))) + (Parameter.set 'Int_Target_Method Int_Targets_LR) + nil)) + (t nil)))) + +(provide 'apml) diff --git a/CosyVoice-ttsfrd/resource/festival/apml_f2bf0lr.scm b/CosyVoice-ttsfrd/resource/festival/apml_f2bf0lr.scm new file mode 100644 index 0000000000000000000000000000000000000000..3d312a8ff8d7d20eba6b043d12cd3c880d7b66ee --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/apml_f2bf0lr.scm @@ -0,0 +1,530 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 2002 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Author: Rob Clark +;;; Date: July 2002 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; APML.f0 trees. +;; +;; + +(set! apml_f2b_f0_lr_start +'( +( Intercept 163.9871 ) +( pp.lisp_apml_tgtype -3.1750 (1) ) +( p.lisp_apml_tgtype 5.0332 (1) ) +( lisp_apml_tgtype 0.0000 (1) ) +( n.lisp_apml_tgtype 17.7799 (1) ) +( nn.lisp_apml_tgtype 13.6845 (1) ) +( pp.lisp_apml_tgtype 0.0000 (2) ) +( p.lisp_apml_tgtype 0.0000 (2) ) +( lisp_apml_tgtype 0.0000 (2) ) +( n.lisp_apml_tgtype 0.0000 (2) ) +( nn.lisp_apml_tgtype 0.0000 (2) ) +( pp.lisp_apml_tgtype 0.0000 (3) ) +( p.lisp_apml_tgtype 0.0000 (3) ) +( lisp_apml_tgtype -9.7245 (3) ) +( n.lisp_apml_tgtype 0.0000 (3) ) +( nn.lisp_apml_tgtype -2.4009 (3) ) +( pp.lisp_apml_iecount 0.0000 ) +( p.lisp_apml_iecount -0.4484 ) +( lisp_apml_iecount 0.0000 ) +( n.lisp_apml_iecount -2.0165 ) +( nn.lisp_apml_iecount 0.0000 ) +( pp.tobi_accent 0.0000 (H*) ) +( p.tobi_accent 11.1239 (H*) ) +( tobi_accent 21.5164 (H*) ) +( n.tobi_accent -2.5990 (H*) ) +( nn.tobi_accent -6.5307 (H*) ) +( pp.tobi_accent 0.0000 (L*) ) +( p.tobi_accent -10.0000 (L*) ) +( tobi_accent -5.0000 (L*) ) +( n.tobi_accent -10.6798 (L*) ) +( nn.tobi_accent -5.6561 (L*) ) +( pp.tobi_accent 5.3577 (L*+H) ) +( p.tobi_accent 60.0000 (L*+H) ) +( tobi_accent -5.0000 (L*+H) ) +( n.tobi_accent 0.0000 (L*+H) ) +( nn.tobi_accent 0.0000 (L*+H) ) +( pp.tobi_accent 0.0000 (L+H*) ) +( p.tobi_accent 11.1200 (L+H*) ) +( tobi_accent 21.5200 (L+H*) ) +( n.tobi_accent -2.6000 (L+H*) ) +( nn.tobi_accent -6.5300 (L+H*) ) +( pp.tobi_endtone 0.0000 (L-L%) ) +( p.tobi_endtone -0.6164 (L-L%) ) +( tobi_endtone -50 (L-L%) ) +( n.tobi_endtone -10.8729 (L-L%) ) +( nn.tobi_endtone -7.6522 (L-L%) ) +( pp.tobi_endtone 0.7583 (L-H%) ) +( p.tobi_endtone 0.0000 (L-H%) ) +( tobi_endtone -20.0000 (L-H%) ) +( n.tobi_endtone -11.8935 (L-H%) ) +( nn.tobi_endtone -7.2012 (L-H%) ) +( pp.tobi_endtone 0.0000 (H-L%) ) +( p.tobi_endtone 0.0000 (H-L%) ) +( tobi_endtone 4.0790 (H-L%) ) +( n.tobi_endtone -19.3463 (H-L%) ) +( nn.tobi_endtone -29.3615 (H-L%) ) +( pp.tobi_endtone 0.0000 (H-H%) ) +( p.tobi_endtone 0.0000 (H-H%) ) +( tobi_endtone 0.0000 (H-H%) ) +( n.tobi_endtone 0.0000 (H-H%) ) +( nn.tobi_endtone 0.0000 (H-H%) ) +( pp.tobi_endtone 0.0000 (L-) ) +( p.tobi_endtone -15.1702 (L-) ) +( tobi_endtone 0.0000 (L-) ) +( n.tobi_endtone -14.5562 (L-) ) +( nn.tobi_endtone 0.0000 (L-) ) +( pp.tobi_endtone -13.5046 (H-) ) +( p.tobi_endtone 0.0000 (H-) ) +( tobi_endtone 6.3377 (H-) ) +( n.tobi_endtone -6.8631 (H-) ) +( nn.tobi_endtone 0.0000 (H-) ) +( p.tobi_accent 60.0000 (L+H*L-H%) ) +( tobi_accent -60.0000 (L+H*L-H%) ) +( n.tobi_accent 0.0000 (L+H*L-H%) ) +( pp.syl_break 0.0000 ) +( p.syl_break 0.0000 ) +( syl_break 0.6417 ) +( n.syl_break 1.3532 ) +( nn.syl_break 1.0724 ) +( pp.stress 0.0000 ) +( p.stress -0.6193 ) +( stress 2.4121 ) +( n.stress 0.0000 ) +( nn.stress 2.5478 ) +( syl_in -1.4373 ) +( syl_out 0.4181 ) +( ssyl_in 0.0000 ) +( ssyl_out 0.6125 ) +( asyl_in 0.0000 ) +( asyl_out 0.9906 ) +( last_accent 0.0000 ) +( next_accent -0.3700 ) +( sub_phrases 0.0000 ) +( lisp_l_spread -60.0000 ) +)) + +(set! apml_f2b_f0_lr_left +'( +( Intercept 162.1173 ) +( pp.lisp_apml_tgtype -1.5875 (1) ) +( p.lisp_apml_tgtype 4.8101 (1) ) +( lisp_apml_tgtype 12.8265 (1) ) +( n.lisp_apml_tgtype 16.3027 (1) ) +( nn.lisp_apml_tgtype 13.3225 (1) ) +( pp.lisp_apml_tgtype 0.0000 (2) ) +( p.lisp_apml_tgtype 1.7434 (2) ) +( lisp_apml_tgtype 6.7783 (2) ) +( n.lisp_apml_tgtype 0.6679 (2) ) +( nn.lisp_apml_tgtype 0.0000 (2) ) +( pp.lisp_apml_tgtype 1.6494 (3) ) +( p.lisp_apml_tgtype 1.2861 (3) ) +( lisp_apml_tgtype -2.0724 (3) ) +( n.lisp_apml_tgtype 0.0000 (3) ) +( nn.lisp_apml_tgtype -1.2004 (3) ) +( pp.lisp_apml_iecount 0.0000 ) +( p.lisp_apml_iecount -0.5857 ) +( lisp_apml_iecount 0.0000 ) +( n.lisp_apml_iecount -2.3543 ) +( nn.lisp_apml_iecount 0.0000 ) +( pp.tobi_accent 0.0000 (H*) ) +( p.tobi_accent 8.5867 (H*) ) +( tobi_accent 21.2169 (H*) ) +( n.tobi_accent -1.2995 (H*) ) +( nn.tobi_accent -6.5056 (H*) ) +( pp.tobi_accent 0.0000 (L*) ) +( p.tobi_accent -7.5000 (L*) ) +( tobi_accent -25.0000 (L*) ) +( n.tobi_accent -8.3939 (L*) ) +( nn.tobi_accent -4.5688 (L*) ) +( pp.tobi_accent 2.6789 (L*+H) ) +( p.tobi_accent 45.0000 (L*+H) ) +( tobi_accent -17.5000 (L*+H) ) +( n.tobi_accent -1.3600 (L*+H) ) +( nn.tobi_accent 0.0000 (L*+H) ) +( pp.tobi_accent 0.0000 (L+H*) ) +( p.tobi_accent 8.5850 (L+H*) ) +( tobi_accent 21.2200 (L+H*) ) +( n.tobi_accent -1.3000 (L+H*) ) +( nn.tobi_accent -6.5050 (L+H*) ) +( pp.tobi_endtone 1.8117 (L-L%) ) +( p.tobi_endtone -0.1681 (L-L%) ) +( tobi_endtone -70 (L-L%) ) +( n.tobi_endtone -8.9334 (L-L%) ) +( nn.tobi_endtone -8.4034 (L-L%) ) +( pp.tobi_endtone 1.2099 (L-H%) ) +( p.tobi_endtone 1.1220 (L-H%) ) +( tobi_endtone -10.0000 (L-H%) ) +( n.tobi_endtone -5.9467 (L-H%) ) +( nn.tobi_endtone -6.9072 (L-H%) ) +( pp.tobi_endtone 0.0000 (H-L%) ) +( p.tobi_endtone 0.0000 (H-L%) ) +( tobi_endtone 2.0395 (H-L%) ) +( n.tobi_endtone -12.3940 (H-L%) ) +( nn.tobi_endtone -24.2593 (H-L%) ) +( pp.tobi_endtone 0.0000 (H-H%) ) +( p.tobi_endtone 0.0000 (H-H%) ) +( tobi_endtone 0.0000 (H-H%) ) +( n.tobi_endtone 0.0000 (H-H%) ) +( nn.tobi_endtone 16.1076 (H-H%) ) +( pp.tobi_endtone -1.8913 (L-) ) +( p.tobi_endtone -15.5650 (L-) ) +( tobi_endtone -18.3620 (L-) ) +( n.tobi_endtone -9.8322 (L-) ) +( nn.tobi_endtone -1.8182 (L-) ) +( pp.tobi_endtone -13.4429 (H-) ) +( p.tobi_endtone 0.0000 (H-) ) +( tobi_endtone 1.9053 (H-) ) +( n.tobi_endtone -3.4315 (H-) ) +( nn.tobi_endtone 0.0000 (H-) ) +( p.tobi_accent 0.0000 (L+H*L-H%) ) +( tobi_accent 10.0000 (L+H*L-H%) ) +( n.tobi_accent 0.0000 (L+H*L-H%) ) +( pp.syl_break 0.3501 ) +( p.syl_break -0.8121 ) +( syl_break 0.3209 ) +( n.syl_break 0.7486 ) +( nn.syl_break 0.8182 ) +( pp.stress -0.9778 ) +( p.stress -0.3096 ) +( stress 2.7752 ) +( n.stress 0.9976 ) +( nn.stress 2.7343 ) +( syl_in -1.9845 ) +( syl_out 0.7142 ) +( ssyl_in 1.0376 ) +( ssyl_out 0.3062 ) +( asyl_in 0.0000 ) +( asyl_out 0.4953 ) +( last_accent 0.0000 ) +( next_accent 0.1084 ) +( sub_phrases 0.0000 ) +( lisp_l_spread -60.0000 ) +)) + +(set! apml_f2b_f0_lr_mid +'( +( Intercept 160.2474 ) +( pp.lisp_apml_tgtype 0.0000 (1) ) +( p.lisp_apml_tgtype 4.5869 (1) ) +( lisp_apml_tgtype 25.6530 (1) ) +( n.lisp_apml_tgtype 14.8255 (1) ) +( nn.lisp_apml_tgtype 12.9605 (1) ) +( pp.lisp_apml_tgtype 0.0000 (2) ) +( p.lisp_apml_tgtype 3.4867 (2) ) +( lisp_apml_tgtype 13.5566 (2) ) +( n.lisp_apml_tgtype 1.3359 (2) ) +( nn.lisp_apml_tgtype 0.0000 (2) ) +( pp.lisp_apml_tgtype 3.2989 (3) ) +( p.lisp_apml_tgtype 2.5723 (3) ) +( lisp_apml_tgtype 5.5798 (3) ) +( n.lisp_apml_tgtype 0.0000 (3) ) +( nn.lisp_apml_tgtype 0.0000 (3) ) +( pp.lisp_apml_iecount 0.0000 ) +( p.lisp_apml_iecount -0.7231 ) +( lisp_apml_iecount 0.0000 ) +( n.lisp_apml_iecount -2.6922 ) +( nn.lisp_apml_iecount 0.0000 ) +( pp.tobi_accent 0.0000 (H*) ) +( p.tobi_accent 6.0496 (H*) ) +( tobi_accent 20.9174 (H*) ) +( n.tobi_accent 0.0000 (H*) ) +( nn.tobi_accent -6.4804 (H*) ) +( pp.tobi_accent 0.0000 (L*) ) +( p.tobi_accent -5.0000 (L*) ) +( tobi_accent -45.0000 (L*) ) +( n.tobi_accent -6.1079 (L*) ) +( nn.tobi_accent -3.4815 (L*) ) +( pp.tobi_accent 0.0000 (L*+H) ) +( p.tobi_accent 30.0000 (L*+H) ) +( tobi_accent -30.0000 (L*+H) ) +( n.tobi_accent -2.7200 (L*+H) ) +( nn.tobi_accent 0.0000 (L*+H) ) +( pp.tobi_accent 0.0000 (L+H*) ) +( p.tobi_accent 6.0500 (L+H*) ) +( tobi_accent 20.9200 (L+H*) ) +( n.tobi_accent 0.0000 (L+H*) ) +( nn.tobi_accent -6.4800 (L+H*) ) +( pp.tobi_endtone 3.6235 (L-L%) ) +( p.tobi_endtone 0.2801 (L-L%) ) +( tobi_endtone -80 (L-L%) ) +( n.tobi_endtone -6.9938 (L-L%) ) +( nn.tobi_endtone -9.1546 (L-L%) ) +( pp.tobi_endtone 1.6616 (L-H%) ) +( p.tobi_endtone 2.2441 (L-H%) ) +( tobi_endtone 0.0000 (L-H%) ) +( n.tobi_endtone 0.0000 (L-H%) ) +( nn.tobi_endtone -6.6132 (L-H%) ) +( pp.tobi_endtone 0.0000 (H-L%) ) +( p.tobi_endtone 0.0000 (H-L%) ) +( tobi_endtone 0.0000 (H-L%) ) +( n.tobi_endtone -5.4416 (H-L%) ) +( nn.tobi_endtone -19.1570 (H-L%) ) +( pp.tobi_endtone 0.0000 (H-H%) ) +( p.tobi_endtone 0.0000 (H-H%) ) +( tobi_endtone 0.0000 (H-H%) ) +( n.tobi_endtone 0.0000 (H-H%) ) +( nn.tobi_endtone 32.2151 (H-H%) ) +( pp.tobi_endtone -3.7825 (L-) ) +( p.tobi_endtone -15.9598 (L-) ) +( tobi_endtone -36.7241 (L-) ) +( n.tobi_endtone -5.1082 (L-) ) +( nn.tobi_endtone -3.6363 (L-) ) +( pp.tobi_endtone -13.3813 (H-) ) +( p.tobi_endtone 0.0000 (H-) ) +( tobi_endtone -2.5270 (H-) ) +( n.tobi_endtone 0.0000 (H-) ) +( nn.tobi_endtone 0.0000 (H-) ) +( p.tobi_accent 0.0000 (L+H*L-H%) ) +( tobi_accent 40.0000 (L+H*L-H%) ) +( n.tobi_accent 0.0000 (L+H*L-H%) ) +( pp.syl_break 0.7003 ) +( p.syl_break -1.6241 ) +( syl_break 0.0000 ) +( n.syl_break 0.1439 ) +( nn.syl_break 0.5640 ) +( pp.stress -1.9556 ) +( p.stress 0.0000 ) +( stress 3.1383 ) +( n.stress 1.9952 ) +( nn.stress 2.9208 ) +( syl_in -2.5317 ) +( syl_out 1.0103 ) +( ssyl_in 2.0751 ) +( ssyl_out 0.0000 ) +( asyl_in 0.0000 ) +( asyl_out 0.0000 ) +( last_accent 0.0000 ) +( next_accent 0.5869 ) +( sub_phrases 0.0000 ) +( lisp_l_spread -60.0000 ) +)) + +(set! apml_f2b_f0_lr_right +'( +( Intercept 162.6687 ) +( pp.lisp_apml_tgtype -4.0459 (1) ) +( p.lisp_apml_tgtype 3.0601 (1) ) +( lisp_apml_tgtype 27.8166 (1) ) +( n.lisp_apml_tgtype 7.4127 (1) ) +( nn.lisp_apml_tgtype 11.3458 (1) ) +( pp.lisp_apml_tgtype -3.8091 (2) ) +( p.lisp_apml_tgtype 1.7434 (2) ) +( lisp_apml_tgtype 17.1672 (2) ) +( n.lisp_apml_tgtype 0.6679 (2) ) +( nn.lisp_apml_tgtype 0.0000 (2) ) +( pp.lisp_apml_tgtype 1.6494 (3) ) +( p.lisp_apml_tgtype 1.2861 (3) ) +( lisp_apml_tgtype 9.5674 (3) ) +( n.lisp_apml_tgtype -3.1085 (3) ) +( nn.lisp_apml_tgtype 0.0000 (3) ) +( pp.lisp_apml_iecount 0.0000 ) +( p.lisp_apml_iecount -0.7829 ) +( lisp_apml_iecount -0.5447 ) +( n.lisp_apml_iecount -1.3461 ) +( nn.lisp_apml_iecount -0.7178 ) +( pp.tobi_accent 0.7904 (H*) ) +( p.tobi_accent 3.0248 (H*) ) +( tobi_accent 14.1116 (H*) ) +( n.tobi_accent 0.0000 (H*) ) +( nn.tobi_accent -3.2402 (H*) ) +( pp.tobi_accent 0.0000 (L*) ) +( p.tobi_accent -2.5000 (L*) ) +( tobi_accent -32.5000 (L*) ) +( n.tobi_accent -3.0539 (L*) ) +( nn.tobi_accent -1.7408 (L*) ) +( pp.tobi_accent 0.0000 (L*+H) ) +( p.tobi_accent 17.5000 (L*+H) ) +( tobi_accent -9.0000 (L*+H) ) +( n.tobi_accent -2.8025 (L*+H) ) +( nn.tobi_accent -0.5455 (L*+H) ) +( pp.tobi_accent 0.7900 (L+H*) ) +( p.tobi_accent 3.0250 (L+H*) ) +( tobi_accent 14.1150 (L+H*) ) +( n.tobi_accent 0.0000 (L+H*) ) +( nn.tobi_accent -3.2400 (L+H*) ) +( pp.tobi_endtone 5.7534 (L-L%) ) +( p.tobi_endtone 0.1401 (L-L%) ) +( tobi_endtone -65 (L-L%) ) +( n.tobi_endtone -11.1795 (L-L%) ) +( nn.tobi_endtone -7.8158 (L-L%) ) +( pp.tobi_endtone 4.4276 (L-H%) ) +( p.tobi_endtone 1.1220 (L-H%) ) +( tobi_endtone 20.0000 (L-H%) ) +( n.tobi_endtone -6.8995 (L-H%) ) +( nn.tobi_endtone -6.1219 (L-H%) ) +( pp.tobi_endtone 2.4327 (H-L%) ) +( p.tobi_endtone 0.0000 (H-L%) ) +( tobi_endtone -7.5781 (H-L%) ) +( n.tobi_endtone -2.7208 (H-L%) ) +( nn.tobi_endtone -14.4838 (H-L%) ) +( pp.tobi_endtone 0.0000 (H-H%) ) +( p.tobi_endtone 0.0000 (H-H%) ) +( tobi_endtone 0.0000 (H-H%) ) +( n.tobi_endtone 0.0000 (H-H%) ) +( nn.tobi_endtone 16.1076 (H-H%) ) +( pp.tobi_endtone -1.8913 (L-) ) +( p.tobi_endtone -15.5651 (L-) ) +( tobi_endtone -40.2021 (L-) ) +( n.tobi_endtone -2.5541 (L-) ) +( nn.tobi_endtone -2.2224 (L-) ) +( pp.tobi_endtone -6.6906 (H-) ) +( p.tobi_endtone -3.5483 (H-) ) +( tobi_endtone -1.2635 (H-) ) +( n.tobi_endtone 0.0000 (H-) ) +( nn.tobi_endtone 0.0000 (H-) ) +( p.tobi_accent 0.0000 (L+H*L-H%) ) +( tobi_accent -40.0000 (L+H*L-H%) ) +( n.tobi_accent 0.0000 (L+H*L-H%) ) +( pp.syl_break 0.3501 ) +( p.syl_break -1.0003 ) +( syl_break -1.5536 ) +( n.syl_break 0.0720 ) +( nn.syl_break 0.5989 ) +( pp.stress -0.9778 ) +( p.stress -0.8046 ) +( stress 1.2124 ) +( n.stress 3.9715 ) +( nn.stress 2.3914 ) +( syl_in -2.3468 ) +( syl_out 0.9792 ) +( ssyl_in 2.0463 ) +( ssyl_out 0.0000 ) +( asyl_in -0.1460 ) +( asyl_out 0.0000 ) +( last_accent -1.0992 ) +( next_accent 0.2935 ) +( sub_phrases 0.0000 ) +( lisp_l_spread -60.0000 ) +)) + +(set! apml_f2b_f0_lr_end +'( +( Intercept 165.0901 ) +( pp.lisp_apml_tgtype -8.0918 (1) ) +( p.lisp_apml_tgtype 1.5332 (1) ) +( lisp_apml_tgtype 29.9802 (1) ) +( n.lisp_apml_tgtype 0.0000 (1) ) +( nn.lisp_apml_tgtype 9.7312 (1) ) +( pp.lisp_apml_tgtype -7.6181 (2) ) +( p.lisp_apml_tgtype 0.0000 (2) ) +( lisp_apml_tgtype 20.7778 (2) ) +( n.lisp_apml_tgtype 0.0000 (2) ) +( nn.lisp_apml_tgtype 0.0000 (2) ) +( pp.lisp_apml_tgtype 0.0000 (3) ) +( p.lisp_apml_tgtype 0.0000 (3) ) +( lisp_apml_tgtype 13.5550 (3) ) +( n.lisp_apml_tgtype -6.2170 (3) ) +( nn.lisp_apml_tgtype 0.0000 (3) ) +( pp.lisp_apml_iecount 0.0000 ) +( p.lisp_apml_iecount -0.8428 ) +( lisp_apml_iecount -1.0894 ) +( n.lisp_apml_iecount 0.0000 ) +( nn.lisp_apml_iecount -1.4355 ) +( pp.tobi_accent 1.5807 (H*) ) +( p.tobi_accent 0.0000 (H*) ) +( tobi_accent 7.3057 (H*) ) +( n.tobi_accent 0.0000 (H*) ) +( nn.tobi_accent 0.0000 (H*) ) +( pp.tobi_accent 0.0000 (L*) ) +( p.tobi_accent 0.0000 (L*) ) +( tobi_accent -20.0000 (L*) ) +( n.tobi_accent 0.0000 (L*) ) +( nn.tobi_accent 0.0000 (L*) ) +( pp.tobi_accent 0.0000 (L*+H) ) +( p.tobi_accent 5.0000 (L*+H) ) +( tobi_accent 12.0000 (L*+H) ) +( n.tobi_accent -2.8850 (L*+H) ) +( nn.tobi_accent -1.0910 (L*+H) ) +( pp.tobi_accent 1.5800 (L+H*) ) +( p.tobi_accent 0.0000 (L+H*) ) +( tobi_accent 7.3100 (L+H*) ) +( n.tobi_accent 0.0000 (L+H*) ) +( nn.tobi_accent 0.0000 (L+H*) ) +( pp.tobi_endtone 7.8833 (L-L%) ) +( p.tobi_endtone 0.0000 (L-L%) ) +( tobi_endtone -80 (L-L%) ) +( n.tobi_endtone -35 (L-L%) ) +( nn.tobi_endtone -6.4769 (L-L%) ) +( pp.tobi_endtone 7.1936 (L-H%) ) +( p.tobi_endtone 0.0000 (L-H%) ) +( tobi_endtone 40.0000 (L-H%) ) +( n.tobi_endtone -13.7990 (L-H%) ) +( nn.tobi_endtone -5.6305 (L-H%) ) +( pp.tobi_endtone 4.8654 (H-L%) ) +( p.tobi_endtone 0.0000 (H-L%) ) +( tobi_endtone -15.1561 (H-L%) ) +( n.tobi_endtone 0.0000 (H-L%) ) +( nn.tobi_endtone -9.8107 (H-L%) ) +( pp.tobi_endtone 0.0000 (H-H%) ) +( p.tobi_endtone 0.0000 (H-H%) ) +( tobi_endtone 0.0000 (H-H%) ) +( n.tobi_endtone 0.0000 (H-H%) ) +( nn.tobi_endtone 0.0000 (H-H%) ) +( pp.tobi_endtone 0.0000 (L-) ) +( p.tobi_endtone -15.1705 (L-) ) +( tobi_endtone -43.6801 (L-) ) +( n.tobi_endtone 0.0000 (L-) ) +( nn.tobi_endtone -0.8085 (L-) ) +( pp.tobi_endtone 0.0000 (H-) ) +( p.tobi_endtone -7.0967 (H-) ) +( tobi_endtone 0.0000 (H-) ) +( n.tobi_endtone 0.0000 (H-) ) +( nn.tobi_endtone 0.0000 (H-) ) +( p.tobi_accent 0.0000 (L+H*L-H%) ) +( tobi_accent 60.0000 (L+H*L-H%) ) +( n.tobi_accent -60.0000 (L+H*L-H%) ) +( pp.syl_break 0.0000 ) +( p.syl_break -0.3765 ) +( syl_break -3.1072 ) +( n.syl_break 0.0000 ) +( nn.syl_break 0.6338 ) +( pp.stress 0.0000 ) +( p.stress -1.6093 ) +( stress -0.7136 ) +( n.stress 5.9479 ) +( nn.stress 1.8619 ) +( syl_in -2.1619 ) +( syl_out 0.9481 ) +( ssyl_in 2.0175 ) +( ssyl_out 0.0000 ) +( asyl_in -0.2919 ) +( asyl_out 0.0000 ) +( last_accent -2.1984 ) +( next_accent 0.0000 ) +( sub_phrases 0.0000 ) +( lisp_l_spread -60.0000 ) +)) + diff --git a/CosyVoice-ttsfrd/resource/festival/apml_kaldurtreeZ.scm b/CosyVoice-ttsfrd/resource/festival/apml_kaldurtreeZ.scm new file mode 100644 index 0000000000000000000000000000000000000000..5a3d44e3e80ad6998d8506f825186ba1f8c15d2d --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/apml_kaldurtreeZ.scm @@ -0,0 +1,996 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; A tree to predict zcore durations build from f2b +;;; doesn't use actual phonemes so it can have better generalizations +;;; +;;; Basically copied from ked +;;; + +(set! apml_kal_durs +'( + (uh 0.067 0.025) + (hh 0.061 0.028) + (ao 0.138 0.046) + (hv 0.053 0.020) + (v 0.051 0.019) + (ih 0.058 0.023) + (el 0.111 0.043) + (ey 0.132 0.042) + (em 0.080 0.033) + (jh 0.094 0.024) + (w 0.054 0.023) + (uw 0.107 0.044) + (ae 0.120 0.036) + (en 0.117 0.056) + (k 0.089 0.034) + (y 0.048 0.025) + (axr 0.147 0.035) +; (l 0.056 0.026) + (l 0.066 0.026) + (ng 0.064 0.024) + (zh 0.071 0.030) + (z 0.079 0.034) + (brth 0.246 0.046) + (m 0.069 0.028) + (iy 0.097 0.041) + (n 0.059 0.025) + (ah 0.087 0.031) + (er 0.086 0.010) + (b 0.069 0.024) + (pau 0.200 0.1) + (aw 0.166 0.053) + (p 0.088 0.030) + (ch 0.115 0.025) + (ow 0.134 0.039) + (dh 0.031 0.016) + (nx 0.049 0.100) + (d 0.048 0.021) + (ax 0.046 0.024) + (h# 0.060 0.083) + (r 0.053 0.031) + (eh 0.095 0.036) + (ay 0.137 0.047) + (oy 0.183 0.050) + (f 0.095 0.033) + (sh 0.108 0.031) + (s 0.102 0.037) + (g 0.064 0.021) + (dx 0.031 0.016) + (th 0.093 0.050) + (aa 0.094 0.037) + (t 0.070 0.020) +) +) + +(set! apml_kal_duration_cart_tree +' +((name is pau) + ((emph_sil is +) + ((0.0 -0.5)) + ((p.R:SylStructure.parent.parent.lisp_apml_pause = 0.2) + ((0.0 0.0)) + ((p.R:SylStructure.parent.parent.lisp_apml_pause = 0.4) + ((0.0 2.0)) + ((p.R:SylStructure.parent.parent.lisp_apml_pause = 0.6) + ((0.0 4.0)) + ((p.R:SylStructure.parent.parent.lisp_apml_pause = 0.8) + ((0.0 6.0)) + ((p.R:SylStructure.parent.parent.lisp_apml_pause = 1.0) + ((0.0 8.0)) + ((p.R:SylStructure.parent.parent.lisp_apml_pause = 1.5) + ((0.0 13.0)) + ((p.R:SylStructure.parent.parent.lisp_apml_pause = 2.0) + ((0.0 18.0)) + ((p.R:SylStructure.parent.parent.lisp_apml_pause = 2.5) + ((0.0 23.0)) + ((p.R:SylStructure.parent.parent.lisp_apml_pause = 3.0) + ((0.0 28.0)) + ((p.R:SylStructure.parent.parent.pbreak is BB) + ((0.0 2.0)) + ((0.0 0.0))))))))))))) + ((R:SylStructure.parent.accented is 0) + ((n.ph_ctype is 0) + ((p.ph_vlng is 0) + ((R:SylStructure.parent.syl_codasize < 1.5) + ((p.ph_ctype is n) + ((ph_ctype is f) + ((0.559208 -0.783163)) + ((1.05215 -0.222704))) + ((ph_ctype is s) + ((R:SylStructure.parent.syl_break is 2) + ((0.589948 0.764459)) + ((R:SylStructure.parent.asyl_in < 0.7) + ((1.06385 0.567944)) + ((0.691943 0.0530272)))) + ((ph_vlng is l) + ((pp.ph_vfront is 1) + ((1.06991 0.766486)) + ((R:SylStructure.parent.syl_break is 1) + ((0.69665 0.279248)) + ((0.670353 0.0567774)))) + ((p.ph_ctype is s) + ((seg_onsetcoda is coda) + ((0.828638 -0.038356)) + ((ph_ctype is f) + ((0.7631 -0.545853)) + ((0.49329 -0.765994)))) + ((R:SylStructure.parent.parent.gpos is det) + ((R:SylStructure.parent.last_accent < 0.3) + ((R:SylStructure.parent.sub_phrases < 1) + ((0.811686 0.160195)) + ((0.799015 0.713958))) + ((0.731599 -0.215472))) + ((ph_ctype is r) + ((0.673487 0.092772)) + ((R:SylStructure.parent.asyl_in < 1) + ((0.745273 0.00132813)) + ((0.75457 -0.334898))))))))) + ((pos_in_syl < 0.5) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((0.902446 -0.041618)) + ((R:SylStructure.parent.sub_phrases < 2.3) + ((0.900629 0.262952)) + ((1.18474 0.594794)))) + ((seg_onset_stop is 0) + ((R:SylStructure.parent.position_type is mid) + ((0.512323 -0.760444)) + ((R:SylStructure.parent.syl_out < 6.8) + ((pp.ph_vlng is a) + ((0.640575 -0.450449)) + ((ph_ctype is f) + ((R:SylStructure.parent.sub_phrases < 1.3) + ((0.862876 -0.296956)) + ((R:SylStructure.parent.syl_out < 2.4) + ((0.803215 0.0422868)) + ((0.877856 -0.154465)))) + ((R:SylStructure.parent.syl_out < 3.6) + ((R:SylStructure.parent.syl_out < 1.2) + ((0.567081 -0.264199)) + ((0.598043 -0.541738))) + ((0.676843 -0.166623))))) + ((0.691678 -0.57173)))) + ((R:SylStructure.parent.parent.gpos is cc) + ((1.15995 0.313289)) + ((pp.ph_vfront is 1) + ((0.555993 0.0695819)) + ((R:SylStructure.parent.asyl_in < 1.2) + ((R:SylStructure.parent.sub_phrases < 2.7) + ((0.721635 -0.367088)) + ((0.71919 -0.194887))) + ((0.547052 -0.0637491))))))) + ((ph_ctype is s) + ((R:SylStructure.parent.syl_break is 0) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((0.650007 -0.333421)) + ((0.846301 -0.165383))) + ((0.527756 -0.516332))) + ((R:SylStructure.parent.syl_break is 0) + ((p.ph_ctype is s) + ((0.504414 -0.779112)) + ((0.812498 -0.337611))) + ((pos_in_syl < 1.4) + ((0.513041 -0.745807)) + ((p.ph_ctype is s) + ((0.350582 -1.04907)) + ((0.362 -0.914974)))))))) + ((R:SylStructure.parent.syl_break is 0) + ((ph_ctype is n) + ((R:SylStructure.parent.position_type is initial) + ((pos_in_syl < 1.2) + ((0.580485 0.172658)) + ((0.630973 -0.101423))) + ((0.577937 -0.360092))) + ((R:SylStructure.parent.syl_out < 2.9) + ((R:SylStructure.parent.syl_out < 1.1) + ((R:SylStructure.parent.position_type is initial) + ((0.896092 0.764189)) + ((R:SylStructure.parent.sub_phrases < 3.6) + ((ph_ctype is s) + ((0.877362 0.555132)) + ((0.604511 0.369882))) + ((0.799982 0.666966)))) + ((seg_onsetcoda is coda) + ((p.ph_vlng is a) + ((R:SylStructure.parent.last_accent < 0.4) + ((0.800736 0.240634)) + ((0.720606 0.486176))) + ((1.18173 0.573811))) + ((0.607147 0.194468)))) + ((ph_ctype is r) + ((0.88377 0.499383)) + ((R:SylStructure.parent.last_accent < 0.5) + ((R:SylStructure.parent.position_type is initial) + ((R:SylStructure.parent.parent.word_numsyls < 2.4) + ((0.62798 0.0737318)) + ((0.787334 0.331014))) + ((ph_ctype is s) + ((0.808368 0.0929299)) + ((0.527948 -0.0443271)))) + ((seg_coda_fric is 0) + ((p.ph_vlng is a) + ((0.679745 0.517681)) + ((R:SylStructure.parent.sub_phrases < 1.1) + ((0.759979 0.128316)) + ((0.775233 0.361383)))) + ((R:SylStructure.parent.last_accent < 1.3) + ((0.696255 0.054136)) + ((0.632425 0.246742)))))))) + ((pos_in_syl < 0.3) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((0.847602 0.621547)) + ((ph_ctype is s) + ((0.880645 0.501679)) + ((R:SylStructure.parent.sub_phrases < 3.3) + ((R:SylStructure.parent.sub_phrases < 0.3) + ((0.901014 -0.042049)) + ((0.657493 0.183226))) + ((0.680126 0.284799))))) + ((ph_ctype is s) + ((p.ph_vlng is s) + ((0.670033 -0.820934)) + ((0.863306 -0.348735))) + ((ph_ctype is n) + ((R:SylStructure.parent.asyl_in < 1.2) + ((0.656966 -0.40092)) + ((0.530966 -0.639366))) + ((seg_coda_fric is 0) + ((1.04153 0.364857)) + ((pos_in_syl < 1.2) + ((R:SylStructure.parent.syl_out < 3.4) + ((0.81503 -0.00768613)) + ((0.602665 -0.197753))) + ((0.601844 -0.394632))))))))) + ((n.ph_ctype is f) + ((pos_in_syl < 1.5) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((pos_in_syl < 0.1) + ((1.63863 0.938841)) + ((R:SylStructure.parent.position_type is initial) + ((0.897722 -0.0796637)) + ((nn.ph_vheight is 0) + ((0.781081 0.480026)) + ((0.779711 0.127175))))) + ((ph_ctype is r) + ((p.ph_ctype is s) + ((0.581329 -0.708767)) + ((0.564366 -0.236212))) + ((ph_vlng is a) + ((p.ph_ctype is r) + ((0.70992 -0.273389)) + ((R:SylStructure.parent.parent.gpos is in) + ((0.764696 0.0581338)) + ((nn.ph_vheight is 0) + ((0.977737 0.721904)) + ((R:SylStructure.parent.sub_phrases < 2.2) + ((pp.ph_vfront is 0) + ((0.586708 0.0161206)) + ((0.619949 0.227372))) + ((0.707285 0.445569)))))) + ((ph_ctype is n) + ((R:SylStructure.parent.syl_break is 1) + ((nn.ph_vfront is 2) + ((0.430295 -0.120097)) + ((0.741371 0.219042))) + ((0.587492 0.321245))) + ((p.ph_ctype is n) + ((0.871586 0.134075)) + ((p.ph_ctype is r) + ((0.490751 -0.466418)) + ((R:SylStructure.parent.syl_codasize < 1.3) + ((R:SylStructure.parent.sub_phrases < 2.2) + ((p.ph_ctype is s) + ((0.407452 -0.425925)) + ((0.644771 -0.542809))) + ((0.688772 -0.201899))) + ((ph_vheight is 1) + ((nn.ph_vheight is 0) + ((0.692018 0.209018)) + ((0.751345 -0.178136))) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.3) + ((R:SylStructure.parent.asyl_in < 1.5) + ((0.599633 -0.235593)) + ((0.60042 0.126118))) + ((p.ph_vlng is a) + ((0.7148 -0.174812)) + ((R:SylStructure.parent.parent.gpos is content) + ((0.761296 -0.231509)) + ((0.813081 -0.536405))))))))))))) + ((ph_ctype is n) + ((0.898844 0.163343)) + ((p.ph_vlng is s) + ((seg_coda_fric is 0) + ((0.752921 -0.45528)) + ((0.890079 -0.0998025))) + ((ph_ctype is f) + ((0.729376 -0.930547)) + ((ph_ctype is s) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 0) + ((0.745052 -0.634119)) + ((0.521502 -0.760176))) + ((R:SylStructure.parent.syl_break is 1) + ((0.766575 -0.121355)) + ((0.795616 -0.557509)))))))) + ((p.ph_vlng is 0) + ((p.ph_ctype is r) + ((ph_vlng is 0) + ((0.733659 -0.402734)) + ((R:SylStructure.parent.sub_phrases < 1.5) + ((ph_vlng is s) + ((0.326176 -0.988478)) + ((n.ph_ctype is s) + ((0.276471 -0.802536)) + ((0.438283 -0.900628)))) + ((nn.ph_vheight is 0) + ((ph_vheight is 2) + ((0.521 -0.768992)) + ((0.615436 -0.574918))) + ((ph_vheight is 1) + ((0.387376 -0.756359)) + ((pos_in_syl < 0.3) + ((0.417235 -0.808937)) + ((0.384043 -0.93315))))))) + ((ph_vlng is a) + ((ph_ctype is 0) + ((n.ph_ctype is s) + ((p.ph_ctype is f) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((0.415908 -0.428493)) + ((pos_in_syl < 0.1) + ((0.790441 0.0211071)) + ((0.452465 -0.254485)))) + ((p.ph_ctype is s) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((0.582447 -0.389966)) + ((0.757648 0.185781))) + ((R:SylStructure.parent.sub_phrases < 1.4) + ((0.628965 0.422551)) + ((0.713613 0.145576))))) + ((seg_onset_stop is 0) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 0) + ((pp.ph_vfront is 1) + ((0.412363 -0.62319)) + ((R:SylStructure.parent.syl_out < 3.6) + ((0.729259 -0.317324)) + ((0.441633 -0.591051)))) + ((R:SylStructure.parent.syl_break is 1) + ((R:SylStructure.parent.sub_phrases < 2.7) + ((0.457728 -0.405607)) + ((0.532411 -0.313148))) + ((R:SylStructure.parent.last_accent < 0.3) + ((1.14175 0.159416)) + ((0.616396 -0.254651))))) + ((R:SylStructure.parent.position_type is initial) + ((0.264181 -0.799896)) + ((0.439801 -0.551309))))) + ((R:SylStructure.parent.position_type is final) + ((0.552027 -0.707084)) + ((0.585661 -0.901874)))) + ((ph_ctype is s) + ((pos_in_syl < 1.2) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((pp.ph_vfront is 1) + ((0.607449 0.196466)) + ((0.599662 0.00382414))) + ((0.64109 -0.12859))) + ((pp.ph_vfront is 1) + ((0.720484 -0.219339)) + ((0.688707 -0.516734)))) + ((ph_vlng is s) + ((n.ph_ctype is s) + ((R:SylStructure.parent.parent.gpos is content) + ((R:SylStructure.parent.position_type is single) + ((0.659206 0.159445)) + ((R:SylStructure.parent.parent.word_numsyls < 3.5) + ((R:SylStructure.parent.sub_phrases < 2) + ((0.447186 -0.419103)) + ((0.631822 -0.0928561))) + ((0.451623 -0.576116)))) + ((ph_vheight is 3) + ((0.578626 -0.64583)) + ((0.56636 -0.4665)))) + ((R:SylStructure.parent.parent.gpos is in) + ((0.771516 -0.217292)) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((0.688571 -0.304382)) + ((R:SylStructure.parent.parent.gpos is content) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((n.ph_ctype is n) + ((0.556085 -0.572203)) + ((0.820173 -0.240338))) + ((R:SylStructure.parent.parent.word_numsyls < 2.2) + ((0.595398 -0.588171)) + ((0.524737 -0.95797)))) + ((R:SylStructure.parent.sub_phrases < 3.9) + ((0.371492 -0.959427)) + ((0.440479 -0.845747))))))) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 0) + ((p.ph_ctype is f) + ((0.524088 -0.482247)) + ((nn.ph_vheight is 1) + ((0.587666 -0.632362)) + ((ph_vlng is l) + ((R:SylStructure.parent.position_type is final) + ((0.513286 -0.713117)) + ((0.604613 -0.924308))) + ((R:SylStructure.parent.syl_codasize < 2.2) + ((0.577997 -0.891342)) + ((0.659804 -1.15252)))))) + ((pp.ph_vlng is s) + ((ph_ctype is f) + ((0.813383 -0.599624)) + ((0.984027 -0.0771909))) + ((p.ph_ctype is f) + ((R:SylStructure.parent.parent.gpos is in) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((0.313572 -1.03242)) + ((0.525854 -0.542799))) + ((R:SylStructure.parent.syl_out < 2.8) + ((0.613007 -0.423979)) + ((0.570258 -0.766379)))) + ((R:SylStructure.parent.syl_break is 1) + ((R:SylStructure.parent.parent.gpos is to) + ((0.364585 -0.792895)) + ((ph_vlng is l) + ((0.69143 -0.276816)) + ((0.65673 -0.523721)))) + ((R:SylStructure.parent.syl_out < 3.6) + ((R:SylStructure.parent.position_type is initial) + ((0.682096 -0.488102)) + ((0.406364 -0.731758))) + ((0.584694 -0.822229))))))))))) + ((n.ph_ctype is r) + ((R:SylStructure.parent.position_type is initial) + ((p.ph_vlng is a) + ((0.797058 1.02334)) + ((ph_ctype is s) + ((1.0548 0.536277)) + ((0.817253 0.138201)))) + ((R:SylStructure.parent.sub_phrases < 1.1) + ((R:SylStructure.parent.syl_out < 3.3) + ((0.884574 -0.23471)) + ((0.772063 -0.525292))) + ((nn.ph_vfront is 1) + ((1.25254 0.417485)) + ((0.955557 -0.0781996))))) + ((pp.ph_vfront is 0) + ((ph_ctype is f) + ((n.ph_ctype is s) + ((R:SylStructure.parent.parent.gpos is content) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 0) + ((0.583506 -0.56941)) + ((0.525949 -0.289362))) + ((0.749316 -0.0921038))) + ((p.ph_vlng is s) + ((0.734234 0.139463)) + ((0.680119 -0.0708717)))) + ((ph_vlng is s) + ((ph_vheight is 1) + ((0.908712 -0.618971)) + ((0.55344 -0.840495))) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 1.2) + ((pos_in_syl < 1.2) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((0.838715 0.00913392)) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((ph_vheight is 2) + ((0.555513 -0.512523)) + ((R:SylStructure.parent.position_type is initial) + ((0.758711 0.121704)) + ((0.737555 -0.25637)))) + ((R:SylStructure.parent.syl_out < 3.1) + ((n.ph_ctype is s) + ((0.611756 -0.474522)) + ((1.05437 -0.247206))) + ((R:SylStructure.parent.syl_codasize < 2.2) + ((R:SylStructure.parent.position_type is final) + ((0.567761 -0.597866)) + ((0.785599 -0.407765))) + ((0.575598 -0.741256)))))) + ((ph_ctype is s) + ((n.ph_ctype is s) + ((0.661069 -1.08426)) + ((0.783184 -0.39789))) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((R:SylStructure.parent.sub_phrases < 2.6) + ((0.511323 -0.666011)) + ((0.691878 -0.499492))) + ((ph_ctype is r) + ((0.482131 -0.253186)) + ((0.852955 -0.372832)))))) + ((0.854447 -0.0936489))))) + ((R:SylStructure.parent.position_type is final) + ((0.685939 -0.249982)) + ((R:SylStructure.parent.syl_out < 3.2) + ((0.989843 0.18086)) + ((0.686805 -0.0402908))))))))) + ((R:SylStructure.parent.syl_out < 2.4) + ((R:SylStructure.parent.syl_out < 0.2) + ((seg_onsetcoda is coda) + ((ph_ctype is s) + ((R:SylStructure.parent.syl_break is 4) + ((pp.ph_vlng is 0) + ((0.959737 1.63203)) + ((1.20714 0.994933))) + ((n.ph_ctype is 0) + ((R:SylStructure.parent.syl_break is 2) + ((0.864809 0.214457)) + ((0.874278 0.730381))) + ((pp.ph_vfront is 0) + ((seg_coda_fric is 0) + ((1.20844 -0.336221)) + ((1.01357 0.468302))) + ((0.658106 -0.799121))))) + ((n.ph_ctype is f) + ((ph_ctype is f) + ((1.26332 0.0300613)) + ((ph_vlng is d) + ((1.02719 1.1649)) + ((ph_ctype is 0) + ((R:SylStructure.parent.asyl_in < 1.2) + ((1.14048 2.2668)) + ((ph_vheight is 1) + ((1.15528 1.50375)) + ((1.42406 2.07927)))) + ((R:SylStructure.parent.sub_phrases < 1.1) + ((0.955892 1.10243)) + ((R:SylStructure.parent.syl_break is 2) + ((1.32682 1.8432)) + ((1.27582 1.59853))))))) + ((n.ph_ctype is 0) + ((ph_ctype is n) + ((R:SylStructure.parent.syl_break is 2) + ((1.45399 1.12927)) + ((1.05543 0.442376))) + ((R:SylStructure.parent.syl_break is 4) + ((R:SylStructure.parent.position_type is final) + ((ph_ctype is f) + ((1.46434 1.76508)) + ((0.978055 0.7486))) + ((1.2395 2.30826))) + ((ph_ctype is 0) + ((0.935325 1.69917)) + ((nn.ph_vfront is 1) + ((1.20456 1.31128)) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((nn.ph_vheight is 0) + ((1.16907 0.212421)) + ((0.952091 0.653094))) + ((p.ph_ctype is 0) + ((1.05502 1.25802)) + ((0.818731 0.777568)))))))) + ((ph_ctype is f) + ((p.ph_ctype is 0) + ((1.03918 0.163941)) + ((0.737545 -0.167063))) + ((R:SylStructure.parent.position_type is final) + ((n.ph_ctype is n) + ((R:SylStructure.parent.last_accent < 0.5) + ((R:SylStructure.parent.sub_phrases < 2.8) + ((0.826207 -0.000859005)) + ((0.871119 0.273433))) + ((R:SylStructure.parent.parent.word_numsyls < 2.4) + ((1.17405 1.05694)) + ((0.858394 0.244916)))) + ((R:SylStructure.parent.syl_codasize < 2.2) + ((p.ph_ctype is 0) + ((1.14092 1.21187)) + ((R:SylStructure.parent.syl_break is 2) + ((1.02653 0.59865)) + ((0.94248 1.1634)))) + ((seg_coda_fric is 0) + ((1.07441 0.292935)) + ((1.15736 0.92574))))) + ((ph_vlng is s) + ((R:SylStructure.parent.syl_break is 2) + ((1.34638 1.23484)) + ((0.951514 2.02008))) + ((ph_ctype is 0) + ((p.ph_ctype is r) + ((0.806106 0.697089)) + ((R:SylStructure.parent.syl_break is 2) + ((1.10891 0.992197)) + ((1.04657 1.51093)))) + ((1.18165 0.520952))))))))) + ((p.ph_vlng is 0) + ((pos_in_syl < 0.7) + ((R:SylStructure.parent.position_type is final) + ((ph_ctype is r) + ((0.966357 0.185827)) + ((ph_ctype is s) + ((0.647163 0.0332298)) + ((0.692972 -0.534917)))) + ((ph_ctype is s) + ((0.881521 0.575107)) + ((p.ph_ctype is f) + ((0.8223 -0.111275)) + ((R:SylStructure.parent.last_accent < 0.3) + ((0.969188 0.09447)) + ((0.894438 0.381947)))))) + ((p.ph_ctype is f) + ((0.479748 -0.490108)) + ((0.813125 -0.201268)))) + ((ph_ctype is s) + ((0.908566 1.20397)) + ((R:SylStructure.parent.last_accent < 1.2) + ((0.88078 0.636568)) + ((0.978087 1.07763)))))) + ((pos_in_syl < 1.3) + ((R:SylStructure.parent.syl_break is 0) + ((pos_in_syl < 0.1) + ((R:SylStructure.parent.position_type is initial) + ((p.ph_ctype is n) + ((0.801651 -0.0163359)) + ((ph_ctype is s) + ((n.ph_ctype is r) + ((0.893307 1.07253)) + ((p.ph_vlng is 0) + ((0.92651 0.525806)) + ((0.652444 0.952792)))) + ((p.ph_vlng is 0) + ((seg_onsetcoda is coda) + ((0.820151 0.469117)) + ((p.ph_ctype is f) + ((0.747972 -0.0716448)) + ((ph_ctype is f) + ((0.770882 0.457137)) + ((0.840905 0.102492))))) + ((R:SylStructure.parent.syl_out < 1.1) + ((0.667824 0.697337)) + ((0.737967 0.375114)))))) + ((ph_vheight is 1) + ((0.624353 0.410671)) + ((R:SylStructure.parent.asyl_in < 0.8) + ((0.647905 -0.331055)) + ((p.ph_ctype is s) + ((0.629039 -0.240616)) + ((0.749277 -0.0191273)))))) + ((ph_vheight is 3) + ((p.ph_ctype is s) + ((0.626922 0.556537)) + ((0.789357 0.153892))) + ((seg_onsetcoda is coda) + ((n.ph_ctype is 0) + ((R:SylStructure.parent.parent.word_numsyls < 3.4) + ((0.744714 0.123242)) + ((0.742039 0.295753))) + ((seg_coda_fric is 0) + ((R:SylStructure.parent.parent.word_numsyls < 2.4) + ((ph_vheight is 1) + ((0.549715 -0.341018)) + ((0.573641 -0.00893114))) + ((nn.ph_vfront is 2) + ((0.67099 -0.744625)) + ((0.664438 -0.302803)))) + ((p.ph_vlng is 0) + ((0.630028 0.113815)) + ((0.632794 -0.128733))))) + ((ph_ctype is r) + ((0.367169 -0.854509)) + ((0.94334 -0.216179)))))) + ((n.ph_ctype is f) + ((ph_vlng is 0) + ((1.3089 0.46195)) + ((R:SylStructure.parent.syl_codasize < 1.3) + ((1.07673 0.657169)) + ((pp.ph_vlng is 0) + ((0.972319 1.08222)) + ((1.00038 1.46257))))) + ((p.ph_vlng is l) + ((1.03617 0.785204)) + ((p.ph_vlng is a) + ((R:SylStructure.parent.position_type is final) + ((1.00681 0.321168)) + ((0.928115 0.950834))) + ((ph_vlng is 0) + ((pos_in_syl < 0.1) + ((R:SylStructure.parent.position_type is final) + ((0.863682 -0.167374)) + ((nn.ph_vheight is 0) + ((p.ph_ctype is f) + ((0.773591 -0.00374425)) + ((R:SylStructure.parent.syl_out < 1.1) + ((0.951802 0.228448)) + ((1.02282 0.504252)))) + ((1.09721 0.736476)))) + ((R:SylStructure.parent.position_type is final) + ((1.04302 0.0590974)) + ((0.589208 -0.431535)))) + ((n.ph_ctype is 0) + ((1.27879 1.00642)) + ((ph_vlng is s) + ((R:SylStructure.parent.asyl_in < 1.4) + ((0.935787 0.481652)) + ((0.9887 0.749861))) + ((R:SylStructure.parent.syl_out < 1.1) + ((R:SylStructure.parent.position_type is final) + ((0.921307 0.0696307)) + ((0.83675 0.552212))) + ((0.810076 -0.0479225)))))))))) + ((ph_ctype is s) + ((n.ph_ctype is s) + ((0.706959 -1.0609)) + ((p.ph_ctype is n) + ((0.850614 -0.59933)) + ((n.ph_ctype is r) + ((0.665947 0.00698725)) + ((n.ph_ctype is 0) + ((R:SylStructure.parent.position_type is initial) + ((0.762889 -0.0649044)) + ((0.723956 -0.248899))) + ((R:SylStructure.parent.sub_phrases < 1.4) + ((0.632957 -0.601987)) + ((0.889114 -0.302401))))))) + ((ph_ctype is f) + ((R:SylStructure.parent.syl_codasize < 2.2) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((R:SylStructure.parent.syl_out < 1.1) + ((0.865267 0.164636)) + ((0.581827 -0.0989051))) + ((nn.ph_vfront is 2) + ((0.684459 -0.316836)) + ((0.778854 -0.0961191)))) + ((R:SylStructure.parent.syl_out < 1.1) + ((p.ph_ctype is s) + ((0.837964 -0.429437)) + ((0.875304 -0.0652743))) + ((0.611071 -0.635089)))) + ((p.ph_ctype is r) + ((R:SylStructure.parent.syl_out < 1.1) + ((0.762012 0.0139361)) + ((0.567983 -0.454845))) + ((R:SylStructure.parent.syl_codasize < 2.2) + ((ph_ctype is l) + ((1.18845 0.809091)) + ((R:SylStructure.parent.position_type is initial) + ((ph_ctype is n) + ((0.773548 -0.277092)) + ((1.01586 0.281001))) + ((p.ph_ctype is 0) + ((1.06831 0.699145)) + ((0.924189 0.241873))))) + ((R:SylStructure.parent.syl_break is 0) + ((ph_ctype is n) + ((0.592321 -0.470784)) + ((0.778688 -0.072112))) + ((n.ph_ctype is s) + ((1.08848 0.0733489)) + ((1.25674 0.608371)))))))))) + ((pos_in_syl < 0.7) + ((p.ph_vlng is 0) + ((R:SylStructure.parent.position_type is mid) + ((ph_ctype is 0) + ((ph_vheight is 2) + ((0.456225 -0.293282)) + ((0.561529 -0.0816115))) + ((0.6537 -0.504024))) + ((ph_ctype is s) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((1.31586 0.98395)) + ((R:SylStructure.parent.position_type is single) + ((0.816869 0.634789)) + ((R:SylStructure.parent.syl_out < 4.4) + ((1.05578 0.479029)) + ((R:SylStructure.parent.asyl_in < 0.4) + ((1.11813 0.143214)) + ((0.87178 0.406834)))))) + ((n.ph_ctype is n) + ((R:SylStructure.parent.last_accent < 0.6) + ((0.838154 -0.415599)) + ((0.924024 0.110288))) + ((seg_onsetcoda is coda) + ((nn.ph_vfront is 2) + ((0.670096 0.0314187)) + ((n.ph_ctype is f) + ((1.00363 0.693893)) + ((R:SylStructure.parent.syl_out < 6) + ((0.772363 0.215675)) + ((0.920313 0.574068))))) + ((R:SylStructure.parent.position_type is final) + ((0.673837 -0.458142)) + ((R:SylStructure.parent.sub_phrases < 2.8) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((0.894817 0.304628)) + ((ph_ctype is n) + ((0.787302 -0.23094)) + ((R:SylStructure.parent.asyl_in < 1.2) + ((ph_ctype is f) + ((R:SylStructure.parent.last_accent < 0.5) + ((1.12278 0.326954)) + ((0.802236 -0.100616))) + ((0.791255 -0.0919132))) + ((0.95233 0.219053))))) + ((R:SylStructure.parent.position_type is initial) + ((ph_ctype is f) + ((1.0616 0.216118)) + ((0.703216 -0.00834086))) + ((ph_ctype is f) + ((1.22277 0.761763)) + ((0.904811 0.332721)))))))))) + ((ph_vheight is 0) + ((p.ph_vlng is s) + ((0.873379 0.217178)) + ((n.ph_ctype is r) + ((0.723915 1.29451)) + ((n.ph_ctype is 0) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((R:SylStructure.parent.sub_phrases < 4) + ((seg_coda_fric is 0) + ((p.ph_vlng is l) + ((0.849154 0.945261)) + ((0.633261 0.687498))) + ((0.728546 0.403076))) + ((0.850962 1.00255))) + ((0.957999 1.09113))) + ((0.85771 0.209045))))) + ((ph_vheight is 2) + ((0.803401 -0.0544067)) + ((0.681353 0.256045))))) + ((n.ph_ctype is f) + ((ph_ctype is s) + ((p.ph_vlng is 0) + ((0.479307 -0.9673)) + ((0.700477 -0.351397))) + ((ph_ctype is f) + ((0.73467 -0.6233)) + ((R:SylStructure.parent.syl_break is 0) + ((p.ph_ctype is s) + ((0.56282 0.266234)) + ((p.ph_ctype is r) + ((0.446203 -0.302281)) + ((R:SylStructure.parent.sub_phrases < 2.7) + ((ph_ctype is 0) + ((0.572016 -0.0102436)) + ((0.497358 -0.274514))) + ((0.545477 0.0482177))))) + ((ph_vlng is s) + ((0.805269 0.888495)) + ((ph_ctype is n) + ((0.869854 0.653018)) + ((R:SylStructure.parent.sub_phrases < 2.2) + ((0.735031 0.0612886)) + ((0.771859 0.346637)))))))) + ((R:SylStructure.parent.syl_codasize < 1.4) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.3) + ((R:SylStructure.parent.position_type is initial) + ((0.743458 0.0411808)) + ((1.13068 0.613305))) + ((pos_in_syl < 1.2) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((1.11481 0.175467)) + ((0.937893 -0.276407))) + ((0.74264 -0.550878)))) + ((pos_in_syl < 3.4) + ((seg_onsetcoda is coda) + ((ph_ctype is r) + ((n.ph_ctype is s) + ((0.714319 -0.240328)) + ((p.ph_ctype is 0) + ((0.976987 0.330352)) + ((1.1781 -0.0816682)))) + ((ph_ctype is l) + ((n.ph_ctype is 0) + ((1.39137 0.383533)) + ((0.725585 -0.324515))) + ((ph_vheight is 3) + ((ph_vlng is d) + ((0.802626 -0.62487)) + ((n.ph_ctype is r) + ((0.661091 -0.513869)) + ((R:SylStructure.parent.position_type is initial) + ((R:SylStructure.parent.parent.word_numsyls < 2.4) + ((0.482285 0.207874)) + ((0.401601 -0.0204711))) + ((0.733755 0.397372))))) + ((n.ph_ctype is r) + ((p.ph_ctype is 0) + ((pos_in_syl < 1.2) + ((0.666325 0.271734)) + ((nn.ph_vheight is 0) + ((0.642401 -0.261466)) + ((0.783684 -0.00956571)))) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((0.692225 -0.381895)) + ((0.741921 -0.0898767)))) + ((nn.ph_vfront is 2) + ((ph_ctype is s) + ((0.697527 -1.12626)) + ((n.ph_ctype is s) + ((ph_vlng is 0) + ((R:SylStructure.parent.sub_phrases < 2.4) + ((0.498719 -0.906926)) + ((0.635342 -0.625651))) + ((0.45886 -0.385089))) + ((0.848596 -0.359702)))) + ((p.ph_vlng is a) + ((p.ph_ctype is 0) + ((0.947278 0.216904)) + ((0.637933 -0.394349))) + ((p.ph_ctype is r) + ((R:SylStructure.parent.syl_break is 0) + ((0.529903 -0.860573)) + ((0.581378 -0.510488))) + ((ph_vlng is 0) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((seg_onset_stop is 0) + ((R:SylStructure.parent.syl_break is 0) + ((p.ph_vlng is d) + ((0.768363 0.0108428)) + ((ph_ctype is s) + ((0.835756 -0.035054)) + ((ph_ctype is f) + ((p.ph_vlng is s) + ((0.602016 -0.179727)) + ((0.640126 -0.297341))) + ((0.674628 -0.542602))))) + ((ph_ctype is s) + ((0.662261 -0.60496)) + ((0.662088 -0.432058)))) + ((R:SylStructure.parent.syl_out < 4.4) + ((0.582448 -0.389079)) + ((ph_ctype is s) + ((0.60413 -0.73564)) + ((0.567153 -0.605444))))) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((0.761115 -0.827377)) + ((ph_ctype is n) + ((0.855183 -0.275338)) + ((R:SylStructure.parent.syl_break is 0) + ((0.788288 -0.802801)) + ((R:SylStructure.parent.syl_codasize < 2.2) + ((0.686134 -0.371234)) + ((0.840184 -0.772883))))))) + ((pos_in_syl < 1.2) + ((R:SylStructure.parent.syl_break is 0) + ((n.ph_ctype is n) + ((0.423592 -0.655006)) + ((R:SylStructure.parent.syl_out < 4.4) + ((0.595269 -0.303751)) + ((0.478433 -0.456882)))) + ((0.688133 -0.133182))) + ((seg_onset_stop is 0) + ((1.27464 0.114442)) + ((0.406837 -0.167545)))))))))))) + ((ph_ctype is r) + ((0.462874 -0.87695)) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((0.645442 -0.640572)) + ((0.673717 -0.321322))))) + ((0.61008 -0.925472)))))))) +;; RMSE 0.8085 Correlation is 0.5899 Mean (abs) Error 0.6024 (0.5393) + + +)) + +(provide 'apml_kaldurtreeZ) diff --git a/CosyVoice-ttsfrd/resource/festival/cart_aux.scm b/CosyVoice-ttsfrd/resource/festival/cart_aux.scm new file mode 100644 index 0000000000000000000000000000000000000000..b641a3aad7dadcb6e0b04e7bc8b96bd0d625d274 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/cart_aux.scm @@ -0,0 +1,200 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996-2011 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Some functions for manipulating decision trees +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (cart_prune_tree_thresh tree threshold default) +"(prune_cart_tree_thresh TREE THRESHOLD DEFAULT) +Prune the classification tree TREE so that all tail nodes with +a prediction probabality less than THRESHOLD and changed to return +DEFAULT instead. This may be used when different mistakes have actually +different penalites hence some control of the defaults need to be +controlled." + (cond + ((cdr tree) ;; a question + (list + (car tree) + (cart_prune_tree_thresh (car (cdr tree)) threshold default) + (cart_prune_tree_thresh (car (cdr (cdr tree))) threshold default))) + ((< (cart_class_probability (car tree)) threshold) + (list (list (list threshold default) default))) + (t ;; leave asis + tree))) + +(define (cart_class_probability class) + "(cart_class_probability CLASS) +Returns the probability of the best class in the cart leaf node CLASS. +If CLASS simple has a value and now probabilities the probabilities +it assume to be 1.0." + (let ((val 0.0)) + (set! val (assoc (car (last class)) class)) + (if val + (car (cdr val)) + 1.0))) + +(define (cart_class_prune_merge tree) + "(cart_class_prune_merge tree) +Prune all sub trees which are pure. That is they all predict the +same class. This can happen when some other pruning technique +as modified a sub-tree now making it pure." + (let ((pure (cart_tree_pure tree))) + (cond + (pure pure) + ((cdr tree);; a question + (list + (car tree) + (cart_class_prune_merge (car (cdr tree))) + (cart_class_prune_merge (car (cdr (cdr tree)))))) + (t;; a leaf leave asis + tree)))) + +(define (cart_tree_pure tree) + "(cart_tree_pure tree) +Returns a probability density function if all nodes in this tree +predict the same class and nil otherwise" + (cond + ((cdr tree) + (let ((left (cart_tree_pure (car (cdr tree)))) + (right (cart_tree_pure (car (cdr (cdr tree)))))) + (cond + ((not left) nil) + ((not right) nil) + ((equal? (car (last left)) (car (last right))) + left) + (t + nil)))) + (t ;; its a leaf, so of couse its pure + tree))) + +(define (cart_simplify_tree tree map) + "(cart_simplify_tree TREE) +Simplify a CART tree by reducing probability density functions to +simple single clasifications (no probabilities). This removes valuable +information from the tree but makes them smaller easier to read by humans +and faster to read by machines. Also the classes may be mapped by the assoc +list in map. The bright ones amongst you will note this could be +better and merge 'is' operators into 'in' operators in some situations +especially if you are ignoring actual probability distributions." + (cond + ((cdr tree) + (list + (car tree) + (cart_simplify_tree (car (cdr tree)) map) + (cart_simplify_tree (car (cdr (cdr tree))) map))) + (t + (let ((class (car (last (car tree))))) + (if (assoc class map) + (list (cdr (assoc class map))) + (list (last (car tree)))))))) + +(define (cart_simplify_tree2 tree) + "(cart_simplify_tree2 TREE) +Simplify a CART tree by reducing probability density functions to +only non-zero probabilities." + (cond + ((cdr tree) + (list + (car tree) + (cart_simplify_tree2 (car (cdr tree))) + (cart_simplify_tree2 (car (cdr (cdr tree)))))) + (t + (list + (cart_remove_zero_probs (car tree)))))) + +(define (cart_remove_zero_probs pdf) + "(cart_remove_zero_probs pdf) +Removes zero probability classes in pdf, last in list +is best in class (as from cart leaf node)." + (cond + ((null (cdr pdf)) pdf) + ((equal? 0 (car (cdr (car pdf)))) + (cart_remove_zero_probs (cdr pdf))) + (t + (cons + (car pdf) + (cart_remove_zero_probs (cdr pdf)))))) + +(define (cart_interpret_debug i tree) + "(cart_interpret_debug i tree) +In comparing output between different implementations (flite vs festival) +This prints out the details as it interprets the tree." + (cond + ((cdr tree) ;; question + (format t "%s %s %s\n" (car (car tree)) (upcase (cadr (car tree))) + (car (cddr (car tree)))) + (set! a (item.feat i (car (car tree)))) + (format t "%s\n" a) + (cond + ((string-equal "is" (cadr (car tree))) + (if (string-equal a (car (cddr (car tree)))) + (begin + (format t " YES\n") + (cart_interpret_debug i (car (cdr tree)))) + (begin + (format t " NO\n") + (cart_interpret_debug i (car (cddr tree)))))) + ((string-equal "<" (cadr (car tree))) + (if (< (parse-number a) (parse-number (car (cddr (car tree))))) + (begin + (format t " YES\n") + (cart_interpret_debug i (car (cdr tree)))) + (begin + (format t " NO\n") + (cart_interpret_debug i (car (cddr tree)))))) + (t + (format t "unknown q type %l\n" (car tree))))) + (t ;; leaf + (car tree) + ))) + +;;; +;;; Prediction tree for OLS trees +;;; applies OLS coefficients from appropriate leaf of tree +;;; +(define (ols_tree_predict i tree) + ;; Surprisingly simple function does the necessary work + (let ((p (wagon i tree))) + (apply + + + (cons + (cadr (car (car p))) ;; Intercept + (mapcar + (lambda (fp) + ;; get feature value and multiple by coefficent + (* (parse-number (item.feat i (car fp))) (cadr fp))) + (cdr (car p))))))) + +(provide 'cart_aux) + diff --git a/CosyVoice-ttsfrd/resource/festival/clunits.scm b/CosyVoice-ttsfrd/resource/festival/clunits.scm new file mode 100644 index 0000000000000000000000000000000000000000..9ad181ad2894225abb35bf82c2cb52602200d7b2 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/clunits.scm @@ -0,0 +1,287 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Carnegie Mellon University and ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1998-2001 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH, CARNEGIE MELLON UNIVERSITY AND THE ;; +;;; CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH REGARD TO ;; +;;; THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY ;; +;;; AND FITNESS, IN NO EVENT SHALL THE UNIVERSITY OF EDINBURGH, CARNEGIE ;; +;;; MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, ;; +;;; INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER ;; +;;; RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION ;; +;;; OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF ;; +;;; OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Cluster Unit selection support (Black and Taylor Eurospeech '97) +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Run-time support, selection and synthesis and some debugging functions +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(require_module 'clunits) + +(defvar cluster_synth_pre_hooks nil) +(defvar cluster_synth_post_hooks nil) + +(defvar clunits_time time) ;; some old voices might use this + +(defSynthType Cluster + (apply_hooks cluster_synth_pre_hooks utt) + (Clunits_Select utt) + (Clunits_Get_Units utt) + (Clunits_Join_Units utt) + (apply_hooks cluster_synth_post_hooks utt) + utt +) + +(define (Clunits_Join_Units utt) + "(Clunits_Join_Units utt) +Join the preselected and gotten units into a waveform." + (let ((join_method (get_param 'join_method clunits_params 'simple))) + ;; Choice of function to put them together + (cond + ((string-equal join_method 'windowed) + (Clunits_Windowed_Wave utt) + (clunits::fix_segs_durs utt)) + ((string-equal join_method 'smoothedjoin) + (Clunits_SmoothedJoin_Wave utt) + (clunits::fix_segs_durs utt)) + ((string-equal join_method 'none) + t) + ((string-equal join_method 'modified_lpc) + (defvar UniSyn_module_hooks nil) + (Param.def "unisyn.window_name" "hanning") + (Param.def "unisyn.window_factor" 1.0) + (Parameter.def 'us_sigpr 'lpc) + (mapcar + (lambda (u s) + (item.set_feat s "source_end" (item.feat u "end"))) + (utt.relation.items utt 'Unit) + (utt.relation.items utt 'Segment)) + (us_unit_concat utt) + (if (not (member 'f0 (utt.relationnames utt))) + (targets_to_f0 utt)) + (if (utt.relation.last utt 'Segment) + (set! pm_end (+ (item.feat (utt.relation.last utt 'Segment) "end") + 0.02)) + (set! pm_end 0.02)) + (us_f0_to_pitchmarks utt 'f0 'TargetCoef pm_end) + (us_mapping utt 'segment_single) + (us_generate_wave utt (Parameter.get 'us_sigpr) + 'analysis_period)) + ((string-equal join_method 'smoothed_lpc) +; (format t "smoothed_lpc\n") + (defvar UniSyn_module_hooks nil) + (Param.def "unisyn.window_name" "hanning") + (Param.def "unisyn.window_factor" 1.0) + (Parameter.def 'us_sigpr 'lpc) + (mapcar + (lambda (u s) + (item.set_feat s "source_end" (item.feat u "end")) + (item.set_feat s "unit_duration" + (- (item.feat u "seg_end") (item.feat u "seg_start"))) + ) + (utt.relation.items utt 'Unit) + (utt.relation.items utt 'Segment)) + (us_unit_concat utt) + (mapcar + (lambda (u s) + (item.set_feat s "num_frames" (item.feat u "num_frames"))) + (utt.relation.items utt 'Unit) + (utt.relation.items utt 'Segment)) + (if (not (member 'f0 (utt.relationnames utt))) + (targets_to_f0 utt)) + (if (utt.relation.last utt 'Segment) + (set! pm_end (+ (item.feat (utt.relation.last utt 'Segment) "end") + 0.02)) + (set! pm_end 0.02)) + (us_f0_to_pitchmarks utt 'f0 'TargetCoef pm_end) + (cl_mapping utt clunits_params) + (us_generate_wave utt (Parameter.get 'us_sigpr) + 'analysis_period)) + (t + (Clunits_Simple_Wave utt))) + utt + ) +) + +(define (clunits::units_selected utt filename) + "(clunits::units_selected utt filename) +Output selected unitsfile indexes for each unit in the given utterance. +Results saved in given file name, or stdout if filename is \"-\"." + (let ((fd (if (string-equal filename "-") + t + (fopen filename "w"))) + (end 0) + (sample_rate + (cadr (assoc 'sample_rate (wave.info (utt.wave utt)))))) + (format fd "#\n") + (mapcar + (lambda (s) + (let ((dur (/ (- (item.feat s "samp_end") + (item.feat s "samp_start")) + sample_rate)) + (start (/ (item.feat s "samp_start") sample_rate))) + (set! end (+ end dur)) + (format fd "%f 125 %s ; %s %10s %f %f %f\n" + end + (string-before (item.name s) "_") + (item.name s) + (item.feat s "fileid") + (item.feat s "unit_start") + (item.feat s "unit_middle") + (item.feat s "unit_end")) + )) + (utt.relation.items utt 'Unit)) + (if (not (string-equal filename "-")) + (fclose fd)) + t)) + +(define (clunits::units_segs utt filename) + "(clunits::units_segs utt filename) +Svaes the unit selections (alone) for display." + (let ((fd (if (string-equal filename "-") + t + (fopen filename "w"))) + (end 0) + (sample_rate + (cadr (assoc 'sample_rate (wave.info (utt.wave utt)))))) + (format fd "#\n") + (mapcar + (lambda (s) + (let ((dur (/ (- (item.feat s "samp_end") + (item.feat s "samp_start")) + sample_rate)) + (start (/ (item.feat s "samp_start") sample_rate))) + (set! end (+ end dur)) + (format fd "%f 125 %s \n" + end + (string-before (item.name s) "_") +; (item.name s) + ) + )) + (utt.relation.items utt 'Unit)) + (if (not (string-equal filename "-")) + (fclose fd)) + t)) + +(define (clunits::fix_segs_durs utt) + "(clunits::fix_segs_durs utt) +Takes the actual unit times and places then back on the segs." + (let ((end 0) + (sample_rate + (cadr (assoc 'sample_rate (wave.info (utt.wave utt)))))) + (mapcar + (lambda (u s) + (let ((dur (/ (- (item.feat u "samp_end") + (item.feat u "samp_start")) + sample_rate)) + (seg_start (/ (- (item.feat u "samp_seg_start") + (item.feat u "samp_start")) + sample_rate))) + (if (item.prev s) + (item.set_feat (item.prev s) "end" + (+ (item.feat s "p.end") seg_start))) + (set! end (+ end dur)) + (item.set_feat s "end" end))) + (utt.relation.items utt 'Unit) + (utt.relation.items utt 'Segment) + ) + utt)) + +(define (clunits::display utt) + "(clunits::display utt) +Display utterance with emulabel. Note this saves files in +scratch/wav/ and scratch/lab/." + (let ((id "cl01")) + (utt.save.wave utt (format nil "scratch/wav/%s.wav" id)) + (utt.save.segs utt (format nil "scratch/lab/%s.lab" id)) + (system "cd scratch; emulabel ../etc/emu_lab cl01 &") + t)) + +; (define (clunits::debug_resynth_units utt) +; "(clunits::debug_resynth_units utt) +; Check each of the units in utt against the related label +; files and re-synth with any given new boundaries. Note this is +; will only work if the segment still overlaps with its original and +; also note that with a rebuild of the clunits db a complete different +; set of units may be selected for this utterance." +; (let () +; (mapcar +; (lambda (unit) +; (clunits::check_unit_boundaries unit)) +; (utt.relation.items utt 'Unit)) +; ;; This can't be done like this ... +; (Clunits_Get_Units utt) ;; get unit signal/track stuff +; (Clunits_Join_Units utt) ;; make a complete waveform +; (apply_hooks cluster_synth_post_hooks utt) +; utt) +; ) + +(define (clunits::join_parameters utt) + "(clunits::join_parameters utt) +Join selected paremeters (rather than the signal), used in F0 and +Articulatory selection." + (let ((params nil) + (num_channels 0) + (num_frames 0 )) + + (mapcar + (lambda (unit) + (set! num_frames + (+ num_frames + (track.num_frames (item.feat unit "coefs")))) + (set! num_channels (track.num_channels (item.feat unit "coefs"))) + (format t "coounting %d %d\n" num_frames num_channels) + ) + (utt.relation.items utt 'Unit)) + + (set! params (track.resize nil 0 num_channels)) + + (mapcar + (lambda (unit) + (set! frames 0) + (format t "inserting \n") + (format t "%l %l %l %l %l\n" + params (track.num_frames params) + (item.feat unit "coefs") 0 + (track.num_frames (item.feat unit "coefs"))) + (track.insert + params (track.num_frames params) + (item.feat unit "coefs") 0 + (track.num_frames (item.feat unit "coefs"))) + ) + (utt.relation.items utt 'Unit)) + + (utt.relation.create utt "AllCoefs") + (set! coefs_item (utt.relation.append utt "AllCoefs")) + (item.set_feat coefs_item "name" "AllCoefs") + (item.set_feat coefs_item "AllCoefs" params) + + utt +)) + + +(provide 'clunits) diff --git a/CosyVoice-ttsfrd/resource/festival/clunits_build.scm b/CosyVoice-ttsfrd/resource/festival/clunits_build.scm new file mode 100644 index 0000000000000000000000000000000000000000..39c13219ec78f80cc9c685d03cf92fdff1f8e7b2 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/clunits_build.scm @@ -0,0 +1,479 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Carnegie Mellon University and ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1998-2005 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH, CARNEGIE MELLON UNIVERSITY AND THE ;; +;;; CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH REGARD TO ;; +;;; THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY ;; +;;; AND FITNESS, IN NO EVENT SHALL THE UNIVERSITY OF EDINBURGH, CARNEGIE ;; +;;; MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, ;; +;;; INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER ;; +;;; RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION ;; +;;; OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF ;; +;;; OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Cluster Unit selection support (Black and Taylor Eurospeech '97) +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; clunits build support +;;; +;;; There are five stages to this +;;; Load in all utterances +;;; Load in their coefficients +;;; Collect together the units of the same type +;;; build distance tables from them +;;; dump features for them +;;; + +(require_module 'clunits) ;; C++ modules support +(require 'clunits) ;; run time scheme support + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (do_all) + (let () + + (format t "Loading utterances and sorting types\n") + (set! utterances (acost:db_utts_load clunits_params)) + (set! unittypes (acost:find_same_types utterances clunits_params)) + (acost:name_units unittypes) + + (format t "Dumping features for clustering\n") + (acost:dump_features unittypes utterances clunits_params) + + (format t "Loading coefficients\n") + (acost:utts_load_coeffs utterances) + ;; If you are short of diskspace try this + (acost:disttabs_and_clusters unittypes clunits_params) + + ;; or if you have lots of diskspace try +; (format t "Building distance tables\n") +; (acost:build_disttabs unittypes clunits_params) + +; ;; Build the cluster trees (requires disttabs and features) +; (format t "Building cluster trees\n") +; (acost:find_clusters (mapcar car unittypes) clunits_params) + + ;; Tidy up and put things together + (acost:collect_trees (mapcar car unittypes) clunits_params) + + (format t "Saving unit catalogue\n") + (acost:save_catalogue utterances clunits_params) + + ) +) + +(define (do_init) + (set! utterances (acost:db_utts_load clunits_params)) + (set! unittypes (acost:find_same_types utterances clunits_params)) + (acost:name_units unittypes) + t) + +(define (acost:disttabs_and_clusters unittypes clunits_params) + "(acost:disttabs_and_custers unittypes) +Cause it uses so much diskspace, build each table individually +and them the cluster, removing the table before moving on to the +next." + (mapcar + (lambda (uu) + (acost:build_disttabs (list uu) clunits_params) + (acost:find_clusters (list (car uu)) clunits_params) + (delete-file + (format nil "%s/%s/%s%s" + (get_param 'db_dir clunits_params "./") + (get_param 'disttabs_dir clunits_params "disttabs/") + (car uu) + (get_param 'disttabs_ext clunits_params ".disttab"))) + ) + unittypes) + t) + +(define (acost:db_utts_load params) + "(acost:db_utts_load params) +Load in all utterances identified in database." + (let ((files (car (cdr (assoc 'files params))))) + (set! acost:all_utts + (mapcar + (lambda (fname) + (set! utt_seg (Utterance Text fname)) + (utt.load utt_seg + (string-append + (get_param 'db_dir params "./") + (get_param 'utts_dir params "festival/utts/") + fname + (get_param 'utts_ext params ".utt"))) + utt_seg) + files)))) + +(define (acost:utts_load_coeffs utterances) + "(acost:utts_load_coeffs utterances) +Loading the acoustic coefficients of for each utterance." + (mapcar + (lambda (utt) (acost:utt.load_coeffs utt clunits_params)) + utterances) + t) + +(define (acost:find_same_types utterances params) + "(acost:find_same_types utterances) +Find all the stream items of the same type and collect them into +lists of that type." + (let ((clunit_name_feat (get_param 'clunit_name_feat params "name")) + (clunit_relation (get_param 'clunit_relation params "Segment"))) + (set! acost:unittypes nil) + (mapcar + (lambda (u) + (mapcar + (lambda (s) + (let ((cname (item.feat s clunit_name_feat))) + (if (not (string-equal "ignore" cname)) + (begin + (item.set_feat s "clunit_name" (item.feat s clunit_name_feat)) + (let ((p (assoc (item.feat s "clunit_name") acost:unittypes))) + (if p + (set-cdr! p (cons s (cdr p))) + (set! acost:unittypes + (cons + (list (item.feat s "clunit_name") s) + acost:unittypes)))))))) + (utt.relation.items u clunit_relation))) + utterances) + (acost:prune_unittypes acost:unittypes params))) + +(define (acost:prune_unittypes unittypes params) + "(acost:prune_unittypes unittypes) +If unit types are complex (contain an _) then remove all unittypes sets +with less than unittype_prune_threshold (typically 3)." + (if (string-matches (car (car unittypes)) ".*_.*") + (let ((ut nil) (pt (get_param 'unittype_prune_threshold params 0))) + (while unittypes + (if (or (eq? pt 0) + (> (length (cdr (car unittypes))) pt)) + (set! ut (cons (car unittypes) ut))) + (set! unittypes (cdr unittypes))) + (reverse ut)) + unittypes)) + +(define (acost:name_units unittypes) + "(acost:name_units unittypes) +Names each unit with a unique id and number the occurrences of each type." + (let ((idnum 0) (tynum 0)) + (mapcar + (lambda (s) + (set! tynum 0) + (mapcar + (lambda (si) + (item.set_feat si "unitid" idnum) + (set! idnum (+ 1 idnum)) + (item.set_feat si "occurid" tynum) + (set! tynum (+ 1 tynum))) + (cdr s)) + (format t "units \"%s\" %d\n" (car s) tynum)) + unittypes) + (format t "total units %d\n" idnum) + idnum)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Generating feature files + +(define (acost:dump_features unittypes utterances params) + "(acost:dump_features unittypes utterances params) +Do multiple passes over the utterances for each unittype and +dump the desired features. This would be easier if utterances +weren't require for feature functions." + (mapcar + (lambda (utype) + (acost:dump_features_utype + (car utype) + (cdr utype) + utterances + params)) + unittypes) + t) + +(define (acost:dump_features_utype utype uitems utterances params) + "(acost:dump_features_utype utype utterances params) +Dump features for all items of type utype." + (let ((fd (fopen + (string-append + (get_param 'db_dir params "./") + (get_param 'feats_dir params "festival/feats/") + utype + (get_param 'feats_ext params ".feats")) + "w")) + (feats (car (cdr (assoc 'feats params))))) + (format t "Dumping features for %s\n" utype) + (mapcar + (lambda (s) + (mapcar + (lambda (f) + (set! fval (unwind-protect (item.feat s f) "0")) + (if (or (string-equal "" fval) + (string-equal " " fval)) + (format fd "%l " fval) + (format fd "%s " fval))) + feats) + (format fd "\n")) + uitems) + (fclose fd))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Tree building functions + +(defvar wagon-balance-size 0) + +(define (acost:find_clusters unittypes clunits_params) +"Use wagon to find the best clusters." + (mapcar + (lambda (unittype) + (build_tree unittype clunits_params)) + unittypes) + t) + +(define (build_tree unittype clunits_params) +"Build tree with Wagon for this unittype." + (let ((command + (format nil "%s -desc %s -data '%s' -balance %s -distmatrix '%s' -stop %s -output '%s' %s" + (get_param 'wagon_progname clunits_params "wagon") + (if (probe_file + (string-append + (get_param 'db_dir clunits_params "./") + (get_param 'wagon_field_desc clunits_params "wagon") + "." unittype)) + ;; So there can be unittype specific desc files + (string-append + (get_param 'db_dir clunits_params "./") + (get_param 'wagon_field_desc clunits_params "wagon") + "." unittype) + (string-append + (get_param 'db_dir clunits_params "./") + (get_param 'wagon_field_desc clunits_params "wagon"))) + (string-append + (get_param 'db_dir clunits_params "./") + (get_param 'feats_dir clunits_params "festival/feats/") + unittype + (get_param 'feats_ext clunits_params ".feats")) + (get_param 'wagon_balance_size clunits_params 0) + (string-append + (get_param 'db_dir clunits_params "./") + (get_param 'disttabs_dir clunits_params "festival/disttabs/") + unittype + (get_param 'disttabs_ext clunits_params ".disttab")) + (get_param 'wagon_cluster_size clunits_params 10) + (string-append + (get_param 'db_dir clunits_params "./") + (get_param 'trees_dir clunits_params "festival/trees/") + unittype + (get_param 'trees_ext clunits_params ".tree")) + (get_param 'wagon_other_params clunits_params "") + ))) + (format t "%s\n" command) + (system command))) + +(defvar clunits_tree_minimum_leafs 0) +(define (acost:collect_trees unittypes params) +"Collect the trees into one file as an assoc list" + (let ((fd (fopen + (string-append + (get_param 'db_dir params "./") + (get_param 'trees_dir params "festival/trees/") + (get_param 'index_name params "all.") + (get_param 'trees_ext params ".tree")) + "wb")) + (tree_pref + (string-append + (get_param 'db_dir params "./") + (get_param 'trees_dir params "festival/trees/"))) + (cluster_prune_limit (get_param 'cluster_prune_limit params 0)) + (cluster_merge (get_param 'cluster_merge params 0))) + (format fd ";; Autogenerated list of selection trees\n") + (mapcar + (lambda (fp) + (format fd ";; %l %l\n" (car fp) (car (cdr fp)))) + params) + (format fd "(set! clunits_selection_trees '(\n") + (mapcar + (lambda (unit) + (set! tree (car (load (string-append tree_pref unit ".tree") t))) + (if (> cluster_prune_limit 0) + (set! tree (cluster_tree_prune tree cluster_prune_limit))) + (if (> cluster_merge 0) + (set! tree (tree_merge_leafs tree cluster_merge))) + (if (boundp 'temp_tree_convert) + (set! tree (temp_tree_convert))) + (if (> (tree_num_units tree) clunits_tree_minimum_leafs) + (pprintf (list unit tree) fd))) + unittypes) + (format fd "))\n") + (fclose fd))) + +(define (cluster_tree_prune_in_line prune_limit) +"(cluster_tree_prune_in_line) +Prune number of units in each cluster in each tree *by* prune_limit, +if negative, or *to* prune_limit, if positive." + (set! sucs_select_trees + (mapcar + (lambda (t) + (cluster_tree_prune t prune_limit)) + sucs_select_trees))) + +(define (tree_merge_leafs tree depth) + "(tree_merge_leafs tree depth) +Merge the leafs of the tree at goven depth. This allows the trees +to be pruned then the single leafs joined together into larger +clusters (so the viterbi part has something to do)." + (let ((num_leafs (tree_num_leafs tree))) + (cond + ((< num_leafs 2) tree) ;; already at the foot + ((< num_leafs depth) + (tree_collect_leafs tree)) + (t + (list + (car tree) + (tree_merge_leafs (car (cdr tree)) depth) + (tree_merge_leafs (car (cdr (cdr tree))) depth)))))) + +(define (tree_num_leafs tree) + "(tree_num_leafs tree) +Number of leafs of given tree." + (cond + ((cdr tree) + (+ + (tree_num_leafs (car (cdr tree))) + (tree_num_leafs (car (cdr (cdr tree)))))) + (t + 1))) + +(define (tree_num_units tree) + "(tree_num_units tree) +Number of leafs of given tree." + (cond + ((cdr tree) + (+ + (tree_num_units (car (cdr tree))) + (tree_num_units (car (cdr (cdr tree)))))) + (t + (length (caar tree)) + ))) + +(define (tree_collect_leafs tree) + "(tree_collect_leafs tree) +Combine all units in the leafs." + (cond + ((cdr tree) + (let ((a (tree_collect_leafs (car (cdr tree)))) + (b (tree_collect_leafs (car (cdr (cdr tree)))))) + (list + (list + (append + (caar a) + (caar b)) + 10.0)))) + (t + tree))) + +(define (cluster_tree_prune tree prune_limit) +"(cluster_tree_prune TREE PRUNE_LIMIT) +Reduce the number of elements in the (CART) tree leaves to PRUNE_LIMIT +removing the ones further from the cluster centre. Maybe later this should +have guards on minimum number of units that must remain in the tree and +a per unit type limit." + (cond + ((cdr tree) ;; a question + (list + (car tree) + (cluster_tree_prune (car (cdr tree)) prune_limit) + (cluster_tree_prune (car (cdr (cdr tree))) prune_limit))) + (t ;; tree leave + (list + (list + (remove_n_worst + (car (car tree)) + (if (< prune_limit 0) + (* -1 prune_limit) + (- (length (car (car tree))) prune_limit))) + (car (cdr (car tree)))))))) + +(define (remove_n_worst lll togo) +"(remove_n_worst lll togo) +Remove togo worst items from lll." + (cond + ((< togo 0) + lll) + ((equal? 0 togo) + lll) + (t + (remove_n_worst + (remove (worst_unit (cdr lll) (car lll)) lll) + (- togo 1))))) + +(define (worst_unit lll worst_so_far) +"(worst_unit lll worst_so_far) +Returns unit with worst score in list." + (cond + ((null lll) + worst_so_far) + ((< (car (cdr worst_so_far)) (car (cdr (car lll)))) + (worst_unit (cdr lll) (car lll))) + (t + (worst_unit (cdr lll) worst_so_far)))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Save the unit catalogue for use in the run-time index + +(define (acost:save_catalogue utterances clunits_params) + "(acost:save_catalogue utterances clunits_params) +Save the catalogue with named units with times." + (let ((fd (fopen + (string-append + (get_param 'db_dir clunits_params "./") + (get_param 'catalogue_dir clunits_params "trees/") + (get_param 'index_name clunits_params "catalogue.") + ".catalogue") + "wb")) + (num_units 0) + ) + (format fd "EST_File index\n") + (format fd "DataType ascii\n") + (format fd "NumEntries %d\n" + (apply + + (mapcar (lambda (u) + (length (utt.relation.items u 'Segment))) utterances))) + (format fd "IndexName %s\n" (get_param 'index_name clunits_params "cluser")) + (format fd "EST_Header_End\n") + (mapcar + (lambda (u) + (mapcar + (lambda (s) + (format fd "%s_%s %s %f %f %f\n" + (item.feat s "clunit_name") + (item.feat s 'occurid) + (utt.feat u 'fileid) + (item.feat s 'segment_start) + (item.feat s 'segment_mid) + (item.feat s 'segment_end))) + (utt.relation.items u 'Segment))) + utterances) + (fclose fd))) + +(provide 'clunits_build.scm) diff --git a/CosyVoice-ttsfrd/resource/festival/cmusphinx2_phones.scm b/CosyVoice-ttsfrd/resource/festival/cmusphinx2_phones.scm new file mode 100644 index 0000000000000000000000000000000000000000..49c6597e557c0e2d8315a1fb594c67e9a16c4eb7 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/cmusphinx2_phones.scm @@ -0,0 +1,119 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;;; +;;; Carnegie Mellon University ;;; +;;; and Alan W Black and Kevin Lenzo ;;; +;;; Copyright (c) 1998-2000 ;;; +;;; All Rights Reserved. ;;; +;;; ;;; +;;; Permission is hereby granted, free of charge, to use and distribute ;;; +;;; this software and its documentation without restriction, including ;;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;;; +;;; permit persons to whom this work is furnished to do so, subject to ;;; +;;; the following conditions: ;;; +;;; 1. The code must retain the above copyright notice, this list of ;;; +;;; conditions and the following disclaimer. ;;; +;;; 2. Any modifications must be clearly marked as such. ;;; +;;; 3. Original authors' names are not deleted. ;;; +;;; 4. The authors' names are not used to endorse or promote products ;;; +;;; derived from this software without specific prior written ;;; +;;; permission. ;;; +;;; ;;; +;;; CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK ;;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;;; +;;; SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE ;;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;;; +;;; THIS SOFTWARE. ;;; +;;; ;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; A definition of the cmusphinx2 phone set used in the BU RADIO FM +;;; corpus, some people call this the darpa set. This one +;;; has the closures removed +;;; + +(defPhoneSet + cmusphinx2 + ;;; Phone Features + (;; vowel or consonant + (vc + -) + ;; vowel length: short long dipthong schwa + (vlng s l d a 0) + ;; vowel height: high mid low + (vheight 1 2 3 0) + ;; vowel frontness: front mid back + (vfront 1 2 3 0) + ;; lip rounding + (vrnd + - 0) + ;; consonant type: stop fricative affricate nasal lateral approximant + (ctype s f a n l r 0) + ;; place of articulation: labial alveolar palatal labio-dental + ;; dental velar glottal + (cplace l a p b d v g 0) + ;; consonant voicing + (cvox + - 0) + ) + ;; Phone set members + ( + + ;; Note these features were set by awb so they are wrong !!! + +; phone vc vl vh vf vr ct cp cv + (AA + l 3 3 - 0 0 0) ;; father + (AE + s 3 1 - 0 0 0) ;; fat + (AH + s 2 2 - 0 0 0) ;; but + (AO + l 3 3 + 0 0 0) ;; lawn + (AW + d 3 2 - 0 0 0) ;; how + (AX + a 2 2 - 0 0 0) ;; about + (AXR + a 2 2 - r a +) + (AY + d 3 2 - 0 0 0) ;; hide + (B - 0 0 0 0 s l +) + (CH - 0 0 0 0 a p -) + (D - 0 0 0 0 s a +) + (DH - 0 0 0 0 f d +) + (DX - 0 0 0 0 s a +) + (EH + s 2 1 - 0 0 0) ;; get + (ER + a 2 2 - r 0 0) + (EY + d 2 1 - 0 0 0) ;; gate + (F - 0 0 0 0 f b -) + (G - 0 0 0 0 s v +) + (HH - 0 0 0 0 f g -) + (IH + s 1 1 - 0 0 0) ;; bit + (IY + l 1 1 - 0 0 0) ;; beet + (JH - 0 0 0 0 a p +) + (K - 0 0 0 0 s v -) + (L - 0 0 0 0 l a +) + (M - 0 0 0 0 n l +) + (N - 0 0 0 0 n a +) + (NG - 0 0 0 0 n v +) + (OW + d 2 3 + 0 0 0) ;; lone + (OY + d 2 3 + 0 0 0) ;; toy + (P - 0 0 0 0 s l -) + (R - 0 0 0 0 r a +) + (S - 0 0 0 0 f a -) + (SH - 0 0 0 0 f p -) + (T - 0 0 0 0 s a -) + (TH - 0 0 0 0 f d -) + (UH + s 1 3 + 0 0 0) ;; full + (UW + l 1 3 + 0 0 0) ;; fool + (V - 0 0 0 0 f b +) + (W - 0 0 0 0 r l +) + (Y - 0 0 0 0 r p +) + (Z - 0 0 0 0 f a +) + (ZH - 0 0 0 0 f p +) + (SIL - 0 0 0 0 0 0 -) ; added + ) +) + +(PhoneSet.silences '(SIL)) + +(provide 'cmusphinx2_phones) + + + + diff --git a/CosyVoice-ttsfrd/resource/festival/cslush.scm b/CosyVoice-ttsfrd/resource/festival/cslush.scm new file mode 100644 index 0000000000000000000000000000000000000000..6864917d6cc81b9c1620eb5738881f168cddd974 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/cslush.scm @@ -0,0 +1,79 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Functions specific to using Festival in cslush part of the OGI toolkit +;;; The OGI toolkit is a complete dialog building system with speech +;;; recognition and synthesis (Festival) it is available for free for +;;; research purposes from +;;; http://www.cse.ogi.edu/CSLU/toolkit/toolkit.html +;;; +;;; Note this cslush interface requires you to compile festival +;;; with tcl (7.6) +;;; +;;; The functions replace the C++ level functions Jacques H. de Villiers +;;; from CSLU wrote for the previous version +;;; + +(if (not (member 'tcl *modules*)) + (error "cslush: can't load cslush, TCL not supported in this installation of Festival.")) + +(define (cslush.getwave utt) +"(cslush.getwave UTT) +Extract wave memory info, pass this to wave import in CSLUsh." + (format nil "%s %s %s" + (utt.wave.info utt 'data_addr) + (utt.wave.info utt 'num_samples) + (utt.wave.info utt 'sample_rate))) + +(define (cslush.getphone utt) +"(cslush.getphone UTT) +Return segment names a single string of phones, for use to pass to +TCL." + (let ((phones "")) + (mapcar + (lambda (s) + (if (string-equal phones "") + (set! phones (format nil "%s" (utt.streamitem.feat utt s 'name))) + (set! phones (format nil "%s %s" + phones (utt.streamitem.feat utt s 'name))))) + (utt.stream utt 'Segment)) + phones)) + +(define (cslush TCLCOMMAND) +"(cslush TCLCOMMAND) +Pass TCLCOMMAND to TCL interpreter, returns what TCL returns as a +string." + (tcl_eval TCLCOMMAND)) + + +(provide 'cslush) diff --git a/CosyVoice-ttsfrd/resource/festival/cstr.scm b/CosyVoice-ttsfrd/resource/festival/cstr.scm new file mode 100644 index 0000000000000000000000000000000000000000..ffc01d876fb30bf9973fdd854a8ff1e1b689b4cf --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/cstr.scm @@ -0,0 +1,121 @@ + + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;; DO NOT EDIT THIS FILE ON PAIN OF MORE PAIN. + ;;; + ;;; The master copy of this file is in ../../speech_tools/lib/siod/cstr.scm + ;;; and is copied here at build time. + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + + + + + + + + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; CSTR siod extensions. + +;(defvar Parameter nil +; "Parameter +; An assoc-list of parameters and values for various parts of the speech +; synthesis system. This is used by the functions Parameter.set +; Parameter.def and Parameter.get as well as internal C++ functions.") + +(defvar Param (feats.make) + "Param + A feature set for arbitrary parameters for modules.") + +(define (Param.set name val) +"(Param.set NAME VAL) + Set parameter NAME to VAL (deleting any previous setting)" + (feats.set Param name val)) + +(define (Parameter.set name val) +"(Parameter.set NAME VAL) + Set parameter NAME to VAL (deleting any previous setting). This is + an old function and you should use Param.set instead." + (Param.set name val) + val + ) + +(define (Parameter.def name val) +"(Parameter.def NAME VAL) + Set parameter NAME to VAL if not already set. This is an OLD function + you shold use Param.def instead." + (Param.def name val) + ) + +(define (Param.def name val) +"(Param.def NAME VAL) + Set parameter NAME to VAL if not already set" + (if (not (feats.present Param name)) + (feats.set Param name val))) + +(define (Parameter.get name) +"(Parameter.get NAME) + Get parameter NAME's value (nil if unset). This is an OLD function + and may not exist in later versions (or change functionality). This + function (unlike Param.get) may return sylbols (rather than strings + if the val doesn't contain whitespace (to allow older functions to + still work." + (let ((val (Param.get name))) + (if (and (eq? 'string (typeof val)) + (not (string-matches val ".*[ \t\r\n].*"))) + (intern val) + val)) + ) + +(define (Param.get name) +"(Param.get NAME) + Get parameter NAME's value (nil if unset)" + (feats.get Param name)) + +(define (get_param name params default) + "(get_param name params default) +Get named parameters in params returning default if its not present." + (let ((pair (assoc name params))) + (if pair + (car (cdr pair)) + default))) + +(provide 'cstr) diff --git a/CosyVoice-ttsfrd/resource/festival/darpa_phones.scm b/CosyVoice-ttsfrd/resource/festival/darpa_phones.scm new file mode 100644 index 0000000000000000000000000000000000000000..184c8bfdd614e26d83c5c95f15ce6eaf65c49bb5 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/darpa_phones.scm @@ -0,0 +1,115 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1999 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Author: Alan W Black +;;; Date: April 1999 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; (yet another) darpa definition +;;; + +(require 'phoneset) + +(set! darpa_fs (cadr +(defPhoneSet + darpa + (Features + (vowel (syllabic + -) + (length long short diphthong schwa) + (height high mid low) + (front front mid back) + (round + -)) + (consonant + (syllabic + -) + (manner stop affricate fricative approximant nasal) + (place alveolar dental labial palatal velar) + (voicing + -)) + (silence + (syllabic -))) + (Phones + ;; type syl length height front round + (aa vowel + long low back -) + (ae vowel + short low front -) + (ah vowel + short mid mid -) + (ao vowel + long low front +) + (aw vowel + diphthong low mid -) + (ax vowel + schwa mid mid -) + (axr vowel + schwa mid mid -) + (ay vowel + diphthong low mid -) + (eh vowel + short mid front -) + (ey vowel + diphthong mid front -) + (ih vowel + short high front -) + (iy vowel + long high front -) + (ow vowel + diphthong mid back +) + (oy vowel + diphthong mid back +) + (uh vowel + short high back +) + (uw vowel + long high back +) + ;; type syl manner place voicing + (b consonant - stop labial +) + (ch consonant - affricate alveolar -) + (d consonant - stop alveolar +) + (dh consonant - fricative dental +) + (dx consonant - stop alveolar +) + (el consonant + approximant alveolar +) + (em consonant + nasal labial +) + (en consonant + stop alveolar +) + (er consonant + approximant alveolar +) + (f consonant - fricative labial -) + (g consonant - stop velar +) + (hh consonant - fricative velar -) + (jh consonant - affricate alveolar +) + (k consonant - stop velar -) + (l consonant - approximant alveolar +) + (m consonant - nasal labial +) + (n consonant - nasal alveolar +) + (nx consonant - nasal alveolar +) + (ng consonant - nasal velar +) + (p consonant - stop labial -) + (r consonant - approximant alveolar +) + (s consonant - fricative alveolar -) + (sh consonant - fricative palatal -) + (t consonant - stop alveolar -) + (th consonant - fricative dental -) + (v consonant - fricative labial +) + (w consonant - approximant velar +) + (y consonant - approximant palatal +) + (z consonant - fricative alveolar +) + (zh consonant - fricative palatal +) + (pau silence -) +; (sil silence -) + )))) + +(provide 'darpa_phones) + + + + diff --git a/CosyVoice-ttsfrd/resource/festival/display.scm b/CosyVoice-ttsfrd/resource/festival/display.scm new file mode 100644 index 0000000000000000000000000000000000000000..b190c05186bdb0181db4201493cb1e217227ec5c --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/display.scm @@ -0,0 +1,69 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Author: Alan W Black +;;; Date: December 1996 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; An xwaves display function for utterances +;;; +;;; Requires Xwaves to be running, saves labels etc and sends +;;; messages to Xwaves to display the utterance. +;;; +;;; This can be a model for other display functions. +;;; + +(define (display utt) +"(display utt) +Display an utterance's waveform, F0 and segment labels in Xwaves. +Xwaves must be running on the current machine, with a labeller for +this to work." + (let ((tmpname (make_tmp_filename))) + (utt.save.wave utt (string-append tmpname ".wav")) + (utt.save.segs utt (string-append tmpname ".lab")) + (utt.save.f0 utt (string-append tmpname ".f0")) + (system (format nil "send_xwaves make file %s name %s height 150" + (string-append tmpname ".f0") tmpname)) + (system (format nil "send_xwaves make name %s file %s height 200" + tmpname (string-append tmpname ".wav"))) + (system (format nil "send_xwaves send make file %s name %s color 125" + (string-append tmpname ".lab") tmpname)) + (system (format nil "send_xwaves send activate name %s fields 1" + tmpname)) + (system (format nil "send_xwaves %s align file %s" + tmpname (string-append tmpname ".wav")))) + ) + +(provide 'display) + + + + diff --git a/CosyVoice-ttsfrd/resource/festival/duration.scm b/CosyVoice-ttsfrd/resource/festival/duration.scm new file mode 100644 index 0000000000000000000000000000000000000000..7e074d76367bf08325c2064474abe7038c3bfb5d --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/duration.scm @@ -0,0 +1,196 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Basic Duration module which will call appropriate duration +;;; (C++) modules based on set parameter +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; These modules should predict intonation events/labels +;;; based on information in the phrase and word streams + +(define (Duration utt) +"(Duration utt) +Predict segmental durations using Duration_Method defined in Parameters. +Four methods are currently available: averages, Klatt rules, CART tree +based, and fixed duration." + (let ((rval (apply_method 'Duration_Method utt))) + (cond + (rval rval) ;; new style + ;; 1.1.1 voices still use other names + ((eq 'Averages (Parameter.get 'Duration_Method)) + (Duration_Averages utt)) + ((eq 'Klatt (Parameter.get 'Duration_Method)) + (Duration_Klatt utt)) + ((eq 'Tree_ZScores (Parameter.get 'Duration_Method)) + (Duration_Tree_ZScores utt)) + ((eq 'Tree (Parameter.get 'Duration_Method)) + (Duration_Tree utt)) + (t + (Duration_Default utt))))) + +(define (Duration_LogZScores utt) +"(Duration_LogZScores utt) +Predicts duration to segments using the CART tree in duration_logzscore_tree +and duration_logzscore_tree_silence which produces a zscore of the log +duration. The variable duration_logzscore_ph_info contains (log) means +and std for each phone in the set." + (let ((silence (car (car (cdr (assoc 'silences (PhoneSet.description)))))) + ldurinfo) + (mapcar + (lambda (s) + (if (string-equal silence (item.name s)) + (set! ldurinfo + (wagon s duration_logzscore_tree_silence)) + (set! ldurinfo + (wagon s duration_logzscore_tree))) + (set! dur (exp (duration_unzscore + (item.name s) + (car (last ldurinfo)) + duration_logzscore_ph_info))) + (set! dur (* dur (duration_find_stretch s))) + (item.set_feat + s "end" (+ dur (item.feat s "start_segment")))) + (utt.relation.items utt 'Segment)) + utt)) + +(define (duration_unzscore phname zscore table) +"(duration_unzscore phname zscore table) +Look up phname in table and convert xscore back to absolute domain." + (let ((phinfo (assoc phname table)) + mean std) + (if phinfo + (begin + (set! mean (car (cdr phinfo))) + (set! std (car (cdr (cdr phinfo))))) + (begin + (format t "Duration: unzscore no info for %s\n" phname) + (set! mean 0.100) + (set! std 0.25))) + (+ mean (* zscore std)))) + +(define (duration_find_stretch seg) +"(duration_find_stretch utt seg) +Find any relavant duration stretch." + (let ((global (Parameter.get 'Duration_Stretch)) + (local (item.feat + seg "R:SylStructure.parent.parent.R:Token.parent.dur_stretch"))) + (if (or (not global) + (equal? global 0.0)) + (set! global 1.0)) + (if (string-equal local 0.0) + (set! local 1.0)) + (* global local))) + +;; These provide lisp level functions, some of which have +;; been converted in C++ (in festival/src/modules/base/ff.cc) +(define (onset_has_ctype seg type) + ;; "1" if onset contains ctype + (let ((syl (item.relation.parent seg 'SylStructure))) + (if (not syl) + "0" ;; a silence + (let ((segs (item.relation.daughters syl 'SylStructure)) + (v "0")) + (while (and segs + (not (string-equal + "+" + (item.feat (car segs) "ph_vc")))) + (if (string-equal + type + (item.feat (car segs) "ph_ctype")) + (set! v "1")) + (set! segs (cdr segs))) + v)))) + +(define (coda_has_ctype seg type) + ;; "1" if coda contains ctype + (let ((syl (item.relation.parent seg 'SylStructure))) + (if (not syl) + "0" ;; a silence + (let ((segs (reverse (item.relation.daughters + syl 'SylStructure))) + (v "0")) + (while (and segs + (not (string-equal + "+" + (item.feat (car segs) "ph_vc")))) + (if (string-equal + type + (item.feat (car segs) "ph_ctype")) + (set! v "1")) + (set! segs (cdr segs))) + v)))) + +(define (onset_stop seg) + (onset_has_ctype seg "s")) +(define (onset_fric seg) + (onset_has_ctype seg "f")) +(define (onset_nasal seg) + (onset_has_ctype seg "n")) +(define (onset_glide seg) + (let ((l (onset_has_ctype seg "l"))) + (if (string-equal l "0") + (onset_has_ctype seg "r") + "1"))) +(define (coda_stop seg) + (coda_has_ctype seg "s")) +(define (coda_fric seg) + (coda_has_ctype seg "f")) +(define (coda_nasal seg) + (coda_has_ctype seg "n")) +(define (coda_glide seg) + (let ((l (coda_has_ctype seg "l"))) + (if (string-equal l "0") + (coda_has_ctype seg "r") + "1"))) + +(define (Unisyn_Duration utt) + "(UniSyn_Duration utt) +predicts Segment durations is some speficied way but holds the +result in a way necessary for other Unisyn code." + (let ((end 0)) + (mapcar + (lambda (s) + (item.get_utt s) + (let ((dur (wagon_predict s duration_cart_tree))) + (set! dur (* (Parameter.get 'Duration_Stretch) dur)) + (set! end (+ dur end)) + (item.set_feat s "target_dur" dur) + (item.set_function s "start" "unisyn_start") + (item.set_feat s "end" end) + (item.set_feat s "dur" dur) + )) + (utt.relation.items utt 'Segment)) + utt)) + +(provide 'duration) diff --git a/CosyVoice-ttsfrd/resource/festival/email-mode.scm b/CosyVoice-ttsfrd/resource/festival/email-mode.scm new file mode 100644 index 0000000000000000000000000000000000000000..4f8450f5b8ddfc41694304b6697c064fb3566f22 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/email-mode.scm @@ -0,0 +1,89 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; An example tts text mode for reading email messages, this includes +;;; support for extracting the interesting headers from the message +;;; and for dealing with quoted text. Its all very primitive and +;;; will easily be confused but its here just as an example +;;; + +(define (email_init_func) + "(email_init_func) +Called on starting email text mode." + (voice_rab_diphone) + (set! email_previous_t2w_func token_to_words) + (set! english_token_to_words email_token_to_words) + (set! token_to_words english_token_to_words) + (set! email_in_quote nil)) + +(define (email_exit_func) + "(email_exit_func) +Called on exit email text mode." + (set! english_token_to_words email_previous_t2w_func) + (set! token_to_words english_token_to_words)) + +(define (email_token_to_words token name) + "(email_token_to_words utt token name) +Email spcific token to word rules." + (cond + ((string-matches name "<.*@.*>") + (append + (email_previous_t2w_func token + (string-after (string-before name "@") "<")) + (cons + "at" + (email_previous_t2w_func token + (string-before (string-after name "@") ">"))))) + ((and (string-matches name ">") + (string-matches (item.feat token "whitespace") + "[ \t\n]*\n *")) + (voice_cmu_us_awb_cg) + nil ;; return nothing to say + ) + (t ;; for all other cases + (if (string-matches (item.feat token "whitespace") + ".*\n[ \n]*") + (voice_rab_diphone)) + (email_previous_t2w_func token name)))) + +(set! tts_text_modes + (cons + (list + 'email ;; mode name + (list ;; email mode params + (list 'init_func email_init_func) + (list 'exit_func email_exit_func) + '(filter "email_filter"))) + tts_text_modes)) + +(provide 'email-mode) diff --git a/CosyVoice-ttsfrd/resource/festival/engmorph.scm b/CosyVoice-ttsfrd/resource/festival/engmorph.scm new file mode 100644 index 0000000000000000000000000000000000000000..46b7c427dcdc9f9571d0d32d62bc8fd35c7105a7 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/engmorph.scm @@ -0,0 +1,151 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Author: Alan W Black +;;; Date: December 1997 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; THIS IS EXPERIMENTAL AND DOES *NOT* WORK +;;; +;;; Koskenniemi-style context rewrite rules for English Morphographemics +;;; Basically splits words into their (potential) morphemes. +;;; +;;; Based (roughly) on the rules in "Computational Morphology" +;;; Ritchie et al. MIT Press 1992. +;;; +;;; This is not a Scheme file and can't be loaded and evaluated +;;; It is designed for use with the wfst tools in the speech tools +;;; e.g. wfst_build -type kk -o engmorph.wfst -detmin engmorph.scm +;;; + +(KKrules + engmorph + (Alphabets + ;; Input Alphabet + (a b c d e f g h i j k l m n o p q r s t u v w x y z #) + ;; Output Alphabet + (a b c d e f g h i j k l m n o p q r s t u v w x y z + #) + ) + (Sets + (LET a b c d e f g h i j k l m n o p q r s t u v w x y z) + ) + (Rules + ;; The basic rules + ( a => nil --- nil) + ( b => nil --- nil) + ( c => nil --- nil) + ( d => nil --- nil) + ( e => nil --- nil) + ( f => nil --- nil) + ( g => nil --- nil) + ( h => nil --- nil) + ( i => nil --- nil) + ( j => nil --- nil) + ( k => nil --- nil) + ( l => nil --- nil) + ( m => nil --- nil) + ( n => nil --- nil) + ( o => nil --- nil) + ( p => nil --- nil) + ( q => nil --- nil) + ( r => nil --- nil) + ( s => nil --- nil) + ( t => nil --- nil) + ( u => nil --- nil) + ( v => nil --- nil) + ( w => nil --- nil) + ( x => nil --- nil) + ( y => nil --- nil) + ( z => nil --- nil) + ( # => nil --- nil) +; ( _epsilon_/+ => (or LET _epsilon_/e ) --- (LET)) + ( _epsilon_/+ => (or LET _epsilon_/e) --- nil) + + ;; The rules that do interesting things + + ;; Epenthesis + ;; churches -> church+s + ;; boxes -> box+s + (e/+ <=> (or (s h) (or s x z) (i/y) (c h)) + --- + (s)) + ;; Gemination + (b/+ <=> ( (or b c d f g h j k l m n p q r s t v w z) (or a e i o u y) b ) + --- + ((or a e i o u))) + (d/+ <=> ((or b c d f g h j k l m n p q r s t v w z) (or a e i o u y) d ) + --- + ((or a e i o u))) + (f/+ <=> ((or b c d f g h j k l m n p q r s t v w z) (or a e i o u y) f ) + --- + ((or a e i o u))) + (g/+ <=> ((or b c d f g h j k l m n p q r s t v w z) (or a e i o u y) g ) + --- + ((or a e i o u))) + (m/+ <=> ((or b c d f g h j k l m n p q r s t v w z) (or a e i o u y) m ) + --- + ((or a e i o u))) + (p/+ <=> ((or b c d f g h j k l m n p q r s t v w z) (or a e i o u y) p ) + --- + ((or a e i o u))) + (s/+ <=> ((or b c d f g h j k l m n p q r s t v w z) (or a e i o u y) s ) + --- + ((or a e i o u))) + (t/+ <=> ((or b c d f g h j k l m n p q r s t v w z) (or a e i o u y) t ) + --- + ((or a e i o u))) + (z/+ <=> ((or b c d f g h j k l m n p q r s t v w z) (or a e i o u y) z ) + --- + ((or a e i o u))) + (n/+ <=> ((or b c d f g h j k l m n p q r s t v w z) (or a e i o u y) n ) + --- + ((or a e i o u))) + (l/+ <=> ((or b c d f g h j k l m n p q r s t v w z) (or a e i o u y) l ) + --- + ((or a e i o u))) + (r/+ <=> ((or b c d f g h j k l m n p q r s t v w z) (or a e i o u y) r ) + --- + ((or a e i o u))) + ;; tries->try+s + ( i/y <=> ((or b c d f g h j k l m n p q r s t v w x z)) + --- + ((or ( e/+ s ) + ( _epsilon_/+ (or a d e f h i l m n o p s w y))))) + ;; Elision + ;; moved -> move+ed + (_epsilon_/e <=> + ((or a e i o u ) (or b c d f g j k l m n p q r s t v x z)) + --- + ( _epsilon_/+ (or a e i o u ))) + + ) +) diff --git a/CosyVoice-ttsfrd/resource/festival/engmorphsyn.scm b/CosyVoice-ttsfrd/resource/festival/engmorphsyn.scm new file mode 100644 index 0000000000000000000000000000000000000000..d6e237f1cb8e45f4ab897409f05bd259646b26fd --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/engmorphsyn.scm @@ -0,0 +1,170 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Author: Alan W Black +;;; Date: December 1997 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; THIS IS EXPERIMENTAL AND DOES *NOT* WORK +;;; +;;; +;;; An English morpho-syntax finite-state grammar +;;; This is used for morphological decomposition of unknown words +;;; specifically (only) words that are not found in the lexicon. +;;; This idea is that when an unknown word is found an attempt is made +;;; to see if it contains any well known morphological inflections or +;;; derivations, if so a better use of LTS can be made on the root, of +;;; none are found this +;;; +;;; +;;; Based on "Analysis of Unknown Words through Morphological +;;; Decomposition", Black, van de Plassche, Willians, European ACL 91. +;;; with the anyword matcher from a question by Lauri Karttunen after +;;; the talk. +;;; +;;; The suffixes and finite-state morph-syntax grammar is based +;;; (very roughly) on the rules in "Computational Morphology" +;;; Ritchie et al. MIT Press 1992. +;;; +;;; Can be compiled with +;;; wfst_build -type rg -o engmorphsyn.wfst -detmin engmorphsyn.scm +;;; +;;; The result can be combined with the morphographemic rules +;;; with +;;; wfst_build -type compose engmorph.wfst engmorphsyn.wfst -detmin -o engstemmer.wfst +;;; +;;; echo "# b o x e/+ s #" | wfst_run -wfst engstemmer.wfst -recog +;;; state 0 #/# -> 1 +;;; state 1 b/b -> 3 +;;; state 3 o/o -> 17 +;;; state 17 x/x -> 14 +;;; state 14 e/+ -> 36 +;;; state 36 s/s -> 34 +;;; state 34 #/# -> 16 +;;; OK. +;;; echo "# b o x e s #" | wfst_run -wfst engstemmer.wfst -recog +;;; state 0 #/# -> 1 +;;; state 1 b/b -> 3 +;;; state 3 o/o -> 17 +;;; state 17 x/x -> 14 +;;; state 14 e/e -> 22 +;;; state 22 s/s -> -1 + +(RegularGrammar + engsuffixmorphosyntax + ;; Sets + ( + (V a e i o u y) + (C b c d f g h j k l m n p q r s t v w x y z) + ) + ;; Rules + + ( + ;; A word *must* have a suffix to be recognized + (Word -> # Syls Suffix ) + (Word -> # Syls End ) + + ;; This matches any string of characters that contains at least one vowel + (Syls -> Syl Syls ) + (Syls -> Syl ) + (Syl -> Cs V Cs ) + (Cs -> C Cs ) + (Cs -> ) + + (Suffix -> VerbSuffix ) + (Suffix -> NounSuffix ) + (Suffix -> AdjSuffix ) + (VerbSuffix -> VerbFinal End ) + (VerbSuffix -> VerbtoNoun NounSuffix ) + (VerbSuffix -> VerbtoNoun End ) + (VerbSuffix -> VerbtoAdj AdjSuffix ) + (VerbSuffix -> VerbtoAdj End ) + (NounSuffix -> NounFinal End ) + (NounSuffix -> NountoNoun NounSuffix ) + (NounSuffix -> NountoNoun End ) + (NounSuffix -> NountoAdj AdjSuffix ) + (NounSuffix -> NountoAdj End ) + (NounSuffix -> NountoVerb VerbSuffix ) + (NounSuffix -> NountoVerb End ) + (AdjSuffix -> AdjFinal End ) + (AdjSuffix -> AdjtoAdj AdjSuffix) + (AdjSuffix -> AdjtoAdj End) + (AdjSuffix -> AdjtoAdv End) ;; isn't any Adv to anything + + (End -> # ) ;; word boundary symbol *always* present + + (VerbFinal -> + e d) + (VerbFinal -> + i n g) + (VerbFinal -> + s) + + (VerbtoNoun -> + e r) + (VerbtoNoun -> + e s s) + (VerbtoNoun -> + a t i o n) + (VerbtoNoun -> + i n g) + (VerbtoNoun -> + m e n t) + + (VerbtoAdj -> + a b l e) + + (NounFinal -> + s) + + (NountoNoun -> + i s m) + (NountoNoun -> + i s t) + (NountoNoun -> + s h i p) + + (NountoAdj -> + l i k e) + (NountoAdj -> + l e s s) + (NountoAdj -> + i s h) + (NountoAdj -> + o u s) + + (NountoVerb -> + i f y) + (NountoVerb -> + i s e) + (NountoVerb -> + i z e) + + (AdjFinal -> + e r) + (AdjFinal -> + e s t) + + (AdjtoAdj -> + i s h) + (AdjtoAdv -> + l y) + (AdjtoNoun -> + n e s s) + (AdjtoVerb -> + i s e) + (AdjtoVerb -> + i z e) + +) +) + + + + + + + + diff --git a/CosyVoice-ttsfrd/resource/festival/f2bdurtreeZ.scm b/CosyVoice-ttsfrd/resource/festival/f2bdurtreeZ.scm new file mode 100644 index 0000000000000000000000000000000000000000..407943a97c3d408cb74989c85cc35808ba279ac0 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/f2bdurtreeZ.scm @@ -0,0 +1,869 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; First attempt at a tree to learn durations. Although +;;; it was trained from F2B and the radio phone set should +;;; work for others that are decalred with the same phone +;;; features +;;; + +;; in ancient items (not on independent data) +;; RMSE 0.821086 Correlation is 0.573693 Mean (abs) Error 0.612327 (0.547034) + +;; on independent test data +;; RMSE 0.8054 Correlation is 0.5327 Mean (abs) Error 0.6073 (0.5290) + +(set! f2b_duration_cart_tree +' +((name is #) + ((emph_sil is +) + ((0.0 -0.5)) + ((R:Segment.p.R:SylStructure.parent.parent.pbreak is BB) + ((0.0 2.0)) + ((0.0 0.0)))) +((R:SylStructure.parent.accented is 0) + ((R:Segment.p.ph_ctype is 0) + ((R:Segment.n.ph_cplace is 0) + ((ph_ctype is n) + ((R:SylStructure.parent.position_type is initial) + ((ph_cplace is a) + ((0.675606 -0.068741)) + ((0.674321 0.204279))) + ((ph_cplace is l) + ((0.688993 -0.124997)) + ((R:SylStructure.parent.syl_out < 10) + ((0.610881 -0.394451)) + ((0.664504 -0.603196))))) + ((ph_ctype is r) + ((lisp_onset_glide is 0) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 0) + ((0.949991 0.619256)) + ((1.05066 0.979668))) + ((0.858728 0.457972))) + ((R:SylStructure.parent.position_type is single) + ((syl_initial is 0) + ((ph_ctype is s) + ((0.692981 -0.788933)) + ((0.834878 -0.116988))) + ((R:SylStructure.parent.syl_out < 9.4) + ((0.777932 0.357818)) + ((0.852909 0.115478)))) + ((R:Segment.n.ph_vrnd is +) + ((ph_ctype is s) + ((0.81305 0.87399)) + ((0.65978 0.418928))) + ((R:SylStructure.parent.position_type is final) + ((R:SylStructure.parent.parent.word_numsyls < 2.3) + ((0.71613 -0.2888)) + ((0.642029 0.0624649))) + ((R:Segment.nn.ph_cplace is a) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 1) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 1) + ((R:SylStructure.parent.position_type is initial) + ((0.854092 0.384456)) + ((0.769274 0.10705))) + ((lisp_coda_stop is 0) + ((0.571763 0.0755348)) + ((0.632928 -0.11117)))) + ((lisp_coda_stop is 0) + ((R:SylStructure.parent.syl_out < 8.6) + ((0.555092 0.30006)) + ((0.552673 -0.0263481))) + ((0.903186 0.519185)))) + ((R:Segment.nn.ph_cplace is p) + ((0.563915 0.204967)) + ((R:Segment.nn.ph_cvox is -) + ((ph_ctype is s) + ((0.67653 0.227681)) + ((0.550623 0.435079))) + ((R:SylStructure.parent.position_type is initial) + ((0.93428 0.732003)) + ((0.84114 0.423214))))))))))) + ((R:Segment.n.ph_ctype is s) + ((ph_ctype is s) + ((0.693376 -1.02719)) + ((R:Segment.n.ph_cplace is v) + ((ph_ctype is r) + ((0.539799 -0.344524)) + ((0.858576 0.154275))) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 1.2) + ((lisp_onset_glide is 0) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 1) + ((ph_ctype is n) + ((R:Segment.nn.ph_cplace is a) + ((0.64604 -0.643797)) + ((0.739746 -0.450649))) + ((ph_ctype is f) + ((0.657043 -0.462107)) + ((0.798438 -0.19569)))) + ((R:SylStructure.parent.syl_out < 8.4) + ((lisp_coda_stop is 0) + ((0.766789 -0.0484781)) + ((0.717203 -0.322113))) + ((R:SylStructure.parent.position_type is single) + ((0.508168 -0.412874)) + ((0.703458 -0.291121))))) + ((0.574827 -0.65022))) + ((0.801765 -0.120813))))) + ((ph_ctype is n) + ((R:Segment.n.ph_ctype is f) + ((R:Segment.n.ph_cplace is b) + ((0.797652 0.623764)) + ((R:Segment.n.ph_cplace is a) + ((R:Segment.n.seg_onsetcoda is coda) + ((0.675567 0.288251)) + ((0.854197 0.626272))) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 1) + ((0.660394 -0.225466)) + ((0.65275 0.0487195))))) + ((R:Segment.n.ph_ctype is n) + ((0.685613 -0.512227)) + ((0.736366 -0.104066)))) + ((R:Segment.n.ph_ctype is r) + ((R:SylStructure.parent.position_type is initial) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.1) + ((0.98185 0.152471)) + ((0.851907 0.788208))) + ((ph_ctype is f) + ((0.76106 0.406474)) + ((R:Segment.n.ph_cplace is a) + ((1.01348 -0.0422549)) + ((0.786777 -0.714839))))) + ((ph_cplace is b) + ((R:SylStructure.parent.syl_out < 10.4) + ((0.799025 0.0992277)) + ((0.851068 -0.115896))) + ((R:Segment.n.ph_cplace is p) + ((0.669855 -0.655488)) + ((ph_ctype is r) + ((R:Segment.n.ph_cplace is a) + ((1.00772 0.130892)) + ((0.635981 -0.35826))) + ((R:Segment.n.ph_ctype is l) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 1) + ((0.746089 -0.286007)) + ((0.89158 0.154432))) + ((R:Segment.n.ph_cplace is b) + ((1.04971 -0.0449782)) + ((R:SylStructure.parent.syl_out < 9.8) + ((R:Segment.n.ph_ctype is f) + ((R:Segment.n.seg_onsetcoda is coda) + ((1.4144 0.143658)) + ((0.781116 -0.281483))) + ((ph_vlng is 0) + ((0.755959 -0.33462)) + ((0.81024 -0.615287)))) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.3) + ((0.7426 -0.24342)) + ((R:Segment.n.ph_ctype is f) + ((R:Segment.n.ph_cplace is a) + ((R:SylStructure.parent.position_type is single) + ((0.578639 -0.322097)) + ((0.55826 -0.663238))) + ((0.616575 -0.713688))) + ((0.759572 -0.314116)))))))))))))) + ((R:Segment.n.ph_ctype is f) + ((ph_ctype is 0) + ((R:Segment.p.ph_ctype is r) + ((R:SylStructure.parent.parent.word_numsyls < 2.2) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 1) + ((0.733193 -0.180968)) + ((0.563111 -0.467934))) + ((0.426244 -0.758137))) + ((ph_vlng is a) + ((R:Segment.n.ph_cplace is b) + ((R:Segment.nn.ph_cvox is +) + ((0.680234 0.059855)) + ((R:SylStructure.parent.position_type is single) + ((0.980851 0.443893)) + ((0.715307 0.112865)))) + ((R:Segment.p.ph_cplace is a) + ((0.851224 0.695863)) + ((R:Segment.nn.ph_cvox is -) + ((0.75892 0.195772)) + ((0.630633 0.478738))))) + ((R:Segment.n.seg_onsetcoda is coda) + ((R:Segment.n.ph_cplace is b) + ((R:Segment.nn.ph_cplace is 0) + ((0.815979 -0.477579)) + ((0.851491 -0.168622))) + ((R:SylStructure.parent.position_type is single) + ((R:Segment.nn.ph_cvox is +) + ((1.14265 0.717697)) + ((0.814726 0.291482))) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 0) + ((0.512322 -0.0749096)) + ((0.488216 0.112774))))) + ((R:SylStructure.parent.position_type is final) + ((0.693071 -0.200708)) + ((R:Segment.p.ph_cvox is +) + ((0.489147 -0.378728)) + ((0.695396 -0.525028))))))) + ((ph_vlng is s) + ((0.464234 -0.162706)) + ((R:Segment.p.ph_cvox is +) + ((R:SylStructure.parent.parent.word_numsyls < 2.2) + ((0.566845 -0.616918)) + ((0.92747 -0.26777))) + ((0.632833 -0.858295))))) + ((R:Segment.n.ph_vrnd is 0) + ((R:Segment.p.ph_ctype is r) + ((ph_vlng is 0) + ((0.845308 -0.23426)) + ((R:SylStructure.parent.syl_out < 4.8) + ((R:Segment.n.ph_ctype is n) + ((0.484602 -0.850587)) + ((0.535398 -0.586652))) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.3) + ((ph_vlng is a) + ((0.368898 -0.799533)) + ((lisp_coda_stop is 0) + ((0.387923 -1.11431)) + ((0.407377 -0.859849)))) + ((R:Segment.n.ph_cplace is a) + ((ph_vlng is a) + ((0.382367 -0.787669)) + ((0.522121 -0.687376))) + ((0.361185 -0.853639)))))) + ((ph_vlng is a) + ((ph_ctype is 0) + ((R:Segment.n.ph_ctype is s) + ((R:Segment.p.ph_cvox is +) + ((R:Segment.p.ph_cplace is d) + ((0.502849 -0.232866)) + ((R:SylStructure.parent.position_type is initial) + ((0.641714 -0.0545426)) + ((R:SylStructure.parent.parent.word_numsyls < 2.6) + ((0.613913 0.373746)) + ((R:Segment.n.ph_cplace is v) + ((0.581158 0.310101)) + ((0.628758 -0.068165)))))) + ((R:SylStructure.parent.position_type is mid) + ((0.459281 -0.553794)) + ((0.728208 -0.138806)))) + ((R:Segment.p.ph_cplace is v) + ((0.32179 -0.728364)) + ((R:Segment.p.ph_cplace is l) + ((0.562971 -0.550272)) + ((R:SylStructure.parent.position_type is initial) + ((0.937298 -0.0246324)) + ((R:Segment.p.ph_cvox is +) + ((R:Segment.n.ph_ctype is n) + ((R:Segment.n.ph_cplace is a) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 0) + ((0.434029 -0.404793)) + ((1.05548 -0.103717))) + ((0.408372 -0.556145))) + ((0.712335 -0.118776))) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.3) + ((0.379593 -0.658075)) + ((0.549207 -0.494876)))))))) + ((R:SylStructure.parent.position_type is final) + ((0.597124 -0.649729)) + ((0.628822 -1.03743)))) + ((ph_ctype is s) + ((R:Segment.n.ph_ctype is r) + ((R:SylStructure.parent.syl_out < 8.4) + ((0.760328 0.31651)) + ((0.738363 -0.0177161))) + ((R:Segment.n.ph_ctype is l) + ((0.649328 -0.108791)) + ((0.594945 -0.712753)))) + ((ph_vlng is s) + ((R:Segment.n.ph_ctype is s) + ((R:Segment.n.ph_cplace is v) + ((R:Segment.nn.ph_cplace is a) + ((0.583211 0.0724331)) + ((0.434605 -0.229857))) + ((R:Segment.p.ph_cplace is a) + ((R:SylStructure.parent.position_type is single) + ((0.785502 -0.00061573)) + ((0.544995 -0.432984))) + ((R:Segment.nn.ph_cplace is 0) + ((0.507071 -0.715041)) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 0) + ((0.506404 -0.573733)) + ((0.62466 -0.3356)))))) + ((R:Segment.p.ph_cplace is l) + ((0.571756 -0.819693)) + ((lisp_coda_stop is 0) + ((R:SylStructure.parent.position_type is initial) + ((0.906891 -0.352911)) + ((R:Segment.n.ph_ctype is r) + ((0.620335 -0.445714)) + ((R:SylStructure.parent.parent.word_numsyls < 2.5) + ((R:Segment.p.ph_cvox is +) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 0) + ((0.484057 -0.781483)) + ((0.653917 -0.615429))) + ((0.754814 -0.531845))) + ((0.493988 -0.881596))))) + ((0.792979 -0.32648))))) + ((R:Segment.p.ph_cvox is +) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.3) + ((lisp_coda_stop is 0) + ((0.913526 -0.195111)) + ((0.56564 -0.64867))) + ((R:SylStructure.parent.position_type is single) + ((R:Segment.n.ph_cplace is a) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 1) + ((0.790882 -0.488954)) + ((0.780221 -0.185138))) + ((0.487794 -0.691338))) + ((R:Segment.p.ph_ctype is n) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 1) + ((0.595729 -0.771698)) + ((0.57908 -1.06592))) + ((R:Segment.pp.ph_vfront is 0) + ((0.591417 -0.784735)) + ((0.486298 -0.436971)))))) + ((ph_vlng is 0) + ((0.629869 -0.960652)) + ((R:Segment.n.ph_ctype is r) + ((R:Segment.nn.ph_cplace is 0) + ((0.591783 -0.671576)) + ((R:Segment.nn.ph_cvox is +) + ((0.365135 -0.822844)) + ((0.428573 -0.988434)))) + ((lisp_coda_stop is 0) + ((R:Segment.p.ph_cplace is a) + ((R:Segment.n.ph_cplace is a) + ((0.428189 -0.730057)) + ((0.337443 -0.861764))) + ((0.57354 -0.494602))) + ((0.497606 -0.414451)))))))))) + ((ph_vlng is l) + ((R:Segment.pp.ph_vfront is 1) + ((0.937199 0.833877)) + ((R:SylStructure.parent.syl_out < 12.7) + ((0.729202 0.344121)) + ((0.71086 0.101855)))) + ((syl_initial is 0) + ((R:Segment.p.ph_ctype is r) + ((R:Segment.nn.ph_cplace is a) + ((0.844815 0.175273)) + ((0.662523 -0.297527))) + ((ph_vlng is 0) + ((R:Segment.p.ph_ctype is s) + ((R:SylStructure.parent.syl_out < 14.6) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 0) + ((0.665332 -0.610529)) + ((0.42276 -0.848942))) + ((0.427946 -0.980726))) + ((R:SylStructure.parent.position_type is single) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 1) + ((0.523367 -0.825038)) + ((0.635654 -0.535303))) + ((R:SylStructure.parent.position_type is final) + ((0.515996 -0.707614)) + ((ph_cplace is a) + ((lisp_coda_stop is 0) + ((0.689738 0.0446601)) + ((0.698347 -0.268593))) + ((R:Segment.nn.ph_cplace is a) + ((0.706504 -0.659172)) + ((0.775589 -0.201769))))))) + ((0.79472 -0.0539192)))) + ((ph_ctype is s) + ((R:SylStructure.parent.position_type is single) + ((R:Segment.p.ph_ctype is f) + ((0.641302 0.532411)) + ((R:Segment.n.ph_vrnd is +) + ((0.800655 0.325651)) + ((0.894711 0.0487864)))) + ((R:SylStructure.parent.position_type is initial) + ((R:Segment.nn.ph_cplace is a) + ((0.618082 -0.0190591)) + ((0.733637 0.156329))) + ((ph_cplace is a) + ((R:SylStructure.parent.parent.word_numsyls < 2.3) + ((0.372869 -0.0827845)) + ((0.494988 0.0882778))) + ((0.593526 -0.335404))))) + ((R:Segment.p.ph_cvox is +) + ((R:Segment.p.ph_ctype is n) + ((R:SylStructure.parent.syl_out < 5.4) + ((1.0207 -0.152517)) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((0.711277 -0.513467)) + ((0.509207 -0.726794)))) + ((ph_cplace is g) + ((0.545188 -0.568352)) + ((R:Segment.p.ph_cplace is a) + ((ph_ctype is n) + ((0.61149 -0.325094)) + ((R:SylStructure.parent.position_type is single) + ((R:Segment.p.ph_ctype is r) + ((0.525282 0.395446)) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 1) + ((0.85778 0.0760293)) + ((0.704055 0.290369)))) + ((R:Segment.pp.ph_vfront is 0) + ((0.590093 0.136983)) + ((0.734563 -0.0570759))))) + ((R:Segment.pp.ph_vfront is 2) + ((0.519485 -0.477174)) + ((0.707546 -0.13584)))))) + ((R:SylStructure.parent.position_type is single) + ((R:Segment.p.ph_ctype is f) + ((0.797877 0.00462775)) + ((R:Segment.pp.ph_vfront is 1) + ((0.852184 -0.259914)) + ((0.65313 -0.492506)))) + ((R:SylStructure.parent.position_type is initial) + ((0.662516 -0.45585)) + ((lisp_onset_glide is 0) + ((0.652534 -0.652428)) + ((0.482818 -0.885728)))))))))))) + ((syl_initial is 0) + ((ph_cplace is 0) + ((R:SylStructure.parent.position_type is single) + ((R:Segment.n.ph_ctype is f) + ((R:Segment.p.ph_cplace is a) + ((R:Segment.n.ph_cplace is a) + ((R:Segment.pp.ph_vfront is 0) + ((1.06157 1.30945)) + ((1.12041 1.85843))) + ((1.05622 0.921414))) + ((R:Segment.nn.ph_cvox is -) + ((1.03073 0.916168)) + ((1.06857 0.452851)))) + ((R:Segment.p.ph_ctype is r) + ((R:Segment.n.ph_cplace is v) + ((1.22144 0.672433)) + ((R:Segment.p.ph_cplace is l) + ((0.859749 -0.315152)) + ((R:Segment.nn.ph_cvox is -) + ((0.89862 0.131037)) + ((0.760033 -0.121252))))) + ((R:SylStructure.parent.syl_out < 8.8) + ((R:SylStructure.parent.syl_out < 0.8) + ((1.06821 1.63716)) + ((R:Segment.n.ph_cplace is a) + ((R:Segment.p.ph_cvox is +) + ((1.04477 0.581686)) + ((R:Segment.nn.ph_cvox is +) + ((0.769059 0.301576)) + ((0.953428 0.0764058)))) + ((R:Segment.p.ph_cplace is a) + ((1.01367 0.507761)) + ((1.2827 0.945031))))) + ((R:Segment.n.ph_cplace is l) + ((0.618397 -0.0873608)) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 0) + ((R:Segment.p.ph_cvox is +) + ((0.817182 0.477262)) + ((0.792181 -0.0592145))) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((R:SylStructure.parent.syl_out < 16) + ((0.995411 0.497843)) + ((0.784087 0.152266))) + ((1.11816 0.716352)))))))) + ((R:Segment.n.ph_ctype is f) + ((R:SylStructure.parent.position_type is final) + ((1.35724 1.06028)) + ((R:Segment.p.ph_ctype is r) + ((R:SylStructure.parent.syl_out < 8.6) + ((0.511716 -0.0833005)) + ((0.492142 -0.30212))) + ((R:Segment.n.ph_cplace is b) + ((0.53059 0.00266551)) + ((R:SylStructure.parent.parent.word_numsyls < 2.3) + ((ph_vlng is l) + ((0.433396 0.821463)) + ((0.66915 0.415614))) + ((0.501369 0.154721)))))) + ((R:SylStructure.parent.position_type is final) + ((R:Segment.n.ph_ctype is s) + ((1.03896 0.524706)) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((1.15147 0.428386)) + ((R:Segment.p.ph_cplace is a) + ((0.919929 0.0314637)) + ((0.716168 -0.366629))))) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 4) + ((0.816778 0.408786)) + ((lisp_onset_glide is 0) + ((R:Segment.p.ph_ctype is n) + ((R:Segment.n.ph_ctype is s) + ((0.532911 -0.153851)) + ((0.633518 -0.762353))) + ((R:Segment.p.ph_cvox is -) + ((R:Segment.p.ph_cplace is g) + ((0.618376 -0.593197)) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 1) + ((R:Segment.pp.ph_vfront is 0) + ((R:Segment.n.ph_ctype is n) + ((0.554085 -0.058903)) + ((R:Segment.p.ph_cplace is a) + ((0.59842 -0.174458)) + ((0.585539 -0.349335)))) + ((0.500857 -0.416613))) + ((R:SylStructure.parent.syl_out < 7) + ((0.616683 -0.00213272)) + ((0.631444 -0.141773))))) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 0) + ((0.5198 -0.151901)) + ((ph_vlng is s) + ((0.677428 0.203522)) + ((0.780789 0.375429)))))) + ((R:Segment.nn.ph_cplace is a) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((0.594604 -0.27832)) + ((0.736114 -0.422756))) + ((R:Segment.p.ph_cplace is a) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 1) + ((0.512186 -0.732785)) + ((0.550759 -0.506471))) + ((0.47297 -0.791841))))))))) + ((R:Segment.p.ph_ctype is 0) + ((R:SylStructure.parent.position_type is final) + ((lisp_coda_stop is 0) + ((ph_ctype is f) + ((R:Segment.nn.ph_cplace is 0) + ((1.00978 0.366105)) + ((0.80682 -0.0827529))) + ((R:Segment.n.ph_cplace is a) + ((R:Segment.nn.ph_cvox is -) + ((1.07097 1.77503)) + ((1.14864 1.14754))) + ((R:Segment.n.ph_vrnd is -) + ((0.883474 0.286471)) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 1) + ((1.22264 0.884142)) + ((1.03401 0.658192)))))) + ((ph_cplace is a) + ((R:SylStructure.parent.syl_out < 6.4) + ((R:SylStructure.parent.syl_out < 0.6) + ((1.07956 0.602849)) + ((1.12301 0.0555897))) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((0.898888 -0.17527)) + ((0.940932 0.274301)))) + ((1.10093 -0.68098)))) + ((R:Segment.n.ph_ctype is s) + ((ph_cplace is v) + ((0.639932 -1.33353)) + ((R:SylStructure.parent.position_type is single) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 0) + ((lisp_coda_stop is 0) + ((0.822882 -0.131692)) + ((0.971957 -0.385365))) + ((R:Segment.nn.ph_cvox is -) + ((1.06611 0.183678)) + ((lisp_coda_stop is 0) + ((0.967183 0.0925019)) + ((0.876026 -0.230108))))) + ((ph_ctype is f) + ((R:SylStructure.parent.syl_out < 13) + ((0.589198 -0.655594)) + ((0.476651 -0.926625))) + ((R:SylStructure.parent.syl_out < 5) + ((0.682936 -0.227662)) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((R:Segment.nn.ph_cplace is a) + ((0.447309 -0.700998)) + ((0.626113 -0.468853))) + ((0.657893 -0.383607))))))) + ((ph_ctype is r) + ((R:Segment.nn.ph_cvox is -) + ((1.15158 1.15233)) + ((R:Segment.n.ph_vrnd is -) + ((1.05554 0.533749)) + ((0.955478 0.0841894)))) + ((ph_ctype is l) + ((R:Segment.n.ph_ctype is 0) + ((R:Segment.nn.ph_cplace is a) + ((0.766431 0.28943)) + ((1.48633 1.09574))) + ((R:SylStructure.parent.position_type is single) + ((1.01777 0.474653)) + ((0.545859 -0.402743)))) + ((R:SylStructure.parent.syl_out < 4.8) + ((R:Segment.n.ph_vc is +) + ((ph_ctype is n) + ((0.776645 -0.433859)) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 0) + ((0.776179 0.23435)) + ((R:SylStructure.parent.parent.word_numsyls < 2.2) + ((0.744272 -0.0859672)) + ((0.782605 0.115647)))) + ((0.626541 -0.167615)))) + ((R:Segment.n.seg_onsetcoda is coda) + ((1.28499 0.864144)) + ((ph_cplace is a) + ((0.926103 0.0435837)) + ((0.839172 -0.189514))))) + ((R:Segment.n.ph_ctype is n) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.1) + ((0.973489 -0.203415)) + ((0.777589 -0.849733))) + ((ph_ctype is n) + ((R:SylStructure.parent.position_type is initial) + ((R:Segment.n.ph_vc is +) + ((0.743482 -0.53384)) + ((0.619309 -0.0987861))) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((1.15555 0.0786295)) + ((1.06689 0.681662)))) + ((R:Segment.n.ph_ctype is r) + ((R:SylStructure.parent.syl_out < 8.9) + ((0.752079 -0.237421)) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((0.664182 -0.041521)) + ((0.772712 0.103499)))) + ((R:Segment.n.seg_onsetcoda is coda) + ((R:SylStructure.parent.position_type is mid) + ((R:SylStructure.parent.parent.word_numsyls < 3.3) + ((0.715944 -0.275113)) + ((0.675729 0.202848))) + ((R:Segment.n.ph_vrnd is -) + ((R:SylStructure.parent.syl_out < 8.3) + ((ph_ctype is s) + ((0.82747 -0.116723)) + ((0.689586 -0.303909))) + ((R:SylStructure.parent.syl_out < 17.7) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 0) + ((0.659686 -0.621268)) + ((ph_cplace is a) + ((0.861741 -0.285324)) + ((0.507102 -0.444082)))) + ((0.850664 -0.269084)))) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 0) + ((0.878643 -0.255833)) + ((0.98882 0.115252))))) + ((ph_cplace is a) + ((R:SylStructure.parent.syl_out < 13) + ((0.850625 -0.289333)) + ((0.788154 -0.44844))) + ((0.70482 -0.630276)))))))))))) + ((R:Segment.p.ph_ctype is l) + ((R:SylStructure.parent.position_type is single) + ((0.873748 -0.21639)) + ((lisp_coda_stop is 0) + ((0.71002 0.428132)) + ((0.703501 0.015833)))) + ((ph_vlng is 0) + ((R:Segment.p.ph_ctype is r) + ((R:SylStructure.parent.position_type is initial) + ((0.907151 -0.494409)) + ((ph_ctype is s) + ((0.782539 -0.398555)) + ((R:Segment.p.ph_cplace is 0) + ((0.767435 -0.298857)) + ((0.767046 0.151217))))) + ((ph_cplace is a) + ((R:Segment.n.ph_ctype is r) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((0.689367 0.0195991)) + ((0.64446 -0.256648))) + ((R:Segment.n.ph_vc is +) + ((ph_ctype is s) + ((R:Segment.nn.ph_cvox is +) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 1) + ((0.59482 -0.214443)) + ((0.745691 0.0292177))) + ((0.523103 -0.391245))) + ((R:Segment.p.ph_cvox is +) + ((R:Segment.p.ph_cplace is a) + ((0.524304 -0.428306)) + ((0.605117 -0.165604))) + ((R:Segment.p.ph_ctype is f) + ((0.491251 -0.455353)) + ((lisp_coda_stop is 0) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 1) + ((0.175021 -1.02136)) + ((0.264113 -0.976809))) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.3) + ((0.704803 -0.716976)) + ((0.300317 -0.924727))))))) + ((ph_ctype is f) + ((R:SylStructure.parent.syl_out < 13) + ((R:Segment.n.ph_ctype is s) + ((0.731994 -0.711044)) + ((0.768008 -0.415076))) + ((0.691821 -0.803284))) + ((R:Segment.nn.ph_cplace is 0) + ((R:Segment.n.ph_cplace is a) + ((0.569567 -0.993506)) + ((0.689849 -0.761696))) + ((0.386818 -1.14744)))))) + ((R:Segment.p.seg_onsetcoda is coda) + ((R:Segment.p.ph_cplace is a) + ((0.746337 -0.866206)) + ((0.532751 -1.22185))) + ((ph_cplace is l) + ((0.74942 -0.820648)) + ((0.685988 -0.298146)))))) + ((0.812766 0.17291)))))) + ((R:SylStructure.parent.position_type is mid) + ((ph_ctype is r) + ((0.577775 -0.54714)) + ((R:Segment.n.ph_ctype is f) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 0) + ((0.370448 0.00076407)) + ((0.460385 0.20631))) + ((R:Segment.p.ph_cvox is -) + ((ph_vlng is 0) + ((0.615959 -0.57434)) + ((0.50852 -0.197814))) + ((R:Segment.n.ph_ctype is 0) + ((1.34281 0.477163)) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 1) + ((0.59975 -0.1342)) + ((0.640294 -0.32653))))))) + ((R:Segment.n.ph_ctype is f) + ((R:SylStructure.parent.position_type is initial) + ((0.758739 0.311943)) + ((R:Segment.n.seg_onsetcoda is coda) + ((R:Segment.p.ph_ctype is f) + ((1.28746 1.99771)) + ((R:Segment.pp.ph_vfront is 1) + ((1.42474 1.76925)) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 1) + ((0.979414 1.37583)) + ((1.00321 1.06671))))) + ((1.15222 0.852004)))) + ((R:Segment.p.ph_ctype is 0) + ((R:Segment.n.ph_ctype is s) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 1) + ((0.664807 -0.0880262)) + ((0.573589 0.217234))) + ((ph_ctype is s) + ((ph_cplace is l) + ((0.800348 0.66579)) + ((ph_cplace is a) + ((0.859133 1.46854)) + ((R:SylStructure.parent.position_type is single) + ((0.692229 1.23671)) + ((0.552426 0.923928))))) + ((R:SylStructure.parent.syl_out < 9.2) + ((R:SylStructure.parent.position_type is single) + ((R:SylStructure.parent.syl_out < 3.6) + ((1.01673 1.26824)) + ((0.848274 0.92375))) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 1) + ((R:Segment.nn.ph_cplace is a) + ((0.788163 0.818855)) + ((0.822028 1.01227))) + ((0.8365 0.483313)))) + ((lisp_coda_stop is 0) + ((R:Segment.nn.ph_cvox is +) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((0.807795 0.670829)) + ((0.773774 0.435486))) + ((0.849529 0.103561))) + ((0.858848 0.763836)))))) + ((R:Segment.n.ph_vrnd is -) + ((ph_vlng is 0) + ((R:SylStructure.parent.position_type is final) + ((ph_cplace is a) + ((R:Segment.nn.ph_cvox is -) + ((0.691915 -0.42124)) + ((R:Segment.p.ph_cplace is a) + ((0.773696 0.354001)) + ((0.65495 -0.14321)))) + ((0.610433 -0.479739))) + ((R:Segment.p.ph_ctype is r) + ((R:SylStructure.parent.R:Syllable.n.syl_break is 0) + ((0.560921 0.384674)) + ((0.895267 0.746476))) + ((R:Segment.p.ph_ctype is l) + ((0.704694 0.568012)) + ((R:Segment.p.ph_cplace is b) + ((1.34739 0.539049)) + ((R:Segment.p.ph_ctype is s) + ((R:SylStructure.parent.syl_out < 12.9) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((0.807285 0.151429)) + ((0.988033 0.383763))) + ((0.878655 0.102291))) + ((ph_ctype is n) + ((0.759582 -0.315096)) + ((R:SylStructure.parent.syl_out < 8.8) + ((R:Segment.pp.ph_vfront is 0) + ((0.846546 0.000647117)) + ((R:Segment.pp.ph_vfront is 1) + ((0.586216 0.150701)) + ((0.793898 0.379041)))) + ((lisp_coda_stop is 0) + ((ph_ctype is f) + ((0.74736 -0.31103)) + ((0.715751 -0.00576581))) + ((0.914486 0.17528)))))))))) + ((1.24204 0.908819))) + ((ph_ctype is s) + ((ph_cplace is a) + ((0.864408 1.35528)) + ((R:Segment.n.seg_onsetcoda is coda) + ((0.85602 0.344576)) + ((0.869622 0.659223)))) + ((R:Segment.nn.ph_cvox is -) + ((R:Segment.n.ph_ctype is s) + ((R:Segment.nn.ph_cplace is 0) + ((0.942964 1.27475)) + ((0.978218 0.650268))) + ((R:SylStructure.parent.syl_out < 3.9) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((1.32463 1.05026)) + ((0.896966 0.417727))) + ((R:Segment.p.ph_cplace is a) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 0) + ((0.776698 0.195369)) + ((0.969518 0.432394))) + ((0.799096 -0.0203318))))) + ((ph_cplace is a) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((0.680861 -0.315846)) + ((R:SylStructure.parent.R:Syllable.nn.syl_break is 1) + ((0.954393 0.0965487)) + ((0.884928 0.372884)))) + ((lisp_coda_stop is 0) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((R:SylStructure.parent.position_type is final) + ((1.03696 0.565834)) + ((0.906661 0.277961))) + ((R:SylStructure.parent.position_type is final) + ((0.778429 -0.0967381)) + ((0.863993 0.314023)))) + ((R:Segment.p.ph_cplace is a) + ((R:SylStructure.parent.R:Syllable.p.stress is 0) + ((0.898898 0.571009)) + ((0.830278 0.787486))) + ((1.1101 0.333888))))))))))))) +;; RMSE 0.7726 Correlation is 0.5943 Mean (abs) Error 0.5752 (0.5160) + +)) + +(provide 'f2bdurtreeZ) diff --git a/CosyVoice-ttsfrd/resource/festival/f2bf0lr.scm b/CosyVoice-ttsfrd/resource/festival/f2bf0lr.scm new file mode 100644 index 0000000000000000000000000000000000000000..6a066719ff42f98780e88056c0651433a14c983e --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/f2bf0lr.scm @@ -0,0 +1,314 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; First attempt at a linear regression model to predict F0 values. +;;; This is an attempt to reimplement the work in Black and +;;; Hunt ICSLP96, though this model probably isn't as good. +;;; + +;;;start +;;; R2 = 0.251, F(74, 12711) = 57.5, Prob>F = 0.000 +;;; RMSE = 27.877 +;;;mid +;;; R2 = 0.332, F(74, 12711) = 85.6, Prob>F = 0.000 +;;; RMSE = 28.293 +;;;end +;;; R2 = 0.292, F(74, 12711) = 70.8, Prob>F = 0.000 +;;; RMSE = 27.139 + +(define (emph_syl syl) + (if (string-equal (item.feat syl "tobi_accent") "NONE") + 0.0 + (if (string-equal (item.feat + syl "R:SylStructure.parent.R:Token.parent.EMPH") "1") + 2.0 + 0.0))) + +(set! f2b_f0_lr_start +'( +( Intercept 160.584956 ) +( R:SylStructure.parent.R:Token.parent.EMPH 10.0 ) +( pp.tobi_accent 10.081770 (H*) ) +( pp.tobi_accent 3.358613 (!H*) ) +( pp.tobi_accent 4.144342 (*? X*? H*!H* * L+H* L+!H*) ) +( pp.tobi_accent -1.111794 (L*) ) +( pp.tobi_accent 19.646313 (L*+H L*+!H) ) +( p.tobi_accent 32.081029 (H*) ) +( p.tobi_accent 18.090033 (!H*) ) +( p.tobi_accent 23.255280 (*? X*? H*!H* * L+H* L+!H*) ) +( p.tobi_accent -9.623577 (L*) ) +( p.tobi_accent 26.517095 (L*+H L*+!H) ) +( tobi_accent 5.221081 (H*) ) +( tobi_accent 10.159194 (!H*) ) +( tobi_accent 3.645511 (*? X*? H*!H* * L+H* L+!H*) ) +( tobi_accent -5.720030 (L*) ) +( tobi_accent -6.355773 (L*+H L*+!H) ) +( n.tobi_accent -5.691933 (H*) ) +( n.tobi_accent 8.265606 (!H*) ) +( n.tobi_accent 0.861427 (*? X*? H*!H* * L+H* L+!H*) ) +( n.tobi_accent 1.270504 (L*) ) +( n.tobi_accent 3.499418 (L*+H L*+!H) ) +( nn.tobi_accent -3.785701 (H*) ) +( nn.tobi_accent 7.013446 (!H*) ) +( nn.tobi_accent 2.637494 (*? X*? H*!H* * L+H* L+!H*) ) +( nn.tobi_accent -0.392176 (L*) ) +( nn.tobi_accent -2.957502 (L*+H L*+!H) ) +( pp.tobi_endtone -3.531153 (L-L%) ) +( pp.tobi_endtone 0.131156 (L-) ) +( pp.tobi_endtone 2.729199 (H-L% !H-L% -X?) ) +( pp.tobi_endtone 8.258756 (L-H%) ) +( pp.tobi_endtone 5.836487 (H-) ) +( pp.tobi_endtone 11.213440 (!H- H-H%) ) +( R:Syllable.p.tobi_endtone -28.081359 (L-L%) ) +( R:Syllable.p.tobi_endtone -20.553145 (L-) ) +( R:Syllable.p.tobi_endtone -5.442577 (H-L% !H-L% -X?) ) +( R:Syllable.p.tobi_endtone -6.585836 (L-H%) ) +( R:Syllable.p.tobi_endtone 8.537044 (H-) ) +( R:Syllable.p.tobi_endtone 4.243342 (!H- H-H%) ) +( tobi_endtone -9.333926 (L-L%) ) +( tobi_endtone -0.346711 (L-) ) +( tobi_endtone -0.507352 (H-L% !H-L% -X?) ) +( tobi_endtone -0.937483 (L-H%) ) +( tobi_endtone 9.472265 (H-) ) +( tobi_endtone 14.256898 (!H- H-H%) ) +( n.tobi_endtone -13.084253 (L-L%) ) +( n.tobi_endtone -1.060688 (L-) ) +( n.tobi_endtone -7.947205 (H-L% !H-L% -X?) ) +( n.tobi_endtone -5.471592 (L-H%) ) +( n.tobi_endtone -0.095669 (H-) ) +( n.tobi_endtone 4.933708 (!H- H-H%) ) +( nn.tobi_endtone -14.993470 (L-L%) ) +( nn.tobi_endtone -3.784284 (L-) ) +( nn.tobi_endtone -15.505132 (H-L% !H-L% -X?) ) +( nn.tobi_endtone -11.352400 (L-H%) ) +( nn.tobi_endtone -5.551627 (H-) ) +( nn.tobi_endtone -0.661581 (!H- H-H%) ) +( pp.old_syl_break -3.367677 ) +( p.old_syl_break 0.641755 ) +( old_syl_break -0.659002 ) +( n.old_syl_break 1.217358 ) +( nn.old_syl_break 2.974502 ) +( pp.stress 1.588098 ) +( p.stress 3.693430 ) +( stress 2.009843 ) +( n.stress 1.645560 ) +( nn.stress 1.926870 ) +( syl_in 1.048362 ) +( syl_out 0.315553 ) +( ssyl_in -2.096079 ) +( ssyl_out 0.303531 ) +( asyl_in -4.257915 ) +( asyl_out -2.422424 ) +( last_accent -0.397647 ) +( next_accent -0.418613 ) +( sub_phrases -5.472055 ) +)) + +(set! f2b_f0_lr_mid +'( +( Intercept 169.183377 ) +( R:SylStructure.parent.R:Token.parent.EMPH 10.0 ) +( pp.tobi_accent 4.923247 (H*) ) +( pp.tobi_accent 0.955474 (!H*) ) +( pp.tobi_accent 1.193597 (*? X*? H*!H* * L+H* L+!H*) ) +( pp.tobi_accent 1.501383 (L*) ) +( pp.tobi_accent 7.992120 (L*+H L*+!H) ) +( p.tobi_accent 16.603350 (H*) ) +( p.tobi_accent 11.665814 (!H*) ) +( p.tobi_accent 13.063298 (*? X*? H*!H* * L+H* L+!H*) ) +( p.tobi_accent -2.288798 (L*) ) +( p.tobi_accent 29.168430 (L*+H L*+!H) ) +( tobi_accent 34.517868 (H*) ) +( tobi_accent 22.349656 (!H*) ) +( tobi_accent 23.551548 (*? X*? H*!H* * L+H* L+!H*) ) +( tobi_accent -14.117284 (L*) ) +( tobi_accent -5.978760 (L*+H L*+!H) ) +( n.tobi_accent -1.914945 (H*) ) +( n.tobi_accent 5.249441 (!H*) ) +( n.tobi_accent -1.929947 (*? X*? H*!H* * L+H* L+!H*) ) +( n.tobi_accent -3.287877 (L*) ) +( n.tobi_accent -4.980375 (L*+H L*+!H) ) +( nn.tobi_accent -6.147251 (H*) ) +( nn.tobi_accent 8.408949 (!H*) ) +( nn.tobi_accent 3.193500 (*? X*? H*!H* * L+H* L+!H*) ) +( nn.tobi_accent 1.323099 (L*) ) +( nn.tobi_accent 9.148058 (L*+H L*+!H) ) +( pp.tobi_endtone 4.255273 (L-L%) ) +( pp.tobi_endtone -1.033377 (L-) ) +( pp.tobi_endtone 11.992045 (H-L% !H-L% -X?) ) +( pp.tobi_endtone 6.989573 (L-H%) ) +( pp.tobi_endtone 2.598854 (H-) ) +( pp.tobi_endtone 12.178307 (!H- H-H%) ) +( R:Syllable.p.tobi_endtone -4.397973 (L-L%) ) +( R:Syllable.p.tobi_endtone -6.157077 (L-) ) +( R:Syllable.p.tobi_endtone 5.530608 (H-L% !H-L% -X?) ) +( R:Syllable.p.tobi_endtone 6.938086 (L-H%) ) +( R:Syllable.p.tobi_endtone 6.162763 (H-) ) +( R:Syllable.p.tobi_endtone 8.035727 (!H- H-H%) ) +( tobi_endtone -19.357902 (L-L%) ) +( tobi_endtone -13.877759 (L-) ) +( tobi_endtone -6.176061 (H-L% !H-L% -X?) ) +( tobi_endtone -7.328882 (L-H%) ) +( tobi_endtone 12.694193 (H-) ) +( tobi_endtone 30.923398 (!H- H-H%) ) +( n.tobi_endtone -17.727785 (L-L%) ) +( n.tobi_endtone -2.539592 (L-) ) +( n.tobi_endtone -8.126830 (H-L% !H-L% -X?) ) +( n.tobi_endtone -8.701685 (L-H%) ) +( n.tobi_endtone -1.006439 (H-) ) +( n.tobi_endtone 6.834498 (!H- H-H%) ) +( nn.tobi_endtone -15.407530 (L-L%) ) +( nn.tobi_endtone -2.974196 (L-) ) +( nn.tobi_endtone -12.287673 (H-L% !H-L% -X?) ) +( nn.tobi_endtone -7.621437 (L-H%) ) +( nn.tobi_endtone -0.458837 (H-) ) +( nn.tobi_endtone 3.170632 (!H- H-H%) ) +( pp.old_syl_break -4.196950 ) +( p.old_syl_break -5.176929 ) +( old_syl_break 0.047922 ) +( n.old_syl_break 2.153968 ) +( nn.old_syl_break 2.577074 ) +( pp.stress -2.368192 ) +( p.stress 1.080493 ) +( stress 1.135556 ) +( n.stress 2.447219 ) +( nn.stress 1.318122 ) +( syl_in 0.291663 ) +( syl_out -0.411814 ) +( ssyl_in -1.643456 ) +( ssyl_out 0.580589 ) +( asyl_in -5.649243 ) +( asyl_out 0.489823 ) +( last_accent 0.216634 ) +( next_accent 0.244134 ) +( sub_phrases -5.758156 ) +)) + + +(set! f2b_f0_lr_end +'( +( Intercept 169.570381 ) +( R:SylStructure.parent.R:Token.parent.EMPH 10.0 ) +( pp.tobi_accent 3.594771 (H*) ) +( pp.tobi_accent 0.432519 (!H*) ) +( pp.tobi_accent 0.235664 (*? X*? H*!H* * L+H* L+!H*) ) +( pp.tobi_accent 1.513892 (L*) ) +( pp.tobi_accent 2.474823 (L*+H L*+!H) ) +( p.tobi_accent 11.214208 (H*) ) +( p.tobi_accent 9.619350 (!H*) ) +( p.tobi_accent 9.084690 (*? X*? H*!H* * L+H* L+!H*) ) +( p.tobi_accent 0.519202 (L*) ) +( p.tobi_accent 26.593112 (L*+H L*+!H) ) +( tobi_accent 25.217589 (H*) ) +( tobi_accent 13.759851 (!H*) ) +( tobi_accent 17.635192 (*? X*? H*!H* * L+H* L+!H*) ) +( tobi_accent -12.149974 (L*) ) +( tobi_accent 13.345913 (L*+H L*+!H) ) +( n.tobi_accent 4.944848 (H*) ) +( n.tobi_accent 7.398383 (!H*) ) +( n.tobi_accent 1.683011 (*? X*? H*!H* * L+H* L+!H*) ) +( n.tobi_accent -6.516900 (L*) ) +( n.tobi_accent -6.768201 (L*+H L*+!H) ) +( nn.tobi_accent -4.335797 (H*) ) +( nn.tobi_accent 5.656462 (!H*) ) +( nn.tobi_accent 0.263288 (*? X*? H*!H* * L+H* L+!H*) ) +( nn.tobi_accent 1.022002 (L*) ) +( nn.tobi_accent 6.702368 (L*+H L*+!H) ) +( pp.tobi_endtone 10.274958 (L-L%) ) +( pp.tobi_endtone 3.129947 (L-) ) +( pp.tobi_endtone 15.476240 (H-L% !H-L% -X?) ) +( pp.tobi_endtone 10.446935 (L-H%) ) +( pp.tobi_endtone 6.104384 (H-) ) +( pp.tobi_endtone 14.182688 (!H- H-H%) ) +( R:Syllable.p.tobi_endtone 1.767454 (L-L%) ) +( R:Syllable.p.tobi_endtone -1.040077 (L-) ) +( R:Syllable.p.tobi_endtone 18.438093 (H-L% !H-L% -X?) ) +( R:Syllable.p.tobi_endtone 8.750018 (L-H%) ) +( R:Syllable.p.tobi_endtone 5.000340 (H-) ) +( R:Syllable.p.tobi_endtone 10.913437 (!H- H-H%) ) +( tobi_endtone -12.637935 (L-L%) ) +( tobi_endtone -13.597961 (L-) ) +( tobi_endtone -6.501965 (H-L% !H-L% -X?) ) +( tobi_endtone 8.747483 (L-H%) ) +( tobi_endtone 15.165833 (H-) ) +( tobi_endtone 50.190326 (!H- H-H%) ) +( n.tobi_endtone -16.965781 (L-L%) ) +( n.tobi_endtone -5.222475 (L-) ) +( n.tobi_endtone -7.358555 (H-L% !H-L% -X?) ) +( n.tobi_endtone -7.833168 (L-H%) ) +( n.tobi_endtone 4.701087 (H-) ) +( n.tobi_endtone 10.349902 (!H- H-H%) ) +( nn.tobi_endtone -15.369483 (L-L%) ) +( nn.tobi_endtone -2.207161 (L-) ) +( nn.tobi_endtone -9.363835 (H-L% !H-L% -X?) ) +( nn.tobi_endtone -7.052374 (L-H%) ) +( nn.tobi_endtone 2.207854 (H-) ) +( nn.tobi_endtone 5.271546 (!H- H-H%) ) +( pp.old_syl_break -4.745862 ) +( p.old_syl_break -5.685178 ) +( old_syl_break -2.633291 ) +( n.old_syl_break 1.678340 ) +( nn.old_syl_break 2.274729 ) +( pp.stress -2.747198 ) +( p.stress 0.306724 ) +( stress -0.565613 ) +( n.stress 2.838327 ) +( nn.stress 1.285244 ) +( syl_in 0.169955 ) +( syl_out -1.045661 ) +( ssyl_in -1.487774 ) +( ssyl_out 0.752405 ) +( asyl_in -5.081677 ) +( asyl_out 3.016218 ) +( last_accent 0.312900 ) +( next_accent 0.837992 ) +( sub_phrases -5.397805 ) + +)) + +;; groups +;; tobi_accent_1 25.217589 (H*) ) +;; tobi_accent_2 13.759851 (!H*) ) +;; tobi_accent_3 17.635192 (*? X*? H*!H* * L+H* L+!H*) ) +;; tobi_accent_4 -12.149974 (L*) ) +;; tobi_accent_5 13.345913 (L*+H L*+!H) ) + +;; tobi_endtone_1 10.274958 (L-L%) ) +;; tobi_endtone_2 3.129947 (L-) ) +;; tobi_endtone_3 15.476240 (H-L% !H-L% -X?) ) +;; tobi_endtone_4 10.446935 (L-H%) ) +;; tobi_endtone_5 6.104384 (H-) ) +;; tobi_endtone_6 14.182688 (!H- H-H%) ) + +(provide 'f2bf0lr) + diff --git a/CosyVoice-ttsfrd/resource/festival/festdoc.scm b/CosyVoice-ttsfrd/resource/festival/festdoc.scm new file mode 100644 index 0000000000000000000000000000000000000000..13bc5ddd4b261dd39c43ad9f8e0bfcaa987bdca9 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/festdoc.scm @@ -0,0 +1,178 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Author: Alan W Black +;;; Date: August 1996 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Save documentation strings as texinfo files +;;; +;;; Finds all functions with documentation, and all variables with +;;; documentation, sorts and dumps the information in doc/festfunc.texi +;;; and doc/festvars.texi +;;; +;;; The makefile in the doc directory runs the compiled festival binary and +;;; causes these files to be created form the currently defined functions +;;; and variables +;;; +;;; Also provides function to extract manual section for documentation +;;; string and send a url to Netscape to display it +;;; + +(define (make-doc) +"(make-doc) +Find function and variable document strings and save them in texinfo +format to respective files." + (format t "Making function, feature and variable lists\n") + + ;; Need to ensure all library files are actually loaded if they contain + ;; funcstions/variables which have to be put in the manual + (require 'display) + (require 'mbrola) + (require 'tilt) + + (make-a-doc "festfunc.texi" 'function) + (make-a-doc "festfeat.texi" 'features) + (make-a-doc "festvars.texi" 'vars)) + +(define (make-a-doc outfile doclist) +"(make-a-doc FILENAME DOCLIST) +Make a texinfo document in FILENAME as a texinfo table, items are +from DOCLIST. DOCLIST names which doclist to use, it may be +one of 'function, 'features or 'vars." + (let ((outfp (fopen outfile "wb"))) + (format outfp "@table @code\n") + ;; Yes I am so lazy I'm not willing to write a sort function in Scheme + (sort-and-dump-docstrings doclist outfp) + (format outfp "@end table\n") + (fclose outfp))) + +;;; +;;; Documentation string may refer to a section in the manual +;;; If it does then we can automatically go to that section in the +;;; menu using Netscape. +;;; + +(defvar manual-browser "netscape" +"manual-browser +The Unix program name of your Netscape Navigator browser. +[see Getting some help]") + +(defvar manual-url + (format nil "http://www.cstr.ed.ac.uk/projects/festival/manual-%s.%s.%s/" + (car festival_version_number) + (car (cdr festival_version_number)) + (car (cdr (cdr festival_version_number)))) +"manual-url +The default URL for the Festival Manual in html format. You may +reset this to a file://.../... type URL on you're local machine. +[see Getting some help]") + +;;; Paul got this idea from VM, the email system for emacs and +;;; I found out how to do this from their code, thanks Kyle + +(define (send-url-to-netscape url) +"(send-url-to-netscape URL) +Send given URL to netscape for display. This is primarily used to +display parts of the manual referenced in documentation strings." + (system + (string-append + manual-browser + " -remote \"openURL( " + url + " )\" "))) + +(define (lastline string) +"(lastline STRING) +Returns the part of the string which between the last newline and the +end of string." + (let ((ns (string-after string "\n"))) + (if (string-equal ns "") + string + (lastline ns)))) + +(define (manual-sym symbol) +"(manual-sym SYMBOL) +Display the section in the manual that SYMBOL's docstring has +identified as the most relevant. The section is named on the +last line of a documentation string with no newlines within it +prefixed by \"[see \" with a \"]\" just immediately before the end +of the documentation string. The manual section name is translated to +the section in the HTML version of the manual and a URL is +and sent to Netscape for display. [see Getting some help]" +(let ((section (string-before (string-after + (lastline (eval (list 'doc symbol))) + "[see ") + "]"))) + (cond + ((string-equal section "") + (eval (list 'doc symbol))) ;; nothing there + (t + (manual section))))) + +(define (manual section) +"(manual SECTION) +Display SECTION in the manual. SECTION is a string identifying +a manual section (it could be an initial substring. If SECTION +is nil or unspecifed then the Manual table of contents is displayed. +This uses netscape to display the manual page so you must have that +(use variable manual-browser to identify it) and the variable +manual-url pointing to a copy of the manual. [see Getting some help]" +(let ((tmpfile (make_tmp_filename)) + (manual-section)) + (cond + ((string-matches section "\"") + (string-append "Invalid section reference containing quote: " + section "\n")) + ((not section) + (send-url-to-netscape (string-append manual-url "festival_toc.html"))) + (t ;; find section in manual + (get_url (string-append manual-url "festival_toc.html") tmpfile) + (system + (string-append + "grep -i \"^
  • .*$//' > \"" + tmpfile ".out\"")) + (set! manual-section (load (string-append tmpfile ".out") t)) + (cond + ((not manual-section) + (string-append "No section called: " section)) + (t + (send-url-to-netscape (string-append manual-url (car manual-section))) + (delete-file tmpfile) + (delete-file (string-append tmpfile ".out")) + "Sent manual reference url to netscape.")))))) + +(provide 'festdoc) + + + + diff --git a/CosyVoice-ttsfrd/resource/festival/festival.el b/CosyVoice-ttsfrd/resource/festival/festival.el new file mode 100644 index 0000000000000000000000000000000000000000..c1899f63a3b98c47faf0988348b70aa87a9fa114 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/festival.el @@ -0,0 +1,282 @@ +;;; +;;; File: festival.el +;;; Emacs Lisp +;;; +;;; Alan W Black CSTR (awb@cstr.ed.ac.uk) June 1996 +;;; +;;; Provide an emacs mode for interfacing to the festival speech +;;; synthesizer system +;;; +;;; I've looked at many examples from the emacs Lisp directory +;;; copying relevant bits from here and there, so this can only +;;; reasonably inherit the GNU licence (GPL) +;;; +;;; Setup: +;;; In your .emacs add the following 2 lines to get a Say menu: +;;; +;;; (autoload 'say-minor-mode "festival" "Menu for using Festival." t) +;;; (say-minor-mode t) +;;; (setq auto-mode-alist +;;; (append '(("\\.festivalrc$" . scheme-mode)) auto-mode-alist)) +;;; +;;; The following gives you pretty colors in emacs-19 if you are into +;;; such things +;;; ;;; Some colors for scheme mode +;;; (hilit-set-mode-patterns +;;; '(scheme-mode) +;;; '( +;;; (";.*" nil comment) +;;; (hilit-string-find ?\\ string) +;;; ("^\\s *(def\\s +" "\\()\\|nil\\)" defun) +;;; ("^\\s *(defvar\\s +\\S +" nil decl) +;;; ("^\\s *(set\\s +\\S +" nil decl) +;;; ("^\\s *(defconst\\s +\\S +" nil define) +;;; ("^\\s *(\\(provide\\|require\\).*$" nil include) +;;; ("(\\(let\\*?\\|cond\\|if\\|or\\|and\\|map\\(car\\|concat\\)\\|prog[n1*]?\\|while\\|lambda\\|function\\|Parameter\\|set\\([qf]\\|car\\|cdr\\)?\\|nconc\\|eval-when-compile\\|condition-case\\|unwind-protect\\|catch\\|throw\\|error\\)[ \t\n]" 1 keyword))) +;;; +;;; +;;;-------------------------------------------------------------------- +;;; Copyright (C) Alan W Black 1996 +;;; This code is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY. No author or distributor accepts +;;; responsibility to anyone for the consequences of using this code +;;; or for whether it serves any particular purpose or works at all, +;;; unless explicitly stated in a written agreement. +;;; +;;; Everyone is granted permission to copy, modify and redistribute +;;; this code, but only under the conditions described in the GNU +;;; Emacs General Public License. A copy of this license is +;;; distrubuted with GNU Emacs so you can know your rights and +;;; responsibilities. It should be in a file named COPYING. Among +;;; other things, the copyright notice and this notice must be +;;; preserved on all copies. +;;;-------------------------------------------------------------------- +;;; + +(defvar festival-program-name "festival") + +(defvar festival-process nil) + +(defvar festival-tmp-file + (format "/tmp/festival-emacs-tmp-%s" (user-real-login-name)) + "Filename to save input for Festivial.") + +(defun festival-fast () + (interactive) + (festival-send-command '(Parameter.set 'Duration.Stretch 0.8))) +(defun festival-slow () + (interactive) + (festival-send-command '(Parameter.set 'Duration.Stretch 1.2))) +(defun festival-ndur () + (interactive) + (festival-send-command '(Parameter.set 'Duration.Stretch 1.0))) +(defun festival-intro () + (interactive) + (festival-send-command '(intro))) + +(defun festival-gsw () + (interactive) + (festival-send-command '(voice_gsw_diphone))) +(defun festival-rab () + (interactive) + (festival-send-command '(voice_rab_diphone))) +(defun festival-ked () + (interactive) + (festival-send-command '(voice_ked_diphone))) +(defun festival-kal () + (interactive) + (festival-send-command '(voice_kal_diphone))) +(defun festival-don () + (interactive) + (festival-send-command '(voice_don_diphone))) +(defun festival-welsh () + (interactive) + (festival-send-command '(voice_welsh_hl))) +(defun festival-spanish () + (interactive) + (festival-send-command '(voice_spanish_el))) + +(defun festival-say-string (string) + "Send string to festival and have it said" + (interactive "sSay: ") + (festival-start-process) + (process-send-string festival-process + (concat "(SayText " (format "%S" string) ") +"))) + +(defun festival-send-command (cmd) + "Send command to festival" + (interactive "px") + (festival-start-process) + (process-send-string festival-process (format "%S +" cmd))) + +(defun festival-process-status () + (interactive) + (if festival-process + (message (format "Festival process status: %s" + (process-status festival-process))) + (message (format "Festival process status: NONE")))) + +(defun festival-start-process () + "Check status of process and start it if necessary" + (interactive ) + (let ((process-connection-type t)) + (if (and festival-process + (eq (process-status festival-process) 'run)) + 't + ;;(festival-kill-festival t) + (message "Starting new synthesizer process...") + (sit-for 0) + (setq festival-process + (start-process "festival" (get-buffer-create "*festival*") + festival-program-name))) + )) + +(defun festival-kill-process () + "Kill festival sub-process" + (interactive) + (if festival-process + (kill-process festival-process)) + (setq festival-process nil) + (message "Festival process killed")) + +(defun festival-send-string (string) + "Send given string to fesitval process." + (interactive) + (festival-start-process) + (process-send-string festival-process string)) + +(defun festival-say-region (reg-start reg-end) + "Send given region to festival for saying. This saves the region +as a file in /tmp and then tells festival to say that file. The +major mode is *not* passed as text mode name to Festival." + (interactive "r") + (write-region reg-start reg-end festival-tmp-file) + (festival-send-command (list 'tts festival-tmp-file nil))) + +(defun festival-say-buffer () + "Send given region to festival for saying. This saves the region +as a file in /tmp and then tells festival to say that file. The +major-mode is passed as a text mode to Festival." + (interactive) + (write-region (point-min) (point-max) festival-tmp-file) + ;; Because there may by sgml-like sub-files mentioned + ;; ensure festival tracks the buffer's default-directory + (festival-send-command (list 'cd (expand-file-name default-directory))) + (if (equal "-mode" (substring (format "%S" major-mode) -5 nil)) + (if (equal "sgml" (substring (format "%S" major-mode) 0 -5)) + (festival-send-command + (list 'tts festival-tmp-file "sable")) + (festival-send-command + (list 'tts festival-tmp-file + (substring (format "%S" major-mode) 0 -5)))) + (festival-send-command (list 'tts festival-tmp-file nil)))) + +;; +;; say-minor-mode provides a menu offering various speech synthesis commands +;; +(defvar say-minor-mode nil) + +(defun say-minor-mode (arg) + "Toggle say minor mode. +With arg, turn say-minor-mode on iff arg is positive." + (interactive "P") + (setq say-minor-mode + (if (if (null arg) (not say-minor-mode) + (> (prefix-numeric-value arg) 0)) + t)) + (force-mode-line-update)) + +(setq say-params-menu (make-sparse-keymap "Pitch/Duration")) +(fset 'say-params-menu (symbol-value 'say-params-menu)) +(define-key say-params-menu [say-fast] '("Fast" . festival-fast)) +(define-key say-params-menu [say-slow] '("Slow" . festival-slow)) +(define-key say-params-menu [say-ndur] '("Normal Dur" . festival-ndur)) + +(setq say-lang-menu (make-sparse-keymap "Select language")) +(fset 'say-lang-menu (symbol-value 'say-lang-menu)) +(define-key say-lang-menu [say-lang-spain1] '("Spanish el" . festival-spanish)) +(define-key say-lang-menu [say-lang-welsh1] '("Welsh hl" . festival-welsh)) +(define-key say-lang-menu [say-lang-eng5] '("English gsw" . festival-gsw)) +(define-key say-lang-menu [say-lang-eng4] '("English don" . festival-don)) +(define-key say-lang-menu [say-lang-eng3] '("English rab" . festival-rab)) +(define-key say-lang-menu [say-lang-eng2] '("English ked" . festival-ked)) +(define-key say-lang-menu [say-lang-eng1] '("English kal" . festival-kal)) +;(define-key say-params-menu [say-set-dur-stretch] +; '("Set Duration Stretch" . festival-set-dur-stretch)) +;(define-key say-params-menu [say-high] '("High" . festival-high)) +;(define-key say-params-menu [say-low] '("Low" . festival-low)) +;(define-key say-params-menu [say-npit] '("Normal Pitch" . festival-npit)) +;(define-key say-params-menu [say-set-pitch-stretch] +; '("Set Pitch Stretch" . festival-set-pitch-stretch)) + +(setq say-minor-mode-map (make-sparse-keymap)) +(setq say-menu (make-sparse-keymap "SAY")) +(define-key say-minor-mode-map [menu-bar SAY] (cons "Say" say-menu)) +(define-key say-minor-mode-map [menu-bar SAY festival-intro] '("Festival Intro" . festival-intro)) +(define-key say-minor-mode-map [menu-bar SAY festival-process-status] '("Festival status" . festival-process-status)) +(define-key say-minor-mode-map [menu-bar SAY festival-kill-process] '("Kill Festival" . festival-kill-process)) +(define-key say-minor-mode-map [menu-bar SAY festival-start-process] '("(Re)start Festival" . festival-start-process)) +;;(define-key say-menu [separator-process] '("--")) +;;(define-key say-menu [params] '("Pitch/Durations" . say-params-menu)) +(define-key say-menu [separator-buffers] '("--")) +(define-key say-menu [festival-send-command] '("Festival eval command" . festival-send-command)) +(define-key say-menu [say-lang-menu] '("Select language" . say-lang-menu)) +(define-key say-menu [festival-say-buffer] '("Say buffer" . festival-say-buffer)) +(define-key say-menu [festival-say-region] '("Say region" . festival-say-region)) + + +(setq minor-mode-map-alist + (cons + (cons 'say-minor-mode say-minor-mode-map) + minor-mode-map-alist)) + +(or (assq 'say-minor-mode minor-mode-alist) + (setq minor-mode-alist + (cons '(say-minor-mode "") minor-mode-alist))) + +;;; +;;; A FESTIVAL inferior mode (copied from prolog.el) +;;; +(defvar inferior-festival-mode-map nil) + +(defun inferior-festival-mode () + "Major mode for interacting with an inferior FESTIVAL process. + +The following commands are available: +\\{inferior-festival-mode-map} + +Entry to this mode calls the value of `festival-mode-hook' with no arguments, +if that value is non-nil. Likewise with the value of `comint-mode-hook'. +`festival-mode-hook' is called after `comint-mode-hook'. + +You can send text to the inferior FESTIVAL from other buffers +using the commands `send-region', `send-string' + +Return at end of buffer sends line as input. +Return not at end copies rest of line to end and sends it. +\\[comint-kill-input] and \\[backward-kill-word] are kill commands, imitating normal Unix input editing. +\\[comint-interrupt-subjob] interrupts the shell or its current subjob if any. +\\[comint-stop-subjob] stops. \\[comint-quit-subjob] sends quit signal." + (interactive) + (require 'comint) + (comint-mode) + (setq major-mode 'inferior-festival-mode + mode-name "Inferior FESTIVAL" + comint-prompt-regexp "^festival> ") + (if inferior-festival-mode-map nil + (setq inferior-festival-mode-map (copy-keymap comint-mode-map)) + (festival-mode-commands inferior-festival-mode-map)) + (use-local-map inferior-festivalr-mode-map) + (run-hooks 'festival-mode-hook)) + +;;;###autoload +(defun run-festival () + "Run an inferior FESTIVAL process, input and output via buffer *festival*." + (interactive) + (require 'comint) + (switch-to-buffer (make-comint "festival" festival-program-name)) + (inferior-festival-mode)) + +(provide 'festival) diff --git a/CosyVoice-ttsfrd/resource/festival/festival.scm b/CosyVoice-ttsfrd/resource/festival/festival.scm new file mode 100644 index 0000000000000000000000000000000000000000..77b22929e3b661543b979cfe41bed1b45d6b1764 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/festival.scm @@ -0,0 +1,633 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; General Festival Scheme specific functions +;;; Including definitions of various standard variables. + +;; will be set automatically on start-up +(defvar festival_version "unknown" + "festival_version + A string containing the current version number of the system.") + +;; will be set automatically on start-up +(defvar festival_version_number '(x x x) + "festival_version_number + A list of major, minor and subminor version numbers of the current + system. e.g. (1 0 12).") + +(define (apply_method method utt) +"(apply_method METHOD UTT) +Apply the appropriate function to utt defined in parameter." + (let ((method_val (Parameter.get method))) + (cond + ((null method_val) + nil) ;; should be an error, but I'll let you off at present + ((and (symbol? method_val) (symbol-bound? method_val)) + (apply (symbol-value method_val) (list utt))) + ((member (typeof method_val) '(subr closure)) + (apply method_val (list utt))) + (t ;; again is probably an error + nil)))) + +(define (require_module l) + "(require_module l) +Check that certain compile-time modules are included in this installation. +l may be a single atom or list of atoms. Each item in l must appear in +*modules* otherwise an error is throw." + (if (consp l) + (mapcar require_module l) + (if (not (member_string l *modules*)) + (error (format nil "module %s required, but not compiled in this installation\n" l)))) + t) + +;;; Feature Function Functions +(define (utt.features utt relname func_list) +"(utt.features UTT RELATIONNAME FUNCLIST) + Get vectors of feature values for each item in RELATIONNAME in UTT. + [see Features]" + (mapcar + (lambda (s) + (mapcar (lambda (f) (item.feat s f)) func_list)) + (utt.relation.items utt relname))) + +(define (utt.type utt) +"(utt.type UTT) + Returns the type of UTT." + (intern (utt.feat utt 'type))) + +(define (utt.save.segs utt filename) +"(utt.save.segs UTT FILE) + Save segments of UTT in a FILE in xlabel format." + (let ((fd (fopen filename "w"))) + (format fd "#\n") + (mapcar + (lambda (info) + (format fd "%2.4f 100 %s\n" (car info) (car (cdr info)))) + (utt.features utt 'Segment '(segment_end name))) + (fclose fd) + utt)) + +(define (utt.save.words utt filename) +"(utt.save.words UTT FILE) + Save words of UTT in a FILE in xlabel format." + (let ((fd (fopen filename "w"))) + (format fd "#\n") + (mapcar + (lambda (info) + (format fd "%2.4f 100 %s\n" (car info) (car (cdr info)))) + (utt.features utt 'Word '(word_end name))) + (fclose fd) + utt)) + +(define (utt.resynth labfile f0file) +"(utt.resynth LABFILE F0FILE) +Resynthesize an utterance from a label file and F0 file (in any format +supported by the Speech Tool Library). This loads, synthesizes and +plays the utterance." + (let (u f0 f0_item) + (set! u (Utterance SegF0)) ; need some u to start with + (utt.relation.load u 'Segment labfile) + (utt.relation.create u 'f0) + (set! f0 (track.load f0file)) + (set! f0_item (utt.relation.append u 'f0)) + (item.set_feat f0_item "name" "f0") + (item.set_feat f0_item "f0" f0) + + ;; emulabel may have flipped pau to H# + (mapcar + (lambda (s) + (cond + ((string-matches (item.name s) "[hH]#") + (item.set_feat s "name" "pau")) + ((string-matches (item.name s) "#.*") + (item.set_feat s "name" (string-after (item.name s) "#"))))) + (utt.relation.items u 'Segment)) + + (Wave_Synth u) + (utt.play u) + u)) + +(define (utt.relation.present utt relation) +"(utt.relation.present UTT RELATIONNAME) +Returns t if UTT caontains a relation called RELATIONNAME, nil otherwise." + (if (member_string relation (utt.relationnames utt)) + t + nil)) + +(define (utt.relation.leafs utt relation) +"(utt.relation.leafs UTT RELATIONNAME) +Returns a list of all the leafs in this relation." + (let ((leafs nil)) + (mapcar + (lambda (i) + (if (not (item.down (item.relation i relation))) + (set! leafs (cons i leafs)))) + (utt.relation.items utt relation)) + (reverse leafs))) + +(define (utt.relation.first utt relation) +"(utt.relation.first UTT RELATIONNAME) +Returns a the first item in this relation." + (utt.relation utt relation)) + +(define (utt.relation.last utt relation) +"(utt.relation.last UTT RELATIONNAME) +Returns a the last item in this relation." + (let ((i (utt.relation.first utt relation))) + (while (item.next i) + (set! i (item.next i))) + i)) + +(define (item.feat.present item feat) + "(item.feat.present item feat) +nil if feat doesn't existing in this item, non-nil otherwise." + (and item (assoc_string feat (item.features item)))) + +(define (item.relation.append_daughter parent relname daughter) +"(item.relation.append_daughter parent relname daughter) +Make add daughter to parent as a new daughter in relname." + (item.append_daughter (item.relation parent relname) daughter)) + +(define (item.relation.insert si relname newsi direction) +"(item.relation.insert si relname newsi direction) +Insert newsi in relation relname with respect to direction. If +direction is ommited after is assumed, valid directions are after +before, above and below. Note you should use +item.relation.append_daughter for tree adjoining. newsi maybe +a item itself of a LISP description of one." + (item.insert + (item.relation si relname) + newsi + direction)) + +(define (item.relation.daughters parent relname) + "(item.relation.daughters parent relname) +Return a list of all daughters of parent by relname." + (let ((d1 (item.daughter1 (item.relation parent relname))) + (daughters)) + (while d1 + (set! daughters (cons d1 daughters)) + (set! d1 (item.next d1))) + (reverse daughters))) + +(define (item.daughters p) + "(item.daughters parent) +Return a list of all daughters of parent." + (item.relation.daughters p (item.relation.name p))) + +(define (item.relation.parent si relname) + "(item.relation.parent item relname) +Return the parent of this item in this relation." + (item.parent (item.relation si relname))) + +(define (item.relation.daughter1 si relname) + "(item.relation.daughter1 item relname) +Return the first daughter of this item in this relation." + (item.daughter1 (item.relation si relname))) + +(define (item.relation.daughter2 si relname) + "(item.relation.daughter2 item relname) +Return the second daughter of this item in this relation." + (item.daughter2 (item.relation si relname))) + +(define (item.relation.daughtern si relname) + "(item.relation.daughtern item relname) +Return the final daughter of this item in this relation." + (item.daughtern (item.relation si relname))) + +(define (item.relation.next si relname) + "(item.relation.next item relname) +Return the next item in this relation." + (item.next (item.relation si relname))) + +(define (item.relation.prev si relname) + "(item.relation.prev item relname) +Return the previous item in this relation." + (item.prev (item.relation si relname))) + +(define (item.relation.first si relname) + "(item.relation.first item relname) +Return the most previous item from this item in this relation." + (let ((n (item.relation si relname))) + (while (item.prev n) + (set! n (item.prev n))) + n)) + +(define (item.leafs si) + "(item.relation.leafs item relname) +Return a list of the leafs of this item in this relation." + (let ((ls nil) + (pl (item.first_leaf si)) + (ll (item.next_leaf (item.last_leaf si)))) + (while (and pl (not (equal? pl ll))) + (set! ls (cons pl ls)) + (set! pl (item.next_leaf pl))) + (reverse ls))) + +(define (item.relation.leafs si relname) + "(item.relation.leafs item relname) +Return a list of the leafs of this item in this relation." + (item.leafs (item.relation si relname))) + +(define (item.root s) + "(item.root s) +Follow parent link until s has no parent." + (cond + ((item.parent s) + (item.root (item.parent s))) + (t s))) + +(define (item.parent_to s relname) + "(item.parent_to s relname) +Find the first ancestor of s in its current relation that is also in +relname. s is treated as an ancestor of itself so if s is in relname +it is returned. The returned value is in will be in relation relname +or nil if there isn't one." + (cond + ((null s) s) + ((member_string relname (item.relations s)) + (item.relation s relname)) + (t (item.parent_to (item.parent s) relname)))) + +(define (item.daughter1_to s relname) + "(item.daughter1_to s relname) +Follow daughter1 links of s in its current relation until an item +is found that is also in relname, is s is in relname it is returned. +The return item is returned in relation relname, or nil if there is +nothing in relname." + (cond + ((null s) s) + ((member_string relname (item.relations s)) (item.relation s relname)) + (t (item.daughter1_to (item.daughter1 s) relname)))) + +(define (item.daughtern_to s relname) + "(item.daughter1_to s relname) +Follow daughtern links of s in its current relation until an item +is found that is also in relname, is s is in relname it is returned. +The return item is returned in relation relname, or nil if there is +nothing in relname." + (cond + ((null s) s) + ((member_string relname (item.relations s)) (item.relation s relname)) + (t (item.daughtern_to (item.daughtern s) relname)))) + +(define (item.name s) +"(item.name ITEM) + Returns the name of ITEM. [see Accessing an utterance]" + (item.feat s "name")) + +(define (utt.wave utt) + "(utt.wave UTT) +Get waveform from wave (R:Wave.first.wave)." + (item.feat (utt.relation.first utt "Wave") "wave")) + +(define (utt.wave.rescale . args) + "(utt.wave.rescale UTT FACTOR NORMALIZE) +Modify the gain of the waveform in UTT by GAIN. If NORMALIZE is +specified and non-nil the waveform is maximized first." + (wave.rescale (utt.wave (nth 0 args)) (nth 1 args) (nth 2 args)) + (nth 0 args)) + +(define (utt.wave.resample utt rate) + "(utt.wave.resample UTT RATE)\ +Resample waveform in UTT to RATE (if it is already at that rate it remains +unchanged)." + (wave.resample (utt.wave utt) rate) + utt) + +(define (utt.import.wave . args) + "(utt.import.wave UTT FILENAME APPEND) +Load waveform in FILENAME into UTT in R:Wave.first.wave. If APPEND +is specified and non-nil append this to the current waveform." + (let ((utt (nth 0 args)) + (filename (nth 1 args)) + (append (nth 2 args))) + (if (and append (member 'Wave (utt.relationnames utt))) + (wave.append (utt.wave utt) (wave.load filename)) + (begin + (utt.relation.create utt 'Wave) + (item.set_feat + (utt.relation.append utt 'Wave) + "wave" + (wave.load filename)))) + utt)) + +(define (utt.save.wave . args) + "(utt.save.wave UTT FILENAME FILETYPE) +Save waveform in UTT in FILENAME with FILETYPE (if specified) or +using global parameter Wavefiletype." + (wave.save + (utt.wave (nth 0 args)) + (nth 1 args) + (nth 2 args)) + (nth 0 args)) + +(define (utt.play utt) + "(utt.play UTT) +Play waveform in utt by current audio method." + (wave.play (utt.wave utt)) + utt) + +(define (utt.save.track utt filename relation feature) + "(utt.save.track utt filename relation feature) +DEPRICATED use trace.save instead." + (format stderr "utt.save.track: DEPRICATED use track.save instead\n") + (track.save + (item.feat + (utt.relation.first utt relation) + feature) + filename) + utt) + +(define (utt.import.track utt filename relation fname) + "(utt.import.track UTT FILENAME RELATION FEATURE_NAME) +Load track in FILENAME into UTT in R:RELATION.first.FEATURE_NAME. +Deletes RELATION if it already exists. (you maybe want to use track.load +directly rather than this legacy function." + (utt.relation.create utt relation) + (item.set_feat + (utt.relation.append utt relation) + fname + (track.load filename)) + utt) + +(define (wagon_predict item tree) +"(wagon_predict ITEM TREE) +Predict with given ITEM and CART tree and return the prediction +(the last item) rather than whole probability distribution." + (car (last (wagon item tree)))) + +(define (phone_is_silence phone) + (member_string + phone + (car (cdr (car (PhoneSet.description '(silences))))))) + +(define (phone_feature phone feat) +"(phone_feature phone feat) +Return the feature for given phone in current phone set, or 0 +if it doesn't exist." + (let ((ph (intern phone))) + (let ((fnames (cadr (assoc 'features (PhoneSet.description)))) + (fvals (cdr (assoc ph (cadr (assoc 'phones (PhoneSet.description))))))) + (while (and fnames (not (string-equal feat (car (car fnames))))) + (set! fvals (cdr fvals)) + (set! fnames (cdr fnames))) + (if fnames + (car fvals) + 0)))) + +(defvar server_max_clients 10 + "server_max_clients +In server mode, the maximum number of clients supported at any one +time. When more that this number of clients attach simulaneous +the last ones are denied access. Default value is 10. +[see Server/client API]") + +(defvar server_port 1314 + "server_port +In server mode the inet port number the server will wait for connects +on. The default value is 1314. [see Server/client API]") + +(defvar server_log_file t + "server_log_file +If set to t server log information is printed to standard output +of the server process. If set to nil no output is given. If set +to anything else the value is used as the name of file to which +server log information is appended. Note this value is checked at +server start time, there is no way a client may change this. +[see Server/client API]") + +(defvar server_passwd nil + "server_passwd +If non-nil clients must send this passwd to the server followed by +a newline before they can get a connection. It would be normal +to set this for the particular server task. +[see Server/client API]") + +(defvar server_access_list '(localhost) + "server_access_list +If non-nil this is the exhaustive list of machines and domains +from which clients may access the server. This is a list of REGEXs +that client host must match. Remember to add the backslashes before +the dots. [see Server/client API]") + +(defvar server_deny_list nil + "server_deny_list +If non-nil this is a list of machines which are to be denied access +to the server absolutely, irrespective of any other control features. +The list is a list of REGEXs that are used to matched the client hostname. +This list is checked first, then server_access_list, then passwd. +[see Server/client API]") + +(define (def_feature_docstring fname fdoc) +"(def_feature_docstring FEATURENAME FEATUREDOC) +As some feature are used directly of stream items with no +accompanying feature function, the features are just values on the feature +list. This function also those features to have an accompanying +documentation string." + (let ((fff (assoc fname ff_docstrings))) + (cond + (fff ;; replace what's already there + (set-cdr! fff fdoc)) + (t + (set! ff_docstrings (cons (cons fname fdoc) ff_docstrings)))) + t)) + +(define (linear_regression item model) + "(linear_regression ITEM MODEL) +Use linear regression MODEL on ITEM. MODEL consists of a list +of features, weights and optional map list. E.g. ((Intercept 100) +(tobi_accent 10 (H* !H*)))." + (let ((intercept (if (equal? 'Intercept (car (car model))) + (car (cdr (car model))) 0)) + (mm (if (equal? 'Intercept (car (car model))) + (cdr model) model))) + (apply + + (cons intercept + (mapcar + (lambda (f) + (let ((ff (item.feat item (car f)))) + (if (car (cdr (cdr f))) + (if (member_string ff (car (cdr (cdr f)))) + (car (cdr f)) + 0) + (* (parse-number ff) (car (cdr f)))))) + mm))))) + +(defvar help + "The Festival Speech Synthesizer System: Help + +Getting Help + (doc ') displays help on + (manual nil) displays manual in local netscape + C-c return to top level + C-d or (quit) Exit Festival +(If compiled with editline) + M-h displays help on current symbol + M-s speaks help on current symbol + M-m displays relevant manula page in local netscape + TAB Command, symbol and filename completion + C-p or up-arrow Previous command + C-b or left-arrow Move back one character + C-f or right-arrow + Move forward one character + Normal Emacs commands work for editing command line + +Doing stuff + (SayText TEXT) Synthesize text, text should be surrounded by + double quotes + (tts FILENAME nil) Say contexts of file, FILENAME should be + surrounded by double quotes + (voice_rab_diphone) Select voice (Britsh Male) + (voice_kal_diphone) Select voice (American Male) +") + +(define (festival_warranty) +"(festival_warranty) + Display Festival's copyright and warranty. [see Copying]" + (format t + (string-append + " The Festival Speech Synthesis System: " + festival_version +" + Centre for Speech Technology Research + University of Edinburgh, UK + Copyright (c) 1996-2014 + All Rights Reserved. + + Permission is hereby granted, free of charge, to use and distribute + this software and its documentation without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of this work, and to + permit persons to whom this work is furnished to do so, subject to + the following conditions: + 1. The code must retain the above copyright notice, this list of + conditions and the following disclaimer. + 2. Any modifications must be clearly marked as such. + 3. Original authors' names are not deleted. + 4. The authors' names are not used to endorse or promote products + derived from this software without specific prior written + permission. + + THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK + DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING + ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT + SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE + FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, + ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF + THIS SOFTWARE. +"))) + +(define (intro) +"(intro) + Synthesize an introduction to the Festival Speech Synthesis System." + (tts (path-append libdir "../examples/intro.text") nil)) + +(define (intro-spanish) +"(intro-spanish) + Synthesize an introduction to the Festival Speech Synthesis System + in spanish. Spanish voice must already be selected for this." + (tts (path-append libdir "../examples/spintro.text") nil)) + +(define (na_play FILENAME) +"(play_wave FILENAME) +Play given wavefile" + (utt.play (utt.synth (eval (list 'Utterance 'Wave FILENAME))))) + +;;; Some autoload commands +(autoload manual-sym "festdoc" "Show appropriate manual section for symbol.") +(autoload manual "festdoc" "Show manual section.") + +(autoload display "display" "Graphically display utterance.") + +(autoload festtest "festtest" "Run tests of Festival.") + +(defvar diphone_module_hooks nil + "diphone_module_hooks + A function or list of functions that will be applied to the utterance + at the start of the diphone module. It can be used to map segment + names to those that will be used by the diphone database itself. + Typical use specifies _ and $ for consonant clusters and syllable + boundaries, mapping to dark ll's etc. Reduction and tap type + phenomena should probabaly be done by post lexical rules though the + distinction is not a clear one.") + +(def_feature_docstring + 'Segment.diphone_phone_name + "Segment.diphone_phone_name + This is produced by the diphone module to contain the desired phone + name for the desired diphone. This adds things like _ if part of + a consonant or $ to denote syllable boundaries. These are generated + on a per voice basis by function(s) specified by diphone_module_hooks. + Identification of dark ll's etc. may also be included. Note this is not + necessarily the name of the diphone selected as if it is not found + some of these characters will be removed and fall back values will be + used.") + +(def_feature_docstring + 'Syllable.stress + "Syllable.stress + The lexical stress of the syllable as specified from the lexicon entry + corresponding to the word related to this syllable.") + +;;; +;;; I tried some tests on the resulting speed both runtime and loadtime +;;; but compiled files don't seem to make any significant difference +;;; +(define (compile_library) + "(compile_library) +Compile all the scheme files in the library directory." + (mapcar + (lambda (file) + (format t "compile ... %s\n" file) + (compile-file (string-before file ".scm"))) + (list + "synthesis.scm" "siod.scm" "init.scm" "lexicons.scm" + "festival.scm" "gsw_diphone.scm" "intonation.scm" "duration.scm" + "pos.scm" "phrase.scm" "don_diphone.scm" "rab_diphone.scm" + "voices.scm" "tts.scm" "festdoc.scm" "languages.scm" "token.scm" + "mbrola.scm" "display.scm" "postlex.scm" "tokenpos.scm" + "festtest.scm" "cslush.scm" "ducs_cluster.scm" "sucs.scm" + "web.scm" "cart_aux.scm" + "lts_nrl.scm" "lts_nrl_us.scm" "email-mode.scm" + "mrpa_phones.scm" "radio_phones.scm" "holmes_phones.scm" + "mrpa_durs.scm" "klatt_durs.scm" "gswdurtreeZ.scm" + "tobi.scm" "f2bf0lr.scm")) + t) + +;;; For mlsa resynthesizer +(defvar mlsa_alpha_param 0.42) +(defvar mlsa_beta_param 0.0) + +(provide 'festival) diff --git a/CosyVoice-ttsfrd/resource/festival/festtest.scm b/CosyVoice-ttsfrd/resource/festival/festtest.scm new file mode 100644 index 0000000000000000000000000000000000000000..345c3cc2b59600933adb490050507c429e7eb7ef --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/festtest.scm @@ -0,0 +1,72 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Some basic functions used in tests for Festival +;;; + +(define (test_words text) +"(test_words TEXT) +prints TEXT, Synthesizes TEXT and outputs the words in it." + (format t "Word test: %s\n " text) + (set! utt1 (utt.synth (eval (list 'Utterance 'Text text)))) + (mapcar + (lambda (word) (format t "%s " (car word))) + (utt.features utt1 'Word '(name))) + (format t "\n") + t) + +(define (test_segments text) +"(test_segments TEXT) +prints TEXT, Synthesizes TEXT and outputs the segments in it." + (format t "Segment test: %s\n " text) + (set! utt1 (utt.synth (eval (list 'Utterance 'Text text)))) + (mapcar + (lambda (word) (format t "%s " (car word))) + (utt.features utt1 'Segment '(name))) + (format t "\n") +) + +(define (test_phrases text) +"(test_phrases TEXT) +prints TEXT, Synthesizes TEXT and outputs the words and phrase breaks." + (format t "Phrase test: %s \n " text) + (set! utt1 (utt.synth (eval (list 'Utterance 'Text text)))) + (mapcar + (lambda (phrase) + (mapcar (lambda (w) (format t "%s " (car (car w)))) (cdr phrase)) + (format t "%s\n " (car (car phrase)))) + (utt.relation_tree utt1 'Phrase)) + (format t "\n") + t) + +(provide 'festtest) diff --git a/CosyVoice-ttsfrd/resource/festival/fringe.scm b/CosyVoice-ttsfrd/resource/festival/fringe.scm new file mode 100644 index 0000000000000000000000000000000000000000..b60bd1e5106687642dba7f33449a6817ff4e9152 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/fringe.scm @@ -0,0 +1,108 @@ + + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;; DO NOT EDIT THIS FILE ON PAIN OF MORE PAIN. + ;;; + ;;; The master copy of this file is in ../../speech_tools/lib/siod/fringe.scm + ;;; and is copied here at build time. + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + + + + + + + + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Talking to fringe. + +(defvar fringe_verbose nil + "fringe_verbose + If this is set true, all subsequent fringe connections will + print a trace of what they are doing.") + +;;; Aliases which are better suited to command line use. + +(defvar fringe_name "fringe" + "fringe_name + The name of the last name passed to \[fringe_setup\].") + +(defvar fringe_connection nil + "fringe_connection + A connection to fringe, used by the command line fringe functions.") + +(define (fringe_setup &opt name) + "(fringe_setup &opt name) + Connect to fringe." + + (fringe_read_server_table) + (if (not name) (set! name fringe_name)) + (set! fringe_connection (fringe_server "fringe")) + (set! fringe_name name) + ) + +(define (fringe command) + "(fringe COMMAND) + Send COMMAND to the fringe server \[fringe_connection\] + For command line use, use (fringe_comand_string...) in scripts. " + (if (not fringe_connection) (fringe_setup)) + (let ((val (fringe_command_string fringe_connection command))) + (if (or (null val) (consp val)) + nil + val) + ) + ) + +(define (fringel package operation args) + "(fringel PACKAGE OPERATION ARGS) + Send a command to the fringe server \[fringe_connection\]. + For command line use, use (fringe_comand...) in scripts. " + + (if (not fringe_connection) (fringe_setup)) + (let ((val (fringe_command fringe_connection package operation args))) + (if (or (null val) (consp val)) + nil + val) + ) + ) + +(provide 'fringe) diff --git a/CosyVoice-ttsfrd/resource/festival/gswdurtreeZ.scm b/CosyVoice-ttsfrd/resource/festival/gswdurtreeZ.scm new file mode 100644 index 0000000000000000000000000000000000000000..4968192dce221ec1f557c83bcadc4d41fa035b90 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/gswdurtreeZ.scm @@ -0,0 +1,947 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; A tree to predict zcore durations build from gsw 450 (timit) +;;; doesn't use actual phonemes so it can have better generalizations +;;; + +;; pre Sue's changes to mrpa_phones (on traing data) +;; RMSE 0.79102 Correlation is 0.610184 Mean (abs) Error 0.605081 (0.509517) +;; Post with balance +;; train test spit --stop 19 --balance 16 +;; RMSE 0.841861 Correlation is 0.526064 Mean (abs) Error 0.646614 (0.539288) +;; on training data +;; RMSE 0.784032 Correlation is 0.619165 Mean (abs) Error 0.602819 (0.501332) +;; +;; Oct 29th 1997 +;; stepwise (but its over trained) +;; RMSE 0.8322 Correlation is 0.5286 Mean (abs) Error 0.6375 (0.5350) +;; +;; May 11th 1998 +;; new architecture, full new train on f2b on test data +;; in zscore domain +;; RMSE 0.8076 Correlation is 0.5307 Mean (abs) Error 0.6113 (0.5278) +;; in absolute domain +;; RMSE 0.0276 Correlation 0.7468 Mean (abs) error 0.0203 (0.0187) +;; +;; May 18th 1998 +;; various corrections f2bdur.bbz.H0.S50.tree no names zscore +;; in zscore domain +;; RMSE 0.8049 Correlation is 0.6003 Mean (abs) Error 0.6008 (0.5357) +;; in absolute domain +;; RMSE 0.0268 Correlation 0.7766 Mean (abs) error 0.0196 (0.0183) + +(set! gsw_duration_cart_tree +' +((name is #) + ((emph_sil is +) + ((0.0 -0.5)) + ((p.R:SylStructure.parent.parent.pbreak is BB) + ((0.0 2.0)) + ((0.0 0.0)))) + +((R:SylStructure.parent.accented is 0) + ((n.ph_ctype is 0) + ((p.ph_vlng is 0) + ((R:SylStructure.parent.syl_codasize < 1.5) + ((p.ph_ctype is n) + ((ph_ctype is f) + ((0.559208 -0.783163)) + ((1.05215 -0.222704))) + ((ph_ctype is s) + ((R:SylStructure.parent.syl_break is 2) + ((0.589948 0.764459)) + ((R:SylStructure.parent.asyl_in < 0.7) + ((1.06385 0.567944)) + ((0.691943 0.0530272)))) + ((ph_vlng is l) + ((pp.ph_vfront is 1) + ((1.06991 0.766486)) + ((R:SylStructure.parent.syl_break is 1) + ((0.69665 0.279248)) + ((0.670353 0.0567774)))) + ((p.ph_ctype is s) + ((seg_onsetcoda is coda) + ((0.828638 -0.038356)) + ((ph_ctype is f) + ((0.7631 -0.545853)) + ((0.49329 -0.765994)))) + ((R:SylStructure.parent.parent.gpos is det) + ((R:SylStructure.parent.last_accent < 0.3) + ((R:SylStructure.parent.sub_phrases < 1) + ((0.811686 0.160195)) + ((0.799015 0.713958))) + ((0.731599 -0.215472))) + ((ph_ctype is r) + ((0.673487 0.092772)) + ((R:SylStructure.parent.asyl_in < 1) + ((0.745273 0.00132813)) + ((0.75457 -0.334898))))))))) + ((pos_in_syl < 0.5) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((0.902446 -0.041618)) + ((R:SylStructure.parent.sub_phrases < 2.3) + ((0.900629 0.262952)) + ((1.18474 0.594794)))) + ((seg_onset_stop is 0) + ((R:SylStructure.parent.position_type is mid) + ((0.512323 -0.760444)) + ((R:SylStructure.parent.syl_out < 6.8) + ((pp.ph_vlng is a) + ((0.640575 -0.450449)) + ((ph_ctype is f) + ((R:SylStructure.parent.sub_phrases < 1.3) + ((0.862876 -0.296956)) + ((R:SylStructure.parent.syl_out < 2.4) + ((0.803215 0.0422868)) + ((0.877856 -0.154465)))) + ((R:SylStructure.parent.syl_out < 3.6) + ((R:SylStructure.parent.syl_out < 1.2) + ((0.567081 -0.264199)) + ((0.598043 -0.541738))) + ((0.676843 -0.166623))))) + ((0.691678 -0.57173)))) + ((R:SylStructure.parent.parent.gpos is cc) + ((1.15995 0.313289)) + ((pp.ph_vfront is 1) + ((0.555993 0.0695819)) + ((R:SylStructure.parent.asyl_in < 1.2) + ((R:SylStructure.parent.sub_phrases < 2.7) + ((0.721635 -0.367088)) + ((0.71919 -0.194887))) + ((0.547052 -0.0637491))))))) + ((ph_ctype is s) + ((R:SylStructure.parent.syl_break is 0) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((0.650007 -0.333421)) + ((0.846301 -0.165383))) + ((0.527756 -0.516332))) + ((R:SylStructure.parent.syl_break is 0) + ((p.ph_ctype is s) + ((0.504414 -0.779112)) + ((0.812498 -0.337611))) + ((pos_in_syl < 1.4) + ((0.513041 -0.745807)) + ((p.ph_ctype is s) + ((0.350582 -1.04907)) + ((0.362 -0.914974)))))))) + ((R:SylStructure.parent.syl_break is 0) + ((ph_ctype is n) + ((R:SylStructure.parent.position_type is initial) + ((pos_in_syl < 1.2) + ((0.580485 0.172658)) + ((0.630973 -0.101423))) + ((0.577937 -0.360092))) + ((R:SylStructure.parent.syl_out < 2.9) + ((R:SylStructure.parent.syl_out < 1.1) + ((R:SylStructure.parent.position_type is initial) + ((0.896092 0.764189)) + ((R:SylStructure.parent.sub_phrases < 3.6) + ((ph_ctype is s) + ((0.877362 0.555132)) + ((0.604511 0.369882))) + ((0.799982 0.666966)))) + ((seg_onsetcoda is coda) + ((p.ph_vlng is a) + ((R:SylStructure.parent.last_accent < 0.4) + ((0.800736 0.240634)) + ((0.720606 0.486176))) + ((1.18173 0.573811))) + ((0.607147 0.194468)))) + ((ph_ctype is r) + ((0.88377 0.499383)) + ((R:SylStructure.parent.last_accent < 0.5) + ((R:SylStructure.parent.position_type is initial) + ((R:SylStructure.parent.parent.word_numsyls < 2.4) + ((0.62798 0.0737318)) + ((0.787334 0.331014))) + ((ph_ctype is s) + ((0.808368 0.0929299)) + ((0.527948 -0.0443271)))) + ((seg_coda_fric is 0) + ((p.ph_vlng is a) + ((0.679745 0.517681)) + ((R:SylStructure.parent.sub_phrases < 1.1) + ((0.759979 0.128316)) + ((0.775233 0.361383)))) + ((R:SylStructure.parent.last_accent < 1.3) + ((0.696255 0.054136)) + ((0.632425 0.246742)))))))) + ((pos_in_syl < 0.3) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((0.847602 0.621547)) + ((ph_ctype is s) + ((0.880645 0.501679)) + ((R:SylStructure.parent.sub_phrases < 3.3) + ((R:SylStructure.parent.sub_phrases < 0.3) + ((0.901014 -0.042049)) + ((0.657493 0.183226))) + ((0.680126 0.284799))))) + ((ph_ctype is s) + ((p.ph_vlng is s) + ((0.670033 -0.820934)) + ((0.863306 -0.348735))) + ((ph_ctype is n) + ((R:SylStructure.parent.asyl_in < 1.2) + ((0.656966 -0.40092)) + ((0.530966 -0.639366))) + ((seg_coda_fric is 0) + ((1.04153 0.364857)) + ((pos_in_syl < 1.2) + ((R:SylStructure.parent.syl_out < 3.4) + ((0.81503 -0.00768613)) + ((0.602665 -0.197753))) + ((0.601844 -0.394632))))))))) + ((n.ph_ctype is f) + ((pos_in_syl < 1.5) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((pos_in_syl < 0.1) + ((1.63863 0.938841)) + ((R:SylStructure.parent.position_type is initial) + ((0.897722 -0.0796637)) + ((nn.ph_vheight is 0) + ((0.781081 0.480026)) + ((0.779711 0.127175))))) + ((ph_ctype is r) + ((p.ph_ctype is s) + ((0.581329 -0.708767)) + ((0.564366 -0.236212))) + ((ph_vlng is a) + ((p.ph_ctype is r) + ((0.70992 -0.273389)) + ((R:SylStructure.parent.parent.gpos is in) + ((0.764696 0.0581338)) + ((nn.ph_vheight is 0) + ((0.977737 0.721904)) + ((R:SylStructure.parent.sub_phrases < 2.2) + ((pp.ph_vfront is 0) + ((0.586708 0.0161206)) + ((0.619949 0.227372))) + ((0.707285 0.445569)))))) + ((ph_ctype is n) + ((R:SylStructure.parent.syl_break is 1) + ((nn.ph_vfront is 2) + ((0.430295 -0.120097)) + ((0.741371 0.219042))) + ((0.587492 0.321245))) + ((p.ph_ctype is n) + ((0.871586 0.134075)) + ((p.ph_ctype is r) + ((0.490751 -0.466418)) + ((R:SylStructure.parent.syl_codasize < 1.3) + ((R:SylStructure.parent.sub_phrases < 2.2) + ((p.ph_ctype is s) + ((0.407452 -0.425925)) + ((0.644771 -0.542809))) + ((0.688772 -0.201899))) + ((ph_vheight is 1) + ((nn.ph_vheight is 0) + ((0.692018 0.209018)) + ((0.751345 -0.178136))) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.3) + ((R:SylStructure.parent.asyl_in < 1.5) + ((0.599633 -0.235593)) + ((0.60042 0.126118))) + ((p.ph_vlng is a) + ((0.7148 -0.174812)) + ((R:SylStructure.parent.parent.gpos is content) + ((0.761296 -0.231509)) + ((0.813081 -0.536405))))))))))))) + ((ph_ctype is n) + ((0.898844 0.163343)) + ((p.ph_vlng is s) + ((seg_coda_fric is 0) + ((0.752921 -0.45528)) + ((0.890079 -0.0998025))) + ((ph_ctype is f) + ((0.729376 -0.930547)) + ((ph_ctype is s) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 0) + ((0.745052 -0.634119)) + ((0.521502 -0.760176))) + ((R:SylStructure.parent.syl_break is 1) + ((0.766575 -0.121355)) + ((0.795616 -0.557509)))))))) + ((p.ph_vlng is 0) + ((p.ph_ctype is r) + ((ph_vlng is 0) + ((0.733659 -0.402734)) + ((R:SylStructure.parent.sub_phrases < 1.5) + ((ph_vlng is s) + ((0.326176 -0.988478)) + ((n.ph_ctype is s) + ((0.276471 -0.802536)) + ((0.438283 -0.900628)))) + ((nn.ph_vheight is 0) + ((ph_vheight is 2) + ((0.521 -0.768992)) + ((0.615436 -0.574918))) + ((ph_vheight is 1) + ((0.387376 -0.756359)) + ((pos_in_syl < 0.3) + ((0.417235 -0.808937)) + ((0.384043 -0.93315))))))) + ((ph_vlng is a) + ((ph_ctype is 0) + ((n.ph_ctype is s) + ((p.ph_ctype is f) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((0.415908 -0.428493)) + ((pos_in_syl < 0.1) + ((0.790441 0.0211071)) + ((0.452465 -0.254485)))) + ((p.ph_ctype is s) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((0.582447 -0.389966)) + ((0.757648 0.185781))) + ((R:SylStructure.parent.sub_phrases < 1.4) + ((0.628965 0.422551)) + ((0.713613 0.145576))))) + ((seg_onset_stop is 0) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 0) + ((pp.ph_vfront is 1) + ((0.412363 -0.62319)) + ((R:SylStructure.parent.syl_out < 3.6) + ((0.729259 -0.317324)) + ((0.441633 -0.591051)))) + ((R:SylStructure.parent.syl_break is 1) + ((R:SylStructure.parent.sub_phrases < 2.7) + ((0.457728 -0.405607)) + ((0.532411 -0.313148))) + ((R:SylStructure.parent.last_accent < 0.3) + ((1.14175 0.159416)) + ((0.616396 -0.254651))))) + ((R:SylStructure.parent.position_type is initial) + ((0.264181 -0.799896)) + ((0.439801 -0.551309))))) + ((R:SylStructure.parent.position_type is final) + ((0.552027 -0.707084)) + ((0.585661 -0.901874)))) + ((ph_ctype is s) + ((pos_in_syl < 1.2) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((pp.ph_vfront is 1) + ((0.607449 0.196466)) + ((0.599662 0.00382414))) + ((0.64109 -0.12859))) + ((pp.ph_vfront is 1) + ((0.720484 -0.219339)) + ((0.688707 -0.516734)))) + ((ph_vlng is s) + ((n.ph_ctype is s) + ((R:SylStructure.parent.parent.gpos is content) + ((R:SylStructure.parent.position_type is single) + ((0.659206 0.159445)) + ((R:SylStructure.parent.parent.word_numsyls < 3.5) + ((R:SylStructure.parent.sub_phrases < 2) + ((0.447186 -0.419103)) + ((0.631822 -0.0928561))) + ((0.451623 -0.576116)))) + ((ph_vheight is 3) + ((0.578626 -0.64583)) + ((0.56636 -0.4665)))) + ((R:SylStructure.parent.parent.gpos is in) + ((0.771516 -0.217292)) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((0.688571 -0.304382)) + ((R:SylStructure.parent.parent.gpos is content) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((n.ph_ctype is n) + ((0.556085 -0.572203)) + ((0.820173 -0.240338))) + ((R:SylStructure.parent.parent.word_numsyls < 2.2) + ((0.595398 -0.588171)) + ((0.524737 -0.95797)))) + ((R:SylStructure.parent.sub_phrases < 3.9) + ((0.371492 -0.959427)) + ((0.440479 -0.845747))))))) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 0) + ((p.ph_ctype is f) + ((0.524088 -0.482247)) + ((nn.ph_vheight is 1) + ((0.587666 -0.632362)) + ((ph_vlng is l) + ((R:SylStructure.parent.position_type is final) + ((0.513286 -0.713117)) + ((0.604613 -0.924308))) + ((R:SylStructure.parent.syl_codasize < 2.2) + ((0.577997 -0.891342)) + ((0.659804 -1.15252)))))) + ((pp.ph_vlng is s) + ((ph_ctype is f) + ((0.813383 -0.599624)) + ((0.984027 -0.0771909))) + ((p.ph_ctype is f) + ((R:SylStructure.parent.parent.gpos is in) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((0.313572 -1.03242)) + ((0.525854 -0.542799))) + ((R:SylStructure.parent.syl_out < 2.8) + ((0.613007 -0.423979)) + ((0.570258 -0.766379)))) + ((R:SylStructure.parent.syl_break is 1) + ((R:SylStructure.parent.parent.gpos is to) + ((0.364585 -0.792895)) + ((ph_vlng is l) + ((0.69143 -0.276816)) + ((0.65673 -0.523721)))) + ((R:SylStructure.parent.syl_out < 3.6) + ((R:SylStructure.parent.position_type is initial) + ((0.682096 -0.488102)) + ((0.406364 -0.731758))) + ((0.584694 -0.822229))))))))))) + ((n.ph_ctype is r) + ((R:SylStructure.parent.position_type is initial) + ((p.ph_vlng is a) + ((0.797058 1.02334)) + ((ph_ctype is s) + ((1.0548 0.536277)) + ((0.817253 0.138201)))) + ((R:SylStructure.parent.sub_phrases < 1.1) + ((R:SylStructure.parent.syl_out < 3.3) + ((0.884574 -0.23471)) + ((0.772063 -0.525292))) + ((nn.ph_vfront is 1) + ((1.25254 0.417485)) + ((0.955557 -0.0781996))))) + ((pp.ph_vfront is 0) + ((ph_ctype is f) + ((n.ph_ctype is s) + ((R:SylStructure.parent.parent.gpos is content) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 0) + ((0.583506 -0.56941)) + ((0.525949 -0.289362))) + ((0.749316 -0.0921038))) + ((p.ph_vlng is s) + ((0.734234 0.139463)) + ((0.680119 -0.0708717)))) + ((ph_vlng is s) + ((ph_vheight is 1) + ((0.908712 -0.618971)) + ((0.55344 -0.840495))) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 1.2) + ((pos_in_syl < 1.2) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((0.838715 0.00913392)) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((ph_vheight is 2) + ((0.555513 -0.512523)) + ((R:SylStructure.parent.position_type is initial) + ((0.758711 0.121704)) + ((0.737555 -0.25637)))) + ((R:SylStructure.parent.syl_out < 3.1) + ((n.ph_ctype is s) + ((0.611756 -0.474522)) + ((1.05437 -0.247206))) + ((R:SylStructure.parent.syl_codasize < 2.2) + ((R:SylStructure.parent.position_type is final) + ((0.567761 -0.597866)) + ((0.785599 -0.407765))) + ((0.575598 -0.741256)))))) + ((ph_ctype is s) + ((n.ph_ctype is s) + ((0.661069 -1.08426)) + ((0.783184 -0.39789))) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((R:SylStructure.parent.sub_phrases < 2.6) + ((0.511323 -0.666011)) + ((0.691878 -0.499492))) + ((ph_ctype is r) + ((0.482131 -0.253186)) + ((0.852955 -0.372832)))))) + ((0.854447 -0.0936489))))) + ((R:SylStructure.parent.position_type is final) + ((0.685939 -0.249982)) + ((R:SylStructure.parent.syl_out < 3.2) + ((0.989843 0.18086)) + ((0.686805 -0.0402908))))))))) + ((R:SylStructure.parent.syl_out < 2.4) + ((R:SylStructure.parent.syl_out < 0.2) + ((seg_onsetcoda is coda) + ((ph_ctype is s) + ((R:SylStructure.parent.syl_break is 4) + ((pp.ph_vlng is 0) + ((0.959737 1.63203)) + ((1.20714 0.994933))) + ((n.ph_ctype is 0) + ((R:SylStructure.parent.syl_break is 2) + ((0.864809 0.214457)) + ((0.874278 0.730381))) + ((pp.ph_vfront is 0) + ((seg_coda_fric is 0) + ((1.20844 -0.336221)) + ((1.01357 0.468302))) + ((0.658106 -0.799121))))) + ((n.ph_ctype is f) + ((ph_ctype is f) + ((1.26332 0.0300613)) + ((ph_vlng is d) + ((1.02719 1.1649)) + ((ph_ctype is 0) + ((R:SylStructure.parent.asyl_in < 1.2) + ((1.14048 2.2668)) + ((ph_vheight is 1) + ((1.15528 1.50375)) + ((1.42406 2.07927)))) + ((R:SylStructure.parent.sub_phrases < 1.1) + ((0.955892 1.10243)) + ((R:SylStructure.parent.syl_break is 2) + ((1.32682 1.8432)) + ((1.27582 1.59853))))))) + ((n.ph_ctype is 0) + ((ph_ctype is n) + ((R:SylStructure.parent.syl_break is 2) + ((1.45399 1.12927)) + ((1.05543 0.442376))) + ((R:SylStructure.parent.syl_break is 4) + ((R:SylStructure.parent.position_type is final) + ((ph_ctype is f) + ((1.46434 1.76508)) + ((0.978055 0.7486))) + ((1.2395 2.30826))) + ((ph_ctype is 0) + ((0.935325 1.69917)) + ((nn.ph_vfront is 1) + ((1.20456 1.31128)) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((nn.ph_vheight is 0) + ((1.16907 0.212421)) + ((0.952091 0.653094))) + ((p.ph_ctype is 0) + ((1.05502 1.25802)) + ((0.818731 0.777568)))))))) + ((ph_ctype is f) + ((p.ph_ctype is 0) + ((1.03918 0.163941)) + ((0.737545 -0.167063))) + ((R:SylStructure.parent.position_type is final) + ((n.ph_ctype is n) + ((R:SylStructure.parent.last_accent < 0.5) + ((R:SylStructure.parent.sub_phrases < 2.8) + ((0.826207 -0.000859005)) + ((0.871119 0.273433))) + ((R:SylStructure.parent.parent.word_numsyls < 2.4) + ((1.17405 1.05694)) + ((0.858394 0.244916)))) + ((R:SylStructure.parent.syl_codasize < 2.2) + ((p.ph_ctype is 0) + ((1.14092 1.21187)) + ((R:SylStructure.parent.syl_break is 2) + ((1.02653 0.59865)) + ((0.94248 1.1634)))) + ((seg_coda_fric is 0) + ((1.07441 0.292935)) + ((1.15736 0.92574))))) + ((ph_vlng is s) + ((R:SylStructure.parent.syl_break is 2) + ((1.34638 1.23484)) + ((0.951514 2.02008))) + ((ph_ctype is 0) + ((p.ph_ctype is r) + ((0.806106 0.697089)) + ((R:SylStructure.parent.syl_break is 2) + ((1.10891 0.992197)) + ((1.04657 1.51093)))) + ((1.18165 0.520952))))))))) + ((p.ph_vlng is 0) + ((pos_in_syl < 0.7) + ((R:SylStructure.parent.position_type is final) + ((ph_ctype is r) + ((0.966357 0.185827)) + ((ph_ctype is s) + ((0.647163 0.0332298)) + ((0.692972 -0.534917)))) + ((ph_ctype is s) + ((0.881521 0.575107)) + ((p.ph_ctype is f) + ((0.8223 -0.111275)) + ((R:SylStructure.parent.last_accent < 0.3) + ((0.969188 0.09447)) + ((0.894438 0.381947)))))) + ((p.ph_ctype is f) + ((0.479748 -0.490108)) + ((0.813125 -0.201268)))) + ((ph_ctype is s) + ((0.908566 1.20397)) + ((R:SylStructure.parent.last_accent < 1.2) + ((0.88078 0.636568)) + ((0.978087 1.07763)))))) + ((pos_in_syl < 1.3) + ((R:SylStructure.parent.syl_break is 0) + ((pos_in_syl < 0.1) + ((R:SylStructure.parent.position_type is initial) + ((p.ph_ctype is n) + ((0.801651 -0.0163359)) + ((ph_ctype is s) + ((n.ph_ctype is r) + ((0.893307 1.07253)) + ((p.ph_vlng is 0) + ((0.92651 0.525806)) + ((0.652444 0.952792)))) + ((p.ph_vlng is 0) + ((seg_onsetcoda is coda) + ((0.820151 0.469117)) + ((p.ph_ctype is f) + ((0.747972 -0.0716448)) + ((ph_ctype is f) + ((0.770882 0.457137)) + ((0.840905 0.102492))))) + ((R:SylStructure.parent.syl_out < 1.1) + ((0.667824 0.697337)) + ((0.737967 0.375114)))))) + ((ph_vheight is 1) + ((0.624353 0.410671)) + ((R:SylStructure.parent.asyl_in < 0.8) + ((0.647905 -0.331055)) + ((p.ph_ctype is s) + ((0.629039 -0.240616)) + ((0.749277 -0.0191273)))))) + ((ph_vheight is 3) + ((p.ph_ctype is s) + ((0.626922 0.556537)) + ((0.789357 0.153892))) + ((seg_onsetcoda is coda) + ((n.ph_ctype is 0) + ((R:SylStructure.parent.parent.word_numsyls < 3.4) + ((0.744714 0.123242)) + ((0.742039 0.295753))) + ((seg_coda_fric is 0) + ((R:SylStructure.parent.parent.word_numsyls < 2.4) + ((ph_vheight is 1) + ((0.549715 -0.341018)) + ((0.573641 -0.00893114))) + ((nn.ph_vfront is 2) + ((0.67099 -0.744625)) + ((0.664438 -0.302803)))) + ((p.ph_vlng is 0) + ((0.630028 0.113815)) + ((0.632794 -0.128733))))) + ((ph_ctype is r) + ((0.367169 -0.854509)) + ((0.94334 -0.216179)))))) + ((n.ph_ctype is f) + ((ph_vlng is 0) + ((1.3089 0.46195)) + ((R:SylStructure.parent.syl_codasize < 1.3) + ((1.07673 0.657169)) + ((pp.ph_vlng is 0) + ((0.972319 1.08222)) + ((1.00038 1.46257))))) + ((p.ph_vlng is l) + ((1.03617 0.785204)) + ((p.ph_vlng is a) + ((R:SylStructure.parent.position_type is final) + ((1.00681 0.321168)) + ((0.928115 0.950834))) + ((ph_vlng is 0) + ((pos_in_syl < 0.1) + ((R:SylStructure.parent.position_type is final) + ((0.863682 -0.167374)) + ((nn.ph_vheight is 0) + ((p.ph_ctype is f) + ((0.773591 -0.00374425)) + ((R:SylStructure.parent.syl_out < 1.1) + ((0.951802 0.228448)) + ((1.02282 0.504252)))) + ((1.09721 0.736476)))) + ((R:SylStructure.parent.position_type is final) + ((1.04302 0.0590974)) + ((0.589208 -0.431535)))) + ((n.ph_ctype is 0) + ((1.27879 1.00642)) + ((ph_vlng is s) + ((R:SylStructure.parent.asyl_in < 1.4) + ((0.935787 0.481652)) + ((0.9887 0.749861))) + ((R:SylStructure.parent.syl_out < 1.1) + ((R:SylStructure.parent.position_type is final) + ((0.921307 0.0696307)) + ((0.83675 0.552212))) + ((0.810076 -0.0479225)))))))))) + ((ph_ctype is s) + ((n.ph_ctype is s) + ((0.706959 -1.0609)) + ((p.ph_ctype is n) + ((0.850614 -0.59933)) + ((n.ph_ctype is r) + ((0.665947 0.00698725)) + ((n.ph_ctype is 0) + ((R:SylStructure.parent.position_type is initial) + ((0.762889 -0.0649044)) + ((0.723956 -0.248899))) + ((R:SylStructure.parent.sub_phrases < 1.4) + ((0.632957 -0.601987)) + ((0.889114 -0.302401))))))) + ((ph_ctype is f) + ((R:SylStructure.parent.syl_codasize < 2.2) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((R:SylStructure.parent.syl_out < 1.1) + ((0.865267 0.164636)) + ((0.581827 -0.0989051))) + ((nn.ph_vfront is 2) + ((0.684459 -0.316836)) + ((0.778854 -0.0961191)))) + ((R:SylStructure.parent.syl_out < 1.1) + ((p.ph_ctype is s) + ((0.837964 -0.429437)) + ((0.875304 -0.0652743))) + ((0.611071 -0.635089)))) + ((p.ph_ctype is r) + ((R:SylStructure.parent.syl_out < 1.1) + ((0.762012 0.0139361)) + ((0.567983 -0.454845))) + ((R:SylStructure.parent.syl_codasize < 2.2) + ((ph_ctype is l) + ((1.18845 0.809091)) + ((R:SylStructure.parent.position_type is initial) + ((ph_ctype is n) + ((0.773548 -0.277092)) + ((1.01586 0.281001))) + ((p.ph_ctype is 0) + ((1.06831 0.699145)) + ((0.924189 0.241873))))) + ((R:SylStructure.parent.syl_break is 0) + ((ph_ctype is n) + ((0.592321 -0.470784)) + ((0.778688 -0.072112))) + ((n.ph_ctype is s) + ((1.08848 0.0733489)) + ((1.25674 0.608371)))))))))) + ((pos_in_syl < 0.7) + ((p.ph_vlng is 0) + ((R:SylStructure.parent.position_type is mid) + ((ph_ctype is 0) + ((ph_vheight is 2) + ((0.456225 -0.293282)) + ((0.561529 -0.0816115))) + ((0.6537 -0.504024))) + ((ph_ctype is s) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((1.31586 0.98395)) + ((R:SylStructure.parent.position_type is single) + ((0.816869 0.634789)) + ((R:SylStructure.parent.syl_out < 4.4) + ((1.05578 0.479029)) + ((R:SylStructure.parent.asyl_in < 0.4) + ((1.11813 0.143214)) + ((0.87178 0.406834)))))) + ((n.ph_ctype is n) + ((R:SylStructure.parent.last_accent < 0.6) + ((0.838154 -0.415599)) + ((0.924024 0.110288))) + ((seg_onsetcoda is coda) + ((nn.ph_vfront is 2) + ((0.670096 0.0314187)) + ((n.ph_ctype is f) + ((1.00363 0.693893)) + ((R:SylStructure.parent.syl_out < 6) + ((0.772363 0.215675)) + ((0.920313 0.574068))))) + ((R:SylStructure.parent.position_type is final) + ((0.673837 -0.458142)) + ((R:SylStructure.parent.sub_phrases < 2.8) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((0.894817 0.304628)) + ((ph_ctype is n) + ((0.787302 -0.23094)) + ((R:SylStructure.parent.asyl_in < 1.2) + ((ph_ctype is f) + ((R:SylStructure.parent.last_accent < 0.5) + ((1.12278 0.326954)) + ((0.802236 -0.100616))) + ((0.791255 -0.0919132))) + ((0.95233 0.219053))))) + ((R:SylStructure.parent.position_type is initial) + ((ph_ctype is f) + ((1.0616 0.216118)) + ((0.703216 -0.00834086))) + ((ph_ctype is f) + ((1.22277 0.761763)) + ((0.904811 0.332721)))))))))) + ((ph_vheight is 0) + ((p.ph_vlng is s) + ((0.873379 0.217178)) + ((n.ph_ctype is r) + ((0.723915 1.29451)) + ((n.ph_ctype is 0) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((R:SylStructure.parent.sub_phrases < 4) + ((seg_coda_fric is 0) + ((p.ph_vlng is l) + ((0.849154 0.945261)) + ((0.633261 0.687498))) + ((0.728546 0.403076))) + ((0.850962 1.00255))) + ((0.957999 1.09113))) + ((0.85771 0.209045))))) + ((ph_vheight is 2) + ((0.803401 -0.0544067)) + ((0.681353 0.256045))))) + ((n.ph_ctype is f) + ((ph_ctype is s) + ((p.ph_vlng is 0) + ((0.479307 -0.9673)) + ((0.700477 -0.351397))) + ((ph_ctype is f) + ((0.73467 -0.6233)) + ((R:SylStructure.parent.syl_break is 0) + ((p.ph_ctype is s) + ((0.56282 0.266234)) + ((p.ph_ctype is r) + ((0.446203 -0.302281)) + ((R:SylStructure.parent.sub_phrases < 2.7) + ((ph_ctype is 0) + ((0.572016 -0.0102436)) + ((0.497358 -0.274514))) + ((0.545477 0.0482177))))) + ((ph_vlng is s) + ((0.805269 0.888495)) + ((ph_ctype is n) + ((0.869854 0.653018)) + ((R:SylStructure.parent.sub_phrases < 2.2) + ((0.735031 0.0612886)) + ((0.771859 0.346637)))))))) + ((R:SylStructure.parent.syl_codasize < 1.4) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.3) + ((R:SylStructure.parent.position_type is initial) + ((0.743458 0.0411808)) + ((1.13068 0.613305))) + ((pos_in_syl < 1.2) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 1) + ((1.11481 0.175467)) + ((0.937893 -0.276407))) + ((0.74264 -0.550878)))) + ((pos_in_syl < 3.4) + ((seg_onsetcoda is coda) + ((ph_ctype is r) + ((n.ph_ctype is s) + ((0.714319 -0.240328)) + ((p.ph_ctype is 0) + ((0.976987 0.330352)) + ((1.1781 -0.0816682)))) + ((ph_ctype is l) + ((n.ph_ctype is 0) + ((1.39137 0.383533)) + ((0.725585 -0.324515))) + ((ph_vheight is 3) + ((ph_vlng is d) + ((0.802626 -0.62487)) + ((n.ph_ctype is r) + ((0.661091 -0.513869)) + ((R:SylStructure.parent.position_type is initial) + ((R:SylStructure.parent.parent.word_numsyls < 2.4) + ((0.482285 0.207874)) + ((0.401601 -0.0204711))) + ((0.733755 0.397372))))) + ((n.ph_ctype is r) + ((p.ph_ctype is 0) + ((pos_in_syl < 1.2) + ((0.666325 0.271734)) + ((nn.ph_vheight is 0) + ((0.642401 -0.261466)) + ((0.783684 -0.00956571)))) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((0.692225 -0.381895)) + ((0.741921 -0.0898767)))) + ((nn.ph_vfront is 2) + ((ph_ctype is s) + ((0.697527 -1.12626)) + ((n.ph_ctype is s) + ((ph_vlng is 0) + ((R:SylStructure.parent.sub_phrases < 2.4) + ((0.498719 -0.906926)) + ((0.635342 -0.625651))) + ((0.45886 -0.385089))) + ((0.848596 -0.359702)))) + ((p.ph_vlng is a) + ((p.ph_ctype is 0) + ((0.947278 0.216904)) + ((0.637933 -0.394349))) + ((p.ph_ctype is r) + ((R:SylStructure.parent.syl_break is 0) + ((0.529903 -0.860573)) + ((0.581378 -0.510488))) + ((ph_vlng is 0) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((seg_onset_stop is 0) + ((R:SylStructure.parent.syl_break is 0) + ((p.ph_vlng is d) + ((0.768363 0.0108428)) + ((ph_ctype is s) + ((0.835756 -0.035054)) + ((ph_ctype is f) + ((p.ph_vlng is s) + ((0.602016 -0.179727)) + ((0.640126 -0.297341))) + ((0.674628 -0.542602))))) + ((ph_ctype is s) + ((0.662261 -0.60496)) + ((0.662088 -0.432058)))) + ((R:SylStructure.parent.syl_out < 4.4) + ((0.582448 -0.389079)) + ((ph_ctype is s) + ((0.60413 -0.73564)) + ((0.567153 -0.605444))))) + ((R:SylStructure.parent.R:Syllable.p.syl_break is 2) + ((0.761115 -0.827377)) + ((ph_ctype is n) + ((0.855183 -0.275338)) + ((R:SylStructure.parent.syl_break is 0) + ((0.788288 -0.802801)) + ((R:SylStructure.parent.syl_codasize < 2.2) + ((0.686134 -0.371234)) + ((0.840184 -0.772883))))))) + ((pos_in_syl < 1.2) + ((R:SylStructure.parent.syl_break is 0) + ((n.ph_ctype is n) + ((0.423592 -0.655006)) + ((R:SylStructure.parent.syl_out < 4.4) + ((0.595269 -0.303751)) + ((0.478433 -0.456882)))) + ((0.688133 -0.133182))) + ((seg_onset_stop is 0) + ((1.27464 0.114442)) + ((0.406837 -0.167545)))))))))))) + ((ph_ctype is r) + ((0.462874 -0.87695)) + ((R:SylStructure.parent.R:Syllable.n.syl_onsetsize < 0.2) + ((0.645442 -0.640572)) + ((0.673717 -0.321322))))) + ((0.61008 -0.925472)))))))) +;; RMSE 0.8085 Correlation is 0.5899 Mean (abs) Error 0.6024 (0.5393) + + +)) + +(provide 'gswdurtreeZ) diff --git a/CosyVoice-ttsfrd/resource/festival/holmes_phones.scm b/CosyVoice-ttsfrd/resource/festival/holmes_phones.scm new file mode 100644 index 0000000000000000000000000000000000000000..29e38ed5a55b784fe3069c4aada05c07278eb386 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/holmes_phones.scm @@ -0,0 +1,118 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ;; +;; Centre for Speech Technology Research ;; +;; University of Edinburgh, UK ;; +;; Copyright (c) 1996,1997 ;; +;; All Rights Reserved. ;; +;; ;; +;; Permission is hereby granted, free of charge, to use and distribute ;; +;; this software and its documentation without restriction, including ;; +;; without limitation the rights to use, copy, modify, merge, publish, ;; +;; distribute, sublicense, and/or sell copies of this work, and to ;; +;; permit persons to whom this work is furnished to do so, subject to ;; +;; the following conditions: ;; +;; 1. The code must retain the above copyright notice, this list of ;; +;; conditions and the following disclaimer. ;; +;; 2. Any modifications must be clearly marked as such. ;; +;; 3. Original authors' names are not deleted. ;; +;; 4. The authors' names are not used to endorse or promote products ;; +;; derived from this software without specific prior written ;; +;; permission. ;; +;; ;; +;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;; THIS SOFTWARE. ;; +;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; A definition of the Holmes phone set used by the Donovan LPC +;; diphone synthesizer, the rest of the synthesis process will +;; typically use mrpa phones and map to these. +;; +;; Hmm not sure I've got the right mapping (as usual) + +(defPhoneSet + holmes + ;;; Phone Features + (;; vowel or consonant + (vc + -) + ;; vowel length: short long dipthong schwa + (vlng s l d a 0) + ;; vowel height: high mid low + (vheight 1 2 3 - 0) + ;; vowel frontness: front mid back + (vfront 1 2 3 - 0) + ;; lip rounding + (vrnd + - 0) + ;; consonant type: stop fricative affricative nasal lateral approximant + (ctype s f a n l r 0) + ;; place of articulation: labial alveolar palatal labio-dental + ;; dental velar glottal + (cplace l a p b d v g 0) + ;; consonant voicing + (cvox + - 0) + ) + ;; Phone set members + ( + ;; Note these features were set by awb so they are wrong !!! + (ee + l 1 1 - 0 0 0) ;; beet + (i + s 1 1 - 0 0 0) ;; bit + (ai + d 2 1 - 0 0 0) ;; gate + (e + s 2 1 - 0 0 0) ;; get + (aa + s 3 1 - 0 0 0) ;; fat + (ar + l 3 3 - 0 0 0) ;; father + (aw + l 3 3 + 0 0 0) ;; lawn + (oa + d 2 2 - 0 0 0) ;; lone + (oo + s 1 3 + 0 0 0) ;; full + (uu + l 1 3 + 0 0 0) ;; fool + (o + s 2 3 + 0 0 0) + (er + l 2 2 - 0 0 0) ;; murder + (a + a 2 2 - 0 0 0) ;; about + (u + s 2 3 - 0 0 0) ;; but + (ie + d 3 2 - 0 0 0) ;; hide + (ou + d 3 2 + 0 0 0) ;; how + (oi + d 3 3 + 0 0 0) ;; toy + (eer + d 2 1 - 0 0 0) + (air + d 1 1 - 0 0 0) + (oor + d 3 1 + 0 0 0) +;; (yu + l 2 3 + 0 0 +) ;; you ??? + + (p - 0 0 0 0 s l -) + (b - 0 0 0 0 s l +) + (t - 0 0 0 0 s a -) + (d - 0 0 0 0 s a +) + (k - 0 0 0 0 s v -) + (g - 0 0 0 0 s v +) + (f - 0 0 0 0 f b -) + (v - 0 0 0 0 f b +) + (th - 0 0 0 0 f d -) + (dh - 0 0 0 0 f d +) + (s - 0 0 0 0 f a -) + (z - 0 0 0 0 f a +) + (sh - 0 0 0 0 f p -) + (zh - 0 0 0 0 f p +) + (h - 0 0 0 0 f g -) + (m - 0 0 0 0 n l +) + (n - 0 0 0 0 n a +) + (ng - 0 0 0 0 n v +) + (ch - 0 0 0 0 a p -) + (j - 0 0 0 0 a p +) + (l - 0 0 0 0 l a +) + (w - 0 0 0 0 r l +) + (y - 0 0 0 0 r p +) + (r - 0 0 0 0 r a +) +;; (wh - 0 - - + l l -) ;; ?? +;; (wh - 0 - - + l l +) ;; map to w + (# - 0 0 0 0 0 0 -) + ) + ) + +(PhoneSet.silences '(#)) + +(provide 'holmes_phones) diff --git a/CosyVoice-ttsfrd/resource/festival/hts.scm b/CosyVoice-ttsfrd/resource/festival/hts.scm new file mode 100644 index 0000000000000000000000000000000000000000..030bf4f8a52fedcc5b8552d6004a9397712276b3 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/hts.scm @@ -0,0 +1,522 @@ +;; ---------------------------------------------------------------- ;; +;; Nagoya Institute of Technology and ;; +;; Carnegie Mellon University ;; +;; Copyright (c) 2002 ;; +;; All Rights Reserved. ;; +;; ;; +;; Permission is hereby granted, free of charge, to use and ;; +;; distribute this software and its documentation without ;; +;; restriction, including without limitation the rights to use, ;; +;; copy, modify, merge, publish, distribute, sublicense, and/or ;; +;; sell copies of this work, and to permit persons to whom this ;; +;; work is furnished to do so, subject to the following conditions: ;; +;; ;; +;; 1. The code must retain the above copyright notice, this list ;; +;; of conditions and the following disclaimer. ;; +;; ;; +;; 2. Any modifications must be clearly marked as such. ;; +;; ;; +;; 3. Original authors' names are not deleted. ;; +;; ;; +;; 4. The authors' names are not used to endorse or promote ;; +;; products derived from this software without specific prior ;; +;; written permission. ;; +;; ;; +;; NAGOYA INSTITUTE OF TECHNOLOGY, CARNEGIE MELLON UNIVERSITY AND ;; +;; THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH ;; +;; REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF ;; +;; MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL NAGOYA INSTITUTE ;; +;; OF TECHNOLOGY, CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS ;; +;; BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ;; +;; ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR ;; +;; PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER ;; +;; TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR ;; +;; PERFORMANCE OF THIS SOFTWARE. ;; +;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Generic HTS support code and specific features ;; +;; http://hts.ics.nitech.ac.jp ;; +;; Author : Alan W Black ;; +;; Date : August 2002 (and April 2004) ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ;; +;; Still has language specific features in here, that will have to ;; +;; move out to the voices ;; +;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(defvar hts_synth_pre_hooks nil) +(defvar hts_synth_post_hooks nil) +(defvar hts_engine_params nil) + +(defvar hts_duration_stretch 0) +(defvar hts_f0_mean 0) +(defvar hts_f0_std 1) +(defvar hts_fw_factor 0.42) +(defvar hts_total_length 0.0) +(defvar hts_uv_threshold 0.5) +(defvar hts_use_phone_align 0) + +(defSynthType HTS + (let ((featfile (make_tmp_filename)) + (mcepfile (make_tmp_filename)) + (f0file (make_tmp_filename)) + (wavfile (make_tmp_filename)) + (labfile (make_tmp_filename))) + + (apply_hooks hts_synth_pre_hooks utt) + + (set! hts_output_params + (list + (list "-labelfile" featfile) + (list "-om" mcepfile) + (list "-of" f0file) + (list "-or" wavfile) + (list "-od" labfile)) + ) + + (hts_dump_feats utt hts_feats_list featfile) + + (HTS_Synthesize utt) + + (delete-file featfile) + (delete-file mcepfile) + (delete-file f0file) + (delete-file wavfile) + (delete-file labfile) + + (apply_hooks hts_synth_post_hooks utt) + utt) +) + +(define (hts_feats_output ofd s) + "This is bad as it makes decisions about what the feats are" +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; SEGMENT + +; boundary + (format ofd "%10.0f %10.0f " + (* 10000000 (item.feat s "segment_start")) + (* 10000000 (item.feat s "segment_end"))) + +; pp.name + (format ofd "%s" (if (string-equal "0" (item.feat s "p.p.name")) + "x" (item.feat s "p.p.name"))) +; p.name + (format ofd "^%s" (if (string-equal "0" (item.feat s "p.name")) + "x" (item.feat s "p.name"))) +; c.name + (format ofd "-%s" (if (string-equal "0" (item.feat s "name")) + "x" (item.feat s "name"))) +; n.name + (format ofd "+%s" (if (string-equal "0" (item.feat s "n.name")) + "x" (item.feat s "n.name"))) +; nn.name + (format ofd "=%s" (if (string-equal "0" (item.feat s "n.n.name")) + "x" (item.feat s "n.n.name"))) + +; position in syllable (segment) + (format ofd "@") + (format ofd "%s" (if (string-equal "pau" (item.feat s "name")) + "x" (+ 1 (item.feat s "pos_in_syl")))) + (format ofd "_%s" (if (string-equal "pau" (item.feat s "name")) + "x" (- (item.feat s "R:SylStructure.parent.R:Syllable.syl_numphones") + (item.feat s "pos_in_syl")))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; SYLLABLE + +;; previous syllable + +; p.stress + (format ofd "/A:%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "p.R:SylStructure.parent.R:Syllable.stress") + (item.feat s "R:SylStructure.parent.R:Syllable.p.stress"))) +; p.accent + (format ofd "_%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "p.R:SylStructure.parent.R:Syllable.accented") + (item.feat s "R:SylStructure.parent.R:Syllable.p.accented"))) +; p.length + (format ofd "_%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "p.R:SylStructure.parent.R:Syllable.syl_numphones") + (item.feat s "R:SylStructure.parent.R:Syllable.p.syl_numphones"))) +;; current syllable + +; c.stress + (format ofd "/B:%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.R:Syllable.stress"))) +; c.accent + (format ofd "-%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.R:Syllable.accented"))) +; c.length + (format ofd "-%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.R:Syllable.syl_numphones"))) + +; position in word (syllable) + (format ofd "@%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (+ 1 (item.feat s "R:SylStructure.parent.R:Syllable.pos_in_word")))) + (format ofd "-%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (- + (item.feat s "R:SylStructure.parent.parent.R:Word.word_numsyls") + (item.feat s "R:SylStructure.parent.R:Syllable.pos_in_word")))) + +; position in phrase (syllable) + (format ofd "&%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (+ 1 + (item.feat s "R:SylStructure.parent.R:Syllable.syl_in")))) + (format ofd "-%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (+ 1 + (item.feat s "R:SylStructure.parent.R:Syllable.syl_out")))) + +; position in phrase (stressed syllable) + (format ofd "#%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (+ 1 + (item.feat s "R:SylStructure.parent.R:Syllable.ssyl_in")))) + (format ofd "-%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (+ 1 + (item.feat s "R:SylStructure.parent.R:Syllable.ssyl_out")))) + +; position in phrase (accented syllable) + (format ofd "$%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (+ 1 + (item.feat s "R:SylStructure.parent.R:Syllable.asyl_in")))) + (format ofd "-%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (+ 1 + (item.feat s "R:SylStructure.parent.R:Syllable.asyl_out")))) + +; distance from stressed syllable + (format ofd "!%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.R:Syllable.lisp_distance_to_p_stress"))) + (format ofd "-%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.R:Syllable.lisp_distance_to_n_stress"))) + +; distance from accented syllable + (format ofd ";%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.R:Syllable.lisp_distance_to_p_accent"))) + (format ofd "-%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.R:Syllable.lisp_distance_to_n_accent"))) + +; name of the vowel of current syllable + (format ofd "|%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.R:Syllable.syl_vowel"))) + +;; next syllable + (format ofd "/C:%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "n.R:SylStructure.parent.R:Syllable.stress") + (item.feat s "R:SylStructure.parent.R:Syllable.n.stress"))) +; n.accent + (format ofd "+%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "n.R:SylStructure.parent.R:Syllable.accented") + (item.feat s "R:SylStructure.parent.R:Syllable.n.accented"))) +; n.length + (format ofd "+%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "n.R:SylStructure.parent.R:Syllable.syl_numphones") + (item.feat s "R:SylStructure.parent.R:Syllable.n.syl_numphones"))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WORD + +;;;;;;;;;;;;;;;;;; +;; previous word + +; p.gpos + (format ofd "/D:%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "p.R:SylStructure.parent.parent.R:Word.gpos") + (item.feat s "R:SylStructure.parent.parent.R:Word.p.gpos"))) +; p.lenght (syllable) + (format ofd "_%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "p.R:SylStructure.parent.parent.R:Word.word_numsyls") + (item.feat s "R:SylStructure.parent.parent.R:Word.p.word_numsyls"))) + +;;;;;;;;;;;;;;;;; +;; current word + +; c.gpos + (format ofd "/E:%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.parent.R:Word.gpos"))) +; c.lenght (syllable) + (format ofd "+%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.parent.R:Word.word_numsyls"))) + +; position in phrase (word) + (format ofd "@%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (+ 1 (item.feat s "R:SylStructure.parent.parent.R:Word.pos_in_phrase")))) + (format ofd "+%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.parent.R:Word.words_out"))) + +; position in phrase (content word) + (format ofd "&%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.parent.R:Word.content_words_in"))) + (format ofd "+%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.parent.R:Word.content_words_out"))) + +; distance from content word in phrase + (format ofd "#%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.parent.R:Word.lisp_distance_to_p_content"))) + (format ofd "+%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.parent.R:Word.lisp_distance_to_n_content"))) + +;;;;;;;;;;;;;; +;; next word + +; n.gpos + (format ofd "/F:%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "n.R:SylStructure.parent.parent.R:Word.gpos") + (item.feat s "R:SylStructure.parent.parent.R:Word.n.gpos"))) +; n.lenghte (syllable) + (format ofd "_%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "n.R:SylStructure.parent.parent.R:Word.word_numsyls") + (item.feat s "R:SylStructure.parent.parent.R:Word.n.word_numsyls"))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PHRASE + +;;;;;;;;;;;;;;;;;;;; +;; previous phrase + +; length of previous phrase (syllable) + (format ofd "/G:%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "p.R:SylStructure.parent.parent.R:Phrase.parent.lisp_num_syls_in_phrase") + (item.feat s "R:SylStructure.parent.parent.R:Phrase.parent.p.lisp_num_syls_in_phrase"))) + +; length of previous phrase (word) + (format ofd "_%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "p.R:SylStructure.parent.parent.R:Phrase.parent.lisp_num_words_in_phrase") + (item.feat s "R:SylStructure.parent.parent.R:Phrase.parent.p.lisp_num_words_in_phrase"))) + +;;;;;;;;;;;;;;;;;;;; +;; current phrase + +; length of current phrase (syllable) + (format ofd "/H:%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.parent.R:Phrase.parent.lisp_num_syls_in_phrase"))) + +; length of current phrase (word) + (format ofd "=%s" + (if (string-equal "pau" (item.feat s "name")) + "x" + (item.feat s "R:SylStructure.parent.parent.R:Phrase.parent.lisp_num_words_in_phrase"))) + +; position in major phrase (phrase) + (format ofd "@%s" + (+ 1 (item.feat s "R:SylStructure.parent.R:Syllable.sub_phrases"))) + (format ofd "=%s" + (- + (item.feat s "lisp_total_phrases") + (item.feat s "R:SylStructure.parent.R:Syllable.sub_phrases"))) + +; type of tobi endtone of current phrase + (format ofd "|%s" + (item.feat s "R:SylStructure.parent.parent.R:Phrase.parent.daughtern.R:SylStructure.daughtern.tobi_endtone")) + +;;;;;;;;;;;;;;;;;;;; +;; next phrase + +; length of next phrase (syllable) + (format ofd "/I:%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "n.R:SylStructure.parent.parent.R:Phrase.parent.lisp_num_syls_in_phrase") + (item.feat s "R:SylStructure.parent.parent.R:Phrase.parent.n.lisp_num_syls_in_phrase"))) + +; length of next phrase (word) + (format ofd "=%s" + (if (string-equal "pau" (item.feat s "name")) + (item.feat s "n.R:SylStructure.parent.parent.R:Phrase.parent.lisp_num_words_in_phrase") + (item.feat s "R:SylStructure.parent.parent.R:Phrase.parent.n.lisp_num_words_in_phrase"))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; UTTERANCE + +; length (syllable) + (format ofd "/J:%s" (item.feat s "lisp_total_syls")) + +; length (word) + (format ofd "+%s" (item.feat s "lisp_total_words")) + +; length (phrase) + (format ofd "-%s" (item.feat s "lisp_total_phrases")) + + (format ofd "\n") + +) + +(define (hts_dump_feats utt feats ofile) + (let ((ofd (fopen ofile "w"))) + (mapcar + (lambda (s) + (hts_feats_output ofd s)) + (utt.relation.items utt 'Segment)) + (fclose ofd) + )) + + +;; +;; Extra features +;; From Segment items refer by +;; +;; R:SylStructure.parent.parent.R:Phrase.parent.lisp_num_syls_in_phrase +;; R:SylStructure.parent.parent.R:Phrase.parent.lisp_num_words_in_phrase +;; lisp_total_words +;; lisp_total_syls +;; lisp_total_phrases +;; +;; The last three will act on any item + +(define (distance_to_p_content i) + (let ((c 0) (rc 0 ) (w (item.relation.prev i "Phrase"))) + (while w + (set! c (+ 1 c)) + (if (string-equal "1" (item.feat w "contentp")) + (begin + (set! rc c) + (set! w nil)) + (set! w (item.prev w))) + ) + rc)) + +(define (distance_to_n_content i) + (let ((c 0) (rc 0) (w (item.relation.next i "Phrase"))) + (while w + (set! c (+ 1 c)) + (if (string-equal "1" (item.feat w "contentp")) + (begin + (set! rc c) + (set! w nil)) + (set! w (item.next w))) + ) + rc)) + +(define (distance_to_p_accent i) + (let ((c 0) (rc 0 ) (w (item.relation.prev i "Syllable"))) + (while (and w (member_string (item.feat w "syl_break") '("0" "1"))) + (set! c (+ 1 c)) + (if (string-equal "1" (item.feat w "accented")) + (begin + (set! rc c) + (set! w nil)) + (set! w (item.prev w))) + ) + rc)) + +(define (distance_to_n_accent i) + (let ((c 0) (rc 0 ) (w (item.relation.next i "Syllable"))) + (while (and w (member_string (item.feat w "p.syl_break") '("0" "1"))) + (set! c (+ 1 c)) + (if (string-equal "1" (item.feat w "accented")) + (begin + (set! rc c) + (set! w nil)) + (set! w (item.next w))) + ) + rc)) + +(define (distance_to_p_stress i) + (let ((c 0) (rc 0 ) (w (item.relation.prev i "Syllable"))) + (while (and w (member_string (item.feat w "syl_break") '("0" "1"))) + (set! c (+ 1 c)) + (if (string-equal "1" (item.feat w "stress")) + (begin + (set! rc c) + (set! w nil)) + (set! w (item.prev w))) + ) + rc)) + +(define (distance_to_n_stress i) + (let ((c 0) (rc 0 ) (w (item.relation.next i "Syllable"))) + (while (and w (member_string (item.feat w "p.syl_break") '("0" "1"))) + (set! c (+ 1 c)) + (if (string-equal "1" (item.feat w "stress")) + (begin + (set! rc c) + (set! w nil)) + (set! w (item.next w))) + ) + rc)) + +(define (num_syls_in_phrase i) + (apply + + + (mapcar + (lambda (w) + (length (item.relation.daughters w 'SylStructure))) + (item.relation.daughters i 'Phrase)))) + +(define (num_words_in_phrase i) + (length (item.relation.daughters i 'Phrase))) + +(define (total_words w) + (length + (utt.relation.items (item.get_utt w) 'Word))) + +(define (total_syls s) + (length + (utt.relation.items (item.get_utt s) 'Syllable))) + +(define (total_phrases s) + (length + (utt.relation_tree (item.get_utt s) 'Phrase))) + +(provide 'hts) diff --git a/CosyVoice-ttsfrd/resource/festival/init.scm b/CosyVoice-ttsfrd/resource/festival/init.scm new file mode 100644 index 0000000000000000000000000000000000000000..90bccb7e7bf74ec6132112ce475e70ea75cc1ede --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/init.scm @@ -0,0 +1,157 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Initialisation file -- loaded before anything else +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Basic siod library (need this before load_library or require works) +(load (path-append libdir "siod.scm")) + +(defvar home-directory (or (getenv "HOME") "/") + "home-directory + Place looked at for .festivalrc etc.") + +;;; User startup initialization, can be used to override load-path +;;; to allow alternate basic modules to be loaded. +(if (probe_file (path-append home-directory ".siodvarsrc")) + (load (path-append home-directory ".siodvarsrc"))) + +(if (probe_file (path-append home-directory ".festivalvarsrc")) + (load (path-append home-directory ".festivalvarsrc"))) + +;;; A chance to set various variables to a local setting e.g. +;;; lexdir, voices_dir audio etc etc. +(if (probe_file (path-append libdir "sitevars.scm")) + (load (path-append libdir "sitevars.scm"))) + +;;; CSTR siod extensions +(require 'cstr) + +;;; Festival specific definitions +(require 'festival) + +;;; Dealing with module descriptions +(require 'module_description) + +;;; Web related definitions +(require 'web) + +;;; Utterance types and support +(require 'synthesis) + +;;; Some default parameters +(Parameter.def 'Wavefiletype 'riff) + +;;; Set default audio method +(cond + ((member 'nas *modules*) + (Parameter.def 'Audio_Method 'netaudio)) + ((member 'esd *modules*) + (Parameter.def 'Audio_Method 'esdaudio)) + ((member 'sun16audio *modules*) + (Parameter.def 'Audio_Method 'sun16audio)) + ((member 'freebsd16audio *modules*) + (Parameter.def 'Audio_Method 'freebsd16audio)) + ((member 'linux16audio *modules*) + (Parameter.def 'Audio_Method 'linux16audio)) + ((member 'irixaudio *modules*) + (Parameter.def 'Audio_Method 'irixaudio)) + ((member 'macosxaudio *modules*) + (Parameter.def 'Audio_Method 'macosxaudio)) + ((member 'win32audio *modules*) + (Parameter.def 'Audio_Method 'win32audio)) + ((member 'os2audio *modules*) + (Parameter.def 'Audio_Method 'os2audio)) + ((member 'mplayeraudio *modules*) + (Parameter.def 'Audio_Method 'mplayeraudio)) + (t ;; can't find direct support so guess that /dev/audio for 8k ulaw exists + (Parameter.def 'Audio_Method 'sunaudio))) +;;; If you have an external program to play audio add its definition +;;; in siteinit.scm + +;;; The audio spooler doesn't work under Windows so redefine audio_mode +(if (member 'mplayeraudio *modules*) + (define (audio_mode param) param) +) + +;;; Intonation +(require 'intonation) + +;;; Duration +(require 'duration) + +;;; A large lexicon +(require 'lexicons) +(require 'pauses) + +;;; Part of speech prediction +(require 'pos) + +;;; Phrasing (dependent on pos) +(require 'phrase) + +;;; POstlexical rules +(require 'postlex) + +;;; Different voices +(require 'voices) ;; sets voice_default +(require 'languages) + +;;; Some higher level functions +(require 'token) +(require 'tts) + +;;; +;;; Local site initialization, if the file exists load it +;;; +(if (probe_file (path-append libdir "siteinit.scm")) + (load (path-append libdir "siteinit.scm"))) + +;;; User initialization, if a user has a personal customization +;;; file loaded it +(if (probe_file (path-append home-directory ".siodrc")) + (load (path-append home-directory ".siodrc"))) + +(if (probe_file (path-append home-directory ".festivalrc")) + (load (path-append home-directory ".festivalrc"))) + +;;; Default voice (have to do something cute so autoloads still work) +(eval (list voice_default)) + +(provide 'init) + + + + + diff --git a/CosyVoice-ttsfrd/resource/festival/intonation.scm b/CosyVoice-ttsfrd/resource/festival/intonation.scm new file mode 100644 index 0000000000000000000000000000000000000000..8062e03a8f63f13c13a555bc4a251e05aab7947f --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/intonation.scm @@ -0,0 +1,187 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Basic Intonation modules. These call appropriate sub-modules +;;; depending on the chosen intonation methods +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; These modules should predict intonation events/labels +;;; based on information in the phrase and word streams + +; to detect prespecified accents (feature "accent" in 'Word relation) +; AS 5/29/00 + +(define (tobi_accent_prespecified utt) + (let ((tobi_found nil) + (words (utt.relation.items utt 'Word))) + + (while (and words (not tobi_found)) +; feature "accent" might be prespecified on words or tokens, AS 05/29/00 + (if (item.feat.present (car words) 'accent) + (set! tobi_found t) +; if Token relation exists, check tokens as well + (if (not (null (item.parent (item.relation (car words) 'Token)))) + (if (item.feat.present (item.parent (item.relation (car words) 'Token)) 'accent) + (set! tobi_found t) + (set! words (cdr words))) + (set! words (cdr words))))) + tobi_found)) + +(set! int_accent_cart_tree_no_accent +'((NONE))) + +(define (Intonation utt) +"(Intonation utt) +Select between different intonation modules depending on the Parameter +Int_Method. Currently offers three types: Simple, hats on each content +word; ToBI, a tree method for predicting ToBI accents; and Default a +really bad method with a simple downward sloping F0. This is the first +of a two-stage intonation prediction process. This adds accent-like +features to syllables, the second, Int_Targets generates the F0 contour +itself. [see Intonation]" + +; AS 5/29/00: Hack to avoid prediction of further accent labels +; on utterance chunks that have already been annotated with +; accent labels +; use CART that doesn't assign any labels when using Intonation_Tree + +(if (tobi_accent_prespecified utt) + (progn + (set! int_accent_cart_tree_save int_accent_cart_tree) + (set! int_accent_cart_tree int_accent_cart_tree_no_accent) + (Intonation_Tree utt) + (set! int_accent_cart_tree int_accent_cart_tree_save)) + + (let ((rval (apply_method 'Int_Method utt))) + (Parameter.get 'Int_Method) + (cond + (rval rval) ;; new style + ((eq 'Simple (Parameter.get 'Int_Method)) + (Intonation_Simple utt)) + ((eq 'ToBI (Parameter.get 'Int_Method)) + (format t "Using Intonation_Tree") + (Intonation_Tree utt)) + ((eq 'General (Parameter.get 'Int_Method)) + (Intonation_Simple utt)) ;; yes this is a duplication + (t + (Intonation_Default utt)))))) + + +;;; These modules should create an actual F0 contour based on the +;;; the existing intonational events/labels etc +;;; Specifically this is called after durations have been predicted + +(define (Int_Targets utt) +"(Int_Targets utt) +The second stage in F0 prediction. This generates F0 targets +related to segments using one of three methods, a simple hat, +linear regression based on ToBI markings, and a simple declining +slope. This second part deals with actual F0 values and durations, +while the previous section only deals with accent (and boundary tone) +assignment. [see Intonation]" + (let ((rval (apply_method 'Int_Target_Method utt))) + (cond + (rval rval) ;; new style + ((eq 'Simple (Parameter.get 'Int_Method)) + (Int_Targets_Simple utt)) + ((eq 'ToBI (Parameter.get 'Int_Method)) + (Int_Targets_LR utt)) + ((eq 'General (Parameter.get 'Int_Method)) + (Int_Targets_General utt)) + (t + (Int_Targets_Default utt))))) + +;;; +;;; A tree that adds accents (H) to all content words +;;; simple but better than nothing at all +;;; +(set! simple_accent_cart_tree + ' + ((R:SylStructure.parent.gpos is content) + ((stress is 1) + ((Accented)) + ((position_type is single) + ((Accented)) + ((NONE)))) + ((NONE)))) + +(defvar duffint_params '((start 130) (end 110)) + "duffint_params +Default parameters for Default (duff) intonation target generation. +This is an assoc list of parameters. Two parameters are supported +start specifies the start F0 in Hertz for an utterance, and end specifies +the end.") + +;;; +;;; For simple testing, this function adds fixed duration and +;;; monotone intonation to a set of phones +;;; +(defvar FP_F0 120 +"FP_F0 +In using Fixed_Prosody as used in Phones type utterances and hence +SayPhones, this is the value in Hertz for the monotone F0.") +(defvar FP_duration 100 +"FP_duration +In using Fixed_Prosody as used in Phones type utterances and hence +SayPhones, this is the fix value in ms for phone durations.") + +(define (Fixed_Prosody utt) +"(Fixed_Prosody UTT) +Add fixed duration and fixed monotone F0 to the sgements in UTT. +Uses values of FP_duration and FP_F0 as fixed values." + (let (utt1 + (dur_stretch (Parameter.get 'Duration_Stretch)) + (orig_duffint_params duffint_params)) + (Parameter.set 'Duration_Stretch (/ FP_duration 100.0)) + (set! duffint_params (list (list 'start FP_F0) (list 'end FP_F0))) + + (set! utt1 (Duration_Default utt)) + (set! utt1 (Int_Targets_Default utt1)) + + ;; Reset Parameter values back + (Parameter.set 'Duration_Stretch dur_stretch) + (set! duffint_params orig_duffint_params) + + utt1 + ) +) + +(define (segment_dpitch seg) +"(segment_dpitch UTT SEG) +Returns delta pitch, this pitch minus previous pitch." + (- + (parse-number (item.feat utt seg 'seg_pitch)) + (parse-number (item.feat utt seg 'R:Segment.p.seg_pitch)))) + +(provide 'intonation) diff --git a/CosyVoice-ttsfrd/resource/festival/java.scm b/CosyVoice-ttsfrd/resource/festival/java.scm new file mode 100644 index 0000000000000000000000000000000000000000..e6f514ea45036efe9a7f0346cb6636a2e62fe114 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/java.scm @@ -0,0 +1,39 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1998 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Functions specific to supporting a Java client +;;; + +;; none required yet + +(provide 'java) diff --git a/CosyVoice-ttsfrd/resource/festival/klatt_durs.scm b/CosyVoice-ttsfrd/resource/festival/klatt_durs.scm new file mode 100644 index 0000000000000000000000000000000000000000..8f3864cb792b1f1a7a6969b2e11fa3c90e520463 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/klatt_durs.scm @@ -0,0 +1,85 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Phone duration info for Klatt rules, for mrpa phone set + +(set! duration_klatt_params +'( +(a 230.0 80.0) +(aa 240.0 100.0) +(@ 120.0 60.0) +(@@ 180.0 80.0) +(ai 250.0 150.0) +(au 240.0 100.0) +(b 85.0 60.0) +(ch 70.0 50.0) +(d 75.0 50.0) +(dh 50.0 30.0) +(e 150.0 70.0) +(e@ 270.0 130.0) +(ei 180.0 100.0) +(f 100.0 80.0) +(g 80.0 60.0) +(h 80.0 20.0) +(i 135.0 40.0) +(i@ 230.0 100.0) +(ii 155.0 55) +(jh 70.0 50.0) +(k 80.0 60.0) +(l 80.0 40.0) +(m 70.0 60.0) +(n 60.0 50.0) +(ng 95.0 60.0) +(o 240.0 130.0) +(oi 280.0 150.0) +(oo 240.0 130.0) +(ou 220.0 80.0) +(p 90.0 50.0) +(r 80.0 30.0) +(s 105.0 60.0) +(sh 105.0 80.0) +(t 75.0 50.0) +(th 90.0 60.0) +(u 210.0 70.0) +(u@ 230.0 110.0) +(uh 160.0 60.0) +(uu 230.0 150.0) +(v 60.0 40.0) +(w 80.0 60.0) +(y 80.0 40.0) +(z 75.0 40.0) +(zh 70.0 40.0) +(# 100.0 100.0) +)) + +(provide 'klatt_durs) diff --git a/CosyVoice-ttsfrd/resource/festival/languages.scm b/CosyVoice-ttsfrd/resource/festival/languages.scm new file mode 100644 index 0000000000000000000000000000000000000000..5f4fb41d17db5ed25cf75c81b028cfc97ad115d5 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/languages.scm @@ -0,0 +1,122 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Specification of voices and some major choices of synthesis +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; This should use some sort of database description for voices so +;;; new voices will become automatically available. +;;; + +(define (language_british_english) +"(language_british_english) +Set up language parameters for British English." + (require 'voices) + ;; Will get more elaborate, with different choices of voices in language + + (set! male1 voice_rab_diphone) + (if (symbol-bound? 'voice_don_diphone) + (set! male2 voice_don_diphone)) + (if (symbol-bound? 'voice_gsw_diphone) + (set! male3 voice_gsw_diphone)) + (if (symbol-bound? 'voice_gsw_450) + (set! male4 voice_gsw_450)) + + (male1) + (Parameter.set 'Language 'britishenglish) +) + +(define (language_american_english) +"(language_american_english) +Set up language parameters for Aemerican English." + (require 'voices) + (if (symbol-bound? 'voice_kal_diphone) + (set! female1 voice_kal_diphone)) + (if (symbol-bound? 'voice_kal_diphone) + (set! male1 voice_kal_diphone)) + + (male1) + (Parameter.set 'Language 'americanenglish) +) + +(define (language_scots_gaelic) +"(language_scots_gaelic) +Set up language parameters for Scots Gaelic." + (error "Scots Gaelic not yet supported.") + + (Parameter.set 'Language 'scotsgaelic) +) + +(define (language_welsh) +"(language_welsh) +Set up language parameters for Welsh." + + (set! male1 voice_welsh_hl) + + (male1) + (Parameter.set 'Language 'welsh) +) + +(define (language_castillian_spanish) +"(language_spanish) +Set up language parameters for Castillian Spanish." + + (voice_el_diphone) + (set! male1 voice_el_diphone) + + (Parameter.set 'Language 'spanish) +) + +(define (select_language language) + (cond + ((or (equal? language 'britishenglish) + (equal? language 'english)) ;; we all know its the *real* English + (language_british_english)) + ((equal? language 'americanenglish) + (language_american_english)) + ((equal? language 'scotsgaelic) + (language_scots_gaelic)) + ((equal? language 'welsh) + (language_welsh)) + ((equal? language 'spanish) + (language_castillian_spanish)) + ((equal? language 'klingon) + (language_klingon)) + (t + (print "Unsupported language, using English") + (language_british_english)))) + +(defvar language_default language_british_english) + +(provide 'languages) diff --git a/CosyVoice-ttsfrd/resource/festival/lexicons.scm b/CosyVoice-ttsfrd/resource/festival/lexicons.scm new file mode 100644 index 0000000000000000000000000000000000000000..574c8fa40e25be9a0d2ae6b1059c5c05360e3a4e --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/lexicons.scm @@ -0,0 +1,274 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Definition of various lexicons +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; If there exists a sudirectory of the lib-path called dicts then that +;;; is used as the lexicon directory by default. If it doesn't exist +;;; we set lexdir to the directory in CSTR where our lexicons are. +;;; In non-CSTR installations where lexicons are not in lib/dicts, +;;; you should set lexdir in sitevars.scm + +(defvar lexdir + (if (probe_file (path-append libdir "dicts")) + (path-append libdir "dicts/") + ;; else we'll guess we're in the CSTR filespace + (path-as-directory "/projects/festival/lib/dicts/")) + "lexdir + The directory where the lexicon(s) are, by default.") + +(require 'pos) ;; for part of speech mapping + +(define (setup_cstr_lex) +"(setup_cstr_lexicon) +Define and setup the CSTR lexicon. The CSTR lexicon consists +of about 25,000 entries in the mrpa phone set. A large number of +specific local entries are also added to the addenda." + (if (not (member_string "mrpa" (lex.list))) + (begin + (lex.create "mrpa") + (lex.set.compile.file (path-append lexdir "cstrlex.out")) + (lex.set.phoneset "mrpa") + (lex.set.lts.method 'lts_rules) + (lex.set.lts.ruleset 'nrl) + (lex.set.pos.map english_pos_map_wp39_to_wp20) + (mrpa_addenda) + (lex.add.entry + '("previous" nil (((p r ii) 1) ((v ii) 0) ((@ s) 0)))) + (lex.add.entry + '("audio" () (((oo d) 1) ((ii) 0) ((ou) 0)))) + (lex.add.entry + '("modules" () (((m o d) 1) ((uu l s) 0)))) + ))) + +(define (setup_oald_lex) +"(setup_oald_lexicon) +Define and setup the CUVOALD lexicon. This is derived from the +Computer Users Version of the Oxford Advanced Learners' Dictionary +of Current English. This version includes a trained set of letter +to sound rules which have also been used to reduce the actual lexicon +size by over half, for those entries that the lts model gets exactly +the same." + (if (not (member_string "oald" (lex.list))) + (load (path-append lexdir "oald/oaldlex.scm")))) + +(define (setup_cmu_lex) + "(setup_cmu_lex) +Lexicon derived from the CMU lexicon (cmudict-0.4), around 100,000 entries, +in the radio phoneset (sort of darpa-like). Includes letter to sound +rule model trained from this data, and uses the lexical stress predictor +from OALD." + (if (not (member_string "cmu" (lex.list))) + (load (path-append lexdir "cmu/cmulex.scm")))) + +(define (setup_cmumt_lex) + "(setup_cmumt_lex) +Lexicon derived from the CMU lexicon (cmudict-0.4), around 100,000 entries, +in the radio phoneset (sort of darpa-like). Includes letter to sound +rule model trained from this data, and uses the lexical stress predictor +from OALD." + (if (not (member_string "cmumt" (lex.list))) + (load (path-append lexdir "cmu_mt/cmumtlex.scm")))) + +(define (setup_cmu6_lex) + "(setup_cmu6_lex) +Lexicon derived from the CMU lexicon (cmudict-0.6), around 100,000 entries, +in the radio phoneset (sort of darpa-like). Includes letter to sound +rule model trained from this data, the format of this lexicon is suitable +for the UniSyn metrical phonology modules. That is the entries are +not syllabified," + (if (not (member_string "cmu6" (lex.list))) + (load (path-append lexdir "cmu6/cmu6lex.scm")))) + +(define (setup_moby_lex) +"(setup_moby_lexicon) +Define and setup the MOBY lexicon. This is derived from the public +domain version of the Moby (TM) Pronunciator II lexicon. It can be +converted automatically to British English mrpa phoneset which of +course is sub-optimal. It contains around 120,000 entries and has part +of speech information for homographs." + (if (not (member_string "moby" (lex.list))) + (begin + (lex.create "moby") + ; (lex.set.compile.file (path-append lexdir "mobylex.out")) + (lex.set.compile.file "/home/awb/src/mobypron/mobylex.out") + (lex.set.phoneset "mrpa") + (lex.set.lts.method 'lts_rules) + (lex.set.lts.ruleset 'nrl) + (lex.set.pos.map english_pos_map_wp39_to_wp20) + (lex.add.entry + '("a" dt (((@) 0)))) + (lex.add.entry + '("the" dt (((dh @) 0)))) + (lex.add.entry + '("taylor" n (((t ei) 1) ((l @) 0)))) + (lex.add.entry + '("who" prp ((( h uu ) 0)))) + (mrpa_addenda)))) + +(define (setup_beep_lex) + "(setup_beep_lex) +Lexicon derived from the British English Example Pronunciation dictionary +(BEEP) from Tony Robinson ajr@eng.cam.ac.uk. Around 160,000 entries." + (if (not (member_string "beep" (lex.list))) + (begin + (lex.create "beep") + (lex.set.compile.file (path-append lexdir "beep_lex.out")) + (lex.set.phoneset "mrpa") + (lex.set.lts.method 'lts_rules) + (lex.set.lts.ruleset 'nrl) + (lex.set.pos.map english_pos_map_wp39_to_wp20) + (lex.add.entry + '("taylor" nil (((t ei) 1) ((l @) 0)))) + (mrpa_addenda)))) + +;;; The nrl letter to sound rules produce mrpa phone set so we need +;;; to do some fancy things to make them work for American English +(define (f2b_lts word features) +"(f2b_lts WORD FEATURES) +Letter to sound rule system for f2b (American English), uses the NRL +LTS ruleset and maps the result to the radio phone set." + '("unknown" nil (((ah n) 0) ((n ow n) 1))) +) + +;;; A CART tree for predicting lexical stress for strings of phones +;;; generated by the LTS models. This was actually trained from +;;; OALD as that's the only lexicon with stress and part of speech information +;;; It trained in a phoneset independent way and may be used be either +;;; OALD or CMU models (and probably MOBY and OGI lex too). +;;; On held out data it gets +;;; 07390 378 7768 [7390/7768] 95.134 +;;; 1 512 8207 8719 [8207/8719] 94.128 +;;; 7902 8585 +;;; total 16487 correct 15597.000 94.602% +;;; +(set! english_stress_tree +'((sylpos < 1.7) + ((1)) + ((ph_vlng is a) + ((0)) + ((ph_vheight is 1) + ((num2end < 1.5) + ((ph_vfront is 1) + ((ph_vlng is s) ((0)) ((pos is v) ((1)) ((0)))) + ((pos is n) ((0)) ((sylpos < 2.2) ((1)) ((0))))) + ((ph_vlng is l) + ((1)) + ((ph_vfront is 1) + ((num2end < 2.4) + ((0)) + ((pos is a) + ((num2end < 3.3) ((sylpos < 2.3) ((1)) ((0))) ((0))) + ((sylpos < 3.2) + ((num2end < 3.3) ((0)) ((pos is v) ((1)) ((0)))) + ((0))))) + ((0))))) + ((num2end < 1.5) + ((pos is n) + ((0)) + ((sylpos < 2.4) + ((pos is v) + ((1)) + ((ph_vlng is d) + ((ph_vheight is 2) ((ph_vfront is 1) ((1)) ((0))) ((0))) + ((1)))) + ((ph_vlng is d) + ((sylpos < 3.3) + ((pos is v) + ((ph_vheight is 2) ((ph_vfront is 1) ((0)) ((1))) ((0))) + ((0))) + ((0))) + ((ph_vheight is 2) + ((1)) + ((ph_vrnd is +) ((1)) ((ph_vlng is l) ((0)) ((1)))))))) + ((ph_vlng is d) + ((pos is v) + ((sylpos < 2.4) ((1)) ((0))) + ((ph_vfront is 2) + ((pos is n) + ((num2end < 2.4) + ((ph_vrnd is +) + ((0)) + ((sylpos < 2.2) ((1)) ((ph_vheight is 2) ((1)) ((0))))) + ((sylpos < 2.4) ((ph_vheight is 2) ((0)) ((1))) ((0)))) + ((1))) + ((ph_vheight is 2) ((1)) ((ph_vfront is 1) ((0)) ((1)))))) + ((pos is n) + ((num2end < 2.4) + ((ph_vfront is 3) + ((sylpos < 2.3) ((1)) ((ph_vlng is l) ((1)) ((0)))) + ((1))) + ((1))) + ((1))))))))) + +(define (lex_user_unknown_word word feats) + "(lex_user_unknown_word WORD FEATS) +Function called by lexicon when 'function type letter to sound rules +is defined. It is the user's responsibility to defined this function +themselves when they want to deal with unknown words themselves." + (error "lex_user_unknown_word: has not been defined by user")) + +(define (Word utt) +"(Word utt) +Construct (synthesis specific) syllable/segments from Word relation +using current lexicon and specific module." + (let ((rval (apply_method 'Word_Method utt))) + (cond + (rval rval) ;; new style + (t + (Classic_Word utt))))) + +(define (find_oovs vocab oovs) + (let ((fd (fopen vocab "r")) + (ofd (fopen oovs "w")) + (e 0) + (oov 0) + (entry)) + + (while (not (equal? (set! entry (readfp fd)) (eof-val))) + (set! e (+ 1 e)) + (if (not (lex.lookup_all entry)) + (begin + (set! oov (+ 1 oov)) + (format ofd "%l\n" (lex.lookup entry nil)))) + ) + (format t ";; %d words %d oov %2.2f oov_rate\n" + e oov (/ (* oov 100.0) e)) + ) +) + + +(provide 'lexicons) + diff --git a/CosyVoice-ttsfrd/resource/festival/lts.scm b/CosyVoice-ttsfrd/resource/festival/lts.scm new file mode 100644 index 0000000000000000000000000000000000000000..23c2dad2f889501ee593881c5c77610be64a8aef --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/lts.scm @@ -0,0 +1,212 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1998 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Functions specific to supporting a trained LTS rules +;;; + +(define (lts_rules_predict word feats) + (let ((dcword (downcase word)) + (syls) (phones)) + (if (string-matches dcword "[a-z]*") + (begin + (set! phones + (cdr (reverse (cdr (reverse (lts_predict dcword)))))) + (set! phones (add_lex_stress word feats phones)) + (set! syls (lex.syllabify.phstress phones)) +;; (set! syls (add_lex_stress word syls)) + ) + (set! syls nil)) + (format t "word %l phones %l\n" word syls) + (list word nil syls))) + +;(define (add_lex_stress word syls) +; (cond +; ((> (length syls) 1) +; (set-car! (cdr (nth (- (length syls) 2) syls)) 1)) +; ((word-is-content word english_guess_pos) +; (set-car! (cdr (car syls)) 1))) +; syls) + +(define (word-is-content word guess_pos) + (cond + ((null guess_pos) + t) + ((member_string word (cdr (car guess_pos))) + nil) + (t + (word-is-content word (cdr guess_pos))))) + +(defvar lts_pos nil) + +(define (lts_predict word rules) + "(lts_predict word rules) +Return list of phones related to word using CART trees." + (let ((utt (make_let_utt (enworden (wordexplode word))))) + (predict_phones utt rules) + (cdr (reverse (cdr (reverse ;; remove #'s + (mapcar + (lambda (p) (intern (item.name p))) + (utt.relation.items utt 'PHONE)))))) + ) +) + +(define (wordexplode lets) + (if (consp lets) + lets + (symbolexplode lets))) + +(define (make_let_utt letters) +"(make_let_utt letters) +Build an utterances from th4ese letters." + (let ((utt (Utterance Text ""))) + (utt.relation.create utt 'LTS) + (utt.relation.create utt 'LETTER) + (utt.relation.create utt 'PHONE) + ;; Create letter stream + (mapcar + (lambda (l) + (let ((lsi (utt.relation.append utt 'LETTER))) + (item.set_feat lsi "pos" lts_pos) + (item.set_name lsi l))) + letters) + utt)) + +(define (predict_phones utt rules) + "(predict_phones utt) +Predict phones using CART." + (add_new_phone utt (utt.relation.first utt 'LETTER) '#) + (mapcar + (lambda (lsi) + (let ((tree (car (cdr (assoc_string (item.name lsi) rules))))) + (if (not tree) + (format t "failed to find tree for %s\n" (item.name lsi)) + (let ((p (wagon_predict lsi tree))) +; (format t "predict %s %s\n" (item.name lsi) p) + (cond + ((string-matches p ".*-.*-.*-.*") ; a quad one + (add_new_phone utt lsi (string-before p "-")) + (add_new_phone utt lsi (string-before (string-after p "-") "-")) + (add_new_phone utt lsi (string-before (string-after (string-after p "-") "-") "-")) + (add_new_phone utt lsi (string-after (string-after (string-after p "-") "-") "-"))) + ((string-matches p ".*-.*-.*") ; a triple one + (add_new_phone utt lsi (string-before p "-")) + (add_new_phone utt lsi (string-before (string-after p "-") "-")) + (add_new_phone utt lsi (string-after (string-after p "-") "-"))) + ((string-matches p ".*-.*");; a double one + (add_new_phone utt lsi (string-before p "-")) + (add_new_phone utt lsi (string-after p "-"))) + (t + (add_new_phone utt lsi p))))))) + (reverse (cdr (reverse (cdr (utt.relation.items utt 'LETTER)))))) + (add_new_phone utt (utt.relation.last utt 'LETTER) '#) + utt) + +(define (add_new_phone utt lsi p) + "(add_new_phone utt lsi p) +Add new phone linking to letter, ignoreing it if its _epsilon_." + (if (not (equal? p '_epsilon_)) + (let ((psi (utt.relation.append utt 'PHONE))) + (item.set_name psi p) + (item.relation.append_daughter + (utt.relation.append utt 'LTS lsi) + 'LTS psi) + ))) + +(define (enworden lets) + (cons '# (reverse (cons '# (reverse lets))))) + +;;; Lexical stress assignment +;;; + +(define (add_lex_stress word pos phones tree) + "(add_lex_stress word syls) +Predict lexical stress by decision tree." + (let ((utt (Utterance Text "")) + (si) + (nphones)) + (utt.relation.create utt 'Letter) + (set! si (utt.relation.append utt 'Letter)) + (item.set_feat si 'pos pos) + (item.set_feat si 'numsyls (count_syls phones)) + (item.set_feat si 'sylpos 1) + (set! nphones (add_lex_stress_syl phones si tree)) +; (format t "%l\n" phones) +; (format t "%l\n" nphones) + nphones)) + +(define (count_syls phones) + (cond + ((null phones) 0) + ((string-matches (car phones) "[aeiou@].*") + (+ 1 (count_syls (cdr phones)))) + (t (count_syls (cdr phones))))) + +(define (add_lex_stress_syl phones si tree) + "(add_lex_stress_syl phones si tree) +Add lexical stressing." + (cond + ((null phones) nil) + ((string-matches (car phones) "[aeiou@].*") + (item.set_feat si 'phone (car phones)) + (item.set_feat si 'name (car phones)) + (item.set_feat si 'num2end + (- (+ 1 (item.feat si 'numsyls)) + (item.feat si 'sylpos))) + (set! stress (wagon_predict si tree)) + (item.set_feat si 'sylpos + (+ 1 (item.feat si 'sylpos))) + (cons + (if (not (string-equal stress "0")) + (string-append (car phones) stress) + (car phones)) + (add_lex_stress_syl (cdr phones) si tree))) + (t + (cons + (car phones) + (add_lex_stress_syl (cdr phones) si tree))))) + +;;; Morphological analysis + + +;(define (wfst_stemmer) +; (wfst.load 'stemmer "/home/awb/projects/morpho/engstemmer.wfst") +; (wfst.load 'stemmerL "/home/awb/projects/morpho/engstemmerL.wfst") +; t) + +;(define (stem word) +; (wfst.transduce 'stemmer (enworden (symbolexplode word)))) + +;(define (stemL word) +; (wfst.transduce 'stemmerL (enworden (symbolexplode word)))) + +(provide 'lts) diff --git a/CosyVoice-ttsfrd/resource/festival/lts_build.scm b/CosyVoice-ttsfrd/resource/festival/lts_build.scm new file mode 100644 index 0000000000000000000000000000000000000000..63567d9b9366b368aa8bfbcd2816c9b64fc7f2da --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/lts_build.scm @@ -0,0 +1,723 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1998 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Functions for building LTS rules sets from lexicons +;;; +;;; + +(defvar pl-table nil) + +(define (allaligns phones letters) + "(cummulate phones lets) +Aligns all possible ways for these strings." + (cond + ((null letters) + ;; (wrongly) assume there are never less letters than phones + (if phones + (format t "wrong end: %s\n" word)) + nil) + ((null phones) + nil) + (t + (if (< (length phones) (length letters)) + (begin + (cummulate '_epsilon_ (car letters)) + (allaligns phones (cdr letters)))) + (cummulate (car phones) (car letters)) + (allaligns (cdr phones) (cdr letters))))) + +(define (valid-pair phone letter) + "(valid-pair phone letter) +If predefined to be valid." + (let ((entry1 (assoc_string letter pl-table))) + (if entry1 + (assoc_string phone (cdr entry1)) + nil))) + +(define (valid-pair-e phone nphone letter) + "(valid-pair-e phone letter) +Special cases for when epsilon may be inserted before letter." + (let ((ll (assoc_string letter pl-table)) + (pp (intern (string-append phone "-" nphone)))) + (assoc_string pp (cdr ll)))) + +(define (find-aligns phones letters) + "(find-aligns phones letters) +Find all feasible alignments." + (let ((r nil)) + (cond + ((and (null (cdr phones)) (null (cdr letters)) + (equal? (car phones) (car letters)) + (equal? '# (car phones))) + (list (list (cons '# '#)))) ;; valid end match + (t + (if (valid-pair '_epsilon_ (car letters)) + (set! r (mapcar + (lambda (p) + (cons (cons '_epsilon_ (car letters)) p)) + (find-aligns phones (cdr letters))))) + (if (valid-pair (car phones) (car letters)) + (set! r + (append r + (mapcar + (lambda (p) + (cons (cons (car phones) (car letters)) p)) + (find-aligns (cdr phones) (cdr letters)))))) + ;; Hmm, change this to always check doubles + (if (valid-pair-e (car phones) (car (cdr phones)) (car letters)) + (set! r + (append r + (mapcar + (lambda (p) + (cons (cons (intern (format nil "%s-%s" + (car phones) + (car (cdr phones)))) + (car letters)) p)) + (find-aligns (cdr (cdr phones)) + (cdr letters)))))) + r)))) + +(define (findallaligns phones letters) + (let ((a (find-aligns phones letters))) + (if (null a) + (begin + (set! failedaligns (+ 1 failedaligns)) + (format t "failed: %l %l\n" letters phones))) + a)) + +(define (cummulate phone letter) + "(cummulate phone letter) +record the alignment of this phone and letter." + (if (or (equal? phone letter) + (and (not (equal? phone '#)) + (not (equal? letter '#)))) + (let ((entry1 (assoc_string letter pl-table)) + score) + (if (equal? phone '_epsilon_) + (set! score 0.1) + (set! score 1)) + (if entry1 + (let ((entry2 (assoc_string phone (cdr entry1)))) + (if entry2 + (set-cdr! entry2 (+ score (cdr entry2))) + (set-cdr! entry1 (cons (cons phone 1) (cdr entry1))))) + (set! pl-table + (cons + (cons letter + (list (cons phone score))) + pl-table))) + t))) + +(define (score-pair phone letter) +"(score-pair phone letter) +Give score for this particular phone letter pair." + (let ((entry1 (assoc_string letter pl-table))) + (if entry1 + (let ((entry2 (assoc_string phone (cdr entry1)))) + (if entry2 + (cdr entry2) + 0)) + 0))) + +(define (cummulate-aligns aligns) + (mapcar + (lambda (a) + (mapcar + (lambda (p) + (cummulate (car p) (cdr p))) + a)) + aligns) + t) + +(define (cummulate-pairs trainfile) + "(cummulate-pairs trainfile) +Build cummulatation table from allowable alignments in trainfile." + (set! failedaligns 0) + (set! allaligns 0) + (if (not pl-table) + (set! pl-table + (mapcar + (lambda (l) + (cons (car l) (mapcar (lambda (c) (cons c 0)) (cdr l)))) + allowables))) + (let ((fd (fopen trainfile "r")) + (c 0) (d 0) + (entry)) + (while (not (equal? (set! entry (readfp fd)) (eof-val))) + (if (equal? c 1000) + (begin + (format t "ENTRY: %d %l\n" (set! d (+ 1000 d)) entry) + (set! c 0))) + (set! word (car entry)) + (cummulate-aligns + (findallaligns + (enworden (car (cdr (cdr entry)))) + (enworden (wordexplode (car entry))))) + (set! allaligns (+ 1 allaligns)) + (format t "aligned %d\n" allaligns) + (set! c (+ 1 c))) + (fclose fd) + (format t "failedaligns %d/%d\n" failedaligns allaligns) + )) + +(define (find_best_alignment phones letters) + "(find_best_alignment phones letters) +Find the alignement containg the most frequent alignment pairs." + ;; hackily do this as a global + (set! fba_best_score 0) + (set! fba_best nil) + (find-best-align phones letters nil 0) + fba_best +) + + +(define (find-best-align phones letters path score) + "(find-best-align phones letters) +Find all feasible alignments." + (cond + ((null letters) + (if (> score fba_best_score) + (begin + (set! fba_best_score score) + (set! fba_best (reverse path)))) + nil) + (t + (if (valid-pair '_epsilon_ (car letters)) + (find-best-align phones (cdr letters) + (cons (cons '_epsilon_ (car letters)) path) + (+ score (score-pair '_epsilon_ (car letters))))) + (if (valid-pair (car phones) (car letters)) + (find-best-align (cdr phones) (cdr letters) + (cons (cons (car phones) (car letters))path) + (+ score (score-pair (car phones) (car letters))))) + (if (valid-pair-e (car phones) (car (cdr phones)) (car letters)) + (find-best-align (cdr (cdr phones)) (cdr letters) + (cons (cons (intern (format nil "%s-%s" + (car phones) + (car (cdr phones)))) + (car letters)) + path) + (+ score (score-pair + (intern (format nil "%s-%s" + (car phones) + (car (cdr phones)))) + (car letters)))))))) + +(define (align_and_score phones letters path score) + "(align_and_score phones lets) +Aligns all possible ways for these strings." + (cond + ((null letters) + (if (> score fba_best_score) + (begin + (set! fba_best_score score) + (set! fba_best (reverse path)))) + nil) + (t + (if (< (length phones) (length letters)) + (align_and_score + phones + (cdr letters) + (cons '_epsilon_ path) + (+ score + (score-pair '_epsilon_ (car letters))))) + (align_and_score + (cdr phones) + (cdr letters) + (cons (car phones) path) + (+ score + (score-pair (car phones) (car letters))))))) + +(define (aligndata file ofile) + (let ((fd (fopen file "r")) + (ofd (fopen ofile "w")) + (c 1) + (entry)) + (while (not (equal? (set! entry (readfp fd)) (eof-val))) + (set! lets (enworden (wordexplode (car entry)))) + (set! bp (find_best_alignment + (enworden (car (cdr (cdr entry)))) + lets)) + (if (not bp) + (format t "align failed: %l\n" entry) + (save_info (car (cdr entry)) bp ofd)) + (set! c (+ 1 c))) + (fclose fd) + (fclose ofd))) + +(define (enworden lets) + (cons '# (reverse (cons '# (reverse lets))))) + +(define (wordexplode lets) + (if (consp lets) + lets + (symbolexplode lets))) + +(define (save_info pos bp ofd) + "(save_info pos bp ofd) +Cut out one expensive step and 50M of diskspace and just save it +in a simpler format." + (format ofd "( ( ") + (mapcar + (lambda (l) + (if (not (string-equal "#" (cdr l))) + (format ofd "%l " (cdr l)))) + bp) + (format ofd ") %l" pos) + (mapcar + (lambda (l) + (if (not (string-equal "#" (car l))) + (format ofd " %s" (car l)))) + bp) + (format ofd " )\n")) + +(define (normalise-table pl-table) + "(normalise-table pl-table) +Change scores into probabilities." + (mapcar + (lambda (s) + (let ((sum (apply + (mapcar cdr (cdr s))))) + (mapcar + (lambda (p) + (if (equal? sum 0) + (set-cdr! p 0) + (set-cdr! p (/ (cdr p) sum)))) + (cdr s)))) + pl-table) + t) + +(define (save-table pre) + (normalise-table pl-table) + (set! fd (fopen (string-append pre "pl-tablesp.scm") "w")) + (format fd "(set! pl-table '\n") + (pprintf pl-table fd) + (format fd ")\n") + (fclose fd) + t) + +(define (build-feat-file alignfile featfile) +"(build-feat-file alignfile featfile) +Build a feature file from the given align file. The feature +file contain predicted phone, and letter with 3 preceding and +3 succeeding letters." + (let ((fd (fopen alignfile "r")) + (ofd (fopen featfile "w")) + (entry) + (pn) + (sylpos 1)) + (while (not (equal? (set! entry (readfp fd)) (eof-val))) +;; (format t "read: %l\n" entry) + (set! lets (append '(0 0 0 0 #) (wordexplode (car entry)) + '(# 0 0 0 0))) + (set! phones (cdr (cdr entry))) + (set! pn 5) + (mapcar + (lambda (p) + (format ofd + "%s %s %s %s %s %s %s %s %s %s %s\n" + p + (nth (- pn 4) lets) + (nth (- pn 3) lets) + (nth (- pn 2) lets) + (nth (- pn 1) lets) + (nth pn lets) + (nth (+ pn 1) lets) + (nth (+ pn 2) lets) + (nth (+ pn 3) lets) + (nth (+ pn 4) lets) + (cond + ((not (consp (car (cdr entry)))) + (car (cdr entry))) + ((not (consp (caar (cdr entry)))) + (caar (cdr entry))) + (t nil)) + ;; sylpos + ;; numsyls + ;; num2end + ) + (set! pn (+ 1 pn))) + phones)) + (fclose fd) + (fclose ofd)) +) + +(define (merge_models name filename allowables) +"(merge_models name filename) +Merge the models into a single list of cart trees as a variable +named by name, in filename." + (require 'cart_aux) + (let (trees fd) + (set! trees nil) + (set! lets (mapcar car allowables)) + (while lets + (if (probe_file (format nil "lts.%s.tree" (car lets))) + (begin + (format t "%s\n" (car lets)) + (set! tree (car (load (format nil "lts.%s.tree" (car lets)) t))) + (set! tree (cart_simplify_tree2 tree nil)) + (set! trees + (cons (list (car lets) tree) trees)))) + (set! lets (cdr lets))) + (set! trees (reverse trees)) + (set! fd (fopen filename "w")) + (format fd ";; LTS rules \n") + (format fd "(set! %s '(\n" name) + (mapcar + (lambda (tree) (pprintf tree fd)) + trees) + (format fd "))\n") + (fclose fd)) +) + +(define (lts_testset file cartmodels) + "(lts_testset file cartmodels) +Test an aligned lexicon file against a set of cart trees. Prints out +The number of letters correct (for each letter), total number of +letters correct and the total number of words correct. cartmodels is +the structure as saved by merge_models." + (let ((fd (fopen file "r")) + (entry) + (wordcount 0) + (correctwords 0) + (phonecount 0) + (correctphones 0)) + (while (not (equal? (set! entry (readfp fd)) (eof-val))) + (let ((letters (enworden (wordexplode (car entry)))) + (phones (enworden (cdr (cdr entry)))) + (pphones)) + (set! wordcount (+ 1 wordcount)) + (set! pphones (gen_cartlts letters (car (cdr entry)) cartmodels)) +; (set! pphones +; (or ; unwind-protect +; (gen_vilts letters (car (cdr entry)) +; cartmodels wfstname) +; nil)) + (if (equal? (ph-normalize pphones) (ph-normalize phones)) + (set! correctwords (+ 1 correctwords)) + (or nil + (format t "failed %l %l %l %l\n" (car entry) (car (cdr entry)) phones pphones))) + (count_correct_letters ;; exclude #, cause they're always right + (cdr letters) + (cdr phones) + (cdr pphones)) + (set! phonecount (+ (length (cdr (cdr letters))) phonecount)) + )) + (fclose fd) + (mapcar + (lambda (linfo) + (format t "%s %d correct %d (%2.2f)\n" + (car linfo) (car (cdr linfo)) + (car (cdr (cdr linfo))) + (/ (* (car (cdr (cdr linfo))) 100) (car (cdr linfo)))) + (set! correctphones (+ correctphones (car (cdr (cdr linfo)))))) + correct_letter_table) + (format t "phones %d correct %d (%2.2f)\n" + phonecount correctphones (/ (* correctphones 100) phonecount)) + (format t "words %d correct %d (%2.2f)\n" + wordcount correctwords (/ (* correctwords 100) wordcount)) + (format t "tree model has %d nodes\n" + (apply + (mapcar (lambda (a) (cart_tree_node_count (car (cdr a)))) + cartmodels))) + )) + +(define (cart_tree_node_count tree) + "(tree_node_count tree) +Count the number nodes (questions and leafs) in the given CART tree." + (cond + ((cdr tree) + (+ 1 + (cart_tree_node_count (car (cdr tree))) + (cart_tree_node_count (car (cdr (cdr tree)))))) + (t + 1))) + +(defvar correct_letter_table + (mapcar + (lambda (l) (list l 0 0)) + '(a b c d e f g h i j k l m n o p q r s t u v w x y z)) + "correct_letter_table +List used to cummulate the number of correct (and incorrect) letter to +phone predictions. This list will be extended if there are more letters +in your alphabet, though it doesn't take a fairly western european +view of the alphabet, but you can change this yourself is necessary.") + +(define (count_correct_letters lets phs pphs) + "(count_correct_letters lets phs pphs) +Count which letters have the correct phone prediction. Cummulate this +is a per letter table." + (cond + ((or (null phs) (null pphs) (null lets)) + (format t "misaligned entry\n") + nil) + ((and (null (cdr lets)) (null (cdr phs)) (null (cdr pphs))) + nil) ;; omit final # + (t + (let ((letinfo (assoc_string (car lets) correct_letter_table))) + (if (not letinfo) + (set! correct_letter_table + (append correct_letter_table + (list (set! letinfo (list (car lets) 0 0)))))) + (set-car! (cdr letinfo) (+ 1 (car (cdr letinfo)))) ;; total + (if (equal? (car phs) (car pphs)) ;; correct + (set-car! (cdr (cdr letinfo)) (+ 1 (car (cdr (cdr letinfo)))))) + (count_correct_letters (cdr lets) (cdr phs) (cdr pphs)))))) + +(define (ph-normalize ph) + (cond + ((null ph) nil) + ((string-equal "_epsilon_" (car ph)) + (ph-normalize (cdr ph))) + ((string-matches (car ph) ".*-.*") + (cons + (string-before (car ph) "-") + (cons + (string-after (car ph) "-") + (ph-normalize (cdr ph))))) + (t + (cons (car ph) (ph-normalize (cdr ph)))))) + +(define (make_let_utt_p letters pos) +"(make_let_utt letters) +Build an utterances from th4ese letters." + (let ((utt (Utterance Text ""))) + (utt.relation.create utt 'LTS) + (utt.relation.create utt 'LETTER) + (utt.relation.create utt 'PHONE) + ;; Create letter stream + (mapcar + (lambda (l) + (let ((lsi (utt.relation.append utt 'LETTER))) + (item.set_name lsi l) + (item.set_feat lsi "pos" pos))) + letters) + utt)) + +(define (gen_vilts letters pos cartmodels ngram) + "(get_vilts letters pos cartmodels ngram) +Use cart plus ngrams in viterbi search." + (require 'lts) + (let ((utt (make_let_utt_p letters pos))) + (set! gen_vit_params + (list + (list 'Relation "LETTER") + (list 'return_feat "phone") + (list 'p_word "#") + (list 'pp_word "0") + (list 'ngramname ngram) +; (list 'wfstname ngram) + (list 'cand_function 'lts_cand_function))) + (Gen_Viterbi utt) + (mapcar + (lambda (lsi) + (intern (item.feat lsi "phone"))) + (utt.relation.items utt 'LETTER)))) + +(define (gen_cartlts letters pos cartmodels) + "(get_cartlts letters cartmodels) +Generate the full list of predicted phones, including +epsilon and unexpanded multi-phones." + (require 'lts) + (let ((utt (make_let_utt_p letters pos))) + (enworden + (mapcar + (lambda (lsi) + (let ((tree (car (cdr (assoc_string (item.name lsi) cartmodels)))) + (p)) + (if (not tree) + (begin + (format t "failed to find tree for %s\n" (item.name lsi)) + nil) + (begin + (set! p (wagon_predict lsi tree)) + (item.set_feat lsi "val" p) + p)))) + (reverse (cdr (reverse (cdr (utt.relation.items utt 'LETTER))))))))) + +(define (reduce_lexicon entryfile exceptionfile lts_function) + "(reduce_lexicon entryfile exceptionfile lts_function) +Look up each word in entryfile using the current lexicon, if the entry +doesn't match save it in the exception file. This is a way of reducing +the lexicon based on a letter to sound model (and lexical stress +model, if appropriate)." + (let ((fd (fopen entryfile "r")) + (ofd (fopen exceptionfile "w")) + (entry) + (wordcount 0) + (correctwords 0)) + (while (not (equal? (set! entry (readfp fd)) (eof-val))) + (if (and (consp entry) + (> (length entry) 1)) + (let ((lts (lts_function (car entry) (car (cdr entry)))) + (encount (lex.entrycount (car entry)))) + (set! wordcount (+ 1 wordcount)) + (if (and (equal? (nth 2 entry) (nth 2 lts)) + (< encount 2)) + (set! correctwords (+ 1 correctwords)) + (format ofd "%l\n" entry)) + ))) + (fclose fd) + (fclose ofd) + (format t "words %d correct %d (%2.2f)\n" + wordcount correctwords (/ (* correctwords 100) wordcount)) + )) + +(define (dump-flat-entries infile outfile ltype) + (let ((ifd (fopen infile "r")) + (ofd (fopen outfile "w")) + clength + entry) +; (set! entry (readfp ifd)) +; (if (or (consp entry) (not (string-equal entry "MNCL"))) +; (begin +; (format t "Expected MNCL at start of file: not a compiled lexicon\n") +; (exit))) + (while (not (equal? (set! entry (readfp ifd)) (eof-val))) + (cond + ((not (consp entry)) + t) ;; not an entry + ((string-equal ltype "utf8") + (set! clength (length (utf8explode (car entry))))) + (t + (set! clength (length (car entry))))) + (cond + ((not (consp entry)) + t) ;; not an entry + ((and ;(string-matches (car entry) "...*") + ;(< clength 14) + (not (string-matches (car entry) ".*'.*")) ;; no quotes + (car (cddr entry))) ;; non-nil pronounciation + (begin + (cond + ((string-equal ltype "utf8") + (format ofd + "( %l %l (" + (utf8explode (car entry)) + (cadr entry))) + ((string-equal ltype "asis") + (format ofd + "( \"%s\" %l (" + (car entry) + (cadr entry))) + (t + (format ofd + "( \"%s\" %l (" + (downcase (car entry)) + (cadr entry)))) + (if (consp (car (car (cddr entry)))) + (begin ;; it is syllabified) + (mapcar + (lambda (syl) + (mapcar + (lambda (seg) + (cond + ((string-matches seg "[aeiouAEIOU@].*") + (format ofd "%s " (string-append seg (cadr syl)))) + (t + (format ofd "%s " seg)))) + (car syl))) + (car (cddr entry)))) + (begin ;; it is already flat + (mapcar + (lambda (p) + (format ofd "%s " p)) + (car (cddr entry))) + )) + (format ofd "))\n"))) + (t nil))) + (fclose ifd) + (fclose ofd))) + +(define (dump-lets-phones infile) + "(dump-lets-phones infile) +Dump all the letters to alllets.out and phones to allphones.out for processing. +This expects an external script to sort and uniquify them. This is done +in scheme so we can get utf8/non-utf8 to be easy." + (let ((ifd (fopen infile "r")) + (lfd (fopen "alllets.out" "w")) + (apfd (fopen "allphones.out" "w")) + (pfd (fopen "let2phones.out" "w")) + entry) + (while (not (equal? (set! entry (readfp ifd)) (eof-val))) + (mapcar + (lambda (l) + (format lfd "%s\n" l) + (format pfd "%s " l) + (mapcar + (lambda (p) (format pfd "%s " p)) + (car (cddr entry))) + (format pfd "\n")) + (wordexplode (car entry))) + (mapcar + (lambda (p) (format apfd "%s " p)) + (car (cddr entry))) + (format apfd "\n") + ) + (fclose ifd) + (fclose lfd) + (fclose pfd) + (fclose apfd) + t)) + +(define (dump-flat-entries-all infile outfile) + "(dump-flat-entries-all infile outfile) +Do this for *all* entries not just ones with more than three chars." + (let ((ifd (fopen infile "r")) + (ofd (fopen outfile "w")) + entry) + (readfp ifd) ;; skip "MNCL" + (while (not (equal? (set! entry (readfp ifd)) (eof-val))) + (if (consp entry) + (begin + (format ofd + "( \"%s\" %s (" + (downcase (car entry)) + (cadr entry)) + (mapcar + (lambda (syl) + (mapcar + (lambda (seg) + (cond +; ((string-equal seg "ax") +; (format ofd "%s " seg)) + ((string-matches seg "[aeiouAEIOU@].*") + (format ofd "%s " (string-append seg (cadr syl)))) + (t + (format ofd "%s " seg)))) + (car syl))) + (car (cddr entry))) + (format ofd "))\n")))) + (fclose ifd) + (fclose ofd))) + +(provide 'lts_build) + diff --git a/CosyVoice-ttsfrd/resource/festival/mbrola.scm b/CosyVoice-ttsfrd/resource/festival/mbrola.scm new file mode 100644 index 0000000000000000000000000000000000000000..77d1e42c5f2f0d542828f71eff592cd157808252 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/mbrola.scm @@ -0,0 +1,103 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Support for MBROLA as an external module. +;;; + +;;; You might want to set this in your sitevars.scm +(defvar mbrola_progname "/cstr/external/mbrola/mbrola" + "mbrola_progname + The program name for mbrola.") +(defvar mbrola_database "fr1" + "mbrola_database + The name of the MBROLA database to usde during MBROLA Synthesis.") + +(define (MBROLA_Synth utt) + "(MBROLA_Synth UTT) + Synthesize using MBROLA as external module. Basically dump the info + from this utterance. Call MBROLA and reload the waveform into utt. + [see MBROLA]" + (let ((filename (make_tmp_filename)) + ) + (save_segments_mbrola utt filename) + (system (string-append mbrola_progname " " + mbrola_database " " + filename " " + filename ".au")) + (utt.import.wave utt (string-append filename ".au")) + (apply_hooks after_synth_hooks utt) + (delete-file filename) + (delete-file (string-append filename ".au")) + utt)) + +(define (save_segments_mbrola utt filename) + "(save_segments_mbrola UTT FILENAME) + Save segment information in MBROLA format in filename. The format is + phone duration (ms) [% position F0 target]*. [see MBROLA]" + (let ((fd (fopen filename "w"))) + (mapcar + (lambda (segment) + (save_seg_mbrola_entry + (item.feat segment 'name) + (item.feat segment 'segment_start) + (item.feat segment 'segment_duration) + (mapcar + (lambda (targ_item) + (list + (item.feat targ_item "pos") + (item.feat targ_item "f0"))) + (item.relation.daughters segment 'Target)) ;; list of targets + fd)) + (utt.relation.items utt 'Segment)) + (fclose fd))) + +(define (save_seg_mbrola_entry name start dur targs fd) + "(save_seg_mbrola_entry ENTRY NAME START DUR TARGS FD) + Entry contains, (name duration num_targs start 1st_targ_pos 1st_targ_val)." + (format fd "%s %d " name (nint (* dur 1000))) + (if targs ;; if there are any targets + (mapcar + (lambda (targ) ;; targ_pos and targ_val + (let ((targ_pos (car targ)) + (targ_val (car (cdr targ)))) + + (format fd "%d %d " + (nint (* 100 (/ (- targ_pos start) dur))) ;; % pos of target + (nint (parse-number targ_val))) ;; target value + )) + targs)) + (terpri fd) + (terpri fd) +) + +(provide 'mbrola) diff --git a/CosyVoice-ttsfrd/resource/festival/mettree.scm b/CosyVoice-ttsfrd/resource/festival/mettree.scm new file mode 100644 index 0000000000000000000000000000000000000000..638ded1090e6551a37ac32cbd49457ecf7678345 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/mettree.scm @@ -0,0 +1,88 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1998 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Some (experimental) data for investigating metrical trees +;;; + +;;; Set up generation of metrical tree, this includes getting +;;; a syntactic parse +;;; +;;; Use as +;;; (set! utt1 (metsynth (Utterance Text "For afternoon tea"))) +;;; (utt.relation_tree utt1 'MetricalTree) + +(require 'scfg) +(set! scfg_grammar (load (path-append libdir "scfg_wsj_wp20.gram") t)) + +(define (mettext utt) + (Initialize utt) + (Text utt) + (Token_POS utt) + (Token utt) + (POS utt) + (print "here1") + (Phrasify utt) + (print "here2") + (ProbParse utt) + (print "here3") + (auto_metrical_tree utt) +) + +(define (metsynth utt) + (mettext utt) + (Wave_Synth utt) +) + +;;; Assumed everything is using Roger diphones + +;;(lex.create "cmu_mettree") +;;;(lex.set.phoneset "radio_phones") +;;(lex.set.phoneset "radio_phones") + +(define (setup_cmu_mettree_lex) + "(setup_cmu_mettreelex) +Lexicon derived from the CMU lexicon (cmudict-0.1), around 100,000 entries, +in the radio phoneset (sort of darpa-like)." + (if (not (member_string "cmu_mettree" (lex.list))) + (begin + (print "making cmu lexicon") + (lex.create "cmu_mettree") + (lex.set.compile.file (path-append lexdir "cmu_mettree_lex.out")) + (lex.set.phoneset "radio") + (require 'lts__us) ;; US English letter to sound rules + (lex.set.lts.method 'lts_rules) + (lex.set.lts.ruleset 'nrl_us)))) + +(provide 'mettree) + + diff --git a/CosyVoice-ttsfrd/resource/festival/module_description.scm b/CosyVoice-ttsfrd/resource/festival/module_description.scm new file mode 100644 index 0000000000000000000000000000000000000000..0cf426f9a9afcd012bed5a49642fee8821b6cfd5 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/module_description.scm @@ -0,0 +1,117 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Handle module descriptions. +;;; + +(defvar *module-descriptions* nil + "*module-descriptions* + An association list recording the description objects for proclaimed + modules.") + +(define (set_module_description mod desc) + "(set_module_description MOD DESC) + Set the description for the module named MOD." + (let ((entry (assoc mod *module-descriptions*))) + (if entry + (set-cdr! entry (cons desc nil)) + (set! *module-descriptions* (cons (cons mod (cons desc nil)) + *module-descriptions*)) + ) + ) + ) + +(define (module_description mod) + "(module_description MOD) + Returns the description record of the module named by symbol MOD" + (let ((entry (assoc mod *module-descriptions*))) + (if entry + (car (cdr entry)) + nil + ) + ) + ) + +(defmac (proclaim form) + "(proclaim NAME &opt DESCRIPTION...) + Anounce the availability of a module NAME. DESCRIPTION + is a description in a fixed format." + (let ((name (car (cdr form))) + (description (cdr form)) + ) + (list 'proclaim-real (list 'quote name) (list 'quote description)) + ) + ) + +(define (proclaim-real name description) + (set! *modules* (cons name *modules*)) +; (if description +; (set_module_description name (create_module_description description)) +; ) + ) + +(define (describe_module mod) + "(describe_module MOD) + Describe the module named by the symbol MOD." + + (let ((entry (module_description mod))) + (format t "---------------------\n") + (if entry + (print_module_description entry) + (format t "No description for %l\n" mod) + ) + (format t "---------------------\n") + ) + ) + +(define (describe_all_modules) + "(describe_all_modules) + Print descriptions of all proclaimed modules" + (format t "---------------------\n") + (let ((p *module-descriptions*)) + (while p + (print_module_description (car (cdr (car p)))) + (format t "---------------------\n") + (set! p (cdr p)) + ) + ) + ) + +(proclaim + module_description 1.1 + "CSTR" "Richard Caley " + ( "Handle module descriptions from C++ and from Scheme." + ) + ) + +(provide 'module_description) diff --git a/CosyVoice-ttsfrd/resource/festival/mrpa_allophones.scm b/CosyVoice-ttsfrd/resource/festival/mrpa_allophones.scm new file mode 100644 index 0000000000000000000000000000000000000000..fbabf369c8047c14c6553c5323e8495f56f9619f --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/mrpa_allophones.scm @@ -0,0 +1,111 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ;; +;; Centre for Speech Technology Research ;; +;; University of Edinburgh, UK ;; +;; Copyright (c) 1996,1997 ;; +;; All Rights Reserved. ;; +;; ;; +;; Permission is hereby granted, free of charge, to use and distribute ;; +;; this software and its documentation without restriction, including ;; +;; without limitation the rights to use, copy, modify, merge, publish, ;; +;; distribute, sublicense, and/or sell copies of this work, and to ;; +;; permit persons to whom this work is furnished to do so, subject to ;; +;; the following conditions: ;; +;; 1. The code must retain the above copyright notice, this list of ;; +;; conditions and the following disclaimer. ;; +;; 2. Any modifications must be clearly marked as such. ;; +;; 3. Original authors' names are not deleted. ;; +;; 4. The authors' names are not used to endorse or promote products ;; +;; derived from this software without specific prior written ;; +;; permission. ;; +;; ;; +;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;; THIS SOFTWARE. ;; +;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; A definition of the extended mrpa phone set used for some diphone sets +;; + +(defPhoneSet + mrpa_allophones + ;;; Phone Features + (;; vowel or consonant + (vc + -) + ;; vowel length: short long dipthong schwa + (vlng s l d a 0) + ;; vowel height: high mid low + (vheight 1 2 3 -) + ;; vowel frontness: front mid back + (vfront 1 2 3 -) + ;; lip rounding + (vrnd + -) + ;; consonant type: stop fricative affricative nasal liquid + (ctype s f a n l 0) + ;; place of articulation: labial alveolar palatal labio-dental + ;; dental velar + (cplace l a p b d v 0) + ;; consonant voicing + (cvox + -) + ) + ;; Phone set members + ( + (uh + s 2 3 - 0 0 +) + (e + s 2 1 - 0 0 +) + (a + s 3 1 - 0 0 +) + (o + s 3 3 - 0 0 +) + (i + s 1 1 - 0 0 +) + (u + s 1 3 + 0 0 +) + (ii + l 1 1 - 0 0 +) + (uu + l 2 3 + 0 0 +) + (oo + l 3 2 - 0 0 +) + (aa + l 3 1 - 0 0 +) + (@@ + l 2 2 - 0 0 +) + (ai + d 3 1 - 0 0 +) + (ei + d 2 1 - 0 0 +) + (oi + d 3 3 - 0 0 +) + (au + d 3 3 + 0 0 +) + (ou + d 3 3 + 0 0 +) + (e@ + d 2 1 - 0 0 +) + (i@ + d 1 1 - 0 0 +) + (u@ + d 3 1 - 0 0 +) + (@ + a - - - 0 0 +) + (p - 0 - - + s l -) + (t - 0 - - + s a -) + (k - 0 - - + s p -) + (b - 0 - - + s l +) + (d - 0 - - + s a +) + (g - 0 - - + s p +) + (s - 0 - - + f a -) + (z - 0 - - + f a +) + (sh - 0 - - + f p -) + (zh - 0 - - + f p +) + (f - 0 - - + f b -) + (v - 0 - - + f b +) + (th - 0 - - + f d -) + (dh - 0 - - + f d +) + (ch - 0 - - + a a -) + (jh - 0 - - + a a +) + (h - 0 - - + a v -) + (m - 0 - - + n l +) + (n - 0 - - + n d +) + (ng - 0 - - + n v +) + (l - 0 - - + l d +) + (ll - 0 - - + l d +) + (y - 0 - - + l a +) + (r - 0 - - + l p +) + (w - 0 - - + l l +) + (# - 0 - - - 0 0 -) + ) + ) + +(PhoneSet.silences '(#)) + +(provide 'mrpa_allophones) diff --git a/CosyVoice-ttsfrd/resource/festival/mrpa_durs.scm b/CosyVoice-ttsfrd/resource/festival/mrpa_durs.scm new file mode 100644 index 0000000000000000000000000000000000000000..86b14ca79e63fe01266c18a3e55c6eea50d88755 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/mrpa_durs.scm @@ -0,0 +1,136 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; mrpa average phoneme durations from gsw 450 +;;; +(set! phoneme_durations +'( +(u 0.067) +(i@ 0.146) +(h 0.067) +(uu 0.105) +(uh 0.090) +(v 0.053) +(oo 0.145) +(i 0.060) +(jh 0.097) +(ii 0.095) +(w 0.066) +(k 0.088) +(+ 0.036) +(y 0.051) +(l 0.067) +(zh 0.080) +(ng 0.072) +(m 0.070) +(z 0.079) +(## 0.256) +(au 0.162) +(a 0.118) +(n 0.065) +(o 0.102) +(ai 0.156) +(b 0.071) +(ou 0.129) +(ch 0.119) +(p 0.094) +(oi 0.165) +(# 0.040) +(e@ 0.131) +(d 0.052) +(dh 0.032) +(e 0.091) +(r 0.062) +(sh 0.101) +(@@ 0.149) +(ei 0.131) +(f 0.091) +(s 0.093) +(g 0.066) +(u@ 0.120) +(aa 0.173) +(t 0.073) +(th 0.080) +(@ 0.054) +)) + +(set! gsw_durs +'( +(# 0.200 0.100) +(h 0.061 0.028) +(i@ 0.141 0.061) +(u 0.067 0.024) +(uu 0.107 0.044) +(uh 0.087 0.025) +(v 0.051 0.019) +(oo 0.138 0.046) +(i 0.058 0.023) +(ii 0.092 0.035) +(w 0.054 0.023) +(jh 0.094 0.024) +(k 0.089 0.034) +(y 0.048 0.025) +(l 0.056 0.026) +(zh 0.077 0.030) +(ng 0.064 0.024) +(m 0.063 0.021) +(z 0.072 0.029) +(a 0.120 0.036) +(au 0.171 0.046) +(n 0.059 0.025) +(ou 0.134 0.039) +(b 0.073 0.021) +(o 0.094 0.037) +(ai 0.137 0.047) +(ch 0.128 0.039) +(oi 0.183 0.050) +(p 0.101 0.032) +(e@ 0.144 0.061) +(d 0.048 0.021) +(dh 0.031 0.016) +(e 0.092 0.035) +(r 0.053 0.025) +(sh 0.108 0.031) +(f 0.095 0.033) +(@@ 0.147 0.035) +(ei 0.130 0.042) +(s 0.102 0.037) +(u@ 0.140 0.057) +(th 0.093 0.050) +(g 0.064 0.021) +(aa 0.155 0.045) +(t 0.070 0.034) +(@ 0.046 0.020) +)) + +(provide 'mrpa_durs) diff --git a/CosyVoice-ttsfrd/resource/festival/mrpa_phones.scm b/CosyVoice-ttsfrd/resource/festival/mrpa_phones.scm new file mode 100644 index 0000000000000000000000000000000000000000..84e2c176df50504131af5fcc94cfbb670c1ab966 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/mrpa_phones.scm @@ -0,0 +1,114 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ;; +;; Centre for Speech Technology Research ;; +;; University of Edinburgh, UK ;; +;; Copyright (c) 1996,1997 ;; +;; All Rights Reserved. ;; +;; ;; +;; Permission is hereby granted, free of charge, to use and distribute ;; +;; this software and its documentation without restriction, including ;; +;; without limitation the rights to use, copy, modify, merge, publish, ;; +;; distribute, sublicense, and/or sell copies of this work, and to ;; +;; permit persons to whom this work is furnished to do so, subject to ;; +;; the following conditions: ;; +;; 1. The code must retain the above copyright notice, this list of ;; +;; conditions and the following disclaimer. ;; +;; 2. Any modifications must be clearly marked as such. ;; +;; 3. Original authors' names are not deleted. ;; +;; 4. The authors' names are not used to endorse or promote products ;; +;; derived from this software without specific prior written ;; +;; permission. ;; +;; ;; +;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;; THIS SOFTWARE. ;; +;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; A definition of the mrpa phone set +;; + +(defPhoneSet + mrpa + ;;; Phone Features + (;; vowel or consonant + (vc + -) + ;; vowel length: short long dipthong schwa + (vlng s l d a 0) + ;; vowel height: high mid low + (vheight 1 2 3 0) + ;; vowel frontness: front mid back + (vfront 1 2 3 0) + ;; lip rounding + (vrnd + - 0) + ;; consonant type: stop fricative affricate nasal lateral approximant + (ctype s f a n l r 0) + ;; place of articulation: labial alveolar palatal labio-dental + ;; dental velar glottal + (cplace l a p b d v g 0) + ;; consonant voicing + (cvox + - 0) + ) + ;; Phone set members + ( + (uh + s 2 3 - 0 0 0) + (e + s 2 1 - 0 0 0) + (a + s 3 1 - 0 0 0) + (o + s 2 3 + 0 0 0) + (i + s 1 1 - 0 0 0) + (u + s 1 3 + 0 0 0) + (ii + l 1 1 - 0 0 0) + (uu + l 1 3 + 0 0 0) + (oo + l 3 3 + 0 0 0) + (aa + l 3 3 - 0 0 0) + (@@ + l 2 2 - 0 0 0) + (ai + d 3 2 - 0 0 0) + (ei + d 2 1 - 0 0 0) + (oi + d 3 3 + 0 0 0) + (au + d 3 2 + 0 0 0) + (ou + d 2 2 - 0 0 0) + (e@ + d 2 1 - 0 0 0) + (i@ + d 1 1 - 0 0 0) + (u@ + d 3 1 + 0 0 0) + (@ + a 2 2 - 0 0 0) + (p - 0 0 0 0 s l -) + (t - 0 0 0 0 s a -) + (k - 0 0 0 0 s v -) + (b - 0 0 0 0 s l +) + (d - 0 0 0 0 s a +) + (g - 0 0 0 0 s v +) + (s - 0 0 0 0 f a -) + (z - 0 0 0 0 f a +) + (sh - 0 0 0 0 f p -) + (zh - 0 0 0 0 f p +) + (f - 0 0 0 0 f b -) + (v - 0 0 0 0 f b +) + (th - 0 0 0 0 f d -) + (dh - 0 0 0 0 f d +) + (ch - 0 0 0 0 a p -) + (jh - 0 0 0 0 a p +) + (h - 0 0 0 0 f g -) + (m - 0 0 0 0 n l +) + (n - 0 0 0 0 n a +) + (ng - 0 0 0 0 n v +) + (l - 0 0 0 0 l a +) + (y - 0 0 0 0 r p +) + (r - 0 0 0 0 r a +) + (w - 0 0 0 0 r l +) + (# - 0 0 0 0 0 0 -) + ) + ) + +(PhoneSet.silences '(#)) + +(provide 'mrpa_phones) + + + + diff --git a/CosyVoice-ttsfrd/resource/festival/ogimarkup-mode.scm b/CosyVoice-ttsfrd/resource/festival/ogimarkup-mode.scm new file mode 100644 index 0000000000000000000000000000000000000000..2bca41a1a49d62b30ee39ca9e0418c6831caabce --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/ogimarkup-mode.scm @@ -0,0 +1,191 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; An example tts text mode for reading OGI's CSLU toolkit mark up +;;; +;;; Note not all tokens do something in festival but all are removed +;;; from the actual text +;;; + +(defvar ogimarkup_eou_tree +'((n.name matches "<.*") + ((1)) +((n.whitespace matches ".*\n.*\n\\(.\\|\n\\)*") ;; A significant break (2 nls) + ((1)) + ((punc in ("?" ":" "!")) + ((1)) + ((punc is ".") + ;; This is to distinguish abbreviations vs periods + ;; These are heuristics + ((name matches "\\(.*\\..*\\|[A-Z][A-Za-z]?[A-Za-z]?\\|etc\\)") ;; an abbreviation + ((n.whitespace is " ") + ((0)) ;; if abbrev single space isn't enough for break + ((n.name matches "[A-Z].*") + ((1)) + ((0)))) + ((n.whitespace is " ") ;; if it doesn't look like an abbreviation + ((n.name matches "[A-Z].*") ;; single space and non-cap is no break + ((1)) + ((0))) + ((1)))) + ((0))))))) + +(define (ogimarkup_init_func) + "Called on starting ogimarkup text mode." + (set! ogimarkup_in_tag nil) + (set! ogimarkup_tagtokens "") + (set! ogimarkup_previous_t2w_func token_to_words) + (set! english_token_to_words ogimarkup_token_to_words) + (set! token_to_words ogimarkup_token_to_words) + (set! ogimarkup_previous_eou_tree eou_tree) + (set! eou_tree ogimarkup_eou_tree)) + +(define (ogimarkup_exit_func) + "Called on exit ogimarkup text mode." + (Parameter.set 'Duration_Stretch 1.0) + (set! token_to_words ogimarkup_previous_t2w_func) + (set! english_token_to_words ogimarkup_previous_t2w_func) + (set! eou_tree ogimarkup_previous_eou_tree)) + +(define (ogimarkup_token_to_words token name) + "(ogimarkup_token_to_words token name) +OGI markup specific token to word rules. Tags may have optional +argument e.g. or which means the tag may be over +a number of tokens." + (let (tag (arg nil) (rval nil)) + (cond + ((string-matches name "<.*") + (set! ogimarkup_tagtokens "") + (set! tag (string-after name "<")) + (if (string-matches tag ".*>$") + (set! tag (string-before tag ">")) + (if (string-matches (set! arg (item.feat token "n.name")) + ".*>$") + (set! arg (string-before arg ">")))) + (set! ogimarkup_in_tag tag) + (cond + ((string-equal tag "slow") + (Parameter.set 'Duration_Stretch 1.3)) + ((string-equal tag "SLOW") + (Parameter.set 'Duration_Stretch 2.0)) + ((string-equal tag "normal") + (Parameter.set 'Duration_Stretch 1.0)) + ((string-matches tag "FAST") + (Parameter.set 'Duration_Stretch 0.5)) + ((string-matches tag "fast") + (Parameter.set 'Duration_Stretch 0.8)) + ((string-matches tag"spell") + ;; This ain't really right as we'll get an utterance break here + (set! rval (symbolexplode arg))) + ((string-matches tag "phone") + ;; This ain't really right as we'll get an utterance break here + (item.set_feat token "token_pos" "digits") ;; canonical phone number + (set! rval (ogimarkup_previous_t2w_func token arg))) + ((string-matches tag "male") + (if (and (member 'OGIresLPC *modules*) + (symbol-bound? 'voice_aec_diphone)) + (voice_aec_diphone) + (voice_kal_diphone))) + ((string-matches tag "Male") + (if (and (member 'OGIresLPC *modules*) + (symbol-bound? 'voice_mwm_diphone)) + (voice_mwm_diphone) + (voice_cmu_us_rms_cg))) + ((string-matches tag "MALE") + (if (and (member 'OGIresLPC *modules*) + (symbol-bound? 'voice_jph_diphone)) + (voice_jph_diphone) + (voice_rab_diphone))) + ((string-matches tag "FT") + t) ;; do nothing until the end of this tag + ((string-matches (downcase tag) "female") + ;; only one female voice so map female Female FEMALE to it + (if (and (member 'OGIresLPC *modules*) + (symbol-bound? 'voice_tll_diphone)) + (voice_tll_diphone) + (voice_cmu_us_slt_arctic_hts)))) + (if (string-matches name ".*>$") + (set! ogimarkup_in_tag nil)) + rval ;; mostly nil + ) + ((string-matches name ".*>$") + (set! ogimarkup_tagtokens + (string-append + ogimarkup_tagtokens + (ogimarkup_get_token_string token t))) ;; delete final > + (if (string-equal ogimarkup_in_tag "FT") + (ogimarkup_festival_eval ogimarkup_tagtokens)) + (set! ogimarkup_in_tag nil) ;; end of tag + nil) + (ogimarkup_in_tag + (set! ogimarkup_tagtokens + (string-append + ogimarkup_tagtokens + (ogimarkup_get_token_string token nil))) + nil) ;; still in tag + (t ;; for all other cases + (ogimarkup_previous_t2w_func token name))))) + +(set! tts_text_modes + (cons + (list + 'ogimarkup ;; mode name + (list ;; ogimarkup mode params + (list 'init_func ogimarkup_init_func) + (list 'exit_func ogimarkup_exit_func))) + tts_text_modes)) + +(define (ogimarkup_get_token_string token delend) + "(ogimarkup_get_token_string TOKEN DELEND) +return string for token including whitespace and punctuation. If DELEND +is true remove > from the name." + (string-append + (item.feat token "whitespace") + (item.feat token "prepunctuation") + (if delend + (string-before + (item.feat token "name") ">") + (item.feat token "name")) + (if (string-equal "0" (item.feat token "punc")) + "" + (item.feat token "punc")))) + +(define (ogimarkup_festival_eval tagtokens) +"(ogimarkup_festival_eval TAGTOKENS +Take a string of the tokens within the tag and read an s-expression from +it and then evaluate it." + (let ((com "") (command nil)) + (set! command (read-from-string tagtokens)) + (eval command))) + +(provide 'ogimarkup-mode) diff --git a/CosyVoice-ttsfrd/resource/festival/pauses.scm b/CosyVoice-ttsfrd/resource/festival/pauses.scm new file mode 100644 index 0000000000000000000000000000000000000000..18af2a9cf30a36db91999ae264103aab8c95a604 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/pauses.scm @@ -0,0 +1,242 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Predicting pause insertion + +(define (Pauses utt) +"(Pauses utt) +Insert pauses where required." + (let ((rval (apply_method 'Pause_Method utt))) + (cond + (rval rval) ;; new style + (t + (Classic_Pauses utt)))) + (Pause_optional_deleting_B_X utt)) + +(define (Classic_Pauses utt) + "(Pauses UTT) +Predict pause insertion." + (let ((words (utt.relation.items utt 'Word)) lastword tpname) + (if words + (begin + (insert_initial_pause utt) ;; always have a start pause + (set! lastword (car (last words))) + (mapcar + (lambda (w) + (let ((pbreak (item.feat w "pbreak")) + (emph (item.feat w "R:Token.parent.EMPH"))) + (cond + ((or (string-equal "B" pbreak) + (string-equal "BB" pbreak)) + (insert_pause utt w)) +; ((string-equal emph "1") +; (insert_pause utt w)) + ((equal? w lastword) + (insert_pause utt w))))) + words) + ;; The embarrassing bit. Remove any words labelled as punc or fpunc + (mapcar + (lambda (w) + (let ((pos (item.feat w "pos"))) + (if (or (string-equal "punc" pos) + (string-equal "fpunc" pos)) + (let ((pbreak (item.feat w "pbreak")) + (wp (item.relation w 'Phrase))) + (if (and (string-matches pbreak "BB?") + (item.relation.prev w 'Word)) + (item.set_feat + (item.relation.prev w 'Word) "pbreak" pbreak)) + (item.relation.remove w 'Word) + ;; can't refer to w as we've just deleted it + (item.relation.remove wp 'Phrase))))) + words) + ;; 12/01/2006 V.Strom: Even more embarrasing: Delete all silences + ;; that are followed by a silence. These silence sequences + ;; emerge if 'punc of phrase-final words consists of more than one + ;; character, e.g. period+quote. That in turn causes problems in + ;; build_utts: the 2nd silence ends up with no features but its name, + ;; because there is no corresponding 2nd silence in the phone + ;; segmentation to align with. + ;; This schould be fixed in the functions below, but it is easier for + ;; me to clean up at the end: + (set! sil (car (car (cdr (car (PhoneSet.description '(silences))))))) + (set! seg (item.next(utt.relation.first utt 'Segment))) + (while seg + (if(and(equal? sil (item.name seg)) + (equal? sil (item.name (item.prev seg)))) + (item.delete (item.prev seg))) + (set! seg (item.next seg))))) + utt)) + +(define (insert_pause utt word) +"(insert_pause UTT WORDITEM) +Insert a silence segment after the last segment in WORDITEM in UTT." + (let ((lastseg (find_last_seg word)) + (silence (car (car (cdr (car (PhoneSet.description '(silences)))))))) + (if lastseg + (item.relation.insert + lastseg 'Segment (list silence) 'after)))) + +(define (insert_initial_pause utt) +"(insert_initial_pause UTT) +Always have an initial silence if the utterance is non-empty. +Insert a silence segment after the last segment in WORDITEM in UTT." + (let ((firstseg (car (utt.relation.items utt 'Segment))) + (silence (car (car (cdr (car (PhoneSet.description '(silences)))))))) + (if firstseg + (item.relation.insert + firstseg 'Segment (list silence) 'before)))) + +(define (insert_final_pause utt) +"(insert_final_pause UTT) +Always have a final silence if the utterance is non-empty." + (let ((lastseg (utt.relation.last utt 'Segment)) + (silence (car (car (cdr (car (PhoneSet.description '(silences)))))))) + (set! silence (format nil "%l" silence)) ; to make the symbol a string + ;(format t "silence is %l\n" silence) + ;(format t "lastseg is %l\n" (item.name lastseg)) + (if lastseg + (if (not(equal? (item.name lastseg) silence)) + (begin + (format t "iserted final pause %s\n" silence) + (item.relation.insert lastseg 'Segment (list silence) 'after)))))) + + +(define (find_last_seg word) +;;; Find the segment that is immediately at this end of this word +;;; If this word is punctuation it might not have any segments +;;; so we have to check back until we find a word with a segment in it + (cond + ((null word) + nil) ;; there are no segs (don't think this can happen) + (t + (let ((lsyl (item.relation.daughtern word 'SylStructure))) + (if lsyl + (item.relation.daughtern lsyl 'SylStructure) + (find_last_seg (item.relation.prev word 'Word))))))) + +(define (Unisyn_Pauses utt) + "(Unisyn_Pauses UTT) +Predict pause insertion in a Unisyn utterance structure." + (let ((words (utt.relation.items utt 'Word)) lastword tpname) + (if words + (begin + (us_insert_initial_pause utt) ;; always have a start pause + (set! lastword (car (last words))) + (mapcar + (lambda (w) + (let ((pbreak (item.feat w "pbreak")) + (emph (item.feat w "R:Token.parent.EMPH"))) + (cond + ((or (string-equal "B" pbreak) + (string-equal "BB" pbreak)) + (us_insert_pause utt w)) +; ((string-equal emph "1") +; (us_insert_pause utt w)) + ((equal? w lastword) + (us_insert_pause utt w))))) + words) + ;; The embarrassing bit. Remove any words labelled as punc or fpunc + (mapcar + (lambda (w) + (let ((pos (item.feat w "pos"))) + (if (or (string-equal "punc" pos) + (string-equal "fpunc" pos)) + (let ((pbreak (item.feat w "pbreak")) + (wp (item.relation w 'Phrase))) + (if (and (string-matches pbreak "BB?") + (item.relation.prev w 'Word)) + (item.set_feat + (item.relation.prev w 'Word) "pbreak" pbreak)) + (item.relation.remove w 'Word) + ;; can't refer to w as we've just deleted it + (item.relation.remove wp 'Phrase))))) + words))) + utt)) + +(define (us_insert_pause utt word) +"(us_insert_pause UTT WORDITEM) +Insert a silence segment after the last segment in WORDITEM in UTT." + (let ((lastseg (us_find_last_seg word)) + (silence "pau")) + (if lastseg + (item.relation.insert + lastseg 'Segment (list silence) 'after)))) + +(define (us_insert_initial_pause utt) +"(us_insert_initial_pause UTT) +Always have an initial silence if the utterance is non-empty. +Insert a silence segment after the last segment in WORDITEM in UTT." + (let ((firstseg (utt.relation.first utt 'Segment)) + (silence "pau")) + (if firstseg + (item.relation.insert + firstseg 'Segment (list silence) 'before)))) + +(define (us_find_last_seg word) +;;; Find the segment that is immediately at this end of this word +;;; If this word is punctuation it might not have any segments +;;; so we have to check back until we find a word with a segment in it + (cond + ((null word) + nil) ;; there are no segs (don't think this can happen) + (t + (if (item.daughtern_to (item.relation word 'WordStructure) 'Syllable) + (item.daughtern_to + (item.relation + (item.daughtern_to (item.relation word 'WordStructure) 'Syllable) + 'SylStructure) + 'Segment) + (us_find_last_seg (item.relation.prev word 'Word)))))) + +(define (Pause_optional_deleting_B_X utt) +"(Pause_optional_deleting_B_X utt) + +Delete all phone symbols starting with 'B_' from the segemt relation +(a B_150 e.g. is a 150ms pause) if symbol 'Pause_delete_B_X is defined. +" +; The B_X never occur in the phone segmentation but are predicted by +; some pause methods, in particular the default I used to produce the +; .utt files for the 2009 test sentences for the Blizzard challange. +; Some participants complained about them and I had to fix it quickly. + (if (symbol-bound? 'Pause_delete_B_X) + (let(seg ) + (set! seg (item.next(utt.relation.first utt 'Segment))) + (while seg + (set! next_seg (item.next seg)) + ;(format t "segment %l\n" (item.name seg)) + (if(string-matches (item.name seg) "B_[0-9]*") + (item.delete seg)) + (set! seg next_seg))))) + +(provide 'pauses) diff --git a/CosyVoice-ttsfrd/resource/festival/phoneset.scm b/CosyVoice-ttsfrd/resource/festival/phoneset.scm new file mode 100644 index 0000000000000000000000000000000000000000..19d9b847c6d6b9f303675a87ffdc2187a925c8e4 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/phoneset.scm @@ -0,0 +1,134 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1999 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Author: Alan W Black +;;; Date: April 1999 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Support code for phone set definitions +;;; + +(defmac (defPhoneSet form) + (list 'defPhoneSet_real + (list 'quote (cadr form)) + (list 'quote (car (cddr form))) + (list 'quote (cadr (cddr form))))) + +(define (defPhoneSet_real name featdefs phones) + "(defPhoneSet NAME FEATTYPES PHONES) +Define a phone set with given name, feature types and +list of phones. This also selects name as the current phoneset." + (let (info) + (if (not (eq? 'Features (car featdefs))) + (begin + ;; Old format that has the same number of phone features for + ;; all phones + (set! info + (mapcar + (lambda (ph) + (let ((fvs + (mapcar + list + (mapcar car featdefs) + (cdr ph)))) + (ps_check_fvals + (cons (car ph) (cons (list 'type t) fvs)) + (cons t fvs)) + (list (car ph) fvs))) + phones))) + ;; else + ;; New format where types are specified so phones may have + ;; different features + (set! info + (mapcar + (lambda (ph) + (let ((fvs + (cons + (list 'type (cadr ph)) + (mapcar + list + (mapcar car (cdr (assoc (cadr ph) (cdr featdefs)))) + (cddr ph))))) + (ps_check_fvals + (cons (car ph) fvs) + (assoc (cadr ph) (cdr featdefs))) + (list (car ph) fvs))) + (cdr phones)))) + (Param.set + (string-append "phonesets." name) + info) + (PhoneSet.select name) + (list name info))) + +(define (ps_check_fvals fvs featdefs) + "(ps_check_fvals fvs featdefs) +Check that feature values in a phone definition are in the defined +set of possibles." + (mapcar + (lambda (fp) + (let ((def (cdr (assoc (car fp) (cdr featdefs))))) + (cond + ((not def) + (error "Phoneset definition: phone has no defined type" fvs)) + ((not (member_string (car (cdr fp)) def)) + (error + (format nil "Phoneset definition: phone feature %l is undefined" fp) fvs))))) + (cdr (cdr fvs)))) + +(define (PhoneSet.select name) + "(PhoneSet.select name) +Select named phonset as current." + (if (feats.present Param (string-append "phonesets." name)) + (Param.set "phoneset" (Param.get (string-append "phonesets." name))) + (error "no phoneset defined: " name))) + +(define (PhoneSet.description name) + "(PhoneSet.description) +Return (lisp) representation of current phoneset." + (feats.tolisp (Param.get "phoneset"))) + +(define (PhoneSet.list) + "(PhoneSet.list) +List of the names of the currently defined phonesets." + ;; This isn't a particularly efficient way to get the answer + (mapcar car (feats.tolisp (Param.get "phonesets")))) + +(define (PhoneSet.silences sils) + "(PhoneSet.silences SILLIST) +Define the silence phones for the currently selected phoneset." + (Param.set "phoneset.silences" sils)) + +(provide 'phoneset) + + + + diff --git a/CosyVoice-ttsfrd/resource/festival/phrase.scm b/CosyVoice-ttsfrd/resource/festival/phrase.scm new file mode 100644 index 0000000000000000000000000000000000000000..d35c8877c0ecb184d06daf94d21e34adb4286daa --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/phrase.scm @@ -0,0 +1,171 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Phrase boundary prediction. +;;; +;;; Two methods supported, if POS is enabled we use ngrams for that +;;; otherwise we use a CART tree +;;; +;;; Models trained from the IBM/Lancaster Spoken English Corpus and +;;; Boston University's FM Radio Corpus. + +;;; +;;; Here's a very simple CART tree for predicting phrase breaks +;;; based on punctuation only +;;; +(set! simple_phrase_cart_tree +' +((lisp_token_end_punc in ("?" "." ":")) + ((BB)) + ((lisp_token_end_punc in ("'" "\"" "," ";")) + ((B)) + ((n.name is 0) ;; end of utterance + ((BB)) + ((NB)))))) + +(define (token_end_punc word) + "(token_end_punc UTT WORD) +If punctuation at end of related Token and if WORD is last word +in Token return punc, otherwise 0." + (if (item.relation.next word "Token") + "0" + (item.feat word "R:Token.parent.punc"))) + +;;; This is a simple CART tree used after boundaries are predicted +;;; by the probabilistic method to get two levels of break +(set! english_phrase_type_tree +'((pbreak is NB) + ((num_break is 1) + ((mB)) + ((R:Token.parent.EMPH is 1) + ((NB)) + ((n.R:Token.parent.EMPH is 1) + ((NB)) + ((NB))))) + ((pbreak is BB) + ((BB)) + ((pbreak is mB) + ((mB)) + ((name in ("." "!" "?"));; only (potentially) change Bs to BBs + ((BB)) + ((B))))))) + +(set! f2b_phrase_cart_tree +' +((gpos is punc) + (((1 0.00238095) (3 0) (4 0.997619) B)) + (((4 0.00238095) (3 0) (1 0.997619) NB)))) + +;;; For more detailed prediction of phrase breaks we use POS and +;;; probability distribution of breaks +;;; These models were trained using data from the Lancaster/IBM +;;; Spoken English Corpus + +(require 'pos) ;; for part of speech map + +(defvar pbreak_ngram_dir libdir + "pbreak_ngram_dir + The directory containing the ngram models for predicting phrase + breaks. By default this is the standard library directory.") + +(defvar english_phr_break_params + (list + ;; The name and filename off the ngram with the a priori ngram model + ;; for predicting phrase breaks in the Phrasify module. This model should + ;; predict probability distributions for B and NB given some context of + ;; part of speech tags. + (list 'pos_ngram_name 'english_break_pos_ngram) + (list 'pos_ngram_filename + (path-append pbreak_ngram_dir "sec.ts20.quad.ngrambin")) + ;; The name and filename of the ngram containing the a posteriori ngram + ;; for predicting phrase breaks in the Phrasify module. This module should + ;; predict probability distributions for B and NB given previous B and + ;; NBs. + (list 'break_ngram_name 'english_break_ngram) + (list 'break_ngram_filename + (path-append pbreak_ngram_dir "sec.B.hept.ngrambin")) + ;; A weighting factor for breaks in the break/non-break ngram. + (list 'gram_scale_s 0.05) + ;; When Phrase_Method is prob_models, this tree, if set is used to + ;; potentially predict phrase type. At least some prob_models only + ;; predict B or NB, this tree may be used to change some Bs into + ;; BBs. If it is nil, the pbreak value predicted by prob_models + ;; remains the same. + (list 'phrase_type_tree english_phrase_type_tree) + ;; A list of tags used in identifying breaks. Typically B and NB (and + ;; BB). This should be the alphabet of the ngram identified in + ;; break_ngram_name + (list 'break_tags '(B NB)) + (list 'pos_map english_pos_map_wp39_to_wp20) + ) + "english_phr_break_params +Parameters for English phrase break statistical model.") + +(defvar phr_break_params nil + "phr_break_params +Parameters for phrase break statistical model. This is typcal set by +a voice selection function to the parameters for a particular model.") + +;;; +;;; Declaration of some features +;;; + +(def_feature_docstring + 'Word.pbreak + "Word.pbreak + Result from statistical phrasing module, may be B or NB denoting + phrase break or non-phrase break after the word.") + +(def_feature_docstring + 'Word.pbreak_score + "Word.pbreak_score + Log likelihood score from statistical phrasing module, for pbreak + value.") + +(def_feature_docstring + 'Word.blevel + "Word.blevel + A crude translation of phrase break into ToBI like phrase level. + Values may be 0,1,2,3,4.") + +(define (Phrasify utt) +"(Phrasify utt) +Construct phrasify over Words module." + (let ((rval (apply_method 'Phrasify_Method utt))) + (cond + (rval rval) ;; new style + (t + (Classic_Phrasify utt))))) + + +(provide 'phrase) diff --git a/CosyVoice-ttsfrd/resource/festival/pos.scm b/CosyVoice-ttsfrd/resource/festival/pos.scm new file mode 100644 index 0000000000000000000000000000000000000000..0ace3615ba414cb1b0b9f8ddb2d8a88966441579 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/pos.scm @@ -0,0 +1,229 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; A part of speech tagger +;;; + +(set! english_guess_pos + '((in of for in on that with by at from as if that against about + before because if under after over into while without + through new between among until per up down) + (to to) + (det the a an no some this that each another those every all any + these both neither no many) + (md will may would can could should must ought might) + (cc and but or plus yet nor) + (wp who what where how when) + (pps her his their its our their its mine) + (aux is am are was were has have had be) + (punc "." "," ":" ";" "\"" "'" "(" "?" ")" "!" "[" "]" "{" "}") + )) + +(defvar guess_pos english_guess_pos + "guess_pos + An assoc-list of simple part of speech tag to list of words in that + class. This basically only contains closed class words all other + words may be assumed to be content words. This was built from information + in the f2b database and is used by the ffeature gpos.") + +;;; A more elaborate part of speech tagger using ngrams works but +;;; at present requires a large list of a priori probabilities +;;; to work. If that file exists on your system we'll use it otherwise +;;; POS is guessed by the lexicon + +;;; These models were build from the Penn TreeBank, WSJ corpus + +(defvar pos_model_dir lexdir + "pos_model_dir + The directory contains the various models for the POS module. By + default this is the same directory as lexdir. The directory should + contain two models: a part of speech lexicon with reverse log probabilities + and an ngram model for the same part of speech tag set.") + +(defvar pos_p_start_tag "punc" + "pos_p_start_tag + This variable's value is the tag most likely to appear before + the start of a sentence. It is used when looking for pos context + before an utterance. Typically it should be some type of punctuation + tag.") + +(defvar pos_pp_start_tag "n" + "pos_pp_start_tag + This variable's value is the tag most likely to appear before + pos_p_start_tag and any position preceding that. It is typically + some type of noun tag. This is used to provide pos context for + early words in an utterance.") + +(defvar pos_supported nil + "pos_supported + If set to non-nil use part of speech prediction, if nil just get + pos information from the lexicon.") + +(defvar pos_ngram_name nil + "pos_ngram_name + The name of a loaded ngram containing the a posteriori ngram model for + predicting part of speech. The a priori model is held as a + lexicon call poslex.") + +(defvar pos_map nil + "pos_map + If set this should be a reverse assoc-list mapping on part of speech + tag set to another. It is used after using the defined POS models to + map the pos feature on each word to a new tagset.") + +;;; +;;; All the names here don't really allow multiple versions +;;; they should be prefixed with english_ +;;; + +(if (probe_file (path-append pos_model_dir "wsj.wp39.poslexR")) + (begin + (lex.create "english_poslex") + (lex.set.compile.file + (path-append pos_model_dir "wsj.wp39.poslexR")) + (lex.set.phoneset "mrpa") + (lex.set.lts.method nil) + (set! pos_lex_name "english_poslex") + (set! pos_p_start_tag "punc") + (set! pos_pp_start_tag "nn") + ;; wp39 + (lex.add.entry '("_OOV_" ((nnp -2.9144) (jj -2.7357) (nn -3.5787) + (nns -3.4933) (vbn -3.2486) (vbg -2.9419) + (vb -3.5471) (vbd -3.7896) (vbz -3.7820) + (rb -4.1940) (vbp -3.2755) (nnps -2.1605)) + ())) + (lex.add.entry '("_number_" + ((cd -0.35202) (jj -4.1083) (nns -6.4488) (nnp -7.3595)) + () )) + (lex.add.entry '("," ((punc -0.88488)) () )) + (lex.add.entry '("." ((punc -1.1104)) () )) + (lex.add.entry '(":" ((punc -4.4236)) () )) + (lex.add.entry '("``" ((punc -2.7867)) () )) + (lex.add.entry '("`" ((punc -2.7867)) () )) + (lex.add.entry '("'" ((punc -2.7867)) () )) + (lex.add.entry '("\"" ((punc -2.7867)) () )) + (lex.add.entry '("[" ((punc -2.7867)) () )) + (lex.add.entry '("]" ((punc -2.7867)) () )) + (lex.add.entry '("{" ((punc -2.7867)) () )) + (lex.add.entry '("}" ((punc -2.7867)) () )) + ;; wp17 +;; (lex.add.entry '("_OOV_" ((n -3.4109) (j -2.7892) (v -3.7426)) ())) +; (lex.add.entry '("_OOV_" ((n -1.968) (j -2.351) (v -2.287)) ())) +; (lex.add.entry '("_number_" ((j -0.35202)) ())) +; (lex.add.entry '("," ((punc -0.88359)) () )) +; (lex.add.entry '("." ((punc -1.1101)) () )) +; (lex.add.entry '(":" ((punc -4.4236)) () )) +; (lex.add.entry '("``" ((punc -2.7867)) () )) +; (lex.add.entry '("`" ((punc -2.7867)) () )) +; (lex.add.entry '("'" ((punc -2.7867)) () )) +; (lex.add.entry '("\"" ((punc -2.7867)) () )) + ;; wp22 +; (lex.add.entry '("_OOV_" ((n -3.4109) (j -2.7892) (v -3.7426)) ())) +; (lex.add.entry '("_number_" ((cd -0.35202) (j -4.1908) (n -7.3890)) ())) +; (lex.add.entry '("," ((punc -0.88359)) () )) +; (lex.add.entry '("." ((punc -1.1101)) () )) +; (lex.add.entry '(":" ((punc -4.4236)) () )) +; (lex.add.entry '("``" ((punc -2.7867)) () )) + ;; wp18 +; (lex.add.entry '("_OOV_" ((n -3.4109) (j -2.7892) (v -3.7426)) ())) +; (lex.add.entry '("_number_" ((j -0.35202)) ())) +; (lex.add.entry '("`" ((punc -6.539) ) () )) +; (lex.add.entry '("``" ((punc -2.399) ) () )) +; (lex.add.entry '("," ((punc -0.480) ) () )) +; (lex.add.entry '("." ((fpunc -0.012) ) () )) +; (lex.add.entry '(":" ((punc -4.100) ) () )) + + (ngram.load 'english_pos_ngram + (path-append pos_model_dir "wsj.wp39.tri.ngrambin")) +; (ngram.load 'english_pos_ngram +; (path-append pos_model_dir "wsj.wp45.tri.ngram")) + (set! pos_supported t) + ) + (set! pos_supported nil)) + +(setq english_pos_map_wp39_to_wp20 + '( + (( vbd vb vbn vbz vbp vbg ) v) + (( nn nnp nns nnps fw sym ls ) n) + (( dt ) dt) + (( punc fpunc ) punc) + (( in ) in) + (( jj jjr jjs 1 2 ) j) + (( prp ) prp) + (( rb rp rbr rbs ) r) + (( cc ) cc) + (( of ) of) + (( to ) to) + (( cd ) cd) + (( md ) md) + (( pos ) pos) + (( wdt ) wdt) + (( wp ) wp) + (( wrb ) wrb) + (( ex ) ex) + (( uh ) uh) + (( pdt ) pdt) + )) + +(defvar pos_map nil + "pos_map +A reverse assoc list of predicted pos tags to some other tag set. Note +using this changes the pos tag loosing the actual predicted value. Rather +than map here you may find it more appropriate to map tags sets locally +in the modules that use them (e.g. phrasing and lexicons).") + +;;(setq pos_map_remap +;; '( +;; (( fpunc ) punc) +;; (( of ) in))) + +(def_feature_docstring 'Word.pos + "Word.pos + Part of speech tag value returned by the POS tagger module.") + +(def_feature_docstring 'Word.pos_score + "Word.pos_score + Part of speech tag log likelihood from Viterbi search.") + +(define (POS utt) +"(POS utt) +Apply part of speech tagging (and possible parsing too) to Word +relation." + (let ((rval (apply_method 'POS_Method utt))) + (cond + (rval rval) ;; new style + (t + (Classic_POS utt))))) + + +(provide 'pos) diff --git a/CosyVoice-ttsfrd/resource/festival/postlex.scm b/CosyVoice-ttsfrd/resource/festival/postlex.scm new file mode 100644 index 0000000000000000000000000000000000000000..7fb038bad9437d00bba9e7e7e95b7b3964002e89 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/postlex.scm @@ -0,0 +1,587 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Postlexical rules +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Modifed for CSTR HTS Voice Library ;; +;; Author : Junichi Yamagishi (jyamagis@inf.ed.ac.uk) ;; +;; Date : Sept 2008 ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +(define (PostLex utt) +"(PostLex utt) +Apply post lexical rules to segment stream. These may be almost +arbitrary rules as specified by the particular voice, through the +postlex_hooks variable. A number of standard post lexical rule +sets are provided including reduction, posessives etc. These +rules are also used to mark standard segments with their cluster +information used in creating diphone names." +(let ((rval (apply_method 'PostLex_Method utt))) + (cond + (rval rval) ;; new style + (t ;; should only really need this one + (apply_hooks postlex_rules_hooks utt))) + utt +)) + +(define (Classic_PostLex utt) + "(Classic_PostLex utt) +Apply post lexical rules (both builtin and those specified in +postlex_rules_hooks)." + (Builtin_PostLex utt) ;; haven't translated all the rules yet + (apply_hooks postlex_rules_hooks utt) + utt +) + +(defvar postlex_rules_hooks nil +"postlex_rules_hooks +A function or list of functions which encode post lexical rules. +This will be voice specific, though some rules will be shared across +languages.") + +;;; Mapping of full vowels to reduced vowels, this should be part +;;; of the phoneset definitions +(defvar postlex_vowel_reduce_table + '((mrpa + ((uh @) (i @) (a @) (e @) (u @) (o @) (oo @))) + (radio + ((ah ax el en em) + (ih ax) +; (er axr ax) +; (iy ih) +; (ey ax) + (aa ax) + (ae ax) + (eh ax)))) +"postlex_vowel_reduce_table +Mapping of vowels to their reduced form. This in an assoc list of +phoneset name to an assoc list of full vowel to reduced form.") + +(defvar postlex_vowel_reduce_cart_tree nil +"postlex_vowel_reduce_cart_tree +CART tree for vowel reduction.") + +(defvar postlex_vowel_reduce_cart_tree_hand + '((stress is 0) + ((p.syl_break < 2) + ((syl_break < 2) + ((1)) + ((0))) + ((0))) + ((0))) +"postlex_vowel_reduce_cart_tree_hand +A CART tree for vowel reduction. This is hand-written.") + +(defvar postlex_vowel_reduce_cart_data +' +((R:SylStructure.parent.gpos is cc) + (((0 0.993548) (1 0.00645161) 0)) + ((p.R:SylStructure.parent.gpos is md) + (((0 0.903226) (1 0.0967742) 0)) + ((p.R:SylStructure.parent.gpos is det) + ((n.R:SylStructure.parent.gpos is content) + ((last_accent < 2.5) + ((next_accent < 2.5) + ((next_accent < 1.2) + ((n.syl_break is 4) + (((0 0.967213) (1 0.0327869) 0)) + ((syl_break is 4) + (((0 0.952381) (1 0.047619) 0)) + ((n.syl_break is 4) + (((0 0.953488) (1 0.0465116) 0)) + ((position_type is single) + (((0 0.947368) (1 0.0526316) 0)) + ((accented is 0) + ((n.accented is 0) + (((0 0.857143) (1 0.142857) 0)) + (((0 0.415385) (1 0.584615) 1))) + (((0 0.974359) (1 0.025641) 0))))))) + (((0 0.968254) (1 0.031746) 0))) + (((0 0.969697) (1 0.030303) 0))) + (((0 0.976744) (1 0.0232558) 0))) + (((0 0.990291) (1 0.00970874) 0))) + ((next_accent < 108.5) + ((p.R:SylStructure.parent.gpos is pps) + (((0 0.828947) (1 0.171053) 0)) + ((R:SylStructure.parent.gpos is det) + ((accented is 0) + (((0 0.0599572) (1 0.940043) 1)) + (((0 0.949367) (1 0.0506329) 0))) + ((p.R:SylStructure.parent.gpos is cc) + (((0 0.880952) (1 0.119048) 0)) + ((p.R:SylStructure.parent.gpos is wp) + (((0 0.875) (1 0.125) 0)) + ((p.R:SylStructure.parent.gpos is in) + ((n.syl_break is 4) + (((0 0.961538) (1 0.0384615) 0)) + ((next_accent < 2.5) + ((syl_break is 4) + (((0 0.95122) (1 0.0487805) 0)) + ((next_accent < 1.2) + ((accented is 0) + ((n.stress is 0) + (((0 0.788462) (1 0.211538) 0)) + ((R:SylStructure.parent.R:Word.p.gpos is content) + (((0 0.863636) (1 0.136364) 0)) + ((position_type is single) + (((0 0.729167) (1 0.270833) 0)) + (((0 0.4) (1 0.6) 1))))) + (((0 0.983871) (1 0.016129) 0))) + (((0 0.96) (1 0.04) 0)))) + (((0 0.963636) (1 0.0363636) 0)))) + ((position_type is single) + ((syl_break is 4) + (((0 0.993865) (1 0.00613497) 0)) + ((p.R:SylStructure.parent.gpos is to) + (((0 0.984375) (1 0.015625) 0)) + ((syl_break is 1) + ((accented is 0) + ((n.R:SylStructure.parent.gpos is in) + (((0 0.869565) (1 0.130435) 0)) + ((R:SylStructure.parent.gpos is content) + (((0 0.861789) (1 0.138211) 0)) + ((p.R:SylStructure.parent.gpos is content) + ((p.syl_break is 4) + (((0 0.858065) (1 0.141935) 0)) + ((R:SylStructure.parent.gpos is in) + ((p.syl_break is 1) + ((n.R:SylStructure.parent.gpos is det) + (((0 0.659574) (1 0.340426) 0)) + ((p.stress is 0) + (((0 0.422222) (1 0.577778) 1)) + (((0 0.582278) (1 0.417722) 0)))) + ((n.accented is 0) + ((n.R:SylStructure.parent.gpos is content) + (((0 0.65) (1 0.35) 0)) + ((p.stress is 0) + (((0 0.464286) (1 0.535714) 1)) + (((0 0.538462) (1 0.461538) 0)))) + (((0 0.803279) (1 0.196721) 0)))) + ((n.R:SylStructure.parent.gpos is det) + (((0 0.952381) (1 0.047619) 0)) + ((n.syl_break is 4) + (((0 0.833333) (1 0.166667) 0)) + ((p.stress is 0) + ((p.syl_break is 1) + ((n.syl_break is 1) + (((0 0.740741) (1 0.259259) 0)) + ((R:SylStructure.parent.gpos is aux) + (((0 0.478261) (1 0.521739) 1)) + (((0 0.769231) (1 0.230769) 0)))) + (((0 0.755556) (1 0.244444) 0))) + (((0 0.797619) (1 0.202381) 0))))))) + (((0 0.870968) (1 0.129032) 0))))) + (((0 0.983806) (1 0.0161943) 0))) + (((0 0.977778) (1 0.0222222) 0))))) + ((next_accent < 21.6) + ((p.stress is 0) + ((R:SylStructure.parent.R:Word.p.gpos is md) + (((0 0.961538) (1 0.0384615) 0)) + ((position_type is mid) + (((0 0.977612) (1 0.0223881) 0)) + ((n.R:SylStructure.parent.gpos is det) + (((0 0.916667) (1 0.0833333) 0)) + ((R:SylStructure.parent.R:Word.n.gpos is 0) + (((0 0.915493) (1 0.084507) 0)) + ((R:SylStructure.parent.R:Word.n.gpos is pps) + (((0 0.884615) (1 0.115385) 0)) + ((n.stress is 0) + ((n.syl_break is 4) + (((0 0.986755) (1 0.013245) 0)) + ((p.syl_break is 4) + (((0 0.977011) (1 0.0229885) 0)) + ((n.syl_break is 4) + (((0 0.965517) (1 0.0344828) 0)) + ((last_accent < 1.2) + ((last_accent < 0.1) + (((0 0.910448) (1 0.0895522) 0)) + ((next_accent < 1.2) + ((R:SylStructure.parent.R:Word.n.gpos is in) + (((0 0.82) (1 0.18) 0)) + ((n.syl_break is 0) + ((R:SylStructure.parent.R:Word.p.gpos is content) + (((0 0.819672) (1 0.180328) 0)) + (((0 0.444444) (1 0.555556) 1))) + (((0 0.785714) (1 0.214286) 0)))) + (((0 0.836364) (1 0.163636) 0)))) + (((0 0.962025) (1 0.0379747) 0)))))) + ((stress is 0) + ((n.syl_break is 4) + (((0 0.21875) (1 0.78125) 1)) + ((R:SylStructure.parent.R:Word.p.gpos is aux) + (((0 0.259259) (1 0.740741) 1)) + ((p.syl_break is 1) + (((0 0.243094) (1 0.756906) 1)) + ((R:SylStructure.parent.R:Word.p.gpos is det) + (((0 0.290323) (1 0.709677) 1)) + ((R:SylStructure.parent.R:Word.p.gpos is in) + (((0 0.3) (1 0.7) 1)) + ((syl_break is 1) + (((0 0.289157) (1 0.710843) 1)) + ((p.syl_break is 4) + (((0 0.352941) (1 0.647059) 1)) + ((n.syl_break is 0) + (((0 0.311475) (1 0.688525) 1)) + ((syl_break is 4) + (((0 0.4) (1 0.6) 1)) + (((0 0.581395) (1 0.418605) 0))))))))))) + (((0 1) (1 0) 0))))))))) + ((stress is 0) + ((R:SylStructure.parent.R:Word.n.gpos is 0) + (((0 0.121212) (1 0.878788) 1)) + ((next_accent < 2.4) + ((R:SylStructure.parent.gpos is content) + ((position_type is mid) + (((0 0.176895) (1 0.823105) 1)) + ((p.syl_break is 1) + (((0 0.229167) (1 0.770833) 1)) + ((syl_break is 4) + (((0 0.242775) (1 0.757225) 1)) + ((p.syl_break is 0) + ((n.R:SylStructure.parent.gpos is in) + (((0 0.253521) (1 0.746479) 1)) + ((R:SylStructure.parent.R:Word.p.gpos is in) + (((0 0.262774) (1 0.737226) 1)) + ((last_accent < 2.1) + ((n.R:SylStructure.parent.gpos is aux) + (((0 0.304348) (1 0.695652) 1)) + ((next_accent < 1.2) + ((n.R:SylStructure.parent.gpos is cc) + (((0 0.291667) (1 0.708333) 1)) + ((syl_break is 1) + ((n.syl_break is 4) + (((0 0.344828) (1 0.655172) 1)) + ((R:SylStructure.parent.R:Word.p.gpos is det) + (((0 0.364706) (1 0.635294) 1)) + ((n.syl_break is 4) + (((0 0.384615) (1 0.615385) 1)) + ((last_accent < 1.2) + ((p.accented is 0) + (((0 0.584906) (1 0.415094) 0)) + ((n.accented is 0) + ((R:SylStructure.parent.R:Word.p.gpos is content) + (((0 0.41) (1 0.59) 1)) + (((0 0.6) (1 0.4) 0))) + (((0 0.333333) (1 0.666667) 1)))) + (((0 0.380952) (1 0.619048) 1)))))) + ((p.accented is 0) + (((0 0.183673) (1 0.816327) 1)) + ((n.R:SylStructure.parent.gpos is content) + ((n.stress is 0) + (((0 0.295455) (1 0.704545) 1)) + ((R:SylStructure.parent.R:Word.p.gpos is content) + ((n.syl_break is 1) + (((0 0.5) (1 0.5) 0)) + (((0 0.40625) (1 0.59375) 1))) + (((0 0.333333) (1 0.666667) 1)))) + (((0 0.2) (1 0.8) 1)))))) + (((0 0.3) (1 0.7) 1)))) + (((0 0.302326) (1 0.697674) 1))))) + (((0 0.25) (1 0.75) 1)))))) + (((0 0.173913) (1 0.826087) 1))) + (((0 0.166667) (1 0.833333) 1)))) + (((0 1) (1 0) 0)))) + (((0 0.2) (1 0.8) 1))))))))) + (((0 0.15) (1 0.85) 1))))))) + +(defvar postlex_mrpa_r_cart_tree +'((name is r) + ((R:Segment.n.ph_vc is -) + ((delete)) + ((nil))) + ((nil))) +"postlex_mrpa_r_cart_tree +For remove final R when not between vowels.") + + +;; Changed this to actually work... (Rob 09/12/04) +;; Changed this to delete the syllable when schwa is unneccesary (awb 19/07/04) +(define (postlex_apos_s_check utt) + "(postlex_apos_s_check UTT) +Deal with possesive s for English (American and British). Delete +schwa of 's if previous is not an alveolar or palatal fricative or affricative, and +change voiced to unvoiced s if previous is not voiced." + (mapcar + (lambda (syl) + ; word is 's + (if (string-equal "'s" (item.feat + syl "R:SylStructure.parent.name")) + (begin + ;; de-voice if last phone of previous word is unvoiced + (if (string-equal + "-" + (item.feat syl "p.R:SylStructure.daughtern.ph_cvox")) + (item.set_name + (item.relation.daughtern syl 'SylStructure) + "s")) ;; change it from "z" to "s" + ; if the previous seg is a aveolar or palatal, + ; fricative or affricate don't delete schwa otherwise delete it + (if (and + (member_string + (item.feat syl "p.R:SylStructure.daughtern.ph_ctype") '(f a)) + (member_string + (item.feat syl "p.R:SylStructure.daughtern.ph_cplace") '(a p))) + (begin + t) + (begin + ;; delete the schwa + (item.delete (item.relation.daughter1 syl 'SylStructure)) + ;; attach orphaned s/z to previous word + (item.relation.append_daughter + (item.prev syl) + 'SylStructure + (item.relation.daughtern syl 'SylStructure)) + ;; delete the now empty syllable + (item.delete syl)))))) + ;; never happens to if 's is first in an utterance + (cdr (utt.relation.items utt 'Syllable))) + utt) + +;; Changed this to work the other way round, too. Volker 10/08/06 +(define (postlex_the_vs_thee utt) +"(postlex_the_vs_thee utt) +Unnreduce the schwa in \"the\" when a vowel follows. +Reduce the vowel in \"the\" when no vowel follows (this +requires a lexicon entry for \"the\" with feature \"reduced\", +otherwise there will be no reduction)." +(let ((fullform (cadr (car (caar (cdr (cdar (lex.lookup_all 'thee))))))) + (reducedform (cadr(car(caar(cddr(lex.lookup 'the '(reduced))))))) + seg) + + (mapcar + (lambda (word) + (if (string-equal "the" (downcase (item.feat word "name"))) + (begin + (set! seg (item.relation (item.daughtern (item.relation.daughtern word 'SylStructure)) 'Segment)) + (if (string-equal "+" (item.feat (item.next seg) 'ph_vc)) + (item.set_feat seg 'name fullform) + (item.set_feat seg 'name reducedform))))) + (utt.relation.items utt 'Word))) +utt) + +(define (postlex_the_vs_thee_changeflag utt) +"(postlex_the_vs_thee_changeflag utt) +Unnreduce the schwa in \"the\" when a vowel follows. +Reduce the vowel in \"the\" when no vowel follows (this +requires a lexicon entry for \"the\" with feature \"reduced\", +otherwise there will be no reduction)." +(let ((fullform (cadr (car (caar (cdr (cdar (lex.lookup_all 'thee))))))) + (reducedform (cadr(car(caar(cddr(lex.lookup 'the '(reduced))))))) + seg) + + (mapcar + (lambda (word) + (if (string-equal "the" (downcase (item.feat word "name"))) + (begin + (set! seg (item.relation (item.daughtern (item.relation.daughtern word 'SylStructure)) 'Segment)) + (if (string-equal "+" (item.feat (item.next seg) 'ph_vc)) + (item.set_feat seg 'reducable 0) + (item.set_feat seg 'reducable 1))))) + (utt.relation.items utt 'Word))) +utt) + + +;; For Multisyn voices only. Volker 14/08/06 +(define (postlex_a utt) +"(postlex_a utt) +If POS of \"a\" is \"nn\" and segment feature \"reducable\", set it to 0. +This is a bugfix, but still requires the target cost function to add a +penalty if a candidate is reducable but the target is not. expro_target_cost +does that." +(let(seg) + (mapcar + (lambda(word) +;; (format t "%s\t%s\n" (item.feat word 'name)(item.feat word 'pos)) + (if(and(string-equal "a" (downcase (item.feat word "name"))) + (string-equal "nn" (item.feat word "pos"))) + (begin + (set! seg (item.relation (item.daughtern (item.relation.daughtern word +'SylStructure)) 'Segment)) +;; (format t "should not be reducable\n") + (if (eq 1 (parse-number (item.feat seg 'reducable))) + (item.set_feat seg 'reducable 0)))) + ) + (utt.relation.items utt 'Word))) +utt) + + + +(define (postlex_unilex_vowel_reduction utt) +"(postlex_unilex_vowel_reduction utt) +Perform vowel reduction based on unilex specification of what can be reduced." +(let () + (mapcar + (lambda (seg) + (if (and (eq? (parse-number (item.feat seg "reducable")) 1) + (not (> (parse-number (item.feat seg "R:SylStructure.parent.stress")) 0))) + (if (not (and (seg_word_final seg) + (string-equal (item.feat (item.next seg) 'ph_vc) "+"))) + (item.set_feat seg "name" (item.feat seg "reducedform"))))) + (utt.relation.items utt 'Segment))) +utt) + + + + +(define (seg_word_final seg) +"(seg_word_final seg) +Is this segment word final?" + (let ((this_seg_word (item.parent (item.relation.parent seg 'SylStructure))) + (silence (car (cadr (car (PhoneSet.description '(silences)))))) + next_seg_word) + (if (item.next seg) + (set! next_seg_word (item.parent (item.relation.parent (item.next seg) 'SylStructure)))) + (if (or (equal? this_seg_word next_seg_word) + (string-equal (item.feat seg "name") silence)) + nil + t))) + + + +;; imported from postlex_intervoc_r.scm Volker 14/08/06 +(define (postlex_intervoc_r utt) +"(postlex_intervoc_r UTT) + +Remove any word-final /r/ which is phrase-final or not going +to be inter-vocalic i.e. the following words does not start +with a vowel. + +NOTE: in older versions of unilex-rpx.out for Festival, there +is no word-final /r/. + +" +(let (word next_word last_phone following_phone) + (set! word (utt.relation.first utt 'Word)) + + (while word + (set! next_word (item.next word)) + (set! last_phone (item.daughtern + (item.daughtern(item.relation word 'SylStructure)))) + (if next_word + (begin + + (set! following_phone (item.daughter1 + (item.daughter1 + (item.relation next_word 'SylStructure)))) + ; last_phone and following_phone should always be defined at this point, + ; but since the upgrade to Fedora and characters no longer being in ISO + ; but in UTF8, the pound sterling is no longer treated correctly. + ; Probably (Token utt) should be fixed. + + (if (and following_phone last_phone) + (begin + (format t "%s\t%s %s %s %s\n" (item.name word) + (item.name last_phone) + (item.name following_phone) + (item.feat following_phone 'ph_vc) + (item.feat word 'pbreak)) + (if(and(equal? "r" (item.name last_phone)) + (or(not(equal? "NB" (item.feat word 'pbreak))) + (equal? "-" (item.feat following_phone 'ph_vc)))) + (begin + (format t "\t\t\t/r/ in \"%s %s\" deleted\n" + (item.name word)(item.name next_word)) + (item.delete last_phone)))))) + (if(and last_phone (equal? "r" (item.name last_phone))) + (begin + (format t "\t\t\tutterance-final /r/ deleted\n") + (item.delete last_phone))) + ) + + (set! word (item.next word)))) + utt) + + +(define (postlex_stop_deletion utt) +"(postlex_stop_deletion utt) + +Delete any stop or affricative (phone which has a closure) +immediately followed by another stop or affricative. + +Also save the identity of the deleted phone for the +context cost functions. Consider: + +backtrack /b a k t r a k/ -> /b a t r a k/ +(actually Jenny reduces : /b a k_cl k t_cl t r a k/ -> /b a k_cl t r a k/) +If we then look for a diphone /a t/ we want to favour +candidates coming from the same context i.e. which +are actually a reduced /a k t/. In the data base, +the 1st /a/ gets the feature right_context=k and the +/t/ gets the fearture left_context=k. + +" +(let(seg next_seg prev_seg) + (set! seg (utt.relation.first utt 'Segment)) + (while seg + (set! prev_seg (item.prev seg)) + (if prev_seg + (begin + ;(format t "%s %s %s\n" (item.name seg) + ; (item.feat seg 'ph_ctype) + ; (item.feat seg 'p.ph_ctype)) + (if(and(or(equal? "s" (item.feat seg 'ph_ctype)) + (equal? "a" (item.feat seg 'ph_ctype))) + (or(equal? "s" (item.feat seg 'p.ph_ctype)) + (equal? "a" (item.feat seg 'p.ph_ctype))) + ; When there are 3 stops in a row, and after the 1st has been + ; deleted, this prevents the 2nd to be deleted as well: + (equal? 0 (item.feat prev_seg 'left_context))) + (begin + (set! prev_prev_seg (item.prev prev_seg)) + (format t "postlex_stop_deletion: %s in %s\n" + (item.name prev_seg) + (item.name(item.parent(item.relation.parent prev_seg + 'SylStructure)))) + (if prev_prev_seg + (begin + ;(format t "setting left_context of %s and right context of %s to %s\n" + ; (item.name seg) + ; (item.name prev_prev_seg) + ; (item.name prev_seg)) + (item.set_feat seg 'left_context (item.name prev_seg)) + (item.set_feat prev_prev_seg 'right_context (item.name prev_seg)))) + (if(and(item.next seg) + (equal? (item.name seg) (item.name prev_seg))) + (begin + ;(format t "setting left_context of %s to %s\n" + ; (item.name (item.next seg) + ; (item.name prev_seg)) + + (item.set_feat (item.next seg) 'left_context (item.name prev_seg)))) + (item.delete prev_seg))))) + (set! seg (item.next seg)))) +utt) + +(provide 'postlex) diff --git a/CosyVoice-ttsfrd/resource/festival/radio_phones.scm b/CosyVoice-ttsfrd/resource/festival/radio_phones.scm new file mode 100644 index 0000000000000000000000000000000000000000..7c6b524e6bfebada7620ec85b4fba8398faac6c0 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/radio_phones.scm @@ -0,0 +1,122 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; A definition of the radio phone set used in the BU RADIO FM +;;; corpus, some people call this the darpa set. This one +;;; has the closures removed +;;; + +(defPhoneSet + radio + ;;; Phone Features + (;; vowel or consonant + (vc + -) + ;; vowel length: short long dipthong schwa + (vlng s l d a 0) + ;; vowel height: high mid low + (vheight 1 2 3 0) + ;; vowel frontness: front mid back + (vfront 1 2 3 0) + ;; lip rounding + (vrnd + - 0) + ;; consonant type: stop fricative affricate nasal lateral approximant + (ctype s f a n l r 0) + ;; place of articulation: labial alveolar palatal labio-dental + ;; dental velar glottal + (cplace l a p b d v g 0) + ;; consonant voicing + (cvox + - 0) + ) + ;; Phone set members + ( + ;; Note these features were set by awb so they are wrong !!! + (aa + l 3 3 - 0 0 0) ;; father + (ae + s 3 1 - 0 0 0) ;; fat + (ah + s 2 2 - 0 0 0) ;; but + (ao + l 3 3 + 0 0 0) ;; lawn + (aw + d 3 2 - 0 0 0) ;; how + (ax + a 2 2 - 0 0 0) ;; about + (axr + a 2 2 - r a +) + (ay + d 3 2 - 0 0 0) ;; hide + (b - 0 0 0 0 s l +) + (ch - 0 0 0 0 a p -) + (d - 0 0 0 0 s a +) + (dh - 0 0 0 0 f d +) + (dx - a 0 0 0 s a +) ;; ?? + (eh + s 2 1 - 0 0 0) ;; get + (el + s 0 0 0 l a +) + (em + s 0 0 0 n l +) + (en + s 0 0 0 n a +) + (er + a 2 2 - r 0 0) ;; always followed by r (er-r == axr) + (ey + d 2 1 - 0 0 0) ;; gate + (f - 0 0 0 0 f b -) + (g - 0 0 0 0 s v +) + (hh - 0 0 0 0 f g -) + (hv - 0 0 0 0 f g +) + (ih + s 1 1 - 0 0 0) ;; bit + (iy + l 1 1 - 0 0 0) ;; beet + (jh - 0 0 0 0 a p +) + (k - 0 0 0 0 s v -) + (l - 0 0 0 0 l a +) + (m - 0 0 0 0 n l +) + (n - 0 0 0 0 n a +) + (nx - 0 0 0 0 n d +) ;; ??? + (ng - 0 0 0 0 n v +) + (ow + d 2 3 + 0 0 0) ;; lone + (oy + d 2 3 + 0 0 0) ;; toy + (p - 0 0 0 0 s l -) + (r - 0 0 0 0 r a +) + (s - 0 0 0 0 f a -) + (sh - 0 0 0 0 f p -) + (t - 0 0 0 0 s a -) + (th - 0 0 0 0 f d -) + (uh + s 1 3 + 0 0 0) ;; full + (uw + l 1 3 + 0 0 0) ;; fool + (v - 0 0 0 0 f b +) + (w - 0 0 0 0 r l +) + (y - 0 0 0 0 r p +) + (z - 0 0 0 0 f a +) + (zh - 0 0 0 0 f p +) + (pau - 0 0 0 0 0 0 -) + (h# - 0 0 0 0 0 0 -) + (brth - 0 0 0 0 0 0 -) + ) +) + +(PhoneSet.silences '(pau h# brth)) + +(provide 'radio_phones) + + + + diff --git a/CosyVoice-ttsfrd/resource/festival/sable-latin.ent b/CosyVoice-ttsfrd/resource/festival/sable-latin.ent new file mode 100644 index 0000000000000000000000000000000000000000..f068020fc1bc2f26c724e83182487d4ed2f7a600 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/sable-latin.ent @@ -0,0 +1,171 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/CosyVoice-ttsfrd/resource/festival/sec.B.hept.ngrambin b/CosyVoice-ttsfrd/resource/festival/sec.B.hept.ngrambin new file mode 100644 index 0000000000000000000000000000000000000000..3434e0f5736b3582c1ca3d53b1a4b5c50aed9172 Binary files /dev/null and b/CosyVoice-ttsfrd/resource/festival/sec.B.hept.ngrambin differ diff --git a/CosyVoice-ttsfrd/resource/festival/sec.ts20.quad.ngrambin b/CosyVoice-ttsfrd/resource/festival/sec.ts20.quad.ngrambin new file mode 100644 index 0000000000000000000000000000000000000000..3b35f451bc36c3122ad5a5b2baf63991d9d029d1 Binary files /dev/null and b/CosyVoice-ttsfrd/resource/festival/sec.ts20.quad.ngrambin differ diff --git a/CosyVoice-ttsfrd/resource/festival/singing-mode.scm b/CosyVoice-ttsfrd/resource/festival/singing-mode.scm new file mode 100644 index 0000000000000000000000000000000000000000..91751e336807d40669eb347cef691309013fc352 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/singing-mode.scm @@ -0,0 +1,673 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Festival Singing Mode +;;; +;;; Written by Dominic Mazzoni +;;; Carnegie Mellon University +;;; 11-752 - "Speech: Phonetics, Prosody, Perception and Synthesis" +;;; Spring 2001 +;;; +;;; Extended by Milan Zamazal , 2006: +;;; - Slur support. +;;; - Czech support. +;;; - Some cleanup. +;;; - Print debugging information only when singing-debug is true. +;;; +;;; This code is public domain; anyone may use it freely. +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(require_module 'rxp) + +(xml_register_id "-//SINGING//DTD SINGING mark up//EN" + (path-append xml_dtd_dir "Singing.v0_1.dtd") + ) + +(xml_register_id "-//SINGING//ENTITIES Added Latin 1 for SINGING//EN" + (path-append xml_dtd_dir "sable-latin.ent") + ) + +;; Set this to t to enable debugging messages: +(defvar singing-debug nil) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; XML parsing functions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; +;; singing_xml_targets +;; +;; This variable defines the actions that are to be taken when +;; parsing each of our XML tags: SINGING, PITCH, DURATION, and REST. +;; +;; When we get the pitch and duration of each token, we store them +;; in features of the token. Later our intonation and duration +;; functions access these features. +;; + +(defvar singing_xml_elements + '( + ("(SINGING" (ATTLIST UTT) + (set! singing_pitch_att_list nil) + (set! singing_dur_att_list nil) + (set! singing_global_time 0.0) + (set! singing_bpm (get-bpm ATTLIST)) + (set! singing_bps (/ singing_bpm 50.0)) ;; change this back to 60 + nil) + + (")SINGING" (ATTLIST UTT) + (xxml_synth UTT) ;; Synthesize the remaining tokens + nil) + + ("(PITCH" (ATTLIST UTT) + (set! singing_pitch_att_list ATTLIST) + UTT) + + (")PITCH" (ATTLIST UTT) + (let ((freq (get-freqs singing_pitch_att_list))) + (if singing-debug + (begin + (print "freqs") + (print freq))) + (singing-append-feature! UTT 'freq freq)) + UTT) + + ("(DURATION" (ATTLIST UTT) + (set! singing_dur_att_list ATTLIST) + UTT) + + (")DURATION" (ATTLIST UTT) + (let ((dur (get-durs singing_dur_att_list))) + (if singing-debug + (begin + (print "durs") + (print dur))) + (singing-append-feature! UTT 'dur dur)) + UTT) + + ("(REST" (ATTLIST UTT) + (let ((dur (get-durs ATTLIST))) + (if singing-debug + (begin + (print "rest durs") + (print dur))) + (singing-append-feature! UTT 'rest (caar dur))) + UTT) + )) + +;; +;; get-bpm +;; +;; Given the attribute list of a SINGING tag, returns the beats +;; per minute of the song from the BPM parameter. +;; + +(define (get-bpm atts) + (parse-number (car (car (cdr (assoc 'BPM atts)))))) + +;; +;; get-durs +;; +;; Given the attribute list of a DURATION tag, returns a list of +;; durations in seconds for the syllables of the word enclosed by +;; this tag. +;; +;; It first looks for a BEATS parameter, and converts these to +;; seconds using BPM, which was set in the SINGING tag. If this +;; is not present, it looks for the SECONDS parameter. +;; + +(define (get-durs atts) + (let ((seconds (car (car (cdr (assoc 'SECONDS atts))))) + (beats (car (car (cdr (assoc 'BEATS atts)))))) + (if (equal? beats 'X) + (mapcar (lambda (lst) (mapcar parse-number lst)) + (string->list seconds)) + (mapcar (lambda (lst) + (mapcar (lambda (x) (/ (parse-number x) singing_bps)) lst)) + (string->list beats))))) + +;; +;; get-freqs +;; +;; Given the attribute list of a PITCH tag, returns a list of +;; frequencies in Hertz for the syllables of the word enclosed by +;; this tag. +;; +;; It first looks for a NOTE parameter, which can contain a MIDI +;; note of the form "C4", "D#3", or "Ab6", and if this is not +;; present it looks for the FREQ parameter. +;; + +(define (get-freqs atts) + (let ((freqs (car (car (cdr (assoc 'FREQ atts))))) + (notes (car (car (cdr (assoc 'NOTE atts)))))) + (if (equal? notes 'X) + (mapcar (lambda (lst) (mapcar parse-number lst)) + (string->list freqs)) + (mapcar (lambda (lst) (mapcar note->freq lst)) + (string->list notes))))) + +;; +;; note->freq +;; +;; Converts a string representing a MIDI note such as "C4" and +;; turns it into a frequency. We use the convention that +;; A5=440 (some call this note A3). +;; + +(define (note->freq note) + (if singing-debug + (format t "note is %l\n" note)) + (set! note (format nil "%s" note)) + (if singing-debug + (print_string note)) + (let (l octave notename midinote thefreq) + (set! l (string-length note)) + (set! octave (substring note (- l 1) 1)) + (set! notename (substring note 0 (- l 1))) + (set! midinote (+ (* 12 (parse-number octave)) + (notename->midioffset notename))) + (set! thefreq (midinote->freq midinote)) + (if singing-debug + (format t "note %s freq %f\n" note thefreq)) + thefreq)) + +;; +;; midinote->freq +;; +;; Converts a MIDI note number (1 - 127) into a frequency. We use +;; the convention that 69 = "A5" =440 Hz. +;; + +(define (midinote->freq midinote) + (* 440.0 (pow 2.0 (/ (- midinote 69) 12)))) + +;; +;; notename->midioffset +;; +;; Utility function that looks up the name of a note like "F#" and +;; returns its offset from C. +;; + +(define (notename->midioffset notename) + (parse-number (car (cdr (assoc_string notename note_names))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Pitch modification functions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; +;; singing_f0_targets +;; +;; This function replaces the normal intonation function used in +;; festival. For each syllable, it extracts the frequency that +;; was calculated from the XML tags and stored in the token this +;; syllable comes from, and sets this frequency as both the start +;; and end f0 target. Really straightforward! +;; + +(defvar singing-last-f0 nil) +(define (singing_f0_targets utt syl) + "(singing_f0_targets utt syl)" + (let ((start (item.feat syl 'syllable_start)) + (end (item.feat syl 'syllable_end)) + (freqs (mapcar parse-number (syl->freq syl))) + (durs (syl->durations syl))) + (let ((total-durs (apply + durs)) + (total-time (- end start)) + (time start) + (prev-segment (item.prev (item.relation (item.daughter1 (item.relation syl 'SylStructure)) 'Segment))) + (last-f0 singing-last-f0)) + (if freqs + (begin + (set! singing-last-f0 (car (last freqs))) + (append (if (and last-f0 + prev-segment + (item.prev prev-segment) + (string-equal (item.feat prev-segment 'name) + (car (car (cdr (car (PhoneSet.description '(silences)))))))) + (let ((s (item.feat prev-segment "p.end")) + (e (item.feat prev-segment "end"))) + (list (list (+ s (* (- e s) 0.8)) last-f0) + (list (+ s (* (- e s) 0.9)) (car freqs))))) + (apply append + (mapcar (lambda (d f) + (let ((range (* (/ d total-durs) total-time)) + (old-time time)) + (set! time (+ time range)) + (let ((range-fraction (* 0.1 range))) + (list (list (+ old-time range-fraction) f) + (list (- time range-fraction) f))))) + durs freqs)))))))) + +;; +;; syl->freq +;; +;; Given a syllable, looks up the frequency in its token. The token +;; stores a list of all of the frequencies associated with its +;; syllables, so this syllable grabs the frequency out of the list +;; corresponding to its index within the word. (This assumes that +;; a frequency was given for each syllable, and that a token +;; corresponds directly to a word. Singing-mode is not guaranteed +;; to work at all if either of these things are not true.) +;; + +(define (syl->freq syl) + (let ((index (item.feat syl "R:Syllable.pos_in_word")) + (freqs (singing-feat syl "R:SylStructure.parent.R:Token.parent.freq"))) + (nth index freqs))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Duration modification functions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; +;; singing_duration_method +;; +;; Calculates the duration of each phone in the utterance, in three +;; passes. Consult the three functions it calls, below, to see what +;; each one does. +;; + +(define (singing_duration_method utt) + (mapcar singing_adjcons_syllable (utt.relation.items utt 'Syllable)) + (singing_do_initial utt (car (utt.relation.items utt 'Token))) + (mapcar singing_do_syllable (utt.relation.items utt 'Syllable)) + (mapcar singing_fix_segment (utt.relation.items utt 'Segment)) + utt) + +;; +;; singing_adjcons_syllable +;; +;; First pass. Looks at the first phone of each syllable and +;; adjusts the starting time of this syllable such that the +;; perceived start time of the first phone is at the beginning +;; of the originally intended start time of the syllable. +;; +;; If this is not done, telling it to say the word "ta" at time +;; 2.0 actually doesn't "sound" like it says the "t" sound until +;; about 2.1 seconds. +;; +;; This function has a little bit of duplicated code from +;; singing_do_syllable, below - it could be modularized a little +;; better. +;; + +(define (singing_adjcons_syllable syl) + (let ((totlen (apply + (mapcar (lambda (s) + (get_avg_duration (item.feat s "name"))) + (item.leafs + (item.relation syl 'SylStructure))))) + (syldur (apply + (syl->durations syl))) + ;; figure out the offset of the first phone + (phone1 (item.daughter1 (item.relation syl 'SylStructure))) + (prevsyl (item.prev (item.relation syl 'Syllable)))) + (let ((offset (get_duration_offset (item.feat phone1 "name")))) + (if singing-debug + (format t "offset: %f\n" offset) ) + (if (< syldur totlen) + (set! offset (* offset (/ syldur totlen)))) + (if singing-debug + (format t "Want to adjust syl by %f\n" offset)) + (if prevsyl + (begin + (item.set_feat prevsyl 'subtractoffset offset) + (item.set_feat syl 'addoffset offset)))))) + +;; +;; singing_do_syllable +;; +;; Second pass. For each syllable, adds up the amount of time +;; that would normally be spent in consonants and vowels, based +;; on the average durations of these phones. Then, if the +;; intended length of this syllable is longer than this total, +;; stretch only the vowels; otherwise shrink all phones +;; proportionally. This function actually sets the "end" time +;; of each phone using a global "singing_global_time" variable. +;; +;; We also handle rests at this point, which are tagged onto the +;; end of the previous token. +;; + +(defvar singing-max-short-vowel-length 0.11) + +(define (singing_do_initial utt token) + (if (equal? (item.name token) "") + (let ((restlen (car (item.feat token 'rest)))) + (if singing-debug + (format t "restlen %l\n" restlen)) + (if (> restlen 0) + (let ((silence (car (car (cdr (assoc 'silences (PhoneSet.description))))))) + (set! singing_global_time restlen) + (item.relation.insert (utt.relation.first utt 'Segment) 'Segment + (list silence (list (list "end" singing_global_time))) + 'before)))))) + +(define (singing_do_syllable syl) + (let ((conslen 0.0) + (vowlen 0.0) + (segments (item.leafs (item.relation syl 'SylStructure)))) + ;; if there are no vowels, turn a middle consonant into a vowel; + ;; hopefully this works well for languages where syllables may be + ;; created by some consonants too + (let ((segments* segments) + (vowel-found nil)) + (while (and segments* (not vowel-found)) + (if (equal? "+" (item.feat (car segments*) "ph_vc")) + (set! vowel-found t) + (set! segments* (cdr segments*)))) + (if (not vowel-found) + (item.set_feat (nth (nint (/ (- (length segments) 1) 2)) + segments) + "singing-vc" "+"))) + ;; sum up the length of all of the vowels and consonants in + ;; this syllable + (mapcar (lambda (s) + (let ((slen (get_avg_duration (item.feat s "name")))) + (if (or (equal? "+" (item.feat s "ph_vc")) + (equal? "+" (item.feat s "singing-vc"))) + (set! vowlen (+ vowlen slen)) + (set! conslen (+ conslen slen))))) + segments) + (let ((totlen (+ conslen vowlen)) + (syldur (apply + (syl->durations syl))) + (addoffset (item.feat syl 'addoffset)) + (subtractoffset (item.feat syl 'subtractoffset)) + offset) + (set! offset (- subtractoffset addoffset)) + (if singing-debug + (format t "Vowlen: %f conslen: %f totlen: %f\n" vowlen conslen totlen)) + (if (< offset (/ syldur 2.0)) + (begin + (set! syldur (- syldur offset)) + (if singing-debug + (format t "Offset: %f\n" offset)))) + (if singing-debug + (format t "Syldur: %f\n" syldur)) + (if (> totlen syldur) + ;; if the total length of the average durations in the syllable is + ;; greater than the total desired duration of the syllable, stretch + ;; the time proportionally for each phone + (let ((stretch (/ syldur totlen))) + (mapcar (lambda (s) + (let ((slen (* stretch (get_avg_duration (item.feat s "name"))))) + (set! singing_global_time (+ slen singing_global_time)) + (item.set_feat s 'end singing_global_time))) + (item.leafs (item.relation syl 'SylStructure)))) + ;; otherwise, stretch the vowels and not the consonants + (let ((voweltime (- syldur conslen))) + (let ((vowelstretch (/ voweltime vowlen)) + (phones (mapcar car (car (cdar (PhoneSet.description '(phones))))))) + (mapcar (lambda (s) + (let ((slen (get_avg_duration (item.feat s "name")))) + (if (or (equal? "+" (item.feat s "ph_vc")) + (equal? "+" (item.feat s "singing-vc"))) + (begin + (set! slen (* vowelstretch slen)) + ;; If the sound is long enough, better results + ;; may be achieved by using longer versions of + ;; the vowels. + (if (> slen singing-max-short-vowel-length) + (let ((sname (string-append (item.feat s "name") ":"))) + (if (member_string sname phones) + (item.set_feat s "name" sname)))))) + (set! singing_global_time (+ slen singing_global_time)) + (item.set_feat s 'end singing_global_time))) + segments)))))) + (let ((restlen (car (syl->rest syl)))) + (if singing-debug + (format t "restlen %l\n" restlen)) + (if (> restlen 0) + (let ((lastseg (item.daughtern (item.relation syl 'SylStructure))) + (silence (car (car (cdr (assoc 'silences (PhoneSet.description)))))) + (singing_global_time* singing_global_time)) + (let ((seg (item.relation lastseg 'Segment)) + (extra-pause-length 0.00001)) + (set! singing_global_time (+ restlen singing_global_time)) + (item.insert seg (list silence (list (list "end" singing_global_time))) 'after) + ;; insert a very short extra pause to avoid after-effects, especially + ;; after vowels + (if (and seg + (equal? (item.feat seg "ph_vc") "+") + (< extra-pause-length restlen)) + (item.insert seg (list silence (list (list "end" (+ singing_global_time* + extra-pause-length)))) + 'after))))))) + +;; +;; singing_fix_segment +;; +;; Third pass. Finds any segments (phones) that we didn't catch earlier +;; (say if they didn't belong to a syllable, like silence) and sets them +;; to zero duration +;; + +(define (singing_fix_segment seg) + (if (equal? 0.0 (item.feat seg 'end)) + (if (equal? nil (item.prev seg)) + (item.set_feat seg 'end 0.0) + (item.set_feat seg 'end (item.feat (item.prev seg) 'end))) + (if singing-debug + (format t "segment: %s end: %f\n" (item.name seg) (item.feat seg 'end))))) + +;; returns the duration of a syllable (stored in its token) +(define (syl->durations syl) + (let ((index (item.feat syl "R:Syllable.pos_in_word")) + (durs (singing-feat syl "R:SylStructure.parent.R:Token.parent.dur"))) + (mapcar parse-number (nth index durs)))) + +;; returns the duration of the rest following a syllable +(define (syl->rest syl) + (let ((index (item.feat syl "R:Syllable.pos_in_word")) + (durs (singing-feat syl "R:SylStructure.parent.R:Token.parent.dur")) + (pauselen (singing-feat syl "R:SylStructure.parent.R:Token.parent.rest"))) + (if (equal? index (- (length durs) 1)) + (list (or pauselen 0.0)) + (list 0.0)))) + +;; get the average duration of a phone +(define (get_avg_duration phone) + (let ((pd (assoc_string phone phoneme_durations))) + (if pd + (car (cdr pd)) + 0.08))) + +;; get the duration offset of a phone (see the description above) +(define (get_duration_offset phone) + (parse-number (car (cdr (assoc_string phone phoneme_offsets*))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Other utility functions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (char-quote string) + (if (member string '("*" "+" "?" "[" "]" ".")) + (string-append "[" string "]") + string)) + +(define (split-string string separator) + (if (string-matches string (string-append ".+" (char-quote separator) ".+")) + (cons (string-before string separator) + (split-string (string-after string separator) separator)) + ;; We have to convert the weird XML attribute value type to string + (list (string-append string "")))) + +(define (string->list string) + (mapcar (lambda (s) (split-string s "+")) (split-string string ","))) + +(define (singing-append-feature! utt feature value) + (let ((tokens (utt.relation.items utt 'Token))) + (if tokens + ;; we have to wrap value into a list to work around a Festival bug + (item.set_feat (car (last tokens)) feature (list value)) + (begin + (utt.relation.append utt 'Token '("" ((name "") (whitespace "") + (prepunctuation "") (punc "")))) + (item.set_feat (car (last (utt.relation.items utt 'Token))) feature (list value)))))) + +(define (singing-feat item feature) + (let ((value (item.feat item feature))) + (if (equal? value 0) + nil + (car value)))) + +(define (current-language) + (cadr (car (assoc 'language (voice.description current-voice))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Initializing and exiting singing mode +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; +;; singing_init_func +;; + +(defvar singing_previous_eou_tree nil) + +(define (singing_init_func) + "(singing_init_func) - Initialization for Singing mode" + (if (not (symbol-bound? 'phoneme_durations)) + (set! phoneme_durations '())) + ;; use our intonation function + (Parameter.set 'Int_Method 'General) + (Parameter.set 'Int_Target_Method Int_Targets_General) + (set! int_general_params `((targ_func ,singing_f0_targets))) + (set! singing-last-f0 nil) + ;; use our duration function + (Parameter.set 'Duration_Method singing_duration_method) + ;; set phoneme corrections for the current language + (let ((language 'english +; (cadr (assoc 'language +; (cadr (voice.description current-voice)))) + )) + (set! phoneme_offsets* (cdr (assoc language phoneme_offsets)))) + ;; avoid splitting to multiple utterances with insertion of unwanted pauses + (set! singing_previous_eou_tree eou_tree) + (set! eou_tree nil) + ;; use our xml parsing function + (set! singing_previous_elements xxml_elements) + (set! xxml_elements singing_xml_elements)) + +;; +;; singing_exit_func +;; + +(define (singing_exit_func) + "(singing_exit_func) - Exit function for Singing mode" + (set! eou_tree singing_previous_eou_tree) + (set! xxml_elements singing_previous_elements)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Data tables +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(defvar note_names + '((C 0) + (C# 1) + (Db 1) + (D 2) + (D# 3) + (Eb 3) + (E 4) + (E# 5) + (Fb 4) + (F 5) + (F# 6) + (Gb 6) + (G 7) + (G# 8) + (Ab 8) + (A 9) + (A# 10) + (Bb 10) + (B 11) + (B# 12) + (Cb 11))) + +;; +;; The following list contains the offset into each phone that best +;; represents the perceptual onset of the phone. This is important +;; to know to get durations right in singing. For example, if the +;; offset for "t" is .060, and you want to start a "t" sound at +;; time 2.0 seconds, you should actually start the phone playing +;; at time 1.940 seconds in order for it to sound like the onset of +;; the "t" is really right at 2.0. +;; +;; These were derived empically by looking at and listening to the +;; waveforms of each phone for mwm's voice. +;; + +(defvar phoneme_offsets + `((english (t 0.050) + (T 0.050) + (d 0.090) + (D 0.090) + (p 0.080) + (b 0.080) + (k 0.090) + (g 0.100) + (9r 0.050) ;; r + (l 0.030) + (f 0.050) + (v 0.050) + (s 0.040) + (S 0.040) + (z 0.040) + (Z 0.040) + (n 0.040) + (N 0.040) + (m 0.040) + (j 0.090) + (E 0.0) + (> 0.0) + (>i 0.0) + (aI 0.0) + (& 0.0) + (3r 0.0) + (tS 0.0) + (oU 0.0) + (aU 0.0) + (A 0.0) + (ei 0.0) + (iU 0.0) + (U 0.0) + (@ 0.0) + (h 0.0) + (u 0.0) + (^ 0.0) + (I 0.0) + (dZ 0.0) + (i: 0.0) + (w 0.0) + (pau 0.0) + (brth 0.0) + (h# 0.0) + ))) + +(defvar phoneme_offsets* nil) + +;; +;; Declare the new mode to Festival +;; + +(set! tts_text_modes + (cons `(singing ;; mode name + ((init_func ,singing_init_func) + (exit_func ,singing_exit_func) + (analysis_type xml))) + tts_text_modes)) + +(provide 'singing-mode) diff --git a/CosyVoice-ttsfrd/resource/festival/siod.scm b/CosyVoice-ttsfrd/resource/festival/siod.scm new file mode 100644 index 0000000000000000000000000000000000000000..9584f88e32bb23da524881898c9d5aeb9bbe3e7c --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/siod.scm @@ -0,0 +1,638 @@ + + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;; DO NOT EDIT THIS FILE ON PAIN OF MORE PAIN. + ;;; + ;;; The master copy of this file is in ../../speech_tools/lib/siod/siod.scm + ;;; and is copied here at build time. + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + + + + + + + + + + +;; SIOD: Scheme In One Defun -*-mode: view-*- +;; +;; * COPYRIGHT (c) 1989-1992 BY * +;; * PARADIGM ASSOCIATES INCORPORATED, CAMBRIDGE, MASSACHUSETTS. * +;; * See the source file SLIB.C for more information. * +;; +;; A fair amount of modifications and tidy up was done by AWB, particularly +;; * adding documentation strings to all functions/variable +;; * removing some example functions not relevant to Festival (or siod) +;; * addition of new functions (require provide etc) + +;(puts ";; Optional Runtime Library for Release 2.8 +;") + +(define list + (lambda n +"(list A0 A1 ...) +Return list containing A0 A1 ..." + n)) + +(define (caar x) +"(caar X) +Return the (car (car X))." + (car (car x))) +(define (cadr x) +"(cadr X) +Return the (car (cdr X))." + (car (cdr x))) +(define (cdar x) +"(cdar X) +Return the (cdr (car X))." + (cdr (car x))) +(define (cddr x) +"(cddr X) +Return the (cdr (cdr X))." + (cdr (cdr x))) + +(define (caddr x) +"(caddr X) +Return the (car (cdr (cdr X)))." + (car (cdr (cdr x)))) +(define (cdddr x) +"(cdddr X) +Return the (cdr (cdr (cdr X)))." + (cdr (cdr (cdr x)))) + +(define consp pair?) + +(define (replace before after) +"(replace BEFORE AFTER) +Destructively replace contents of cons cell BEFORE with those of +AFTER." + (set-car! before (car after)) + (set-cdr! before (cdr after)) + after) + +(define (prognify forms) + (if (null? (cdr forms)) + (car forms) + (cons 'begin forms))) + +(define (defmac-macro form) +"(defmac-macro MACRONAME FORM) +Define a macro. Macro expand FORM in-line." + (let ((sname (car (cadr form))) + (argl (cdr (cadr form))) + (fname nil) + (body (prognify (cddr form)))) + (set! fname (symbolconc sname '-macro)) + (list 'begin + (list 'define (cons fname argl) + (list 'replace (car argl) body)) + (list 'define sname (list 'quote fname))))) + +(define defmac 'defmac-macro) + +(defmac (push form) + (list 'set! (caddr form) + (list 'cons (cadr form) (caddr form)))) + +(defmac (pop form) + (list 'let (list (list 'tmp (cadr form))) + (list 'set! (cadr form) '(cdr tmp)) + '(car tmp))) + +;;; Have to set var-docstrings to nil first as defvar requires it to be set +(set! var-docstrings nil) +(define (add-doc-var varname docstring) +"(add-doc-var VARNAME DOCSTRING) + Add document string DOCSTRING to VARNAME. If DOCSTRING is nil + this has no effect. If VARNAME already has a document string replace + it with DOCSTRING." + (if (null? docstring) + t + (let ((lpair (assq varname var-docstrings))) + (if lpair + (set-cdr! lpair docstring) + (set! var-docstrings (cons (cons varname docstring) + var-docstrings)))))) + +(set! boundp symbol-bound?) + +(defmac (defvar form) + (begin ;; always add the documentation string + (add-doc-var (cadr form) (car (cdddr form))) + (list 'or + (list 'symbol-bound? (list 'quote (cadr form))) + (list 'define (cadr form) (caddr form))))) + +(defvar var-docstrings nil + "var-docstrings + An assoc-list of variable names and their documentation strings.") + +(defmac (defun form) + (cons 'define + (cons (cons (cadr form) (caddr form)) + (cdddr form)))) + +(defmac (setq form) + (let ((l (cdr form)) + (result nil)) + (define (loop) + (if l + (begin (push (list 'set! (car l) (cadr l)) result) + (set! l (cddr l)) + (loop)))) + (loop) + (prognify (reverse result)))) + +(define progn begin) + +(defun atom (x) +"(atom X) +True if X is not a cons cells, nil otherwise." + (not (consp x))) + +(define eq eq?) + +(defmac (cond form) + (cond-convert (cdr form))) + +(define null null?) + +(defun cond-convert (l) + (if (null l) + () + (if (null (cdar l)) + (if (null (cdr l)) + (caar l) + (let ((rest (cond-convert (cdr l)))) + (if (and (consp rest) (eq (car rest) 'or)) + (cons 'or (cons (caar l) (cdr rest))) + (list 'or (caar l) rest)))) + (if (or (eq (caar l) 't) + (and (consp (caar l)) (eq (car (caar l)) 'quote))) + (prognify (cdar l)) + (list 'if + (caar l) + (prognify (cdar l)) + (cond-convert (cdr l))))))) + +(defmac (+internal-comma form) + (error 'comma-not-inside-backquote)) + +(define +internal-comma-atsign +internal-comma) +(define +internal-comma-dot +internal-comma) + +(defmac (+internal-backquote form) + (backquotify (cdr form))) + +(defun backquotify (x) +"(backquote FORM) +Backquote function for expanding forms in macros." + (let (a d aa ad dqp) + (cond ((atom x) (list 'quote x)) + ((eq (car x) '+internal-comma) (cdr x)) + ((or (atom (car x)) + (not (or (eq (caar x) '+internal-comma-atsign) + (eq (caar x) '+internal-comma-dot)))) + (setq a (backquotify (car x)) d (backquotify (cdr x)) + ad (atom d) aa (atom a) + dqp (and (not ad) (eq (car d) 'quote))) + (cond ((and dqp (not (atom a)) (eq (car a) 'quote)) + (list 'quote (cons (cadr a) (cadr d)))) + ((and dqp (null (cadr d))) + (list 'list a)) + ((and (not ad) (eq (car d) 'list)) + (cons 'list (cons a (cdr d)))) + (t (list 'cons a d)))) + ((eq (caar x) '+internal-comma-atsign) + (list 'append (cdar x) (backquotify (cdr x)))) + ((eq (caar x) '+internal-comma-dot) + (list 'nconc (cdar x)(backquotify (cdr x))))))) + + +(defun append n +"(append L0 L1 ...) +Append each list to the first list in turn." + (appendl n)) + +(defun appendl (l) + (cond ((null l) nil) + ((null (cdr l)) (car l)) + ((null (cddr l)) + (append2 (car l) (cadr l))) + ('else + (append2 (car l) (appendl (cdr l)))))) + +(defun append2 (a b) + (if (null a) + b + (begin + (let ((x (reverse a)) + (y b)) + (while x + (set! y (cons (car x) y)) + (set! x (cdr x))) + y)))) + +(defun rplacd (a b) +"(replacd A B) +Destructively replace the cdr of A with B." + (set-cdr! a b) + a) + +(defun nconc (a b) +"(nconc A B) +Destructively append B to A, if A is nil return B." + (if (null a) + b + (rplacd (last a) b))) + +(defun last (a) +"(last A) +Last (cdr) element in list A." + (cond ((null a) (error'null-arg-to-last)) + ((null (cdr a)) a) + ((last (cdr a))))) + +(define (remove i l) +"(remove ITEM LIST) +(Non-destructively) remove ITEM from LIST." + (cond + ((null l) nil) + ((eq? i (car l)) + (cdr l)) + (t + (cons (car l) (remove i (cdr l)))))) + +(define (remove-duplicates l) +"(remove-duplicates LIST) +Remove duplicate items in LIST." + (cond + ((null l) l) + ((member_string (car l) (cdr l)) + (remove-duplicates (cdr l))) + (t + (cons (car l) (remove-duplicates (cdr l)))))) + +(define (nth n l) + "(nth N LIST) +Returns nth car of LIST, 0 is car." + (if (< n 1) + (car l) + (nth (- n 1) (cdr l)))) + +(define (position thing l) + "(position thing l) +What position is thing in l, -1 if it doesn't exist." + (let ((p 0) (m l)) + (while (and m (not (equal? thing (car m)))) + (set! p (+ 1 p)) + (set! m (cdr m))) + (if m p nil))) + +(define (nth_cdr n l) + "(nth_cdr N LIST) +Returns nth cdr of LIST, 0 is LIST." + (if (< n 1) + l + (nth_cdr (- n 1) (cdr l)))) + +(define (<= a b) +"(<= NUM1 NUM2) + Returns t if NUM1 is less than or equal to NUM2, nil otherwise. An error is + given is either argument is not a number." + (or (< a b) + (equal? a b))) + +(define (>= a b) +"(>= NUM1 NUM2) + Returns t if NUM1 is greater than or equal to NUM2, nil otherwise. + An error is given is either argument is not a number." + (or (> a b) + (equal? a b))) + +(define (approx-equal? a b diff) + "(approx-equal? a b diff) +True is the difference between a b is less than diff. This allows equality +between floats which may have been written out and read in and hence have +slightly different precision." + (< (if (> a b) (- a b) (- b a)) diff)) + +(define (assoc_string key alist) + "(assoc_string key alist) +Look up key in alist using string-equal. This allow indexing by +string rather than just symbols." + (cond + ((null alist) nil) + ((string-equal key (car (car alist))) (car alist)) + (t (assoc_string key (cdr alist)))) +) + +(defvar *fasdump-hash* t) + +(defun fasl-open (filename mode) +"(fasl-open FILENAME MODE) +Open fasl FILENAME as MODE. Returns a fasl-table." + (list (fopen filename mode) + (if (or (equal? mode "rb") *fasdump-hash*) + (cons-array 100)) + ;; If this is set NIL, then already hashed symbols will be + ;; optimized, and additional ones will not. + 0)) + +(defun fasl-close (table) +"(fasl-close TABLE) +Close fasl table." + (fclose (car table))) + +(defun fasload args +"(fasload FILENAME ARGS) +Fast load FILENAME." + (let ((filename (car args)) + (head (and (cadr args) (cons nil nil)))) + (let ((table (fasl-open filename "rb")) + (exp) + (tail head)) + (while (not (eq table (setq exp (fast-read table)))) + (cond (head + (setq exp (cons exp nil)) + (set-cdr! tail exp) + (setq tail exp)) + ('else + (eval exp)))) + (fasl-close table) + (and head (cdr head))))) + +(defun fasdump (filename forms) +"(fasdump FILENAME FORMS) +Fast dump FORMS into FILENAME." + (let ((table (fasl-open filename "wb")) + (l forms)) + (while l + (fast-print (car l) table) + (set! l (cdr l))) + (fasl-close table))) + +(defun compile-file (filename) +"(compile-file FILENAME) +Compile lisp forms in FILENAME.scm to FILENAME.bin." + (let ((forms (load (string-append filename ".scm") t))) + (puts "Saving forms +") + (fasdump (string-append filename ".bin") + forms))) + +(defvar *properties* (cons-array 100) + "*properties* +Array for holding symbol property lists.") + +(defun get (sym key) +"(get SYM KEY) +Get property named KEY for SYM." + (cdr (assq key (href *properties* sym)))) + +(defun putprop (sym val key) +"(putprop SYM VAL KEY) +Put property VAL named KEY for SYM." + (let ((alist (href *properties* sym))) + (let ((cell (assq key alist))) + (cond (cell + (set-cdr! cell val)) + ('else + (hset *properties* sym (cons (cons key val) alist)) + val))))) + +;;(define (mapcar1 f l1) +;; (and l1 (cons (f (car l1)) (mapcar1 f (cdr l1))))) + +;; An iterative version of the above +(define (mapcar1 f l1) + (let ((l2 l1) (r nil)) + (while l2 + (set! r (cons (f (car l2)) r)) + (set! l2 (cdr l2))) + (reverse r))) + +;;(define (mapcar2 f l1 l2) +;; (and l1 l2 (cons (f (car l1) (car l2)) (mapcar2 f (cdr l1) (cdr l2))))) + +;; An iterative version +(define (mapcar2 f l1 l2) + (let ((a1 l1) (a2 l2) (r nil)) + (while a1 + (set! r (cons (f (car a1) (car a2)) r)) + (set! a1 (cdr a1)) + (set! a2 (cdr a2))) + (reverse r))) + +(define (mapcar . args) +"(mapcar FUNCTION ARGS [ARGS2]) +Apply FUNCTION to each member of ARGS (and [ARGS2]), returning list of +return values." + (cond ((null args) + (error "too few arguments")) + ((null (cdr args)) + (error "too few arguments")) + ((null (cdr (cdr args))) + (mapcar1 (car args) (car (cdr args)))) + ((null (cdr (cdr (cdr args)))) + (mapcar2 (car args) (car (cdr args)) (car (cdr (cdr args))))) + ('else + (error "two many arguments")))) + +;; will be set automatically on start-up +(defvar libdir ' + "libdir + The pathname of the run-time libary directory. Note reseting is + almost definitely not what you want to do. This value is automatically + set at start up from the value specifed at compile-time or the value + specifed with --libdir on the command line. A number of other variables + depend on this value.") + +(defvar load-path (list libdir) + "load-path + A list of directories containing .scm files. Used for various functions + such as load_library and require. Follows the same use as EMACS. By + default it is set up to the compile-time library directory but may be + changed by the user at run time, by adding a user's own library directory + or even replacing all of the standard library. [see Site initialization]") + +;; will be set automatically on start-up +(defvar *ostype* 'unknown + "*ostype* + Contains the name of the operating system type that Festival is running + on, e.g. SunOS5, FreeBSD, linux etc. The value is taken from the Makefile + variable OSTYPE at compile time.") + +(defvar etc-path (path-append libdir "etc/" *ostype*) + "etc-path + A list of directories where binaries specific to Festival may be located. + This variable is automatically set to LIBDIR/etc/OSTYPE/ + and that path is added to the end of the UNIX PATH environment variable.") + +(define (library_expand_filename fname) +"(library_expand_filename FILENAME) + Search for filename by appending FILENAME to each member of load-path. + Full expanded pathname is returned. If not found in load-path FILENAME + is returned." + (let ((p load-path) + (found nil)) + (while (and p (null? found)) + (let ((pot-file (path-append (car p) fname))) + (if (probe_file pot-file) + (setq found pot-file)) + (setq p (cdr p)))) + (if (null? found) + fname + found))) + +(define (load_library fname) +"(load_library FILENAME) + Load file from library, appends FILENAME to each path in load-path + until a valid file is found. If none found loads name itself" + (load (library_expand_filename fname))) + +(define (fasload_library fname) +"(fasload_library FILENAME) + Load binary file from library" + (fasload (library_expand_filename fname))) + +(define (member item list) +"(member ITEM LIST) + Returns subset of LIST whose car is ITEM if it exists, nil otherwise." + (if (consp list) + (if (equal? item (car list)) + list + (member item (cdr list))) + nil)) + +(define (member_string item list) +"(member_string STRING LIST) + Returns subset of LIST whose car is STRING if it exists, nil otherwise." + (if (consp list) + (if (string-equal item (car list)) + list + (member_string item (cdr list))) + nil)) + +(defvar provided nil + "provided + List of file names (omitting .scm) that have been provided. This list + is checked by the require function to find out if a file needs to be + loaded. If that file is already in this list it is not loaded. Typically + a file will have (provide 'MYNAME) at its end so that a call to + (require 'MYNAME) will only load MYNAME.scm once.") + +(define (require fname) +"(require FILENAME) + Checks if FNAME is already provided (member of variable provided) if not + loads it, appending \".scm\" to FILENAME. Uses load_library to find + the file." + (let ((bname (intern (basename fname)))) + (if (null? (member bname provided)) + (progn + ;;; Compiled files aren't faster, so we don't do this + ; (fasload_library (string-append fname ".bin")) + (load_library (string-append fname ".scm")) + 't) + nil))) + +(define (request fname) +"(request FILENAME) + Checks if FNAME is already provided (member of variable provided) if not + tries to loads it, appending \".scm\" to FILENAME. Uses load_library + to find the file. Unlike require, fname isn't found no error occurs" + (unwind-protect (require fname))) + +(define (provide fname) +"(provide FILENAME) + Adds FNAME to the variable provided (if not already there). This means + that future calls to (require FILENAME) will not cause FILENAME to + be re-loaded." + (if (null? (member fname provided)) + (set! provided (cons fname provided)))) + +(define (apply_hooks hooks obj) +"(apply_hooks HOOK OBJ) + Apply HOOK(s) to OBJ. HOOK is a function or list of functions that + take one argument." +(cond + ((null? hooks) obj) + ((consp hooks) + (apply_hooks (cdr hooks) ((car hooks) obj))) + (t (hooks obj)))) + +(define (apply func args) +"(apply FUNC ARGS) +Call FUNC with ARGS as arguments." + (eval + (cons func + (mapcar (lambda (a) (list 'quote a)) args)))) + +(defmac (autoload form) +"(autoload FUNCTION FILENAME DOCSTRING) +Define FUNCTION that when called automatically loads FILENAME +and calls FUNCTION (assumed to be defined in FILENAME)." + (list 'define + (cadr form) + (list + 'lambda + 'n + (list 'let (list (list 'me (cadr form))) + (list 'require (car (cdr (cdr form)))) + (list 'if (list 'eq 'me (cadr form)) + (list 'error + (list 'string-append + "autoload: \"" + (car (cdr (cdr form))) + ".scm\" does not define " + (list 'quote (cadr form))))) + + (list 'apply (cadr form) 'n))))) + +(define (:backtrace frame) +"(:backtrace [FRAME]) +This function called *immediately* after an error will display a backtrace +of the functions evaluated before the error. With no arguments it +lists all stack frames, with the (possibly shortened) forms that were +evaluated at that level. With a numeric argument it displays +the form at that level in full. This function only works at +top level in the read-eval-print loop (command interpreter). Note +that any valid command will leave the backtrace stack empty. Also +note that backtrace itself does not reset the backtrace, unless you +make an error in calling it." + +"The function is interpreted specially by the read-eval-interpreter +and hence has no body, its actual body is defined in +src/arch/siod-3.0/slib.cc." +) + +(defvar hush_startup nil + "hush_startup + If set to non-nil, the copyright banner is not displayed at start up.") + +(defvar editline_histsize 256 + "editline_histsize + The number of lines to be saved in the users history file when a + Festival session ends. The histfile is \".festival_history\" in the + users home directory. Note this value is only checked when the + command interpreter is started, hence this should be set in a user's + \".festivalrc\" or system init file. Reseting it at the command + interpreter will have no effect.") + +(defvar editline_no_echo (getenv "EMACS") + "editline_no_echo + When running under Emacs as an inferior process, we don't want to + echo the content of the line, only the prompt.") + +(defvar ! nil + "! + In interactive mode, this variable's value is the return value of the + previously evaluated expression.") + +(provide 'siod) diff --git a/CosyVoice-ttsfrd/resource/festival/siteinit.scm b/CosyVoice-ttsfrd/resource/festival/siteinit.scm new file mode 100644 index 0000000000000000000000000000000000000000..61f048dfe1939fe6323a3f9dec7efbe0057857ac --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/siteinit.scm @@ -0,0 +1,65 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Site specific initialisation file +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; If festival's internal audio playing support doesn't work on your +;; machine you can make Festival use your own program to play waveform +;; files. Uncomment the following and change "play" to the name of +;; your local program that can play files + +;(Parameter.set 'Audio_Required_Format 'riff) +;(Parameter.set 'Audio_Method 'Audio_Command) + +;; Apple OSX (if you can file afplay) +;(Parameter.set 'Audio_Command "afplay $FILE") + +;; SOX (play) often a good alternative on cygwin and linux +;(Parameter.set 'Audio_Command "play -q $FILE") + +;; Windows 7 (when sox's play doesn't work -- but this might not exit) +;(Parameter.set 'Audio_Command "c:/Windows/System32/WindowsPowerShell/v1.0/powershell -c '(New-Object Media.Soundplayer C:/cygwin'$FILE').PlaySync(); Exit;'") + +;; If you want a voice different from the system installed default +;; uncomment the following line and change the name to the voice you +;; want + +;(set! voice_default 'voice_cmu_us_awb_arctic_hts) + +(provide 'siteinit) + + + + diff --git a/CosyVoice-ttsfrd/resource/festival/soleml-mode.scm b/CosyVoice-ttsfrd/resource/festival/soleml-mode.scm new file mode 100644 index 0000000000000000000000000000000000000000..9856fb2cdcdb96f24a888c762850ce122b604598 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/soleml-mode.scm @@ -0,0 +1,336 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1998 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Support for an SGML based mark-up language used in the SOLE +;;; project. This is all still experimental. +;;; +;;; This currently treats one file as one utterance (to make dealing with +;;; the SOLE museaum database easy + +(set! soleml_word_features_stack nil) +(defvar sole_current_node nil) + +(define (soleml_token_to_words utt token name) + "(soleml_token_to_words utt token name) +SOLEML mode token specific analysis." + (cond + + (t + (soleml_previous_token_to_words utt token name)))) + +(define (voice_soleml) +"(soleml_voice) +Speaker specific initialisation for SOLE museum data." + (voice_rab_diphone) + ;; Utterances only come at end of file + (set! eou_tree '((0))) +) + +(defvar soleml_elements +'( + ("(SOLEML" (ATTLIST UTT) + ;; required to identify type + (voice_soleml) ;; so we know what state we start in + (set! soleml_utt (Utterance Tokens nil)) + (utt.stream.create soleml_utt 'Token) + (utt.relation.create soleml_utt 'SOLEML) + (set! sole_current_node + (utt.relation_append soleml_utt 'SOLEML (cons "sole-ml" ATTLIST))) + soleml_utt + ) + (")SOLEML" (ATTLIST UTT) + ;; required to identify end token + ;; Don't really want to synthesize this + ;; (xxml_synth UTT) ;; Synthesis the remaining tokens + (set! soleml_utt UTT) + UTT + ) + ;; Utterance break elements + ("(LANGUAGE" (ATTLIST UTT) + ;; Select a new language + (select_language (car (xxml_attval "NAME" ATTLIST))) + UTT) + ("(VOICE" (ATTLIST UTT) + ;;(xxml_synth UTT) + ;; Select a new voice + (cond + ((equal? (car (xxml_attval "NAME" ATTLIST)) 'male1) + (voice_soleml_diphone)) + ((equal? (car (xxml_attval "NAME" ATTLIST)) 'male2) + (voice_soleml_diphone)) + ((equal? (car (xxml_attval "NAME" ATTLIST)) 'male3) + (voice_soleml_diphone)) + (t + (print "SOLEML: selecting unknown voice") + (voice_soleml_diphone))) + UTT) + ;; phrase-boundary // mark on token (??) + ;; punct-elem // mark on token + ;; sem-elem + ;; text-elem // ignore + ;; rhet-elem has nucleus and satellite + ;; anaphora-elem + ;; syn-elem + ;; info-struct-elem + ;; other-elem + ("(PUNCT-ELEM" (ATTLIST UTT) + (soleml_push_word_features) + (set! xxml_word_features + (cons (list "punct-elem" "1") + (soleml_conv_attlist ATTLIST))) + UTT) + (")PUNCT-ELEM" (ATTLIST UTT) + (set! xxml_word_features (soleml_pop_word_features)) + UTT) + ("(PHRASE-BOUNDARY" (ATTLIST UTT) + (if (string-equal "4" (car (xxml_attval "STRENGTH" ATTLIST))) + (begin +;; (xxml_synth UTT) + UTT) + (let ((last_token (car (last (utt.stream UTT 'Token))))) + (if last_token + (item.set_feat last_token "pbreak" "B")) + UTT))) + ;; For each recursive element simply build a new node + ("(RHET-ELEM" (ATTLIST UTT) + (let ((sdesc (list 'rhet-elem (soleml_conv_attlist ATTLIST)))) + (set! sole_current_node + (node.append_daughter sole_current_node sdesc)) + UTT)) + (")RHET-ELEM" (ATTLIST UTT) + (set! sole_current_node (node.parent sole_current_node)) + UTT) + ("(RHET-EMPH" (ATTLIST UTT) + (let ((sdesc (list 'rhet-emph (soleml_conv_attlist ATTLIST)))) + (set! sole_current_node + (node.append_daughter sole_current_node sdesc)) + UTT)) + (")RHET-EMPH" (ATTLIST UTT) + (set! sole_current_node (node.parent sole_current_node)) + UTT) + ("(ANAPHORA-ELEM" (ATTLIST UTT) + (let ((sdesc (list 'anaphora-elem (soleml_conv_attlist ATTLIST)))) + (set! sole_current_node + (node.append_daughter sole_current_node sdesc)) + UTT)) + (")ANAPHORA-ELEM" (ATTLIST UTT) + (set! sole_current_node (node.parent sole_current_node)) + UTT) + ("(SYN-ELEM" (ATTLIST UTT) + (let ((sdesc (list 'syn-elem (soleml_conv_attlist ATTLIST)))) + (set! sole_current_node + (node.append_daughter sole_current_node sdesc)) + UTT)) + (")SYN-ELEM" (ATTLIST UTT) + (set! sole_current_node (node.parent sole_current_node)) + UTT) + ("(CONNECTIVE" (ATTLIST UTT) + (let ((sdesc (list 'connective (soleml_conv_attlist ATTLIST)))) + (set! sole_current_node + (node.append_daughter sole_current_node sdesc)) + UTT)) + (")CONNECTIVE" (ATTLIST UTT) + (set! sole_current_node (node.parent sole_current_node)) + UTT) + ("(TEXT-ELEM" (ATTLIST UTT) + (let ((sdesc (list 'text-elem (soleml_conv_attlist ATTLIST)))) + (set! sole_current_node + (node.append_daughter sole_current_node sdesc)) + UTT)) + (")TEXT-ELEM" (ATTLIST UTT) + (set! sole_current_node (node.parent sole_current_node)) + UTT) + ("(SEM-ELEM" (ATTLIST UTT) + (let ((sdesc (list 'sem-elem (soleml_conv_attlist ATTLIST)))) + (set! sole_current_node + (node.append_daughter sole_current_node sdesc)) + UTT)) + (")SEM-ELEM" (ATTLIST UTT) + (set! sole_current_node (node.parent sole_current_node)) + UTT) + ("(INFO-STRUCT-ELEM" (ATTLIST UTT) + (let ((sdesc (list 'info-struct-elem (soleml_conv_attlist ATTLIST)))) + (set! sole_current_node + (node.append_daughter sole_current_node sdesc)) + UTT)) + (")INFO-STRUCT-ELEM" (ATTLIST UTT) + (set! sole_current_node (node.parent sole_current_node)) + UTT) + ("(OTHER-ELEM" (ATTLIST UTT) + (let ((sdesc (list 'other-elem (soleml_conv_attlist ATTLIST)))) + (set! sole_current_node + (node.append_daughter sole_current_node sdesc)) + UTT)) + (")OTHER-ELEM" (ATTLIST UTT) + (set! sole_current_node (node.parent sole_current_node)) + UTT) + ("(NUCLEUS" (ATTLIST UTT) + (let ((sdesc (list 'nucleus (soleml_conv_attlist ATTLIST)))) + (set! sole_current_node + (node.append_daughter sole_current_node sdesc)) + UTT)) + (")NUCLEUS" (ATTLIST UTT) + (set! sole_current_node (node.parent sole_current_node)) + UTT) + ("(SATELLITE" (ATTLIST UTT) + (let ((sdesc (list 'satellite (soleml_conv_attlist ATTLIST)))) + (set! sole_current_node + (node.append_daughter sole_current_node sdesc)) + UTT)) + (")SATELLITE" (ATTLIST UTT) + (set! sole_current_node (node.parent sole_current_node)) + UTT) + ;; Other control functions (probably not used in SOLE) + ("(CALL" (ATTLIST UTT) +;; (xxml_synth UTT) + (if (string-matches (car (xxml_attval "ENGID" ATTLIST)) "festival.*") + (let ((comstr "")) + (mapcar + (lambda (c) (set! comstr (string-append comstr " " c))) + (xxml_attval "COMMAND" ATTLIST)) + (eval (read-from-string comstr)))) + UTT) + ("(DEFINE" (ATTLIST UTT) +;; (xxml_synth UTT) + (if (not (string-equal "NATIVE" (car (xxml_attval "SCHEME" ATTLIST)))) + (format t "DEFINE: unsupported SCHEME %s, definition ignored\n" + (car (xxml_attval "SCHEME" ATTLIST))) + (lex.add.entry + (list + (car (xxml_attval "WORDS" ATTLIST)) ;; head form + nil ;; pos + (lex.syllabify.phstress (xxml_attval "PRONS" ATTLIST))))) + UTT) + ("(SOUND" (ATTLIST UTT) +;; (xxml_synth UTT) + (if (not soleml_omitted_mode) + (apply_hooks tts_hooks + (eval (list 'Utterance 'Wave + (car (xxml_attval "SRC" ATTLIST)))))) + UTT) + ("(EMPH" (ATTLIST UTT) + ;; Festival is particularly bad at adding specific emphasis + ;; that's what happens when you use statistical methods that + ;; don't include any notion of emphasis + ;; This is *not* recursive + (soleml_push_word_features) + (set! xxml_word_features + (cons (list "EMPH" "1") xxml_word_features)) + UTT) + (")EMPH" (ATTLIST UTT) + (set! xxml_word_features (soleml_pop_word_features)) + UTT) + ("(WORD" (ATTLIST UTT) + ;; a word in-line + (let ((name (xxml_attval "NAME" ATTLIST)) + (pos (xxml_attval "POS" ATTLIST)) + (accent (xxml_attval "ACCENT" ATTLIST)) + (tone (xxml_attval "TONE" ATTLIST)) + (phonemes (xxml_attval "PHONEMES" ATTLIST)) + token) + (utt.item.insert UTT 'Token) ;; add new Token + (set! token (utt.stream.tail UTT 'Token)) + (item.set_name token (car name)) + (if pos (item.set_feat token "pos" (car pos))) + (if accent (item.set_feat token "accent" (car accent))) + (if tone (item.set_feat token "tone" (car tone))) + (if phonemes (item.set_feat token "phonemes" + (format nil "%l" phonemes))) + UTT)) +)) + +(define (soleml_init_func) + "(soleml_init_func) +Initialisation for SOLEML mode" + (voice_soleml) + (set! soleml_previous_elements xxml_elements) + (set! xxml_elements soleml_elements) + (set! xxml_token_hooks soleml_token_function) + (set! soleml_previous_token_to_words english_token_to_words) + (set! english_token_to_words soleml_token_to_words) + (set! token_to_words soleml_token_to_words)) + +(define (soleml_exit_func) + "(soleml_exit_func) +Exit function for SOLEML mode" + (set! xxml_elements soleml_previous_elements) + (set! token_to_words soleml_previous_token_to_words) + (set! english_token_to_words soleml_previous_token_to_words)) + +(define (soleml_token_function si) +"(soleml_token_function si) +This is called for each token found." + (node.append_daughter sole_current_node si)) + +(define (soleml_push_word_features) +"(soleml_push_word_features) +Save current word features on stack." + (set! soleml_word_features_stack + (cons xxml_word_features soleml_word_features_stack))) + +(define (soleml_pop_word_features) +"(soleml_pop_word_features) +Pop word features from stack." + (let ((r (car soleml_word_features_stack))) + (set! soleml_word_features_stack (cdr soleml_word_features_stack)) + r)) + +(define (soleml_conv_attlist alist) +"(soleml_conv_attlist alist) +Flatten alist arguments." + (cond + ((null alist) nil) + ((null (car (cdr (car alist)))) + (soleml_conv_attlist (cdr alist))) + ((equal? (length (car (cdr (car alist)))) 1) + (cons + (list (car (car alist)) (car (car (cdr (car alist))))) + (soleml_conv_attlist (cdr alist)))) + (t + (cons + (list (car (car alist)) (format nil "%l" (car (cdr (car alist))))) + (soleml_conv_attlist (cdr alist)))))) + +(set! tts_text_modes + (cons + (list + 'soleml ;; mode name + (list ;; email mode params + (list 'init_func soleml_init_func) + (list 'exit_func soleml_exit_func) + '(analysis_type xxml) + (list 'filter + (format nil "%s -D %s " sgml_parse_progname libdir)))) + tts_text_modes)) + +(provide 'soleml-mode) diff --git a/CosyVoice-ttsfrd/resource/festival/speech.properties b/CosyVoice-ttsfrd/resource/festival/speech.properties new file mode 100644 index 0000000000000000000000000000000000000000..507a5197951156291622c5c71cf561c3cb7d6db2 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/speech.properties @@ -0,0 +1,2 @@ +# Register speech engines +cstr.festival.EngineCentral=cstr.festival.jsapi.EngineCentral diff --git a/CosyVoice-ttsfrd/resource/festival/synthesis.scm b/CosyVoice-ttsfrd/resource/festival/synthesis.scm new file mode 100644 index 0000000000000000000000000000000000000000..69c5d56995162a77d5f83b474218ade57ec69ffd --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/synthesis.scm @@ -0,0 +1,443 @@ + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; ;; + ;; Centre for Speech Technology Research ;; + ;; University of Edinburgh, UK ;; + ;; Copyright (c) 1996,1997 ;; + ;; All Rights Reserved. ;; + ;; ;; + ;; Permission is hereby granted, free of charge, to use and distribute ;; + ;; this software and its documentation without restriction, including ;; + ;; without limitation the rights to use, copy, modify, merge, publish, ;; + ;; distribute, sublicense, and/or sell copies of this work, and to ;; + ;; permit persons to whom this work is furnished to do so, subject to ;; + ;; the following conditions: ;; + ;; 1. The code must retain the above copyright notice, this list of ;; + ;; conditions and the following disclaimer. ;; + ;; 2. Any modifications must be clearly marked as such. ;; + ;; 3. Original authors' names are not deleted. ;; + ;; 4. The authors' names are not used to endorse or promote products ;; + ;; derived from this software without specific prior written ;; + ;; permission. ;; + ;; ;; + ;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; + ;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; + ;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; + ;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; + ;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; + ;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; + ;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; + ;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; + ;; THIS SOFTWARE. ;; + ;; ;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; ;; + ;; Author: Richard Caley (rjc@cstr.ed.ac.uk) ;; + ;; Date: Fri Aug 15 1997 ;; + ;; ------------------------------------------------------------------- ;; + ;; New synthesis mainline. ;; + ;; ;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; ;; + ;; Hooks to add to the synthesis process. ;; + ;; ;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(defvar default_before_synth_hooks nil + "default_before_synth_hooks + The default list of functions to be run on all synthesized utterances + before synthesis starts.") + +(defvar before_synth_hooks default_before_synth_hooks + "before_synth_hooks + List of functions to be run on synthesised utterances before synthesis + starts.") + +(defvar default_after_analysis_hooks nil + "default_after_analysis_hooks + The default list of functions to be run on all synthesized utterances + after analysis but before synthesis.") + +(defvar after_analysis_hooks default_after_analysis_hooks + "after_analysis_hooks + List of functions to be applied after analysis and before synthesis.") + +(defvar default_after_synth_hooks nil + "default_after_synth_hooks + The default list of functions to be run on all synthesized utterances + after Wave_Synth. This will normally be nil but if for some reason you + need to change the gain or rescale *all* waveforms you could set the + function here, in your siteinit.scm.") + +(defvar after_synth_hooks default_after_synth_hooks + "after_synth_hooks + List of functions to be applied after all synthesis modules have been + applied. This is primarily designed to allow waveform manipulation, + particularly resampling and volume changes.") + +(defvar default_access_strategy 'ondemand + "default_access_strategy + How to access units from databases.") + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; ;; + ;; Macro to define utterance types. ;; + ;; ;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(defmac (defUttType form) + (list 'defUttType_real + (list 'quote (cadr form)) + (list 'quote (cddr form)))) + +(defvar UttTypes nil + "UttTypes + List of types and functions used by the utt.synth function to call + appropriate methods.") + +(define (defUttType_real type form) + "(defUttType TYPE . BODY) + Define a new utterance type. TYPE is an atomic type that is specified + as the first argument to the function Utterance. BODY is evaluated + with argument utt, when utt.synth is called with an utterance of type + TYPE. You almost always require the function Initialize first. + [see Utterance types]" + ;;; Yes I am cheating a bit with the macro/function name. + ;;; should check about redefining and the syntax of the forms + (set! UttTypes + (cons + (cons type form) + UttTypes)) + type) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; ;; + ;; Macro to define synthesis types. ;; + ;; ;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(defmac (defSynthType form) + (list 'defSynthType_real + (list 'quote (cadr form)) + (list 'quote (cddr form)))) + +(defvar SynthTypes nil + "SynthTypes + List of synthesis types and functions used by the utt.synth function to + call appropriate methods for wave synthesis.") + +(define (defSynthType_real type form) + "(defSynthType TYPE . BODY) + Define a new wave synthesis type. TYPE is an atomic type that + identifies the type of synthesis. BODY is evaluated with argument + utt, when utt.synth is called with an utterance of type TYPE. + [see Utterance types]" + + (set! SynthTypes + (cons + (cons type form) + SynthTypes)) + type) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Some actual Utterance type definitions +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(defUttType Words + (Initialize utt) + (POS utt) + (Phrasify utt) + (Word utt) + (Pauses utt) + (Intonation utt) + (PostLex utt) + (Duration utt) + (Int_Targets utt) + (Wave_Synth utt) + ) + +(defUttType Text + (Initialize utt) + (Text utt) + (Token_POS utt) + (Token utt) + (POS utt) + (Phrasify utt) + (Word utt) + (Pauses utt) + (Intonation utt) + (PostLex utt) + (Duration utt) + (Int_Targets utt) + (Wave_Synth utt) + ) + +(defUttType Tokens ;; This is used in tts_file, Tokens will be preloaded + (Token_POS utt) ;; when utt.synth is called + (Token utt) + (POS utt) + (Phrasify utt) + (Word utt) + (Pauses utt) + (Intonation utt) + (PostLex utt) + (Duration utt) + (Int_Targets utt) + (Wave_Synth utt) + ) + +(defUttType Concept ;; rather gradious name for when information has + (POS utt) ;; been preloaded (probably XML) to give a word + (Phrasify utt) ;; relation (SOLE uses this) + (Word utt) + (Pauses utt) + (Intonation utt) + (PostLex utt) + (Duration utt) + (Int_Targets utt) + (Wave_Synth utt) + ) + +(defUttType Phrase + (Initialize utt) + (Token_POS utt) + (Token utt) + (POS utt) + (Phrasify utt) + (Word utt) + (Pauses utt) + (Intonation utt) + (PostLex utt) + (Duration utt) + (Int_Targets utt) + (Wave_Synth utt) + ) + +(defUttType Segments + (Initialize utt) + (Wave_Synth utt) + ) + +(defUttType Phones + (Initialize utt) + (Fixed_Prosody utt) + (Wave_Synth utt) + ) + +(defUttType SegF0 + (Wave_Synth utt) + ) + +(defUttType Wave + (Initialize utt)) + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; ;; + ;; And some synthesis types. ;; + ;; ;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(defSynthType Taylor + (Taylor_Synthesize utt) + ) + +(defSynthType UniSyn + (defvar UniSyn_module_hooks nil) + (Param.def "unisyn.window_name" "hanning") + (Param.def "unisyn.window_factor" 1.0) + (Parameter.def 'us_sigpr 'lpc) + + (apply_hooks UniSyn_module_hooks utt) ;; for processing of diphone names + (us_get_diphones utt) + (us_unit_concat utt) + + (if (not (member 'f0 (utt.relationnames utt))) + (targets_to_f0 utt)) +;; temporary fix + (if (utt.relation.last utt 'Segment) + (set! pm_end (+ (item.feat (utt.relation.last utt 'Segment) "end") 0.02)) + (set! pm_end 0.02)) + + (us_f0_to_pitchmarks utt 'f0 'TargetCoef pm_end) + (us_mapping utt 'segment_single) + (cond + ((string-equal "td_psola" (Parameter.get 'us_sigpr)) + ;; Not in standard distribution, so has to be separate function + (us_tdpsola_synthesis utt 'analysis_period)) + (t + ;; All the rest + (us_generate_wave utt (Parameter.get 'us_sigpr) + 'analysis_period))) +) + +(defSynthType None + ;; do nothing + utt + ) + +(defSynthType Standard + (print "synth method: Standard") + + (let ((select (Parameter.get 'SelectionMethod))) + (if select + (progn + (print "select") + (apply select (list utt)) + ) + ) + ) + + (let ((join (Parameter.get 'JoiningMethod))) + (if join + (progn + (print "join") + (apply join (list utt)) + ) + ) + ) + + (let ((impose (Parameter.get 'ImposeMethod))) + (if impose + (progn + (print "impose") + (apply impose (list utt)) + ) + ) + ) + + (let ((power (Parameter.get 'PowerSmoothMethod))) + (if power + (progn + (print "power") + (apply power (list utt)) + ) + ) + ) + + (let ((wavesynthesis (Parameter.get 'WaveSynthesisMethod))) + (if wavesynthesis + (progn + (print "synthesis") + (apply wavesynthesis (list utt)) + ) + ) + ) + ) + +(defSynthType Minimal + (print "synth method: Minimal") + + (let ((select (Parameter.get 'SelectionMethod))) + (if select + (progn + (print "select") + (apply select (list utt)) + ) + ) + ) + + (let ((wavesynthesis (Parameter.get 'WaveSynthesisMethod))) + (if wavesynthesis + (progn + (print "synthesis") + (apply wavesynthesis (list utt "Unit" "Join" "Wave")) + ) + ) + ) + ) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; ;; + ;; Finally the actual driver function. ;; + ;; ;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (utt.synth utt) + + "(utt.synth UTT) + The main synthesis function. Given UTT it will apply the + functions specified for UTT's type, as defined with deffUttType + and then those demanded by the voice. After modules have been + applied synth_hooks are applied to allow extra manipulation. + [see Utterance types]" + + (apply_hooks before_synth_hooks utt) + + (let ((type (utt.type utt))) + (let ((definition (assoc type UttTypes))) + (if (null? definition) + (error "Unknown utterance type" type) + (let ((body (eval (cons 'lambda + (cons '(utt) (cdr definition)))))) + (body utt))))) + + (apply_hooks after_synth_hooks utt) + utt) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; ;; + ;; And a couple of utility expressions. ;; + ;; ;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (SayText text) +"(SayText TEXT) +TEXT, a string, is rendered as speech." + (utt.play (utt.synth (eval (list 'Utterance 'Text text))))) + +(define (SynthText text) +"(SynthText TEXT) +TEXT, a string, is rendered as speech." + (utt.synth (eval (list 'Utterance 'Text text)))) + +(define (SayPhones phones) +"(SayPhones PHONES) +PHONES is a list of phonemes. This uses the Phones type utterance +to synthesize and play the given phones. Fixed duration specified in +FP_duration and fixed monotone duration (FP_F0) are used to generate +prosody." + (utt.play (utt.synth (eval (list 'Utterance 'Phones phones))))) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; ;; + ;; This is the standard synthesis function. The Wave Synthesis may be ;; + ;; more than a simple module ;; + ;; ;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +(define (Wave_Synth utt) +"(Wave_Synth UTT) + Generate waveform from information in UTT, at least a Segment stream + must exist. The actual form of synthesis used depends on the Parameter + Synth_Method. If it is a function that is applied. If it is atom it + should be a SynthType as defined by defSynthType + [see Utterance types]" + (apply_hooks after_analysis_hooks utt) + (let ((method_val (Parameter.get 'Synth_Method))) + (cond + ((null method_val) + (error "Undefined Synth_Method")) + ((and (symbol? method_val) (symbol-bound? method_val)) + ;; Wish there was a function? + (apply (symbol-value method_val) (list utt))) + ((member (typeof method_val) '(subr closure)) + (apply method_val (list utt))) + (t ;; its a defined synthesis type + (let ((synthesis_modules (assoc_string method_val SynthTypes))) + (if (null? synthesis_modules) + (error (format nil "Undefined SynthType %s\n" method_val)) + (let ((body (eval (cons 'lambda + (cons '(utt) (cdr synthesis_modules)))))) + (body utt))))))) + utt) + +(provide 'synthesis) + + + diff --git a/CosyVoice-ttsfrd/resource/festival/tilt.scm b/CosyVoice-ttsfrd/resource/festival/tilt.scm new file mode 100644 index 0000000000000000000000000000000000000000..92dbec6420dbd5c2f98aa9bd736fc381044ab517 --- /dev/null +++ b/CosyVoice-ttsfrd/resource/festival/tilt.scm @@ -0,0 +1,972 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; ;; +;;; Centre for Speech Technology Research ;; +;;; University of Edinburgh, UK ;; +;;; Copyright (c) 1996,1997 ;; +;;; All Rights Reserved. ;; +;;; ;; +;;; Permission is hereby granted, free of charge, to use and distribute ;; +;;; this software and its documentation without restriction, including ;; +;;; without limitation the rights to use, copy, modify, merge, publish, ;; +;;; distribute, sublicense, and/or sell copies of this work, and to ;; +;;; permit persons to whom this work is furnished to do so, subject to ;; +;;; the following conditions: ;; +;;; 1. The code must retain the above copyright notice, this list of ;; +;;; conditions and the following disclaimer. ;; +;;; 2. Any modifications must be clearly marked as such. ;; +;;; 3. Original authors' names are not deleted. ;; +;;; 4. The authors' names are not used to endorse or promote products ;; +;;; derived from this software without specific prior written ;; +;;; permission. ;; +;;; ;; +;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; +;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; +;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; +;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; +;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; +;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; +;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; +;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; +;;; THIS SOFTWARE. ;; +;;; ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Author: Alan W Black, Kurt Dusterhoff, Janet Hitzeman +;;; Date: April 1999 +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Tilt intonation modules, accent/boundary preditions and F0 generation +;;; The F0 generation is done using models as described in +;;; Dusterhoff, K. and Black, A. (1997). "Generating F0 contours for +;;; speech synthesis using the Tilt intonation theory" +;;; (http://www.cstr.ed.ac.uk/awb/papers/esca-int97.ps) +;;; Proceedings of ESCA Workshop of Intonation, pp 107-110, September, +;;; Athens, Greece. +;;; +;;; Intonation_Tilt assigns accents and boundaries by a CART tree +;;; the c and sil nodes are derived directly duration creation +;;; +;;; Int_Targets_Tilt generates the F0 using the CART trees as +;;; described in the paper referenced above. +;;; +;;; THIS CONTAINS *VERY* EXPERIMENTAL CODE +;;; it requires a thoroughly clean up and probably split into +;;; multiple files + +(defvar int_tilt_params nil + "int_tilt_params +Parameters for tilt intonation model.") + +(Parameter.def 'tilt_method 'cart) + +(define (Intonation_Tilt utt) + "(Intonation_Tilt utt) +Assign accent and boundary IntEvents to each syllable, and fill in +spaces with silence and connections." + (let (accent boundary) + ;; Create basic intonation relations + (utt.relation.create utt 'Intonation) + (utt.relation.create utt 'IntonationSyllable) + (mapcar + (lambda (syl) + ;; If first syllable in phrase add phrase_start + (if (string-equal "pau" + (item.feat syl "R:SylStructure.daughter1_to.Segment.p.name")) + (tilt_add_intevent utt syl 'phrase_start)) + + (set! accent (wagon_predict syl tilt_a_cart_tree)) + (set! boundary (wagon_predict syl tilt_b_cart_tree)) +; (format t "%s: accent %s boundary %s\n" +; (item.feat syl "R:WordStructure.root.name") +; accent +; boundary) + (if (not (string-equal accent "0")) + (tilt_add_intevent utt syl accent)) + (if (not (string-equal boundary "0")) + (if (and (string-equal boundary "afb") + (not (string-equal accent "0"))) + (tilt_add_intevent utt syl "fb") ;; can't have a/afb + (tilt_add_intevent utt syl boundary))) + + ;; If last syllable in phrase add phrase_end + (if (string-equal "pau" + (item.feat syl "R:SylStructure.daughtern_to.Segment.n.name")) + (tilt_add_intevent utt syl 'phrase_end))) + (utt.relation.items utt 'Syllable)) +;; (utt.relation.print utt 'Intonation) + utt)) + +(define (tilt_add_intevent utt syl name) +"(tilt_add_intevent utt syl name) +Add a new IntEvent related to syl with name." + (let (ie) + (set! ie (utt.relation.append utt 'Intonation (list name))) + (if (not (item.relation syl 'IntonationSyllable)) + (utt.relation.append utt 'IntonationSyllable syl)) + (item.relation.append_daughter syl 'IntonationSyllable ie) + (if (not (string-matches name "phrase_.*")) + (item.set_feat ie "int_event" 1)) + ie)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Fo generate through tilt parameters and F0 rendering +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(define (Int_Targets_Tilt utt) + "(Int_Targets_Tilt utt) +Assign Tilt parameters to each IntEvent and then generate the +F0 contour and assign targets." + (utt.relation.set_feat utt "Intonation" "intonation_style" "tilt") + (tilt_assign_parameters utt) +; (tilt_F0_and_targets utt) ;; this has to be C++, sorry +; (tilt_map_f0_range utt) + (tilt_to_f0 utt "f0") + (tilt_validate utt) + utt +) + +(define (tilt_validate utt) + "(tilt_validate utt) +Checks that the predicted tilt parameter fall with reasonable +limits and modify them where possible to be more reasonable." + (mapcar + (lambda (ie) + (cond + ((string-equal (item.name ie) "phrase_end") + ;; check previous event does overflow segments + ) + (t + t)) + ) + (utt.relation.items utt 'Intonation)) +) + +(define (tilt_map_f0_range utt) + "(tilt_map_f0_range utt) +In order fo better trained models to be used for voices which don't +have the necessary data to train models from the targets may be mapped +to a different pitch range. Note this is not optimal as pitch ranges +don't map that easily, but the the results can sometimes be better than +using a less sophisticated F0 generation model. The method used +is to define the mean and standard deviation of the speaker the +model was trained on and the mean and standard deciation of the +desired speaker. Mapping is by converting the actual F0 value +to zscores (distance from mean in number of stddev) and back into +the other domain. The variable int_tilt_params is used to find +the values." + (let ((target_f0_mean (car (cdr (assoc 'target_f0_mean int_tilt_params)))) + (target_f0_std (car (cdr (assoc 'target_f0_std int_tilt_params)))) + (model_f0_std (car (cdr (assoc 'model_f0_std int_tilt_params)))) + (model_f0_mean (car (cdr (assoc 'model_f0_mean int_tilt_params))))) + (if target_f0_mean ;; only if one is specified + (lambda (targ) + (item.set_name + targ + (+ target_f0_mean + (* target_f0_std + (/ (- (parse-number (item.name targ)) + model_f0_mean) + model_f0_std))))) + (utt.relation.leafs utt 'Target)))) + +(define (tilt_assign_parameters utt) + "(tilt_assign_parameters utt) +Assigned tilt parameters to IntEvents, depending on the value +of the Parameter tilt_method uses wagon trees (cart) or linear +regression models (lr)." + (let ((method (Parameter.get 'tilt_method))) + (cond + ((equal? method 'cart) + (tilt_assign_parameters_wagon utt)) + ((equal? method 'lr) + (tilt_assign_parameters_lr utt)) + (t + (error "Tilt: unknown tilt param prediction method: " tilt_method))))) + +(define (tilt_assign_parameters_wagon utt) + "(tilt_assign_parameters_wagon utt) +Assing parameters (start_f0, tilt, amplitude, peak_pos and duration) +to each IntEvent. Uses Wagon trees to predict values" + (mapcar + (lambda (ie) + (let ((param_trees (cdr (assoc_string (item.name ie) + tilt_param_trees)))) + (item.set_feat ie "time_path" "IntonationSyllable") + (if (string-equal "1" (item.feat ie "int_event")) + (item.set_function ie "time" "unisyn_tilt_event_position") + (item.set_function ie "time" "unisyn_tilt_phrase_position")) + (cond + ((null param_trees) + (format stderr "Tilt: unknown Intonation type %s, ignored\n" + (item.name ie)) + ;; *need* to assign default values + (item.set_feat ie "ev.f0" 100) + (item.set_feat ie "tilt.amp" 20.0) + (item.set_feat ie "tilt.dur" 0.25) + (item.set_feat ie "tilt.tilt" -0.2) + (item.set_feat ie "rel_pos" 0.0) + ) + (t + (tilt_assign_params_wagon ie param_trees))))) + (utt.relation.items utt 'Intonation))) + +(define (tilt_assign_params_wagon ie trees) + "(tilt_assign_params_wagon ie trees) +Assign the names parameters to ie using the trees and names in +trees." + (mapcar + (lambda (tree) + (let ((val (wagon_predict ie (car (cdr tree))))) + (item.set_feat ie (car tree) val))) + trees)) + +(define (tilt_assign_parameters_lr utt) + "(tilt_assign_parameters_lr utt) +Assing parameters (start_f0, tilt, amplitude, peak_pos and duration) +to each IntEvent. Prediction by linear regression models" + (mapcar + (lambda (ie) + (let ((param_lrmodels (cdr (assoc_string (item.name ie) + tilt_param_lrmodels)))) + (cond + ((null param_lrmodels) + (format stderr "Tilt: unknown IntEvent type %s, ignored\n" + (item.name ie)) + ;; *need* to assign default values + (item.set_feat ie "ev.f0" 100) + (item.set_feat ie "tilt.amp" 20.0) + (item.set_feat ie "tilt.dur" 0.25) + (item.set_feat ie "tilt.tilt" -0.2) + (item.set_feat ie "rel_pos" 0.0) + ) + (t + (tilt_assign_params_lr ie param_lrmodels))))) + (utt.relation.items utt 'IntEvent))) + +(define (tilt_assign_params_lr ie lrmodels) + "(tilt_assign_params_lr ie lrmodels) +Assign the names parameters to ie using the trees and names in +trees." + (mapcar + (lambda (lrm) + (let ((val (lr_predict ie (cdr lrm)))) + (item.set_feat ie (car lrm) val))) + lrmodels)) + +(define (utt.save.tilt_events utt filename) +"(utt.save.til_events UTT FILENAME) +Save tilt events in UTT to FILENAME in a format suitable for +ev_synth." + (let ((fd (fopen filename "w"))) + (format fd "#\n") + (mapcar + (lambda (ie) + (let ((name (item.name ie))) + (cond + ((or (string-equal name "sil") + (string-equal name "c")) + (format fd " %2.4f 100 %s; tilt: %2.6f\n" + (item.feat ie 'end) + name + (item.feat ie "tilt_start_f0"))) + (t ;; accent or boundary + (format fd " %2.4f 100 %s; tilt: %2.6f %2.6f %2.6f %2.6f %2.6f\n" + (item.feat ie 'end) + name + (item.feat ie "ev.f0") + (item.feat ie "tilt.amp") + (item.feat ie "tilt.dur") + (item.feat ie "tilt.tilt") + (item.feat ie "rel_pos")))))) + (utt.relation.items utt 'IntEvent)) + (fclose fd) + utt)) + + +;;;;; +;;; Some features which should be pruned +;;;;; + +(def_feature_docstring 'Syllable.lisp_time_to_next_vowel + "Syllable.lisp_time_to_next_vowel syl + The time from vowel_start to next vowel_start") +(define (time_to_next_vowel syl) + "(time_to_next_vowel syl) + The time from vowel_start to next vowel_start" + (let (ttnv) + (if (string-equal "0" (item.feat syl "n.vowel_start")) + (set! ttnv 0.00) + (set! ttnv (- (item.feat syl "n.vowel_start") + (item.feat syl "vowel_start")))) + ttnv)) + +(def_feature_docstring 'Syllable.lisp_next_stress + "Syllable.lisp_next_stress + Number of syllables to next stressed syllable. 0 if this syllable is + stressed. It is effectively assumed the syllable after the last syllable + is stressed.") +(define (next_stress syl) + (cond + ((null syl) 0) + ((string-equal (item.feat syl 'stress_num) "1") + 0) + (t + (+ 1 (next_stress (item.relation.next syl 'Syllable)))))) + +(def_feature_docstring 'Syllable.lisp_last_stress + "Syllable.lisp_last_stress + Number of syllables from previous stressed syllable. 0 if this syllable + is stressed. It is effectively assumed that the syllable before the + first syllable is stressed.") +(define (last_stress syl) + (cond + ((null syl) 0) + ((string-equal (item.feat syl 'stress_num) "1") + 0) + (t + (+ 1 (last_stress (item.relation.prev syl 'Syllable)))))) + + +(def_feature_docstring 'SylStructure.lisp_length_to_last_seg + "SylStructure.lisp_length_to_last_seg + Length from start of the vowel to start of last segment of syllable.") +(define (length_to_last_seg syl) + (- (item.feat syl "daughtern_to.Segment.start") + (item.feat syl "vowel_start"))) + +(def_feature_docstring 'SylStructure.lisp_get_rhyme_length + "Syllable.lisp_get_rhyme_length + Length from start of the vowel to end of syllable.") +(define (get_rhyme_length syl) + (- (item.feat syl 'end) + (item.feat syl 'vowel_start syl))) + +(def_feature_docstring 'SylStructure.lisp_get_onset_length + "Syllable.lisp_get_onset_length + Length from start of syllable to start of vowel.") +(define (get_onset_length syl) + (cond + ((< (- (item.feat syl 'vowel_start) + (item.feat syl 'start)) + 0.000) + 0.000) ;; just in case + (t + (- (item.feat syl 'vowel_start) + (item.feat syl 'start))))) + +(def_feature_docstring 'Syllable.lisp_tilt_accent + "Syllable.lisp_tilt_accent + Returns \"a\" if there is a tilt accent related to this syllable, 0 + otherwise.") +(define (tilt_accent syl) + (let ((events (item.relation.daughters syl 'IntonationSyllable)) + (r "0")) + (mapcar + (lambda (i) + (if (member_string (item.name i) tilt_accent_list) + (set! r "a"))) + events) + r)) + +(def_feature_docstring 'Syllable.lisp_tilt_boundary + "Syllable.lisp_tilt_boundary + Returns boundary label if there is a tilt boundary related to this +syllable, 0 otherwise.") +(define (tilt_boundary syl) + (let ((events (item.relation.daughters syl 'IntonationSyllable)) + (r "0")) + (mapcar + (lambda (i) + (let ((name (item.name i))) + (if (member_string name tilt_boundary_list) + (cond + ((string-matches name "a.*") + (set! r (string-after name "a"))) + ((string-matches name "m.*") + (set! r (string-after name "m"))) + (t + (set! r name)))))) + events) + r)) + +(def_feature_docstring 'Syllable.lisp_tilt_accented + "Syllable.lisp_tilt_accented + Returns 1 if there is a tilt accent related to this syllable, 0 + otherwise.") +(define (tilt_accented syl) + (let ((events (item.relation.daughters syl 'IntonationSyllable)) + (r "0")) + (mapcar + (lambda (i) + (if (member_string (item.name i) tilt_accent_list) + (set! r "1"))) + events) + r)) + +(def_feature_docstring 'Syllable.lisp_tilt_boundaried + "Syllable.lisp_tilt_boundaried + Returns 1 if there is a tilt boundary related to this syllable, 0 + otherwise.") +(define (tilt_boundaried syl) + (let ((events (item.relation.daughters syl 'IntonationSyllable)) + (r "0")) + (mapcar + (lambda (i) + (if (member_string (item.name i) tilt_boundary_list) + (set! r "1"))) + events) + r)) + +(def_feature_docstring 'SylStructure.lisp_vowel_height + "SylStructure.lisp_vowel_height syl +Classifies vowels as high, low or mid") +(define (vowel_height syl) + (let ((vh (item.feat syl "daughtern.daughter1.daughter1.df.height"))) + vh) +) + +(def_feature_docstring 'SylStructure.lisp_vowel_frontness + "SylStructure.vowel_frontness syl +Classifies vowels as front, back or mid") +(define (vowel_frontness syl) + (let ((vf (item.feat syl "daughtern.daughter1.daughter1.df.front"))) + vf) +) + +(def_feature_docstring 'SylStructure.lisp_vowel_length + "SylStructure.vowel_length syl +Returns the df.length feature of a syllable's vowel") +(define (vowel_length syl) + (let ((vl (item.feat syl "daughtern.daughter1.daughter1.df.length"))) + vl) +) + +(defvar sonority_vless_obst '("f" "h" "hh" "k" "p" "s" "sh" "t" "th" "ch") + "sonority_vless_obst +List of voiceless obstruents for use in sonority scaling (only good w/ radio_speech)" + ) +(defvar sonority_v_obst '("v" "b" "g" "z" "zh" "d" "dh" "jh") + "sonority_v_obst +List of voiced obstruents for use in sonority scaling (only good w/ radio_speech)" + ) +(defvar sonority_nas '("m" "n" "ng" "nx" "em" "en") + "sonority_nas +List of nasals (only good w/ radio_speech)" + ) +(defvar sonority_liq '("r" "l" "er" "el" "axr") + "sonority_liq +List of liquids (only good w/ radio_speech)" + ) +(defvar sonority_glides '("y" "w") + "sonority_glides +List of glides (only good w/ radio_speech)" + ) + +(def_feature_docstring 'SylStructure.lisp_sonority_scale_coda + "SylStructure.sonority_scale_coda syl +Returns value on sonority scale (1 -6, where 6 is most sonorous) +for the coda of a syllable, based on least sonorant portion.") +(define (sonority_scale_coda syl) + (let ((segs (item.daughters (item.daughtern (item.daughtern syl)))) + (scale 6)) + (mapcar + (lambda (seg) + (cond + ((member_string (item.name seg) sonority_vless_obst) + (if (> scale 1) + (set! scale 1))) + ((member_string (item.name seg) sonority_v_obst) + (if (> scale 2) + (set! scale 2))) + ((member_string (item.name seg) sonority_nas) + (if (> scale 3) + (set! scale 3))) + ((member_string (item.name seg) sonority_liq) + (if (> scale 4) + (set! scale 4))) + ((member_string (item.name seg) sonority_glides) + (if (> scale 5) + (set! scale 5))) + (t + (if (> scale 6) + (set! scale 6))) + ) + ) + segs) + scale)) + +(def_feature_docstring 'SylStructure.lisp_sonority_scale_onset + "SylStructure.sonority_scale_onset syl +Returns value on sonority scale (1 -6, where 6 is most sonorous) +for the onset of a syllable, based on least sonorant portion.") +(define (sonority_scale_onset syl) + (if (string-equal "Onset" (item.feat (item.daughter1 syl) "sylval")) + (let ((segs (item.daughters (item.daughter1 syl))) + (scale 6)) + (mapcar + (lambda (seg) + (cond + ((member_string (item.name seg) sonority_vless_obst) + (if (> scale 1) + (set! scale 1))) + ((member_string (item.name seg) sonority_v_obst) + (if (> scale 2) + (set! scale 2))) + ((member_string (item.name seg) sonority_nas) + (if (> scale 3) + (set! scale 3))) + ((member_string (item.name seg) sonority_liq) + (if (> scale 4) + (set! scale 4))) + ((member_string (item.name seg) sonority_glides) + (if (> scale 5) + (set! scale 5))) + (t (set! scale 6)) + ) + ) + segs) + scale) + 0)) + +(def_feature_docstring 'SylStructure.lisp_num_postvocalic_c + "SylStructure.lisp_num_postvocalic_c +Finds the number of postvocalic consonants in a syllable.") +(define (num_postvocalic_c syl) + "(num_postvocalic_c syl) +Finds the number of postvocalic consonants in a syllable." + (let (segs (npc 0)) + (set! segs (item.daughters (item.daughtern (item.daughtern syl)))) + (mapcar + (lambda (seg) + (set! npc (+ npc 1)) + ) + segs) + npc)) + + +(def_feature_docstring 'SylStructure.lisp_syl_numphones + "SylStructure.lisp_syl_numphones syl +Finds the number segments in a syllable.") +(define (syl_numphones syl) + (length (mt_segs_from_syl syl)) + ) + +(def_feature_docstring 'Segment.lisp_pos_in_syl + "Segment.lisp_pos_in_syl seg +Finds the position in a syllable of a segment - returns a number.") +(define (pos_in_syl seg) + (let ((segments (mt_segs_from_syl + (item.relation (item.parent_to + (item.relation seg 'SylStructure) + 'Syllable) + 'SylStructure))) + (seg_count 1)) + (mapcar + (lambda (s) + (if (not (eqv? s seg)) + (set! seg_count (+ 1.0 seg_count)) + nil)) + segs) + seg_count)) + +(def_feature_docstring 'Intonation.lisp_peak_anchor_segment_type + "Intonation.peak_anchor_segment_type ie +Determines whether the segment anchor for a peak +is the first consonant of a syl - C0 -, the +vowel of a syl - V0 -, or segments after that +- C1->X,V1->X. If the segment is in a following syl, +the return value will be preceded by a 1 - e.g. 1V1") +(define (peak_anchor_segment_type ie) + (let ( syl peak_anchor_num numsegs peak_anchor_type) + (set! peak_anchor_num (peak_segment_anchor ie)) + + + (if (> 9 peak_anchor_num) + (set! syl (item.relation + (item.parent (item.relation ie "IntonationSyllable")) + "SylStructure"))) + (if (> 9 peak_anchor_num) + (set! numsegs (item.feat syl "syl_numphones"))) + + (cond + ((< 9 peak_anchor_num) + (set! peak_anchor_type "none")) + ((> 0 peak_anchor_num) + (set! peak_anchor_type + (string-append + "-1" (get_anchor_value (item.prev syl) + (+ peak_anchor_num + (item.feat syl "p.syl_numphones")))))) + ((< peak_anchor_num numsegs) + (set! peak_anchor_type (get_anchor_value syl numsegs))) + ((> peak_anchor_num numsegs) + (set! peak_anchor_type + (string-append + "1" (get_anchor_value (item.next syl) (- peak_anchor_num numsegs))))) + (set! peak_anchor_type "none")) +; (format stderr "pat: %s\n" peak_anchor_type) + peak_anchor_type)) + +(define (get_anchor_value sylSyl seg_num) + "(get_anchor_value sylSyl seg_num) +Gets the c/v value of the segment within a syllable." + (let ((syl (item.relation sylSyl "SylStructure")) + (seg_val "none") segs (ccnt -1) (vcnt -1) (vpis 0)) + (set! segs (mt_segs_from_syl sylSyl)) + (mapcar + (lambda (seg) + (cond + ((string-equal "consonant" (item.feat seg "df.type")) + (set! vcnt (+ 1 vcnt)) + (set! vpis (item.feat seg "pos_in_syl"))) + (t + (set! ccnt (+ 1 ccnt)))) + (cond + ((and + (eq (- seg_num 1.0) (item.feat seg "pos_in_syl")) + ( string-equal "consonant" (item.feat seg "df.type"))) + (set! seg_val (string-append "V" vcnt))) + ((and + (eq (- seg_num 1.0) (item.feat seg "pos_in_syl")) + ( string-equal "vowel" (item.feat seg "df.type"))) + (set! seg_val (string-append "C" (- (item.feat seg "pos_in_syl") + vpis) "V" vcnt))) + (t nil)) + ) + segs) + seg_val)) + +(define (peak_segment_anchor ie) + "peak_segment_anchor ie +Determines what segment acts as the anchor for a peak. +Returns number of segments from start of accented syllable +to peak." +; (format stderr "accent: %s\n" +; (item.name ie)) + (let ((pk_pos (item.feat ie "position")) + (peak_seg_anchor 11)) + (if + (or + (string-equal "phrase_start" (item.name ie)) + (string-equal "phrase_end" (item.name ie)) + (string-equal "pause" (item.name ie))) + (set! peak_seg_anchor 10) + (set! peak_seg_anchor (find_peak_seg_anchor ie pk_pos))) + peak_seg_anchor)) + +(define (find_peak_seg_anchor ie pk_pos) + "find_peak_seg_anchor ie pk_pos +Part of the workings of peak_segment_anchor." + (let (( syl (item.relation + (item.parent (item.relation ie 'IntonationSyllable)) + 'SylStructure)) + (seg_anchor 11)) + (cond + ((not (eq 9.0 (segs_to_peak syl pk_pos))) + (set! seg_anchor (segs_to_peak syl pk_pos))) + + ((and (item.prev syl) + (not (eq 9.0 (segs_to_peak (item.prev syl) pk_pos)))) +; (format stderr "%s\n" (item.name (item.prev syl))) + (set! seg_anchor (* -1 + (- (+ 1 (item.feat syl "p.syl_numphones")) + (segs_to_peak (item.prev syl) pk_pos))))) + + ((and (item.next syl) + (> pk_pos (item.feat syl "n.start"))) +; (format stderr "%s\n" (item.name (item.next syl))) + (set! seg_anchor (+ 1 + (item.feat syl "syl_numphones") + (segs_to_peak (item.next syl) pk_pos)))) + (t + (format stderr "No seg anchor could be found\n"))) +; (format stderr "seg_anchor: %f\n" seg_anchor) + seg_anchor)) + +(define (segs_to_peak sylSyl pk_pos) + "(segs_to_peak sylSyl pk_pos) +Determines the number of segments from the start of a syllable +to an intonation peak" + (let ((syl (item.relation sylSyl "SylStructure")) + (segs_2_peak 9) segs) + (set! segs (mt_segs_from_syl syl)) + (mapcar + (lambda (seg) +; (format stderr "seg_end: %f pk: %f\n" (item.feat seg "end") +; pk_pos) + (if (eq 1.0 (peak_wi_seg seg pk_pos)) + (set! segs_2_peak (item.feat seg "pos_in_syl"))) +; (format stderr "segs_2_peak: %f\n" segs_2_peak) + ) + segs) + segs_2_peak)) + +(define (peak_wi_seg segment pk_pos) + "peak_wi_seg segment pk_pos +Finds if a peak occurs w/i a segment" + (let ((s_start (item.feat segment "start")) + (s_end (item.feat segment "end")) + (ret 0.0)) + (if (and (< s_start pk_pos) + (< pk_pos s_end)) + (set! ret 1.0) + nil) + ret)) + +(defvar tilt_accent_list '("a" "arb" "afb" "m" "mfb" "mrb") + "tilt_accent_list +List of events containing accents in tilt model.") +(defvar tilt_boundary_list '("rb" "arb" "afb" "fb" "mfb" "mrb") + "tilt_boundary_list +List of events containing boundaries in tilt model.") + +(def_feature_docstring 'Intonation.lisp_last_tilt_accent + "Intonation.lisp_last_tilt_accent + Returns the most recent tilt accent.") +(define (last_tilt_accent intev) + (let ((pie (item.relation.prev intev 'Intonation))) + (cond + ((not pie) + "0") + ((member_string (item.name pie) tilt_accent_list) + (item.name pie)) + (t (last_tilt_accent pie))))) + +(def_feature_docstring 'Intonation.lisp_next_tilt_accent + "Intonation.lisp_next_tilt_accent + Returns the next tilt accent.") +(define (next_tilt_accent intev) + (let ((nie (item.relation.next intev 'Intonation))) + (cond + ((not nie) "0") + ((member_string (item.name nie) tilt_accent_list) + (item.name nie)) + (t (next_tilt_accent nie))))) + +(def_feature_docstring 'Intonation.lisp_last_tilt_boundary + "Intonation.lisp_last_tilt_boundary + Returns the most recent tilt boundary.") +(define (last_tilt_boundary intev) + (let ((pie (item.relation.prev intev 'Intonation))) + (cond + ((not pie) "0") + ((member_string (item.name pie) tilt_boundary_list) + (item.name pie)) + (t (last_tilt_boundary pie))))) + +(def_feature_docstring 'Intonation.lisp_next_tilt_boundary + "Intonation.lisp_next_tilt_boundary + Returns the next tilt boundary.") +(define (next_tilt_boundary intev) + (let ((nie (item.relation.next intev 'Intonation))) + (cond + ((not nie) "0") + ((member_string (item.name nie) tilt_boundary_list) + (item.name nie)) + (t (next_tilt_boundary nie))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Some basic function to metrical tree structure +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(define (mt_syl_from_seg seg) + (if seg + (item.root (item.relation seg 'SylStructure)) + nil)) +(define (mt_word_from_syl syl) + (if syl + (item.root (item.relation syl 'WordStructure)) + nil)) +(define (mt_word_from_seg seg) + (mt_word_from_syl (mt_syl_from_seg seg))) + +(define (mt_segs_from_syl s) + (cond + ((null s) nil) + ((member_string 'Segment (item.relations s)) + (list s)) + (t + (apply + append + (mapcar mt_segs_from_syl (item.relation.daughters s 'SylStructure)))))) + +(define (sylmtval s) + (let ((syl (mt_syl_from_seg s))) + (if syl + (item.feat syl "MetricalValue") + "0"))) + +(define (sylpmtval s) + (let ((syl (mt_syl_from_seg s))) + (if syl + (item.feat syl "R:MetricalTree.parent.MetricalValue") + "0"))) + +(define (mt_numsyls w) + (let ((s1 (item.daughter1_to (item.relation w 'WordStructure) 'Syllable)) + (sn (item.daughtern_to (item.relation w 'WordStructure) 'Syllable)) + (count 1)) + (while (and s1 (not (equal? s1 sn))) + (set! count (+ 1 count)) + (set! s1 (item.next s1))) + (if s1 + count + 0))) + +(define (mt_seg_numsyls s) + (let ((w (mt_word_from_seg s))) + (if w + (mt_num_syls w) + 0))) + + +;;; These functions should be sort out some time + +;;; Difference between this syl and the next +;;; number of closing brackets, number of opening brackets +;;; difference between them + +(define (mt_close n) + "(mt_close n) +The number of consituents this is the end of, Effectively the +number of closing brackets after this word." + (if (or (not n) (item.next n)) + 0 + (+ 1 (mt_close (item.parent n))))) + +(define (mt_open n) + "(mt_open n) +The number of consituents this is the start of, Effectively the +number of opening brackets before this word." + (if (or (not n) (item.prev n)) + 0 + (+ 1 (mt_open (item.parent n))))) + +(define (mt_postype syl) + "(mt_postype syl) +Returns single, initial, final or middle." + (let ((w (mt_word_from_syl syl)) + (psw (mt_word_from_syl (item.relation.prev syl 'Syllable))) + (nsw (mt_word_from_syl (item.relation.next syl 'Syllable)))) + (cond + ((and (equal? w psw) + (equal? w nsw)) + 'middle) + ((and (not (equal? w psw)) + (not (equal? w nsw))) + 'single) + ((equal? w psw) + 'final) + (t + 'initial)))) + +(define (mt_accent syl) + "(mt_accent syl) +Accent or 0 if none." + (let ((a 0)) + (mapcar + (lambda (i) + (if (string-matches (item.name i) "^a.*") + (set! a "a"))) + (item.relation.daughters syl 'IntonationSyllable)) + a)) + +(define (mt_break syl) + "(mt_break syl) +Break or 0 if none." + (let ((a 0)) + (mapcar + (lambda (i) + (if (string-matches (item.name i) ".*b$") + (set! a (item.name i)))) + (item.relation.daughters syl 'IntonationSyllable)) + a)) + +(define (mt_ssyl_out s) + (cond + ((null s) 0) + ((not (string-equal + "0" (item.feat s "R:WordStructure.root.lisp_word_mt_break"))) + 0) + ((string-equal "s" (item.feat s "MetricalValue")) + (+ 1 (mt_ssyl_out (item.relation.next s 'Syllable)))) + (t + (mt_ssyl_out (item.relation.next s 'Syllable))))) + +(define (mt_num_s s) + "(mt_num_s s) +The number of s MetricalValues from here to a w or top." + (cond + ((null s) 0) + ((string-equal "w" (item.feat s "MetricalValue")) + 0) + (t + (+ 1 (mt_num_s (item.parent s)))))) + +(define (mt_num_w s) + "(mt_num_w s) +The number of w MetricalValues from here to a s or top." + (cond + ((null s) 0) + ((string-equal "s" (item.feat s "MetricalValue")) + 0) + (t + (+ 1 (mt_num_w (item.parent s)))))) + +(define (mt_strong s) + "(mt_strong s) +1 if all MetricalValues a s to a word, 0 otherwise." + (cond + ((string-equal "w" (item.feat s "MetricalValue")) + "0") + ((member_string 'Word (item.relations s)) "1") + (t + (mt_strong (item.relation.parent s 'MetricalTree))))) + +(define (mt_lssp s) + "(mt_lssp s) +1 if last stressed syllable in phrase, 0 otherwise." + (if (and (string-equal "s" (item.feat s "MetricalValue")) + (equal? 0 (mt_ssyl_out s))) + "1" + "0")) + +(define (mt_fssw s) + "(mt_fssw s) +1 if first stressed syllable in word, 0 otherwise." + (if (and (string-equal "s" (item.feat s "MetricalValue")) + (mt_no_stress_before (item.relation.prev s 'Syllable))) + "1" + "0")) + +(define (mt_nfssw s) + "(nfssw s) +1 if second or later stressed syllable in word, 0 otherwise." + (if (and (string-equal "s" (item.feat s "MetricalValue")) + (null (mt_no_stress_before (item.relation.prev s 'Syllable)))) + "1" + "0")) + +(define (mt_no_stress_before ss) + (cond + ((null ss) t) + ((not (string-equal + (item.feat ss "R:WordStructure.root.addr") + (item.feat (item.next ss) "R:WordStructure.root.addr"))) + t) + ((string-equal "s" (item.feat ss "MetricalValue")) + nil) + (t + (mt_no_stress_before (item.prev ss))))) + +(define (word_mt_break w) + (cond + ((string-equal "1" (item.feat w "sentence_end")) + "BB") + ((string-equal "1" (item.feat w "phrase_end")) + "B") + (t + "0"))) + +(provide 'tilt)