CosyVoice
/
CosyVoice-ttsfrd
/resource
/festival
/voices
/english
/rab_diphone
/festvox
/rab_diphone.scm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
;;; ;; | |
;;; Centre for Speech Technology Research ;; | |
;;; University of Edinburgh, UK ;; | |
;;; Copyright (c) 1996,1997 ;; | |
;;; All Rights Reserved. ;; | |
;;; ;; | |
;;; Permission is hereby granted, free of charge, to use and distribute ;; | |
;;; this software and its documentation without restriction, including ;; | |
;;; without limitation the rights to use, copy, modify, merge, publish, ;; | |
;;; distribute, sublicense, and/or sell copies of this work, and to ;; | |
;;; permit persons to whom this work is furnished to do so, subject to ;; | |
;;; the following conditions: ;; | |
;;; 1. The code must retain the above copyright notice, this list of ;; | |
;;; conditions and the following disclaimer. ;; | |
;;; 2. Any modifications must be clearly marked as such. ;; | |
;;; 3. Original authors' names are not deleted. ;; | |
;;; 4. The authors' names are not used to endorse or promote products ;; | |
;;; derived from this software without specific prior written ;; | |
;;; permission. ;; | |
;;; ;; | |
;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; | |
;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; | |
;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; | |
;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; | |
;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; | |
;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; | |
;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; | |
;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; | |
;;; THIS SOFTWARE. ;; | |
;;; ;; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
;;; Set up rab_diphones using the standard UniSyn diphone synthesizer | |
;;; | |
;;; Roger diphones: male RP English collected October 1996 | |
;;; | |
(defvar rab_diphone_dir (cdr (assoc 'rab_diphone voice-locations)) | |
"rab_diphone_dir | |
The default directory for the rab diphone database.") | |
(require 'mrpa_phones) | |
(require 'pos) | |
(require 'phrase) | |
(require 'tobi) | |
(require 'f2bf0lr) | |
(require 'mrpa_durs) | |
(require 'gswdurtreeZ) | |
(require_module 'UniSyn) | |
(setup_oald_lex) | |
;; set this to lpc or psola | |
(defvar rab_sigpr 'lpc) | |
;; Rset this to ungroup for ungrouped version | |
(defvar rab_groupungroup 'group) | |
(if (probe_file (path-append rab_diphone_dir "group/rablpc16k.group")) | |
(defvar rab_index_file | |
(path-append rab_diphone_dir "group/rablpc16k.group")) | |
(defvar rab_index_file | |
(path-append rab_diphone_dir "group/rablpc8k.group"))) | |
(set! rab_psola_sep | |
(list | |
'(name "rab_psola_sep") | |
(list 'index_file (path-append rab_diphone_dir "dic/diphdic_full.est")) | |
'(grouped "false") | |
(list 'coef_dir (path-append rab_diphone_dir "pm")) | |
(list 'sig_dir (path-append rab_diphone_dir "wav")) | |
'(coef_ext ".pm") | |
'(sig_ext ".wav"))) | |
(set! rab_lpc_sep | |
(list | |
'(name "rab_lpc_sep") | |
(list 'index_file (path-append rab_diphone_dir "dic/diphdic_full.est")) | |
'(grouped "false") | |
(list 'coef_dir (path-append rab_diphone_dir "lpc")) | |
(list 'sig_dir (path-append rab_diphone_dir "lpc")) | |
'(coef_ext ".lpc") | |
'(sig_ext ".res"))) | |
(set! rab_psola_group | |
(list | |
'(name "rab_psola_group") | |
(list 'index_file | |
(path-append rab_diphone_dir "group/rab.group")) | |
'(grouped "true"))) | |
(set! rab_lpc_group | |
(list | |
'(name "rab_lpc_group") | |
(list 'index_file rab_index_file) | |
'(alternates_left ((i ii) (ll l) (u uu) (i@ ii) (uh @) (a aa) | |
(u@ uu) (w @) (o oo) (e@ ei) (e ei) | |
(r @))) | |
'(alternates_right ((i ii) (ll l) (u uu) (i@ ii) | |
(y i) (uh @) (r @) (w @))) | |
'(default_diphone @-@@) | |
'(grouped "true"))) | |
;;; Setup the desried DB | |
(cond | |
((and (eq rab_sigpr 'psola) | |
(eq rab_groupungroup 'group)) | |
(set! rab_db_name (us_diphone_init rab_psola_group))) | |
((and (eq rab_sigpr 'psola) | |
(eq rab_groupungroup 'ungroup)) | |
(set! rab_db_name (us_diphone_init rab_psola_sep))) | |
((and (eq rab_sigpr 'lpc) | |
(eq rab_groupungroup 'group)) | |
(set! rab_db_name (us_diphone_init rab_lpc_group))) | |
((and (eq rab_sigpr 'lpc) | |
(eq rab_groupungroup 'ungroup)) | |
(set! rab_db_name (us_diphone_init rab_lpc_sep)))) | |
(define (rab_postlex_syllabics utt) | |
"(rab_postlex_syllabics utt) | |
Because the lexicon is somewhat random in its used of syllable l n and | |
m this is designed to post process the output inserting schwa before | |
them. Ideally the lexicon should be fixed." | |
(mapcar | |
(lambda (s) | |
(if (and (member_string (item.name s) '("l" "n" "m")) | |
(string-equal "coda" (item.feat s "seg_onsetcoda")) | |
(not (member_string (item.feat s "p.name") '(l r))) | |
(string-equal "-" (item.feat s "p.ph_vc"))) | |
(item.relation.insert | |
s 'SylStructure | |
(item.insert s (list "@") 'before) | |
'before))) | |
(utt.relation.items utt 'Segment))) | |
(define (rab_diphone_const_clusters utt) | |
"(rab_diphone_const_clusters UTT) | |
Identify consonant clusters, dark ls etc in the segment item | |
ready for diphone resynthesis. This may be called as a post lexical | |
rule through poslex_rule_hooks." | |
(mapcar | |
(lambda (s) (rab_diphone_fix_phone_name utt s)) | |
(utt.relation.items utt 'Segment)) | |
utt) | |
(define (rab_diphone_fix_phone_name utt seg) | |
"(rab_diphone_fix_phone_name UTT SEG) | |
Add the feature diphone_phone_name to given segment with the appropriate | |
name for constructing a diphone. Basically adds _ if either side is part | |
of the same consonant cluster, adds $ either side if in different | |
syllable for preceding/succeeding vowel syllable, and converts l to ll | |
in coda part of syllables." | |
(let ((name (item.name seg))) | |
(cond | |
((string-equal name "#") t) | |
((string-equal "-" (item.feat seg 'ph_vc)) | |
(if (and (member_string name '(r w y l)) | |
(member_string (item.feat seg "p.name") '(p t k b d g)) | |
(item.relation.prev seg "SylStructure")) | |
(item.set_feat seg "us_diphone_right" (format nil "_%s" name))) | |
(if (and (member_string name '(w y l m n p t k)) | |
(string-equal (item.feat seg "p.name") 's) | |
(item.relation.prev seg "SylStructure")) | |
(item.set_feat seg "us_diphone_right" (format nil "_%s" name))) | |
(if (and (string-equal name 's) | |
(member_string (item.feat seg "n.name") '(w y l m n p t k)) | |
(item.relation.next seg "SylStructure")) | |
(item.set_feat seg "us_diphone_left" (format nil "%s_" name))) | |
(if (and (member_string name '(p t k b d g)) | |
(member_string (item.feat seg "n.name") '(r w y l)) | |
(item.relation.next seg "SylStructure")) | |
(item.set_feat seg "us_diphone_left" (format nil "%s_" name))) | |
(if (and (member_string name '(p k b d g)) | |
(string-equal "+" (item.feat seg 'p.ph_vc)) | |
(not (member_string (item.feat seg "p.name") '(@ aa o))) | |
(not (item.relation.prev seg "SylStructure"))) | |
(item.set_feat seg "us_diphone_right" (format nil "$%s" name))) | |
(if (and (member_string name '(p t k b d g)) | |
(string-equal "+" (item.feat seg 'n.ph_vc)) | |
(not (member_string (item.feat seg "n.name") '(@ aa))) | |
(not (item.relation.next seg "SylStructure"))) | |
(item.set_feat seg "us_diphone_left" (format nil "%s$" name))) | |
(if (and (string-equal "l" name) | |
(string-equal "+" (item.feat seg "p.ph_vc")) | |
(not (string-equal "a" (item.feat seg "p.ph_vlng"))) | |
(item.relation.prev seg 'SylStructure)) | |
(item.set_feat seg "us_diphone_right" "ll")) | |
(if (and (member_string name '(ch jh)) | |
(string-equal "+" (item.feat seg 'p.ph_vc))) | |
(item.set_feat seg "us_diphone_right" "t")) | |
) | |
))) | |
(define (voice_rab_diphone) | |
"(voice_rab_diphone) | |
Set up the current voice to be a British male RP (Roger) speaker using | |
the rab diphone set." | |
(voice_reset) | |
(Parameter.set 'Language 'britishenglish) | |
;; Phone set | |
(Parameter.set 'PhoneSet 'mrpa) | |
(PhoneSet.select 'mrpa) | |
;; Tokenization rules | |
(set! token_to_words english_token_to_words) | |
;; POS tagger | |
(set! pos_lex_name "english_poslex") | |
(set! pos_ngram_name 'english_pos_ngram) | |
(set! pos_supported t) | |
(set! guess_pos english_guess_pos) ;; need this for accents | |
;; Lexicon selection | |
(lex.select "oald") | |
(set! postlex_rules_hooks (list postlex_apos_s_check | |
rab_postlex_syllabics)) | |
(set! postlex_s_check postlex_apos_s_check) | |
;; Phrase prediction | |
(Parameter.set 'Phrase_Method 'prob_models) | |
(set! phr_break_params english_phr_break_params) | |
;; Accent and tone prediction | |
(set! int_tone_cart_tree f2b_int_tone_cart_tree) | |
(set! int_accent_cart_tree f2b_int_accent_cart_tree) | |
;; F0 prediction | |
(set! f0_lr_start f2b_f0_lr_start) | |
(set! f0_lr_mid f2b_f0_lr_mid) | |
(set! f0_lr_end f2b_f0_lr_end) | |
(Parameter.set 'Int_Method Intonation_Tree) | |
(set! int_lr_params | |
'((target_f0_mean 105) (target_f0_std 14) | |
(model_f0_mean 170) (model_f0_std 34))) | |
(Parameter.set 'Int_Target_Method Int_Targets_LR) | |
;; Duration prediction -- use gsw durations | |
(set! duration_cart_tree gsw_duration_cart_tree) | |
(set! duration_ph_info gsw_durs) | |
(Parameter.set 'Duration_Method Duration_Tree_ZScores) | |
(Parameter.set 'Duration_Stretch 1.05) | |
;; Waveform synthesizer: Roger diphones | |
;; This assigned the diphone names from their context (_ $ etc) | |
(set! UniSyn_module_hooks (list rab_diphone_const_clusters )) | |
(set! us_abs_offset 0.0) | |
(set! window_factor 1.0) | |
(set! us_rel_offset 0.0) | |
(set! us_gain 0.9) | |
(Parameter.set 'Synth_Method 'UniSyn) | |
(Parameter.set 'us_sigpr rab_sigpr) | |
(us_db_select rab_db_name) | |
(set! current-voice 'rab_diphone) | |
) | |
(proclaim_voice | |
'rab_diphone | |
'((language english) | |
(gender male) | |
(dialect british) | |
(description | |
"This voice provides a British RP English male voice using a | |
residual excited LPC diphone synthesis method. It uses a | |
modified Oxford Advanced Learners' Dictionary for pronunciations. | |
Prosodic phrasing is provided by a statistically trained model | |
using part of speech and local distribution of breaks. Intonation | |
is provided by a CART tree predicting ToBI accents and an F0 | |
contour generated from a model trained from natural speech. The | |
duration model is also trained from data using a CART tree."))) | |
(provide 'rab_diphone) | |