update
Browse files- .gitignore +1 -0
 - .models/clip.pth +0 -3
 - api.py +11 -22
 - data/mel_norms.pth +0 -0
 - do_tts.py +5 -1
 - models/arch_util.py +1 -2
 - models/clvp.py +1 -1
 - models/xtransformers.py +0 -47
 - read.py +10 -12
 - requirements.txt +1 -2
 
    	
        .gitignore
    CHANGED
    
    | 
         @@ -130,3 +130,4 @@ dmypy.json 
     | 
|
| 130 | 
         
             
            .pyre/
         
     | 
| 131 | 
         | 
| 132 | 
         
             
            .idea/*
         
     | 
| 
         | 
| 
         | 
|
| 130 | 
         
             
            .pyre/
         
     | 
| 131 | 
         | 
| 132 | 
         
             
            .idea/*
         
     | 
| 133 | 
         
            +
            .models/*
         
     | 
    	
        .models/clip.pth
    DELETED
    
    | 
         @@ -1,3 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256:8ab5a7751b6098b7e57528b5d812ea2ffbaa16f1b36c02e143c501c74900140d
         
     | 
| 3 | 
         
            -
            size 271601435
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        api.py
    CHANGED
    
    | 
         @@ -23,9 +23,11 @@ from utils.tokenizer import VoiceBpeTokenizer, lev_distance 
     | 
|
| 23 | 
         
             
            pbar = None
         
     | 
| 24 | 
         
             
            def download_models():
         
     | 
| 25 | 
         
             
                MODELS = {
         
     | 
| 26 | 
         
            -
                    ' 
     | 
| 27 | 
         
            -
                    ' 
     | 
| 28 | 
         
            -
                    ' 
     | 
| 
         | 
|
| 
         | 
|
| 29 | 
         
             
                }
         
     | 
| 30 | 
         
             
                os.makedirs('.models', exist_ok=True)
         
     | 
| 31 | 
         
             
                def show_progress(block_num, block_size, total_size):
         
     | 
| 
         @@ -162,25 +164,12 @@ class TextToSpeech: 
     | 
|
| 162 | 
         
             
                                                  train_solo_embeddings=False,
         
     | 
| 163 | 
         
             
                                                  average_conditioning_embeddings=True).cpu().eval()
         
     | 
| 164 | 
         
             
                    self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
         
     | 
| 165 | 
         
            -
                    '''
         
     | 
| 166 | 
         
            -
                    self.autoregressive = UnifiedVoice(max_mel_tokens=2048, max_text_tokens=1024, max_conditioning_inputs=1, layers=42,
         
     | 
| 167 | 
         
            -
                                                  model_dim=1152, heads=18, number_text_tokens=256, train_solo_embeddings=False,
         
     | 
| 168 | 
         
            -
                                                  average_conditioning_embeddings=True, types=2).cpu().eval()
         
     | 
| 169 | 
         
            -
                    self.autoregressive.load_state_dict(torch.load('X:\\dlas\\experiments\\train_gpt_tts_xl\\models\\15250_gpt_ema.pth'))
         
     | 
| 170 | 
         
            -
                    '''
         
     | 
| 171 | 
         
            -
             
     | 
| 172 | 
         
            -
                    self.autoregressive_for_diffusion = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
         
     | 
| 173 | 
         
            -
                                                  model_dim=1024,
         
     | 
| 174 | 
         
            -
                                                  heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
         
     | 
| 175 | 
         
            -
                                                  train_solo_embeddings=False,
         
     | 
| 176 | 
         
            -
                                                  average_conditioning_embeddings=True).cpu().eval()
         
     | 
| 177 | 
         
            -
                    self.autoregressive_for_diffusion.load_state_dict(torch.load('.models/autoregressive.pth'))
         
     | 
| 178 | 
         | 
| 179 | 
         
             
                    self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
         
     | 
| 180 | 
         
             
                                     text_seq_len=350, text_heads=8,
         
     | 
| 181 | 
         
             
                                     num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
         
     | 
| 182 | 
         
             
                                     use_xformers=True).cpu().eval()
         
     | 
| 183 | 
         
            -
                    self.clvp.load_state_dict(torch.load('.models/ 
     | 
| 184 | 
         | 
| 185 | 
         
             
                    self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
         
     | 
| 186 | 
         
             
                                     speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
         
     | 
| 
         @@ -213,7 +202,7 @@ class TextToSpeech: 
     | 
|
| 213 | 
         
             
                        'ultra_fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 16, 'cond_free': False},
         
     | 
| 214 | 
         
             
                        'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
         
     | 
| 215 | 
         
             
                        'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
         
     | 
| 216 | 
         
            -
                        'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations':  
     | 
| 217 | 
         
             
                    }
         
     | 
| 218 | 
         
             
                    kwargs.update(presets[preset])
         
     | 
| 219 | 
         
             
                    return self.tts(text, voice_samples, **kwargs)
         
     | 
| 
         @@ -281,11 +270,11 @@ class TextToSpeech: 
     | 
|
| 281 | 
         
             
                        # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
         
     | 
| 282 | 
         
             
                        # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
         
     | 
| 283 | 
         
             
                        # results, but will increase memory usage.
         
     | 
| 284 | 
         
            -
                        self. 
     | 
| 285 | 
         
            -
                        best_latents = self. 
     | 
| 286 | 
         
            -
                                                           torch.tensor([best_results.shape[-1]*self. 
     | 
| 287 | 
         
             
                                                           return_latent=True, clip_inputs=False)
         
     | 
| 288 | 
         
            -
                        self. 
     | 
| 289 | 
         | 
| 290 | 
         
             
                        print("Performing vocoding..")
         
     | 
| 291 | 
         
             
                        wav_candidates = []
         
     | 
| 
         | 
|
| 23 | 
         
             
            pbar = None
         
     | 
| 24 | 
         
             
            def download_models():
         
     | 
| 25 | 
         
             
                MODELS = {
         
     | 
| 26 | 
         
            +
                    'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth',
         
     | 
| 27 | 
         
            +
                    'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clip.pth',
         
     | 
| 28 | 
         
            +
                    'cvvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/cvvp.pth',
         
     | 
| 29 | 
         
            +
                    'diffusion_decoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth',
         
     | 
| 30 | 
         
            +
                    'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth',
         
     | 
| 31 | 
         
             
                }
         
     | 
| 32 | 
         
             
                os.makedirs('.models', exist_ok=True)
         
     | 
| 33 | 
         
             
                def show_progress(block_num, block_size, total_size):
         
     | 
| 
         | 
|
| 164 | 
         
             
                                                  train_solo_embeddings=False,
         
     | 
| 165 | 
         
             
                                                  average_conditioning_embeddings=True).cpu().eval()
         
     | 
| 166 | 
         
             
                    self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 167 | 
         | 
| 168 | 
         
             
                    self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
         
     | 
| 169 | 
         
             
                                     text_seq_len=350, text_heads=8,
         
     | 
| 170 | 
         
             
                                     num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
         
     | 
| 171 | 
         
             
                                     use_xformers=True).cpu().eval()
         
     | 
| 172 | 
         
            +
                    self.clvp.load_state_dict(torch.load('.models/clvp.pth'))
         
     | 
| 173 | 
         | 
| 174 | 
         
             
                    self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
         
     | 
| 175 | 
         
             
                                     speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
         
     | 
| 
         | 
|
| 202 | 
         
             
                        'ultra_fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 16, 'cond_free': False},
         
     | 
| 203 | 
         
             
                        'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
         
     | 
| 204 | 
         
             
                        'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
         
     | 
| 205 | 
         
            +
                        'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
         
     | 
| 206 | 
         
             
                    }
         
     | 
| 207 | 
         
             
                    kwargs.update(presets[preset])
         
     | 
| 208 | 
         
             
                    return self.tts(text, voice_samples, **kwargs)
         
     | 
| 
         | 
|
| 270 | 
         
             
                        # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
         
     | 
| 271 | 
         
             
                        # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
         
     | 
| 272 | 
         
             
                        # results, but will increase memory usage.
         
     | 
| 273 | 
         
            +
                        self.autoregressive = self.autoregressive.cuda()
         
     | 
| 274 | 
         
            +
                        best_latents = self.autoregressive(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
         
     | 
| 275 | 
         
            +
                                                           torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
         
     | 
| 276 | 
         
             
                                                           return_latent=True, clip_inputs=False)
         
     | 
| 277 | 
         
            +
                        self.autoregressive = self.autoregressive.cpu()
         
     | 
| 278 | 
         | 
| 279 | 
         
             
                        print("Performing vocoding..")
         
     | 
| 280 | 
         
             
                        wav_candidates = []
         
     | 
    	
        data/mel_norms.pth
    CHANGED
    
    | 
         Binary files a/data/mel_norms.pth and b/data/mel_norms.pth differ 
     | 
| 
         | 
    	
        do_tts.py
    CHANGED
    
    | 
         @@ -11,6 +11,10 @@ if __name__ == '__main__': 
     | 
|
| 11 | 
         
             
                parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
         
     | 
| 12 | 
         
             
                parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
         
     | 
| 13 | 
         
             
                                                             'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 14 | 
         
             
                parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
         
     | 
| 15 | 
         
             
                args = parser.parse_args()
         
     | 
| 16 | 
         
             
                os.makedirs(args.output_path, exist_ok=True)
         
     | 
| 
         @@ -25,6 +29,6 @@ if __name__ == '__main__': 
     | 
|
| 25 | 
         
             
                    for cond_path in cond_paths:
         
     | 
| 26 | 
         
             
                        c = load_audio(cond_path, 22050)
         
     | 
| 27 | 
         
             
                        conds.append(c)
         
     | 
| 28 | 
         
            -
                    gen = tts.tts_with_preset(args.text, conds, preset= 
     | 
| 29 | 
         
             
                    torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
         
     | 
| 30 | 
         | 
| 
         | 
|
| 11 | 
         
             
                parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
         
     | 
| 12 | 
         
             
                parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
         
     | 
| 13 | 
         
             
                                                             'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
         
     | 
| 14 | 
         
            +
                parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
         
     | 
| 15 | 
         
            +
                parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
         
     | 
| 16 | 
         
            +
                                    help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
         
     | 
| 17 | 
         
            +
                                    default=.5)
         
     | 
| 18 | 
         
             
                parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
         
     | 
| 19 | 
         
             
                args = parser.parse_args()
         
     | 
| 20 | 
         
             
                os.makedirs(args.output_path, exist_ok=True)
         
     | 
| 
         | 
|
| 29 | 
         
             
                    for cond_path in cond_paths:
         
     | 
| 30 | 
         
             
                        c = load_audio(cond_path, 22050)
         
     | 
| 31 | 
         
             
                        conds.append(c)
         
     | 
| 32 | 
         
            +
                    gen = tts.tts_with_preset(args.text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
         
     | 
| 33 | 
         
             
                    torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
         
     | 
| 34 | 
         | 
    	
        models/arch_util.py
    CHANGED
    
    | 
         @@ -5,8 +5,7 @@ import torch 
     | 
|
| 5 | 
         
             
            import torch.nn as nn
         
     | 
| 6 | 
         
             
            import torch.nn.functional as F
         
     | 
| 7 | 
         
             
            import torchaudio
         
     | 
| 8 | 
         
            -
            from  
     | 
| 9 | 
         
            -
            from x_transformers.x_transformers import RelativePositionBias
         
     | 
| 10 | 
         | 
| 11 | 
         | 
| 12 | 
         
             
            def zero_module(module):
         
     | 
| 
         | 
|
| 5 | 
         
             
            import torch.nn as nn
         
     | 
| 6 | 
         
             
            import torch.nn.functional as F
         
     | 
| 7 | 
         
             
            import torchaudio
         
     | 
| 8 | 
         
            +
            from models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias
         
     | 
| 
         | 
|
| 9 | 
         | 
| 10 | 
         | 
| 11 | 
         
             
            def zero_module(module):
         
     | 
    	
        models/clvp.py
    CHANGED
    
    | 
         @@ -2,10 +2,10 @@ import torch 
     | 
|
| 2 | 
         
             
            import torch.nn as nn
         
     | 
| 3 | 
         
             
            import torch.nn.functional as F
         
     | 
| 4 | 
         
             
            from torch import einsum
         
     | 
| 5 | 
         
            -
            from x_transformers import Encoder
         
     | 
| 6 | 
         | 
| 7 | 
         
             
            from models.arch_util import CheckpointedXTransformerEncoder
         
     | 
| 8 | 
         
             
            from models.transformer import Transformer
         
     | 
| 
         | 
|
| 9 | 
         | 
| 10 | 
         | 
| 11 | 
         
             
            def exists(val):
         
     | 
| 
         | 
|
| 2 | 
         
             
            import torch.nn as nn
         
     | 
| 3 | 
         
             
            import torch.nn.functional as F
         
     | 
| 4 | 
         
             
            from torch import einsum
         
     | 
| 
         | 
|
| 5 | 
         | 
| 6 | 
         
             
            from models.arch_util import CheckpointedXTransformerEncoder
         
     | 
| 7 | 
         
             
            from models.transformer import Transformer
         
     | 
| 8 | 
         
            +
            from models.xtransformers import Encoder
         
     | 
| 9 | 
         | 
| 10 | 
         | 
| 11 | 
         
             
            def exists(val):
         
     | 
    	
        models/xtransformers.py
    CHANGED
    
    | 
         @@ -1253,50 +1253,3 @@ class ContinuousTransformerWrapper(nn.Module): 
     | 
|
| 1253 | 
         
             
                        return tuple(res)
         
     | 
| 1254 | 
         
             
                    return res[0]
         
     | 
| 1255 | 
         | 
| 1256 | 
         
            -
             
     | 
| 1257 | 
         
            -
            class XTransformer(nn.Module):
         
     | 
| 1258 | 
         
            -
                def __init__(
         
     | 
| 1259 | 
         
            -
                        self,
         
     | 
| 1260 | 
         
            -
                        *,
         
     | 
| 1261 | 
         
            -
                        dim,
         
     | 
| 1262 | 
         
            -
                        tie_token_emb=False,
         
     | 
| 1263 | 
         
            -
                        **kwargs
         
     | 
| 1264 | 
         
            -
                ):
         
     | 
| 1265 | 
         
            -
                    super().__init__()
         
     | 
| 1266 | 
         
            -
                    enc_kwargs, kwargs = groupby_prefix_and_trim('enc_', kwargs)
         
     | 
| 1267 | 
         
            -
                    dec_kwargs, kwargs = groupby_prefix_and_trim('dec_', kwargs)
         
     | 
| 1268 | 
         
            -
             
     | 
| 1269 | 
         
            -
                    assert 'dim' not in enc_kwargs and 'dim' not in dec_kwargs, 'dimension of either encoder or decoder must be set with `dim` keyword'
         
     | 
| 1270 | 
         
            -
                    enc_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], enc_kwargs)
         
     | 
| 1271 | 
         
            -
                    enc_transformer_kwargs['emb_dropout'] = enc_kwargs.pop('emb_dropout', 0)
         
     | 
| 1272 | 
         
            -
                    enc_transformer_kwargs['num_memory_tokens'] = enc_kwargs.pop('num_memory_tokens', None)
         
     | 
| 1273 | 
         
            -
                    enc_transformer_kwargs['use_pos_emb'] = enc_kwargs.pop('use_pos_emb', True)
         
     | 
| 1274 | 
         
            -
             
     | 
| 1275 | 
         
            -
                    dec_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], dec_kwargs)
         
     | 
| 1276 | 
         
            -
                    dec_transformer_kwargs['emb_dropout'] = dec_kwargs.pop('emb_dropout', 0)
         
     | 
| 1277 | 
         
            -
                    dec_transformer_kwargs['use_pos_emb'] = dec_kwargs.pop('use_pos_emb', True)
         
     | 
| 1278 | 
         
            -
             
     | 
| 1279 | 
         
            -
                    self.encoder = TransformerWrapper(
         
     | 
| 1280 | 
         
            -
                        **enc_transformer_kwargs,
         
     | 
| 1281 | 
         
            -
                        attn_layers=Encoder(dim=dim, **enc_kwargs)
         
     | 
| 1282 | 
         
            -
                    )
         
     | 
| 1283 | 
         
            -
             
     | 
| 1284 | 
         
            -
                    self.decoder = TransformerWrapper(
         
     | 
| 1285 | 
         
            -
                        **dec_transformer_kwargs,
         
     | 
| 1286 | 
         
            -
                        attn_layers=Decoder(dim=dim, cross_attend=True, **dec_kwargs)
         
     | 
| 1287 | 
         
            -
                    )
         
     | 
| 1288 | 
         
            -
             
     | 
| 1289 | 
         
            -
                    if tie_token_emb:
         
     | 
| 1290 | 
         
            -
                        self.decoder.token_emb = self.encoder.token_emb
         
     | 
| 1291 | 
         
            -
             
     | 
| 1292 | 
         
            -
                    self.decoder = AutoregressiveWrapper(self.decoder)
         
     | 
| 1293 | 
         
            -
             
     | 
| 1294 | 
         
            -
                @torch.no_grad()
         
     | 
| 1295 | 
         
            -
                def generate(self, seq_in, seq_out_start, seq_len, src_mask=None, src_attn_mask=None, **kwargs):
         
     | 
| 1296 | 
         
            -
                    encodings = self.encoder(seq_in, mask=src_mask, attn_mask=src_attn_mask, return_embeddings=True)
         
     | 
| 1297 | 
         
            -
                    return self.decoder.generate(seq_out_start, seq_len, context=encodings, context_mask=src_mask, **kwargs)
         
     | 
| 1298 | 
         
            -
             
     | 
| 1299 | 
         
            -
                def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_attn_mask=None):
         
     | 
| 1300 | 
         
            -
                    enc = self.encoder(src, mask=src_mask, attn_mask=src_attn_mask, return_embeddings=True)
         
     | 
| 1301 | 
         
            -
                    out = self.decoder(tgt, context=enc, mask=tgt_mask, context_mask=src_mask)
         
     | 
| 1302 | 
         
            -
                    return out
         
     | 
| 
         | 
|
| 1253 | 
         
             
                        return tuple(res)
         
     | 
| 1254 | 
         
             
                    return res[0]
         
     | 
| 1255 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        read.py
    CHANGED
    
    | 
         @@ -28,11 +28,14 @@ def split_and_recombine_text(texts, desired_length=200, max_len=300): 
     | 
|
| 28 | 
         | 
| 29 | 
         
             
            if __name__ == '__main__':
         
     | 
| 30 | 
         
             
                parser = argparse.ArgumentParser()
         
     | 
| 31 | 
         
            -
                parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/ 
     | 
| 32 | 
         
             
                parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
         
     | 
| 33 | 
         
             
                                                             'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
         
     | 
| 34 | 
         
             
                parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
         
     | 
| 35 | 
         
            -
                parser.add_argument('-- 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 36 | 
         
             
                args = parser.parse_args()
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                outpath = args.output_path
         
     | 
| 
         @@ -60,16 +63,11 @@ if __name__ == '__main__': 
     | 
|
| 60 | 
         
             
                    if not cond_paths:
         
     | 
| 61 | 
         
             
                        print('Error: no valid voices specified. Try again.')
         
     | 
| 62 | 
         | 
| 63 | 
         
            -
                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 64 | 
         
             
                    for j, text in enumerate(texts):
         
     | 
| 65 | 
         
            -
                        conds =  
     | 
| 66 | 
         
            -
                        for cond_path in cond_paths:
         
     | 
| 67 | 
         
            -
                            c = load_audio(cond_path, 22050)
         
     | 
| 68 | 
         
            -
                            conds.append(c)
         
     | 
| 69 | 
         
            -
                        gen = tts.tts_with_preset(text, conds, preset=args.generation_preset)
         
     | 
| 70 | 
         
             
                        torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen.squeeze(0).cpu(), 24000)
         
     | 
| 71 | 
         | 
| 72 | 
         
            -
                        priors.append(torchaudio.functional.resample(gen, 24000, 22050).squeeze(0))
         
     | 
| 73 | 
         
            -
                        while len(priors) > 2:
         
     | 
| 74 | 
         
            -
                            priors.pop(0)
         
     | 
| 75 | 
         
            -
             
     | 
| 
         | 
|
| 28 | 
         | 
| 29 | 
         
             
            if __name__ == '__main__':
         
     | 
| 30 | 
         
             
                parser = argparse.ArgumentParser()
         
     | 
| 31 | 
         
            +
                parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
         
     | 
| 32 | 
         
             
                parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
         
     | 
| 33 | 
         
             
                                                             'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
         
     | 
| 34 | 
         
             
                parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
         
     | 
| 35 | 
         
            +
                parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
         
     | 
| 36 | 
         
            +
                parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
         
     | 
| 37 | 
         
            +
                                    help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
         
     | 
| 38 | 
         
            +
                                    default=.5)
         
     | 
| 39 | 
         
             
                args = parser.parse_args()
         
     | 
| 40 | 
         | 
| 41 | 
         
             
                outpath = args.output_path
         
     | 
| 
         | 
|
| 63 | 
         
             
                    if not cond_paths:
         
     | 
| 64 | 
         
             
                        print('Error: no valid voices specified. Try again.')
         
     | 
| 65 | 
         | 
| 66 | 
         
            +
                    conds = []
         
     | 
| 67 | 
         
            +
                    for cond_path in cond_paths:
         
     | 
| 68 | 
         
            +
                        c = load_audio(cond_path, 22050)
         
     | 
| 69 | 
         
            +
                        conds.append(c)
         
     | 
| 70 | 
         
             
                    for j, text in enumerate(texts):
         
     | 
| 71 | 
         
            +
                        gen = tts.tts_with_preset(text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 72 | 
         
             
                        torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen.squeeze(0).cpu(), 24000)
         
     | 
| 73 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        requirements.txt
    CHANGED
    
    | 
         @@ -6,5 +6,4 @@ tokenizers 
     | 
|
| 6 | 
         
             
            inflect
         
     | 
| 7 | 
         
             
            progressbar
         
     | 
| 8 | 
         
             
            einops
         
     | 
| 9 | 
         
            -
            unidecode
         
     | 
| 10 | 
         
            -
            x-transformers
         
     | 
| 
         | 
|
| 6 | 
         
             
            inflect
         
     | 
| 7 | 
         
             
            progressbar
         
     | 
| 8 | 
         
             
            einops
         
     | 
| 9 | 
         
            +
            unidecode
         
     | 
| 
         |