File size: 4,739 Bytes
e9c2890
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
915cfa4
 
e9c2890
915cfa4
 
 
 
 
 
 
e9c2890
915cfa4
 
 
e9c2890
915cfa4
 
 
 
e9c2890
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from abc import ABC, abstractmethod
import io

import numpy as np

import torch
from transformers import pipeline
from datasets import load_dataset

class TTSModel:
    def __init__(self, model_name):
        self.hf_name = model_name
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"

    @abstractmethod
    def synthesize(self, text):
        pass

#####

####################################################
class SpeechT5(TTSModel):
    def __init__(self, name="microsoft/speecht5_tts"):
        super(SpeechT5, self).__init__(name)
        self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
        self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
        self.speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
        

    def synthesize(self, text):
        speech = self.synthesiser(text, forward_params={"speaker_embeddings": self.speaker_embedding})
        print("[SpeechT5 - synthesize]", speech)
        return (np.array(speech["audio"])* 32767).astype(np.int16) # return a numpy array of int to play

####################################################
# PENDING: NOT WORKING FROM HF
# from MeloTTS.melo.api import TTS as meloTTS
# import nltk

# class MeloTTS(TTSModel):
#     def __init__(self, name="myshell-ai/MeloTTS-English"):
#         super(MeloTTS, self).__init__(name)
#         nltk.download('averaged_perceptron_tagger_eng')
#         self.synthesiser = meloTTS(language='EN', device=self.device)
#         self.speaker_ids = self.synthesiser.hps.data.spk2id

#     def synthesize(self, text):
#         speech = self.synthesiser.tts_to_file(text, self.speaker_ids['EN-Default'])
#         print("[MeloTTS - synthesize]", speech)
#         return speech

####################################################
class Bark(TTSModel):
    def __init__(self, name="suno/bark"):
        super(Bark, self).__init__(name)
        self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)

    def synthesize(self, text):
        speech = self.synthesiser(text)
        print("[Bark - synthesize]", speech)
        return speech

####################################################
# pip install git+https://github.com/huggingface/parler-tts.git

# from parler_tts import ParlerTTSForConditionalGeneration
# from transformers import AutoTokenizer

# class ParlerTTS(TTSModel):
#     def __init__(self, name="parler-tts/parler-tts-large-v1"):
#         super(ParlerTTS, self).__init__(name)
#         self.description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
#         self.model = ParlerTTSForConditionalGeneration.from_pretrained(self.hf_name).to(self.device)
#         self.tokenizer = AutoTokenizer.from_pretrained(self.hf_name)
#         # self.synthesiser = pipeline("text-to-speech", model=self.model, tokenizer=self.tokenizer, device=self.device)

#     def synthesize(self, text):
#         input_ids = self.tokenizer(self.description, return_tensors="pt").input_ids.to(self.device)
#         prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)

#         generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
#         speech = generation.cpu().numpy().squeeze()
#         print("[ParlerTTS - synthesize]", speech)
#         return speech

####################################################
# PENDING: NOT WORKING FROM HF
# pip install coqui-tts
# https://github.com/idiap/coqui-ai-TTS

from TTS.api import TTS
class XTTS(TTSModel):
    def __init__(self, name="tts_models/en/ljspeech/glow-tts"):
        super(XTTS, self).__init__(name)
        self.synthesiser = TTS(model_name=name, progress_bar=False).to(self.device)
        # self.model = AutoModelForSequenceClassification.from_pretrained(self.hf_name).to(self.device)
        # self.tokenizer = AutoTokenizer.from_pretrained(self.hf_name)
        # self.synthesiser = pipeline("text-to-speech", model=self.model, tokenizer=self.tokenizer, device=self.device)

        # self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)


    def synthesize(self, text):
        # input_ids = self.tokenizer(self.description, return_tensors="pt").input_ids.to(self.device)
        # prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
        # print("synthesizing ", text)
        speech = self.synthesiser.tts(text=text)
        print("[XTTS - synthesize]", len(speech), text)
        return speech