File size: 2,418 Bytes
011084a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from os import getenv
from huggingface_hub import hf_hub_download
from json import load as json_load , dump as json_dump
from torch import device as Device
from torch.cuda import is_available as cuda_is_available
from TTS.utils.synthesizer import Synthesizer

lang_conf = {
    "as": "Assamese - অসমীয়া",
    "bn": "Bangla - বাংলা",
    "brx": "Boro - बड़ो",
    "en": "English (Indian accent)",
    "en+hi": "English+Hindi (Hinglish code-mixed)",
    "gu": "Gujarati - ગુજરાતી",
    "hi": "Hindi - हिंदी",
    "kn": "Kannada - ಕನ್ನಡ",
    "ml": "Malayalam - മലയാളം",
    "mni": "Manipuri - মিতৈলোন",
    "mr": "Marathi - मराठी",
    "or": "Oriya - ଓଡ଼ିଆ",
    "pa": "Panjabi - ਪੰਜਾਬੀ",
    "raj": "Rajasthani - राजस्थानी",
    "ta": "Tamil - தமிழ்",
    "te": "Telugu - తెలుగు"
}


class Indic_TTS:
    def __init__(self,lang,device):
        model_id = "shethjenil/INDIC_TTS"
        model_path = hf_hub_download(model_id, lang+"_fastpitch_best_model.pth")
        vocoder_path = hf_hub_download(model_id, lang+"_hifigan_best_model.pth")
        vocoder_config_path = hf_hub_download(model_id, lang+"_hifigan_config.json")
        config_path = hf_hub_download(model_id, lang+"_fastpitch_config.json")
        speaker_path = hf_hub_download(model_id, lang+"_fastpitch_speakers.pth")
        conf = json_load(open(config_path))
        conf['speakers_file'] = conf['model_args']['speakers_file'] = speaker_path
        json_dump(conf, open(config_path, 'w'))
        self.synthesizer = Synthesizer(model_path,config_path,vocoder_checkpoint=vocoder_path,vocoder_config=vocoder_config_path,use_cuda=device.type == "cuda")
        self.speakers = self.synthesizer.tts_model.speaker_manager.speaker_names
    def text2speech(self,text:str,speaker:str):
        self.synthesizer.save_wav(self.synthesizer.tts(text,speaker),"output.wav")
        return "output.wav"

indic_tts_lang = getenv("indic_tts_lang")
tts_lang_name = lang_conf[indic_tts_lang]
tts_model = Indic_TTS(indic_tts_lang,Device("cuda" if cuda_is_available() else "cpu"))

import gradio as gr
gr.Interface(tts_model.text2speech,[gr.Textbox(label="Enter Text"),gr.Dropdown(tts_model.speakers, label="speaker"),],gr.Audio(type="filepath", label="Speech"),title=f"{tts_lang_name} TTS").launch()