spark-tts
commited on
Commit
Β·
e2f41b6
1
Parent(s):
23a5ae6
clean structure
Browse files- cli/SparkTTS.py +4 -3
- inference.py β cli/inference.py +0 -0
- example/infer.sh +2 -2
- {models β sparktts/models}/audio_tokenizer.py +3 -3
- {models β sparktts/models}/bicodec.py +6 -6
- {modules β sparktts/modules}/blocks/layers.py +0 -0
- {modules β sparktts/modules}/blocks/samper.py +0 -0
- {modules β sparktts/modules}/blocks/vocos.py +0 -0
- {modules β sparktts/modules}/encoder_decoder/feat_decoder.py +2 -2
- {modules β sparktts/modules}/encoder_decoder/feat_encoder.py +2 -2
- {modules β sparktts/modules}/encoder_decoder/wave_generator.py +1 -1
- {modules β sparktts/modules}/fsq/finite_scalar_quantization.py +0 -0
- {modules β sparktts/modules}/fsq/residual_fsq.py +1 -1
- {modules β sparktts/modules}/speaker/ecapa_tdnn.py +1 -1
- {modules β sparktts/modules}/speaker/perceiver_encoder.py +0 -0
- {modules β sparktts/modules}/speaker/pooling_layers.py +0 -0
- {modules β sparktts/modules}/speaker/speaker_encoder.py +3 -3
- {modules β sparktts/modules}/vq/factorized_vector_quantize.py +0 -0
- {utils β sparktts/utils}/__init__.py +0 -0
- {utils β sparktts/utils}/audio.py +0 -0
- {utils β sparktts/utils}/file.py +0 -0
- {utils β sparktts/utils}/parse_options.sh +0 -0
- {utils β sparktts/utils}/token_parser.py +0 -0
cli/SparkTTS.py
CHANGED
@@ -17,9 +17,10 @@ import re
|
|
17 |
import torch
|
18 |
from pathlib import Path
|
19 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
20 |
-
|
21 |
-
from
|
22 |
-
from
|
|
|
23 |
|
24 |
|
25 |
class SparkTTS:
|
|
|
17 |
import torch
|
18 |
from pathlib import Path
|
19 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
20 |
+
|
21 |
+
from sparktts.utils.file import load_config
|
22 |
+
from sparktts.models.audio_tokenizer import BiCodecTokenizer
|
23 |
+
from sparktts.utils.token_parser import TASK_TOKEN_MAP
|
24 |
|
25 |
|
26 |
class SparkTTS:
|
inference.py β cli/inference.py
RENAMED
File without changes
|
example/infer.sh
CHANGED
@@ -33,10 +33,10 @@ prompt_speech_path="example/prompt_audio.wav"
|
|
33 |
# Change directory to the root directory
|
34 |
cd "$root_dir" || exit
|
35 |
|
36 |
-
source utils/parse_options.sh
|
37 |
|
38 |
# Run inference for each JSON file
|
39 |
-
python inference
|
40 |
--text "${text}" \
|
41 |
--device "${device}" \
|
42 |
--save_dir "${save_dir}" \
|
|
|
33 |
# Change directory to the root directory
|
34 |
cd "$root_dir" || exit
|
35 |
|
36 |
+
source sparktts/utils/parse_options.sh
|
37 |
|
38 |
# Run inference for each JSON file
|
39 |
+
python -m cli.inference \
|
40 |
--text "${text}" \
|
41 |
--device "${device}" \
|
42 |
--save_dir "${save_dir}" \
|
{models β sparktts/models}/audio_tokenizer.py
RENAMED
@@ -21,9 +21,9 @@ from pathlib import Path
|
|
21 |
from typing import Any, Dict, Tuple
|
22 |
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
|
23 |
|
24 |
-
from utils.file import load_config
|
25 |
-
from utils.audio import load_audio
|
26 |
-
from models.bicodec import BiCodec
|
27 |
|
28 |
|
29 |
class BiCodecTokenizer:
|
|
|
21 |
from typing import Any, Dict, Tuple
|
22 |
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
|
23 |
|
24 |
+
from sparktts.utils.file import load_config
|
25 |
+
from sparktts.utils.audio import load_audio
|
26 |
+
from sparktts.models.bicodec import BiCodec
|
27 |
|
28 |
|
29 |
class BiCodecTokenizer:
|
{models β sparktts/models}/bicodec.py
RENAMED
@@ -20,12 +20,12 @@ from typing import Dict, Any
|
|
20 |
from omegaconf import DictConfig
|
21 |
from safetensors.torch import load_file
|
22 |
|
23 |
-
from utils.file import load_config
|
24 |
-
from modules.speaker.speaker_encoder import SpeakerEncoder
|
25 |
-
from modules.encoder_decoder.feat_encoder import Encoder
|
26 |
-
from modules.encoder_decoder.feat_decoder import Decoder
|
27 |
-
from modules.encoder_decoder.wave_generator import WaveGenerator
|
28 |
-
from modules.vq.factorized_vector_quantize import FactorizedVectorQuantize
|
29 |
|
30 |
|
31 |
class BiCodec(nn.Module):
|
|
|
20 |
from omegaconf import DictConfig
|
21 |
from safetensors.torch import load_file
|
22 |
|
23 |
+
from sparktts.utils.file import load_config
|
24 |
+
from sparktts.modules.speaker.speaker_encoder import SpeakerEncoder
|
25 |
+
from sparktts.modules.encoder_decoder.feat_encoder import Encoder
|
26 |
+
from sparktts.modules.encoder_decoder.feat_decoder import Decoder
|
27 |
+
from sparktts.modules.encoder_decoder.wave_generator import WaveGenerator
|
28 |
+
from sparktts.modules.vq.factorized_vector_quantize import FactorizedVectorQuantize
|
29 |
|
30 |
|
31 |
class BiCodec(nn.Module):
|
{modules β sparktts/modules}/blocks/layers.py
RENAMED
File without changes
|
{modules β sparktts/modules}/blocks/samper.py
RENAMED
File without changes
|
{modules β sparktts/modules}/blocks/vocos.py
RENAMED
File without changes
|
{modules β sparktts/modules}/encoder_decoder/feat_decoder.py
RENAMED
@@ -19,8 +19,8 @@ import torch.nn as nn
|
|
19 |
|
20 |
from typing import List
|
21 |
|
22 |
-
from modules.blocks.vocos import VocosBackbone
|
23 |
-
from modules.blocks.samper import SamplingBlock
|
24 |
|
25 |
|
26 |
class Decoder(nn.Module):
|
|
|
19 |
|
20 |
from typing import List
|
21 |
|
22 |
+
from sparktts.modules.blocks.vocos import VocosBackbone
|
23 |
+
from sparktts.modules.blocks.samper import SamplingBlock
|
24 |
|
25 |
|
26 |
class Decoder(nn.Module):
|
{modules β sparktts/modules}/encoder_decoder/feat_encoder.py
RENAMED
@@ -19,8 +19,8 @@ import torch.nn as nn
|
|
19 |
|
20 |
from typing import List
|
21 |
|
22 |
-
from modules.blocks.vocos import VocosBackbone
|
23 |
-
from modules.blocks.samper import SamplingBlock
|
24 |
|
25 |
|
26 |
class Encoder(nn.Module):
|
|
|
19 |
|
20 |
from typing import List
|
21 |
|
22 |
+
from sparktts.modules.blocks.vocos import VocosBackbone
|
23 |
+
from sparktts.modules.blocks.samper import SamplingBlock
|
24 |
|
25 |
|
26 |
class Encoder(nn.Module):
|
{modules β sparktts/modules}/encoder_decoder/wave_generator.py
RENAMED
@@ -17,7 +17,7 @@
|
|
17 |
|
18 |
import torch.nn as nn
|
19 |
|
20 |
-
from modules.blocks.layers import (
|
21 |
Snake1d,
|
22 |
WNConv1d,
|
23 |
ResidualUnit,
|
|
|
17 |
|
18 |
import torch.nn as nn
|
19 |
|
20 |
+
from sparktts.modules.blocks.layers import (
|
21 |
Snake1d,
|
22 |
WNConv1d,
|
23 |
ResidualUnit,
|
{modules β sparktts/modules}/fsq/finite_scalar_quantization.py
RENAMED
File without changes
|
{modules β sparktts/modules}/fsq/residual_fsq.py
RENAMED
@@ -10,7 +10,7 @@ from torch.amp import autocast
|
|
10 |
from einx import get_at
|
11 |
from einops import rearrange, reduce, pack, unpack
|
12 |
|
13 |
-
from modules.fsq.finite_scalar_quantization import FSQ
|
14 |
|
15 |
|
16 |
def exists(val):
|
|
|
10 |
from einx import get_at
|
11 |
from einops import rearrange, reduce, pack, unpack
|
12 |
|
13 |
+
from sparktts.modules.fsq.finite_scalar_quantization import FSQ
|
14 |
|
15 |
|
16 |
def exists(val):
|
{modules β sparktts/modules}/speaker/ecapa_tdnn.py
RENAMED
@@ -22,7 +22,7 @@ import torch
|
|
22 |
import torch.nn as nn
|
23 |
import torch.nn.functional as F
|
24 |
|
25 |
-
import modules.speaker.pooling_layers as pooling_layers
|
26 |
|
27 |
|
28 |
class Res2Conv1dReluBn(nn.Module):
|
|
|
22 |
import torch.nn as nn
|
23 |
import torch.nn.functional as F
|
24 |
|
25 |
+
import sparktts.modules.speaker.pooling_layers as pooling_layers
|
26 |
|
27 |
|
28 |
class Res2Conv1dReluBn(nn.Module):
|
{modules β sparktts/modules}/speaker/perceiver_encoder.py
RENAMED
File without changes
|
{modules β sparktts/modules}/speaker/pooling_layers.py
RENAMED
File without changes
|
{modules β sparktts/modules}/speaker/speaker_encoder.py
RENAMED
@@ -17,9 +17,9 @@ import torch
|
|
17 |
import torch.nn as nn
|
18 |
|
19 |
from typing import List, Tuple
|
20 |
-
from modules.fsq.residual_fsq import ResidualFSQ
|
21 |
-
from modules.speaker.ecapa_tdnn import ECAPA_TDNN_GLOB_c512
|
22 |
-
from modules.speaker.perceiver_encoder import PerceiverResampler
|
23 |
|
24 |
"""
|
25 |
x-vector + d-vector
|
|
|
17 |
import torch.nn as nn
|
18 |
|
19 |
from typing import List, Tuple
|
20 |
+
from sparktts.modules.fsq.residual_fsq import ResidualFSQ
|
21 |
+
from sparktts.modules.speaker.ecapa_tdnn import ECAPA_TDNN_GLOB_c512
|
22 |
+
from sparktts.modules.speaker.perceiver_encoder import PerceiverResampler
|
23 |
|
24 |
"""
|
25 |
x-vector + d-vector
|
{modules β sparktts/modules}/vq/factorized_vector_quantize.py
RENAMED
File without changes
|
{utils β sparktts/utils}/__init__.py
RENAMED
File without changes
|
{utils β sparktts/utils}/audio.py
RENAMED
File without changes
|
{utils β sparktts/utils}/file.py
RENAMED
File without changes
|
{utils β sparktts/utils}/parse_options.sh
RENAMED
File without changes
|
{utils β sparktts/utils}/token_parser.py
RENAMED
File without changes
|