TArtx commited on
Commit
5e385c1
·
verified ·
1 Parent(s): e231341

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -61
app.py CHANGED
@@ -1,87 +1,59 @@
1
  import gradio as gr
2
  import torch
3
- from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
4
- from string import punctuation
5
- import re
6
- import numpy as np # Ensure NumPy is imported for audio data processing
7
-
8
  from parler_tts import ParlerTTSForConditionalGeneration
9
- from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 
10
 
11
- # Set device to CPU only
12
- device = "cpu"
13
 
14
- # Load Mini model and associated components with low memory usage
15
- repo_id = "TArtx/parler-tts-mini-v1-finetuned-12"
16
- model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
17
  tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
18
- feature_extractor = AutoFeatureExtractor.from_pretrained("parler-tts/parler-tts-mini-v1")
19
 
20
  # Constants
21
- SAMPLE_RATE = feature_extractor.sampling_rate
22
  SEED = 42
23
 
24
- # Default input text and description
25
  default_text = "This is a demonstration of my ability to convert written words into spoken language, seamlessly and naturally. As a text-to-speech model, my goal is to sound as clear and engaging as a human, making sure every word I say leaves an impression."
26
  default_description = "moderate speed, very clear, monotone, wonderful speech quality"
27
 
28
- # Number normalizer
29
- number_normalizer = EnglishNumberNormalizer()
30
-
31
- # Preprocessing function
32
- def preprocess(text):
33
- text = number_normalizer(text).strip()
34
- text = text.replace("-", " ")
35
- if text[-1] not in punctuation:
36
- text = f"{text}."
37
- abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
38
-
39
- def separate_abb(chunk):
40
- chunk = chunk.replace(".", "")
41
- return " ".join(chunk)
42
-
43
- abbreviations = re.findall(abbreviations_pattern, text)
44
- for abv in abbreviations:
45
- if abv in text:
46
- text = text.replace(abv, separate_abb(abv))
47
- return text
48
-
49
  # TTS generation function
50
  def gen_tts(text, description):
51
  try:
52
- # Tokenize inputs and prompts with truncation to avoid memory issues
53
- inputs = tokenizer(description.strip(), return_tensors="pt", truncation=True, max_length=128).to(device)
54
- prompt = tokenizer(preprocess(text), return_tensors="pt", truncation=True, max_length=128).to(device)
55
-
56
  set_seed(SEED)
 
 
 
 
 
 
57
  generation = model.generate(
58
- input_ids=inputs.input_ids,
59
- prompt_input_ids=prompt.input_ids,
60
- attention_mask=inputs.attention_mask,
61
- prompt_attention_mask=prompt.prompt_attention_mask,
62
  do_sample=True,
63
- temperature=1.0,
64
  )
65
 
66
- # Inspect the raw audio generation output
67
- print(f"Generated audio shape: {generation.shape}")
68
- print(f"Generated audio values: {generation.cpu().numpy().squeeze()}")
69
-
70
- # Check if there are any meaningful values in the audio output
71
  audio_arr = generation.cpu().numpy().squeeze()
72
- if np.all(audio_arr == 0):
73
- raise ValueError("Generated audio is empty or silent.")
74
 
75
- # Normalize the audio array to the range [-1, 1]
76
- audio_arr = audio_arr / np.max(np.abs(audio_arr))
 
 
 
 
 
77
 
78
- # Convert the audio to 16-bit PCM (int16 format)
79
- audio_arr = (audio_arr * np.iinfo(np.int16).max).astype(np.int16)
80
 
81
- return SAMPLE_RATE, audio_arr # Return sample rate and audio array
82
  except Exception as e:
83
  print(f"Error in TTS generation: {str(e)}")
84
- return SAMPLE_RATE, np.zeros((SAMPLE_RATE,)) # Return silence in case of error
 
85
 
86
  # Gradio interface
87
  with gr.Blocks() as block:
@@ -100,9 +72,8 @@ with gr.Blocks() as block:
100
  audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
101
 
102
  inputs = [input_text, description]
103
- outputs = audio_out # Only output the audio component
104
- run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
105
 
106
  # Launch the interface
107
- block.queue()
108
- block.launch()
 
1
  import gradio as gr
2
  import torch
 
 
 
 
 
3
  from parler_tts import ParlerTTSForConditionalGeneration
4
+ from transformers import AutoTokenizer, set_seed
5
+ import numpy as np
6
 
7
+ # Set device
8
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
10
+ # Load model and tokenizer
11
+ model = ParlerTTSForConditionalGeneration.from_pretrained("TArtx/parler-tts-mini-v1-finetuned-12").to(device)
 
12
  tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
 
13
 
14
  # Constants
15
+ SAMPLE_RATE = model.config.sampling_rate
16
  SEED = 42
17
 
18
+ # Default inputs
19
  default_text = "This is a demonstration of my ability to convert written words into spoken language, seamlessly and naturally. As a text-to-speech model, my goal is to sound as clear and engaging as a human, making sure every word I say leaves an impression."
20
  default_description = "moderate speed, very clear, monotone, wonderful speech quality"
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # TTS generation function
23
  def gen_tts(text, description):
24
  try:
25
+ # Set seed for reproducibility
 
 
 
26
  set_seed(SEED)
27
+
28
+ # Prepare inputs
29
+ input_ids = tokenizer(description.strip(), return_tensors="pt").input_ids.to(device)
30
+ prompt_input_ids = tokenizer(text.strip(), return_tensors="pt").input_ids.to(device)
31
+
32
+ # Generate audio
33
  generation = model.generate(
34
+ input_ids=input_ids,
35
+ prompt_input_ids=prompt_input_ids,
 
 
36
  do_sample=True,
37
+ temperature=0.7
38
  )
39
 
40
+ # Convert to numpy array
 
 
 
 
41
  audio_arr = generation.cpu().numpy().squeeze()
 
 
42
 
43
+ # Normalize audio
44
+ if np.max(np.abs(audio_arr)) > 0:
45
+ audio_arr = audio_arr / np.max(np.abs(audio_arr))
46
+ audio_arr = (audio_arr * np.iinfo(np.int16).max).astype(np.int16)
47
+ else:
48
+ # Fallback to white noise if generation fails
49
+ audio_arr = np.random.randint(-32768, 32767, SAMPLE_RATE * 10, dtype=np.int16)
50
 
51
+ return SAMPLE_RATE, audio_arr
 
52
 
 
53
  except Exception as e:
54
  print(f"Error in TTS generation: {str(e)}")
55
+ # Return white noise as fallback
56
+ return SAMPLE_RATE, np.random.randint(-32768, 32767, SAMPLE_RATE * 10, dtype=np.int16)
57
 
58
  # Gradio interface
59
  with gr.Blocks() as block:
 
72
  audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
73
 
74
  inputs = [input_text, description]
75
+ outputs = audio_out
76
+ run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs)
77
 
78
  # Launch the interface
79
+ block.launch(debug=True)