vitaliy-sharandin
commited on
Commit
•
ef230f3
1
Parent(s):
31bd8c2
Update app.py
Browse files
app.py
CHANGED
@@ -12,7 +12,6 @@ import soundfile as sf
|
|
12 |
import librosa
|
13 |
from TTS.api import TTS
|
14 |
|
15 |
-
os.environ["COQUI_TOS_AGREED"] = "1"
|
16 |
HF_TOKEN = os.environ["HF_TOKEN"]
|
17 |
DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
|
18 |
|
@@ -132,7 +131,7 @@ def translate_transcript(transcript, target_language, deepl_token):
|
|
132 |
def adjust_voice_pace(sound_array, sample_rate, target_duration):
|
133 |
duration = len(sound_array) / sample_rate
|
134 |
tempo_change = duration / target_duration
|
135 |
-
sound_array_stretched = pyrb.time_stretch(sound_array, sample_rate, tempo_change)
|
136 |
return sound_array_stretched
|
137 |
|
138 |
|
@@ -186,23 +185,20 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
|
|
186 |
sample_rate = None
|
187 |
audio = None
|
188 |
if 'vits' in selected_model:
|
189 |
-
audio = tts.
|
190 |
-
sample_rate = tts.
|
191 |
else:
|
192 |
-
|
193 |
-
|
194 |
-
audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
|
195 |
-
sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
|
196 |
-
del tts; import gc; gc.collect(); torch.cuda.empty_cache()
|
197 |
|
198 |
# Adjust pace to fit the speech timeframe if translated audio is longer than phrase
|
199 |
audio_duration = len(audio) / sample_rate
|
200 |
if speech_item_duration < audio_duration:
|
201 |
audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
|
202 |
-
|
203 |
# Resample to higher rate
|
204 |
new_sample_rate = 44100
|
205 |
-
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=new_sample_rate)
|
206 |
|
207 |
# Transform to AudioArrayClip object
|
208 |
audio = np.expand_dims(audio, axis=1)
|
|
|
12 |
import librosa
|
13 |
from TTS.api import TTS
|
14 |
|
|
|
15 |
HF_TOKEN = os.environ["HF_TOKEN"]
|
16 |
DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
|
17 |
|
|
|
131 |
def adjust_voice_pace(sound_array, sample_rate, target_duration):
|
132 |
duration = len(sound_array) / sample_rate
|
133 |
tempo_change = duration / target_duration
|
134 |
+
sound_array_stretched = pyrb.time_stretch(np.array(sound_array), sample_rate, tempo_change)
|
135 |
return sound_array_stretched
|
136 |
|
137 |
|
|
|
185 |
sample_rate = None
|
186 |
audio = None
|
187 |
if 'vits' in selected_model:
|
188 |
+
audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
|
189 |
+
sample_rate = tts.synthesizer.output_sample_rate
|
190 |
else:
|
191 |
+
audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
|
192 |
+
sample_rate = tts.synthesizer.output_sample_rate
|
|
|
|
|
|
|
193 |
|
194 |
# Adjust pace to fit the speech timeframe if translated audio is longer than phrase
|
195 |
audio_duration = len(audio) / sample_rate
|
196 |
if speech_item_duration < audio_duration:
|
197 |
audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
|
198 |
+
|
199 |
# Resample to higher rate
|
200 |
new_sample_rate = 44100
|
201 |
+
audio = librosa.resample(np.array(audio), orig_sr=sample_rate, target_sr=new_sample_rate)
|
202 |
|
203 |
# Transform to AudioArrayClip object
|
204 |
audio = np.expand_dims(audio, axis=1)
|