File size: 9,479 Bytes
1c6d76f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import soundfile
import pyrubberband
import configparser
import pathlib
import os
import io
from Scripts.shared_imports import *
import Scripts.TTS as TTS
from Scripts.utils import parseBool
from pydub import AudioSegment
from pydub.silence import detect_leading_silence
import langcodes
# Set working folder
workingFolder = "workingFolder"
def trim_clip(inputSound):
trim_leading_silence: AudioSegment = lambda x: x[detect_leading_silence(x) :]
trim_trailing_silence: AudioSegment = lambda x: trim_leading_silence(x.reverse()).reverse()
strip_silence: AudioSegment = lambda x: trim_trailing_silence(trim_leading_silence(x))
strippedSound = strip_silence(inputSound)
return strippedSound
# Function to insert audio into canvas at specific point
def insert_audio(canvas, audioToOverlay, startTimeMs):
# Create a copy of the canvas
canvasCopy = canvas
# Overlay the audio onto the copy
canvasCopy = canvasCopy.overlay(audioToOverlay, position=int(startTimeMs))
# Return the copy
return canvasCopy
# Function to create a canvas of a specific duration in miliseconds
def create_canvas(canvasDuration, frame_rate=int(config['synth_sample_rate'])):
canvas = AudioSegment.silent(duration=canvasDuration, frame_rate=frame_rate)
return canvas
def get_speed_factor(subsDict, trimmedAudio, desiredDuration, num):
virtualTempFile = AudioSegment.from_file(trimmedAudio, format="wav")
rawDuration = virtualTempFile.duration_seconds
trimmedAudio.seek(0) # This MUST be done to reset the file pointer to the start of the file, otherwise will get errors next time try to access the virtual files
# Calculate the speed factor, put into dictionary
desiredDuration = float(desiredDuration)
speedFactor = (rawDuration*1000) / desiredDuration
subsDict[num]['speed_factor'] = speedFactor
return subsDict
def stretch_audio(audioFileToStretch, speedFactor, num):
virtualTempAudioFile = io.BytesIO()
# Write the raw string to virtualtempaudiofile
y, sampleRate = soundfile.read(audioFileToStretch)
streched_audio = pyrubberband.time_stretch(y, sampleRate, speedFactor, rbargs={'--fine': '--fine'}) # Need to add rbarges in weird way because it demands a dictionary of two values
#soundfile.write(f'{workingFolder}\\temp_stretched.wav', streched_audio, sampleRate)
soundfile.write(virtualTempAudioFile, streched_audio, sampleRate, format='wav')
if config['debug_mode']:
soundfile.write(os.path.join(workingFolder, f'{num}_s.wav'), streched_audio, sampleRate) # For debugging, saves the stretched audio files
#return AudioSegment.from_file(f'{workingFolder}\\temp_stretched.wav', format="wav")
return AudioSegment.from_file(virtualTempAudioFile, format="wav")
from pydub import AudioSegment
def build_audio(subsDict, langDict, totalAudioLength, twoPassVoiceSynth=False):
if cloudConfig['tts_service'] == 'azure':
twoPassVoiceSynth = False # Azure doesn't need two pass voice synth, so disable it
virtualTrimmedFileDict = {}
# First trim silence off the audio files
for key, value in subsDict.items():
filePathTrimmed = os.path.join(workingFolder, str(key)) + "_t.wav"
subsDict[key]['TTS_FilePath_Trimmed'] = filePathTrimmed
# Trim the clip and re-write file
rawClip = AudioSegment.from_file(value['TTS_FilePath'], format="mp3", frame_rate=int(config['synth_sample_rate']))
trimmedClip = trim_clip(rawClip)
if config['debug_mode']:
trimmedClip.export(filePathTrimmed, format="wav")
# Create virtual file in dictionary with audio to be read later
tempTrimmedFile = io.BytesIO()
trimmedClip.export(tempTrimmedFile, format="wav")
virtualTrimmedFileDict[key] = tempTrimmedFile
keyIndex = list(subsDict.keys()).index(key)
print(f" Trimmed Audio: {keyIndex + 1} of {len(subsDict)}", end="\r")
print("\n")
# Calculates speed factor if necessary. Azure doesn't need this, so skip it
if not cloudConfig['tts_service'] == 'azure':
# Calculate speed factors for each clip, aka how much to stretch the audio
for key, value in subsDict.items:
# subsDict = get_speed_factor(subsDict, value['TTS_FilePath_Trimmed'], value['duration_ms'], num=key)
subsDict = get_speed_factor(subsDict, virtualTrimmedFileDict[key], value['duration_ms'], num=key)
keyIndex = list(subsDict.keys()).index(key)
print(f" Calculated Speed Factor: {keyIndex + 1} of {len(subsDict)}", end="\r")
print("\n")
# If two pass voice synth is enabled, have API re-synthesize the clips at the new speed
# Azure allows direct specification of audio duration, so no need to re-synthesize
if twoPassVoiceSynth and not cloudConfig['tts_service'] == 'azure':
if cloudConfig['batch_tts_synthesize'] and cloudConfig['tts_service'] == 'azure':
subsDict = TTS.synthesize_dictionary_batch(subsDict, langDict, skipSynthesize=config['skip_synthesize'], secondPass=True)
else:
subsDict = TTS.synthesize_dictionary(subsDict, langDict, skipSynthesize=config['skip_synthesize'], secondPass=True)
for key, value in subsDict.items:
# Trim the clip and re-write file
rawClip = AudioSegment.from_file(value['TTS_FilePath'], format="mp3", frame_rate=int(config['synth_sample_rate']))
trimmedClip = trim_clip(rawClip)
if config['debug_mode']:
# Remove '.wav' from the end of the file path
secondPassTrimmedFile = value['TTS_FilePath_Trimmed'][:-4] + "_p2_t.wav"
trimmedClip.export(secondPassTrimmedFile, format="wav")
trimmedClip.export(virtualTrimmedFileDict[key], format="wav")
keyIndex = list(subsDict.keys()).index(key)
print(f" Trimmed Audio (2nd Pass): {keyIndex + 1} of {len(subsDict)}", end="\r")
print("\n")
if config['force_stretch_with_twopass']:
for key, value in subsDict.items:
subsDict = get_speed_factor(subsDict, virtualTrimmedFileDict[key], value['duration_ms'], num=key)
keyIndex = list(subsDict.keys()).index(key)
print(f" Calculated Speed Factor (2nd Pass): {keyIndex + 1} of {len(subsDict)}", end="\r")
print("\n")
# Create canvas to overlay audio onto
canvas = create_canvas(totalAudioLength)
# Stretch audio and insert into canvas
for key, value in subsDict.items():
if (not twoPassVoiceSynth or config['force_stretch_with_twopass']) and not cloudConfig['tts_service'] == 'azure':
# stretchedClip = stretch_audio(value['TTS_FilePath_Trimmed'], speedFactor=subsDict[key]['speed_factor'], num=key)
stretchedClip = stretch_audio(virtualTrimmedFileDict[key], speedFactor=subsDict[key]['speed_factor'], num=key)
else:
# stretchedClip = AudioSegment.from_file(value['TTS_FilePath_Trimmed'], format="wav")
stretchedClip = AudioSegment.from_file(virtualTrimmedFileDict[key], format="wav")
virtualTrimmedFileDict[key].seek(0) # Not 100% sure if this is necessary but it was in the other place it is used
canvas = insert_audio(canvas, stretchedClip, value['start_ms'])
keyIndex = list(subsDict.keys()).index(key)
print(f" Final Audio Processed: {keyIndex + 1} of {len(subsDict)}", end="\r")
print("\n")
# Use video file name to use in the name of the output file. Add language name and language code
lang = langcodes.get(langDict['languageCode'])
langName = langcodes.get(langDict['languageCode']).get(lang.to_alpha3()).display_name()
if config['debug_mode'] and not os.path.isfile(ORIGINAL_VIDEO_PATH):
outputFileName = "debug" + f" - {langName} - {langDict['languageCode']}."
else:
outputFileName = pathlib.Path(ORIGINAL_VIDEO_PATH).stem + f" - {langName} - {langDict['languageCode']}."
# Set output path
outputFileName = os.path.join(OUTPUT_FOLDER, outputFileName)
# Determine string to use for output format and file extension based on config setting
outputFormat=config['output_format'].lower()
if outputFormat == "mp3":
outputFileName += "mp3"
formatString = "mp3"
elif outputFormat == "wav":
outputFileName += "wav"
formatString = "wav"
elif outputFormat == "aac":
#outputFileName += "m4a"
#formatString = "mp4" # Pydub doesn't accept "aac" as a format, so we have to use "mp4" instead. Alternatively, could use "adts" with file extension "aac"
outputFileName += "aac"
formatString = "adts" # Pydub doesn't accept "aac" as a format, so we have to use "mp4" instead. Alternatively, could use "adts" with file extension "aac"
canvas = canvas.set_channels(2) # Change from mono to stereo
try:
print("\nExporting audio file...")
canvas.export(outputFileName, format=formatString, bitrate="192k")
except:
outputFileName = outputFileName + ".bak"
canvas.export(outputFileName, format=formatString, bitrate="192k")
print("\nThere was an issue exporting the audio, it might be a permission error. The file was saved as a backup with the extension .bak")
print("Try removing the .bak extension then listen to the file to see if it worked.\n")
input("Press Enter to exit...")
return subsDict
|