import os import tempfile import argparse from pysubparser import parser from pydub import AudioSegment import pyttsx3 engine = pyttsx3.init() voices = engine.getProperty('voices') # getting details of current voice vlist = [] for voice in voices: vlist.append(voice.name) def time_to_ms(time): return ((time.hour * 60 + time.minute) * 60 + time.second) * 1000 + time.microsecond / 1000 def generate_audio(path, rate=200, voice_idx=0): print("Generating audio file for {} with {}".format(path, "pyttsx3")) subtitles = parser.parse(path) tts_engine = pyttsx3.init() tts_engine.setProperty('rate', rate) tts_engine.setProperty('voice', tts_engine.getProperty('voices')[voice_idx].id) audio_sum = AudioSegment.empty() with tempfile.TemporaryDirectory() as tmpdirname: print('created temporary directory', tmpdirname) temp_file_path = os.path.join(tmpdirname, "temp.wav") prev_subtitle = None prev_audio_duration_ms = 0 for subtitle in subtitles: tts_engine.save_to_file(subtitle.text, temp_file_path) tts_engine.runAndWait() audio_segment = AudioSegment.from_wav(temp_file_path) print(subtitle.start, subtitle.text) if prev_subtitle is None: silence_duration_ms = time_to_ms(subtitle.start) else: silence_duration_ms = time_to_ms(subtitle.start) - time_to_ms(prev_subtitle.start) - prev_audio_duration_ms audio_sum = audio_sum + AudioSegment.silent(duration=silence_duration_ms) + audio_segment prev_subtitle = subtitle prev_audio_duration_ms = len(audio_segment) with open(f'output/{vlist[voice_idx]}.wav', 'wb') as out_f: audio_sum.export(out_f, format='wav') if __name__ == "__main__": arg_parser = argparse.ArgumentParser() arg_parser.add_argument("-p", "--path", help="subtitle file path",default="two_single.srt") arg_parser.add_argument("-r", "--rate", help="speech rate(words per minute)", type=int, default=240) arg_parser.add_argument("-v", "--voice-idx", help="voice selection", type=int, default=1, choices=[0, 1]) args = arg_parser.parse_args() generate_audio(path=args.path, rate=args.rate, voice_idx=args.voice_idx)