|
|
|
import azure.cognitiveservices.speech as speechsdk |
|
import re |
|
import os |
|
import hashlib |
|
import random |
|
|
|
speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), |
|
region=os.environ.get('SPEECH_REGION')) |
|
|
|
def do_cleanup(dir='wavs', num_files=100): |
|
files = os.listdir(dir) |
|
if len(files) > num_files: |
|
for file in files[:len(files) - num_files]: |
|
os.remove(f"{dir}/{file}") |
|
|
|
def add_sukun(text): |
|
|
|
arabic_letters = 'اأإآةبتثجحخدذرزسشصضطظعغفقكلمنهوي' |
|
shadda = 'ّ' |
|
arabic_letters += shadda |
|
sukun = 'ْ' |
|
punctuation = '.,;!?،؛؟' |
|
|
|
def process_word(word): |
|
|
|
if word[-1] in punctuation: |
|
if len(word) > 1 and word[-2] in arabic_letters and word[-2] != sukun: |
|
return word[:-2] + word[-2] + sukun + word[-1] |
|
return word |
|
|
|
elif word[-1] in arabic_letters and word[-1] != sukun: |
|
return word + sukun |
|
return word |
|
|
|
|
|
words = re.findall(r'\S+|[.,;!?،؛؟]', text) |
|
processed_text = ' '.join(process_word(word) for word in words) |
|
return processed_text |
|
|
|
def get_ssml(text, voice='de-DE-SeraphinaMultilingualNeural'): |
|
return f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="ar-SA"><voice name="{voice}"><lang xml:lang="ar-SA">{text}</lang></voice></speak>' |
|
|
|
|
|
def get_audio(input_text, voice='de-DE-FlorianMultilingualNeural', use_ssml=True): |
|
|
|
input_text = add_sukun(input_text) |
|
hash = hashlib.md5(input_text.encode()).hexdigest() |
|
|
|
if os.path.exists(f"wavs/{hash}.wav"): |
|
return f"wavs/{hash}.wav" |
|
|
|
audio_config = speechsdk.audio.AudioOutputConfig(filename=f"wavs/{hash}.wav") |
|
|
|
|
|
speech_config.set_speech_synthesis_output_format( |
|
speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm |
|
) |
|
|
|
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, |
|
audio_config=audio_config) |
|
if use_ssml: |
|
|
|
ssml = get_ssml(input_text, voice=voice) |
|
result = speech_synthesizer.speak_ssml_async(ssml).get() |
|
else: |
|
|
|
result = speech_synthesizer.speak_text_async(input_text).get() |
|
|
|
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: |
|
print("Speech synthesized for text [{}]".format(input_text)) |
|
elif result.reason == speechsdk.ResultReason.Canceled: |
|
cancellation_details = result.cancellation_details |
|
print("Speech synthesis canceled: {}".format(cancellation_details.reason)) |
|
if cancellation_details.reason == speechsdk.CancellationReason.Error: |
|
print("Error details: {}".format(cancellation_details.error_details)) |
|
|
|
|
|
if random.randint(1, 50) == 1: |
|
do_cleanup() |
|
|
|
return f"wavs/{hash}.wav" |
|
|