audio_translation / translate.py
Salman11223's picture
Create translate.py
ed2fbbc
raw
history blame
9.18 kB
import os
import moviepy.editor as mp
import assemblyai as aai
import requests
import azure.cognitiveservices.speech as speechsdk
from moviepy.editor import AudioFileClip
from gradio_client import Client
class Translate:
def __init__(self, video_path, target_language,original_language,speaking_rate):
self.video_path = video_path
self.target_language = target_language
self.original_language=original_language
self.aai_api_key = "c29eb650444a4ae4be6a787ebb15d5e2"
self.txtospech_key = "358c77527e48454cbf5bf2bd54f03161"
self.translation_api_key = "394833878dd54214886cd81a35ac35dc"
self.spechtotxt_key = "07ac642da789462d87ad47a790ec6d5f"
self.speaking_rate= speaking_rate
def extract_audio(self):
aai.settings.api_key = self.aai_api_key
video = mp.VideoFileClip(self.video_path)
audio = video.audio
audio_path = "audio.wav"
audio.write_audiofile(audio_path)
print("Audio extracted successfully!")
return audio_path
def gender_detection(self):
gender_model_url = "https://salman11223-gender-detection.hf.space/--replicas/wml9f/"
gender_client = Client(gender_model_url)
gender = gender_client.predict(
'audio.wav', api_name="/predict"
)
print(gender)
return gender
def org_language_parameters(self,original_language):
if original_language == 'English':
self.lan_code='en'
elif original_language =='German':
self.lan_code='de'
elif original_language =='French':
self.lan_code='fr'
elif original_language =='Spanish':
self.lan_code='es'
else:
self.lan_code = ''
def set_language_parameters(self, target_language, detected_gender):
if target_language == 'English':
self.language_code = 'en-US'
self.trans_code = 'en'
self.voice_names = 'en-US-GuyNeural' if detected_gender == 'male' else 'en-US-AriaNeural'
elif target_language == 'German':
self.language_code = 'de-DE'
self.trans_code = 'de'
self.voice_names = 'de-DE-ConradNeural' if detected_gender == 'male' else 'de-DE-KatjaNeural'
elif target_language == 'French':
self.language_code = 'fr-CA'
self.trans_code = 'fr'
self.voice_names = 'fr-CA-JeanNeural' if detected_gender == 'male' else 'fr-CA-SylvieNeural'
elif target_language == 'Spanish':
self.language_code = 'es-ES'
self.trans_code = 'es'
self.voice_names = 'es-ES-AlvaroNeural' if detected_gender == 'male' else 'es-ES-ElviraNeural'
elif target_language == 'Urdu':
self.language_code = 'ur-PK'
self.trans_code = 'ur'
self.voice_names = 'ur-PK-AsadNeural' if detected_gender == 'male' else 'ur-PK-UzmaNeural'
else:
# Handle unsupported languages or set default values
self.voice_names = []
self.language_code = ''
self.trans_code = ''
print("Target Language:", target_language)
print("Trans Code:", self.trans_code)
def get_voice_names(self):
return self.voice_names
def get_language_code(self):
return self.language_code
def get_audio_duration(self, audio_path):
audio_clip = AudioFileClip(audio_path)
audio_duration = audio_clip.duration
return audio_duration
def transcribe_audio(self, audio_path):
aai.settings.api_key = self.aai_api_key
config = aai.TranscriptionConfig(self.lan_code)
transcriber = aai.Transcriber(config=config)
transcript = transcriber.transcribe(audio_path)
file_path = "transcript.srt"
filepath = "t.txt"
with open(file_path, "w") as file:
file.write(transcript.export_subtitles_srt())
with open(filepath, "w") as file:
file.write(transcript.text)
def generate_ssml(self, text, speaking_rate):
# Construct SSML with the given text, speaking rate, voice name, and language code
return f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{self.language_code}"><voice name="{self.voice_names}"><prosody rate="{speaking_rate}">{text}</prosody></voice></speak>'
def text_to_speech(self, text, apikey, reggion, out_aud_file, speaking_rate):
ssml = self.generate_ssml(text, speaking_rate)
speech_config = speechsdk.SpeechConfig(subscription=apikey, region=reggion)
audio_config = speechsdk.audio.AudioOutputConfig(filename=out_aud_file)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
speech_synthesizer.speak_ssml_async(ssml).get()
def translate_text(self, text):
base_url = "https://api.cognitive.microsofttranslator.com"
endpoint = "/translate"
headers = {
"Ocp-Apim-Subscription-Key": self.translation_api_key,
"Content-Type": "application/json",
"Ocp-Apim-Subscription-Region": "southeastasia"
}
params = {
"api-version": "3.0",
"to": self.trans_code
}
body = [{"text": text}]
response = requests.post(base_url + endpoint, headers=headers, params=params, json=body)
response.raise_for_status()
translation = response.json()[0]["translations"][0]["text"]
return translation
def transcribe_and_translate(self):
audio_path = self.extract_audio()
self.org_language_parameters(self.original_language)
self.transcribe_audio(audio_path)
gender = self.gender_detection()
print("Detected Gender:", gender)
self.set_language_parameters(self.target_language,gender)
with open("transcript.srt", 'r') as srt_file:
original_srt_content = srt_file.read()
original_subtitles = original_srt_content.strip().split('\n\n')
translated_subtitles = []
for subtitle in original_subtitles:
lines = subtitle.split('\n')
sequence_number = lines[0]
timestamp = lines[1]
original_text = '\n'.join(lines[2:])
translated_text = self.translate_text(original_text)
translated_subtitle = f"{sequence_number}\n{timestamp}\n{translated_text}"
translated_subtitles.append(translated_subtitle)
translated_srt_content = '\n\n'.join(translated_subtitles)
translated_srt_path = "translated_file.srt"
with open(translated_srt_path, 'w', encoding='utf-8') as srt_file:
srt_file.write(translated_srt_content)
# Loop through each translated subtitle and generate speech
translated_audio_paths = []
for subtitle in translated_subtitles:
lines = subtitle.split('\n')
sequence_number = lines[0]
timestamp = lines[1]
translated_text = '\n'.join(lines[2:])
translated_audio_path = f"translated_audio_{sequence_number}.wav"
self.text_to_speech(translated_text, self.txtospech_key, "southeastasia", translated_audio_path, self.speaking_rate)
translated_audio_paths.append(translated_audio_path)
# Create a list to store the audio clips
translated_audio_clips = []
# Loop through each translated audio path and create an AudioFileClip
for audio_path in translated_audio_paths:
translated_audio_clip = mp.AudioFileClip(audio_path)
translated_audio_clips.append(translated_audio_clip)
# Concatenate the translated audio clips into a single audio file
translated_audio = mp.concatenate_audioclips(translated_audio_clips)
# Define the output audio file path
output_audio_path = "translated_audio.wav"
# Write the concatenated translated audio to the output file
translated_audio.write_audiofile(output_audio_path)
# Load the original video
video = mp.VideoFileClip(self.video_path)
# Load the translated audio
translated_audio = mp.AudioFileClip(output_audio_path)
# Set the audio of the video to the translated audio
video = video.set_audio(translated_audio)
# Define the output video file path
output_video_path = "translated_video.mp4"
# Write the video with translated audio to the output file
video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
# Clean up temporary files
self.cleanup_temp_files()
def cleanup_temp_files(self):
temp_files = ["audio.wav", "t.txt", "transcript.srt","translated_audio.wav","translated_file.srt"] + [f"translated_audio_{i}.wav" for i in range(1, 100)] # Adjust the range accordingly
for file in temp_files:
if os.path.exists(file):
os.remove(file)
print(f"Deleted {file}")