OpenVoiceV2

Running

App Files Files Community

OpenVoiceV2 / app.py

XuminYu

update

18e4ba8 8 months ago

raw

history blame

12.4 kB

	import os
	import gradio as gr
	import requests
	import langid
	import base64
	import json
	import time
	import re
	import hashlib
	import hash_code_for_cached_output


	API_URL = os.environ.get("API_URL")
	supported_languages = ['zh', 'en', 'ja', 'ko', 'es', 'fr']
	supported_styles = {
	'zh': "zh_default",
	'en': [
	"en_default",
	"en_us",
	"en_br",
	"en_au",
	"en_in"
	],
	"es": "es_default",
	"fr": "fr_default",
	"ja": "jp_default",
	"ko": "kr_default"
	}

	output_dir = 'outputs'
	os.makedirs(output_dir, exist_ok=True)

	def audio_to_base64(audio_file):
	with open(audio_file, "rb") as audio_file:
	audio_data = audio_file.read()
	base64_data = base64.b64encode(audio_data).decode("utf-8")
	return base64_data

	def count_chars_words(sentence):
	segments = re.findall(r'[\u4e00-\u9fa5]+\|\w+', sentence)

	char_count = 0
	word_count = 0
	for segment in segments:
	if re.match(r'[\u4e00-\u9fa5]+', segment):
	char_count += len(segment)
	else:
	word_count += len(segment.split())
	return char_count + word_count

	def predict(prompt, style, audio_file_pth, speed, agree):
	# initialize a empty info
	text_hint = ''
	# agree with the terms
	if agree == False:
	text_hint += '[ERROR] Please accept the Terms & Condition!\n'
	gr.Warning("Please accept the Terms & Condition!")
	return (
	text_hint,
	None,
	None,
	)

	# Before we get into inference, we will detect if it is from example table or default value
	# If so, we use a cached Audio. Noted that, it is just for demo efficiency.
	# hash code were generated by `hash_code_for_cached_output.py`
	cached_outputs = {
	"d0f5806f6e_60565a5c20_en_us" : "cached_outputs/0.wav",
	"d0f5806f6e_420ab8211d_en_us" : "cached_outputs/1.wav",
	"6e8a024342_0f96bf44f5_es_default" : "cached_outputs/2.wav",
	"54ad3237d7_3fef5adc6f_zh_default" : "cached_outputs/3.wav",
	"8190e911f8_9897b60a4e_jp_default" : "cached_outputs/4.wav"
	}
	unique_code = hash_code_for_cached_output.get_unique_code(audio_file_pth, prompt, style)
	print("audio_file_pth is", audio_file_pth)
	print("unique_code is", unique_code)
	if unique_code in list(cached_outputs.keys()):
	return (
	'We get the cached output for you, since you are try to generating an example cloning.',
	cached_outputs[unique_code],
	audio_file_pth,
	)

	# first detect the input language
	language_predicted = langid.classify(prompt)[0].strip()
	print(f"Detected language:{language_predicted}")


	if language_predicted not in supported_languages:
	text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
	gr.Warning(
	f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
	)

	return (
	text_hint,
	None,
	None,
	)

	# check the style
	if style not in supported_styles[language_predicted]:
	text_hint += f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.\n"
	gr.Warning(f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.")

	prompt_length = count_chars_words(prompt)

	speaker_wav = audio_file_pth

	if prompt_length < 2:
	text_hint += f"[ERROR] Please give a longer prompt text \n"
	gr.Warning("Please give a longer prompt text")
	return (
	text_hint,
	None,
	None,
	)
	if prompt_length > 50:
	text_hint += f"[ERROR] Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749 \n"
	gr.Warning(
	"Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749"
	)
	return (
	text_hint,
	None,
	None,
	)

	save_path = f'{output_dir}/output.wav'
	speaker_audio_base64 = audio_to_base64(speaker_wav)
	if style == 'en_us': # we update us accent
	style = 'en_newest'
	data = {
	"text": prompt,
	"reference_speaker": speaker_audio_base64,
	"language": style,
	"speed": speed
	}

	start = time.time()
	# Send the data as a POST request
	response = requests.post(API_URL, json=data, timeout=60)
	print(f'Get response successfully within {time.time() - start}')

	# Check the response
	if response.status_code == 200:
	try:
	json_data = json.loads(response.content)
	text_hint += f"[ERROR] {json_data['error']} \n"
	gr.Warning(
	f"[ERROR] {json_data['error']} \n"
	)
	return (
	text_hint,
	None,
	None,
	)
	except:
	with open(save_path, 'wb') as f:
	f.write(response.content)
	else:
	text_hint += f"[HTTP ERROR] {response.status_code} - {response.text} \n"
	gr.Warning(
	f"[HTTP ERROR] {response.status_code} - {response.text} \n"
	)
	return (
	text_hint,
	None,
	None,
	)
	text_hint += f'''Get response successfully \n'''
	return (
	text_hint,
	save_path,
	speaker_wav,
	)


	title = "MyShell OpenVoice V2"

	description = """
	In December 2023, we released [OpenVoice V1](https://huggingface.co/spaces/myshell-ai/OpenVoice), an instant voice cloning approach that replicates a speaker's voice and generates speech in multiple languages using only a short audio clip. OpenVoice V1 enables granular control over voice styles, replicates the tone color of the reference speaker and achieves zero-shot cross-lingual voice cloning.
	"""

	description_v2 = """
	In April 2024, we released OpenVoice V2, which includes all features in V1 and has:
	- Better Audio Quality. OpenVoice V2 adopts a different training strategy that delivers better audio quality.
	- Native Multi-lingual Support. English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2.
	- Free Commercial Use. Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use.
	"""

	markdown_table = """
	<div align="center" style="margin-bottom: 10px;">

	\| \| \| \|
	\| :-----------: \| :-----------: \| :-----------: \|
	\| OpenSource Repo \| Project Page \| Join the Community \|
	\| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> \| [OpenVoice](https://research.myshell.ai/open-voice) \| [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) \|

	</div>
	"""

	markdown_table_v2 = """
	<div align="center" style="margin-bottom: 2px;">

	\| \| \| \| \|
	\| :-----------: \| :-----------: \| :-----------: \| :-----------: \|
	\| Github Repo \| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> \| Project Page \| [OpenVoice](https://research.myshell.ai/open-voice) \|

	\| \| \|
	\| :-----------: \| :-----------: \|
	Join the Community \| [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) \|

	</div>
	"""
	content = """
	<div>
	<strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>If you want to deploy the model by yourself and perform inference, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part3.ipynb'>this jupyter notebook</a>.</strong>
	</div>
	"""
	wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"


	examples = [
	[
	"Did you ever hear a folk tale about a giant turtle?",
	'en_us',
	"examples/speaker0.mp3",
	True,
	],[
	"El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.",
	'es_default',
	"examples/speaker1.mp3",
	True,
	],[
	"我最近在学习machine learning，希望能够在未来的artificial intelligence领域有所建树。",
	'zh_default',
	"examples/speaker2.mp3",
	True,
	],[
	"彼は毎朝ジョギングをして体を健康に保っています。",
	'jp_default',
	"examples/speaker3.mp3",
	True,
	],
	]

	with gr.Blocks(analytics_enabled=False) as demo:

	with gr.Row():
	with gr.Column():
	with gr.Row():
	gr.Markdown(
	"""
	## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
	"""
	)
	with gr.Row():
	gr.Markdown(markdown_table_v2)
	with gr.Row():
	gr.Markdown(description)
	with gr.Column():
	gr.Video('./openvoicev2.mp4', autoplay=True)

	with gr.Row():
	gr.Markdown(description_v2)

	with gr.Row():
	gr.HTML(wrapped_markdown_content)

	with gr.Row():
	with gr.Column():
	input_text_gr = gr.Textbox(
	label="Text Prompt",
	info="One or two sentences at a time is better. Up to 200 text characters.",
	value="The bustling city square bustled with street performers, tourists, and local vendors.",
	)
	style_gr = gr.Dropdown(
	label="Style",
	info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
	choices=["en_default", "en_us", "en_br", "en_au", "en_in", "es_default", "fr_default", "jp_default", "zh_default", "kr_default",],
	max_choices=1,
	value="en_us",
	)
	ref_gr = gr.Audio(
	label="Reference Audio",
	info="Click on the ✎ button to upload your own target speaker audio",
	type="filepath",
	value="examples/speaker0.mp3",
	)
	tos_gr = gr.Checkbox(
	label="Agree",
	value=False,
	info="I agree to the terms of the MIT license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
	)

	tts_button = gr.Button("Send", elem_id="send-btn", visible=True)


	with gr.Column():
	out_text_gr = gr.Text(label="Info")
	audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
	ref_audio_gr = gr.Audio(label="Reference Audio Used")

	gr.Examples(examples,
	label="Examples",
	inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
	outputs=[out_text_gr, audio_gr, ref_audio_gr],
	fn=predict,
	cache_examples=False,)
	tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])

	demo.queue(concurrency_count=6)
	demo.launch(debug=True, show_api=True)