Spaces:

DLI-SLQ
/

piper-tts

Runtime error

App Files Files Community

piper-tts / app.py

DLI-SLQ

add post sentence pause and slow down speech speed.

70c66a8 verified 12 months ago

raw

history blame

2.62 kB

	import gradio as gr
	import wave
	import numpy as np
	from io import BytesIO
	from huggingface_hub import hf_hub_download
	from piper import PiperVoice
	from transformers import pipeline

	# Load the NSFW classifier model
	nsfw_detector = pipeline("text-classification", model="michellejieli/NSFW_text_classifier")

	def synthesize_speech(text):
	# Check for NSFW content using the classifier
	nsfw_result = nsfw_detector(text)
	label = nsfw_result[0]['label']
	score = nsfw_result[0]['score']

	if label == 'NSFW' and score >= 0.95:
	error_audio_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="error_audio.wav")

	# Read the error audio file
	try:
	with wave.open(error_audio_path, 'rb') as error_audio_file:
	frames = error_audio_file.readframes(error_audio_file.getnframes())
	error_audio_data = np.frombuffer(frames, dtype=np.int16).tobytes()
	except Exception as e:
	print(f"Error reading audio file: {e}")
	return None, "Error in processing audio file."

	return error_audio_data, "NSFW content detected. Cannot process."

	model_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__01234_model.onnx")
	config_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__01234_model.onnx.json")

	voice = PiperVoice.load(model_path, config_path)

	buffer = BytesIO()
	with wave.open(buffer, 'wb') as wav_file:
	wav_file.setframerate(voice.config.sample_rate)
	wav_file.setsampwidth(2)
	wav_file.setnchannels(1)
	voice.synthesize(text, wav_file, sentence_silence=0.75, length_scale=1.2)

	buffer.seek(0)
	audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
	return audio_data.tobytes(), None

	# Gradio Interface
	with gr.Blocks(theme=gr.themes.Base()) as blocks:
	gr.Markdown("# Text to Speech Synthesizer")
	gr.Markdown("Enter text to synthesize it into speech using models from the State Library of Queensland's collection using Piper.")
	input_text = gr.Textbox(label="Input Text")
	output_audio = gr.Audio(label="Synthesized Speech", type="numpy", show_download_button=False)
	output_text = gr.Textbox(label="Output Text", visible=False)
	submit_button = gr.Button("Synthesize")


	def process_and_output(text):
	audio, message = synthesize_speech(text)
	if message:
	return audio, message
	else:
	return audio, None

	submit_button.click(process_and_output, inputs=input_text, outputs=[output_audio, output_text])

	blocks.launch()