piper-tts / app.py
DLI-SLQ's picture
add post sentence pause and slow down speech speed.
70c66a8 verified
raw
history blame
2.62 kB
import gradio as gr
import wave
import numpy as np
from io import BytesIO
from huggingface_hub import hf_hub_download
from piper import PiperVoice
from transformers import pipeline
# Load the NSFW classifier model
nsfw_detector = pipeline("text-classification", model="michellejieli/NSFW_text_classifier")
def synthesize_speech(text):
# Check for NSFW content using the classifier
nsfw_result = nsfw_detector(text)
label = nsfw_result[0]['label']
score = nsfw_result[0]['score']
if label == 'NSFW' and score >= 0.95:
error_audio_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="error_audio.wav")
# Read the error audio file
try:
with wave.open(error_audio_path, 'rb') as error_audio_file:
frames = error_audio_file.readframes(error_audio_file.getnframes())
error_audio_data = np.frombuffer(frames, dtype=np.int16).tobytes()
except Exception as e:
print(f"Error reading audio file: {e}")
return None, "Error in processing audio file."
return error_audio_data, "NSFW content detected. Cannot process."
model_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__01234_model.onnx")
config_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__01234_model.onnx.json")
voice = PiperVoice.load(model_path, config_path)
buffer = BytesIO()
with wave.open(buffer, 'wb') as wav_file:
wav_file.setframerate(voice.config.sample_rate)
wav_file.setsampwidth(2)
wav_file.setnchannels(1)
voice.synthesize(text, wav_file, sentence_silence=0.75, length_scale=1.2)
buffer.seek(0)
audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
return audio_data.tobytes(), None
# Gradio Interface
with gr.Blocks(theme=gr.themes.Base()) as blocks:
gr.Markdown("# Text to Speech Synthesizer")
gr.Markdown("Enter text to synthesize it into speech using models from the State Library of Queensland's collection using Piper.")
input_text = gr.Textbox(label="Input Text")
output_audio = gr.Audio(label="Synthesized Speech", type="numpy", show_download_button=False)
output_text = gr.Textbox(label="Output Text", visible=False)
submit_button = gr.Button("Synthesize")
def process_and_output(text):
audio, message = synthesize_speech(text)
if message:
return audio, message
else:
return audio, None
submit_button.click(process_and_output, inputs=input_text, outputs=[output_audio, output_text])
blocks.launch()