File size: 3,934 Bytes
4e63be7 4c3b8f4 4e63be7 1672a38 4c3b8f4 1672a38 1dd2e00 ba6183f 4c3b8f4 1672a38 ba6183f 4e63be7 4c3b8f4 ef6d811 1dd2e00 ba6183f 4c3b8f4 ba6183f 4c3b8f4 4e63be7 4c3b8f4 4e63be7 4c3b8f4 4e63be7 4c3b8f4 4e63be7 99fb0c5 4c3b8f4 4e63be7 4c3b8f4 4e63be7 f618670 4c3b8f4 4e63be7 4c3b8f4 4e63be7 4c3b8f4 99fb0c5 4c3b8f4 99fb0c5 4c3b8f4 99fb0c5 4c3b8f4 99fb0c5 4e63be7 4c3b8f4 985726e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import gradio as gr
import wave
import numpy as np
from io import BytesIO
from huggingface_hub import hf_hub_download
from piper import PiperVoice
from transformers import pipeline
# Import necessary libraries:
# gradio for creating the web interface,
# wave for handling WAV audio format,
# numpy for numerical operations,
# BytesIO for in-memory byte handling,
# huggingface_hub for downloading models from the Hugging Face Hub,
# PiperVoice for the text-to-speech functionality,
# pipeline from transformers for the NSFW classifier.
# Load the NSFW classifier model using Hugging Face's pipeline
nsfw_detector = pipeline("text-classification", model="michellejieli/NSFW_text_classifier")
def synthesize_speech(text):
# Check for NSFW content using the classifier
nsfw_result = nsfw_detector(text)
# Extract the label and score from the result
label = nsfw_result[0]['label']
score = nsfw_result[0]['score']
# First, check if the input text contains NSFW content.
#nsfw_result = nsfw_detector(text)
if label == 'NSFW' and score >= 0.95:
# Download and read the error audio file
error_audio_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="error_audio.wav")
with open(error_audio_path, 'rb') as error_audio_file:
error_audio = error_audio_file.read()
# Return the error audio and a warning message
return error_audio, "NSFW content detected. Cannot process."
# If the content is safe, proceed with speech synthesis.
# Download the model and configuration from Hugging Face Hub.
model_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__01234_model.onnx")
config_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__1234_model.onnx.json")
# Load the PiperVoice model for speech synthesis.
voice = PiperVoice.load(model_path, config_path)
# Create a BytesIO buffer to hold the synthesized WAV file in memory.
buffer = BytesIO()
with wave.open(buffer, 'wb') as wav_file:
# Set WAV file properties: sample rate, bit depth, and mono channel.
wav_file.setframerate(voice.config.sample_rate)
wav_file.setsampwidth(2) # 16-bit
wav_file.setnchannels(1) # mono
# Use the PiperVoice model to synthesize speech from the text.
voice.synthesize(text, wav_file)
# Convert the buffer content to a NumPy array, then to bytes for Gradio output.
buffer.seek(0)
audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
return audio_data.tobytes(), None
# Set up the Gradio interface.
with gr.Blocks(theme=gr.themes.Base()) as blocks:
# Create a user-friendly markdown title and description.
gr.Markdown("# Text to Speech Synthesizer")
gr.Markdown("Enter text to synthesize it into speech using models from the State Library of Queensland's collection using Piper.")
# Define Gradio interface components: input textbox, audio output, and output textbox.
input_text = gr.Textbox(label="Input Text")
output_audio = gr.Audio(label="Synthesized Speech", type="numpy")
output_text = gr.Textbox(label="Output Text", visible=True) # Visible for error messages
# Define a function to process the input text and produce outputs.
def process_and_output(text):
audio, message = synthesize_speech(text)
if message:
# If there's a message (e.g., an error message), return None for audio and the message.
return None, message
else:
# Otherwise, return the synthesized audio and None for the message.
return audio, None
# Link the processing function to the Gradio interface button.
submit_button = gr.Button("Synthesize")
submit_button.click(process_and_output, inputs=input_text, outputs=[output_audio, output_text])
# Launch the Gradio web application.
blocks.launch()
|