File size: 3,934 Bytes
4e63be7
 
 
 
 
 
 
 
4c3b8f4
 
 
 
 
 
 
 
 
 
4e63be7
 
 
1672a38
 
 
 
 
4c3b8f4
 
1672a38
 
 
1dd2e00
ba6183f
4c3b8f4
1672a38
ba6183f
4e63be7
4c3b8f4
 
ef6d811
1dd2e00
ba6183f
4c3b8f4
ba6183f
 
4c3b8f4
4e63be7
 
4c3b8f4
4e63be7
 
 
 
4c3b8f4
4e63be7
 
4c3b8f4
4e63be7
 
 
99fb0c5
4c3b8f4
4e63be7
4c3b8f4
4e63be7
f618670
4c3b8f4
 
4e63be7
 
4c3b8f4
4e63be7
4c3b8f4
99fb0c5
 
 
4c3b8f4
 
99fb0c5
4c3b8f4
 
99fb0c5
4c3b8f4
 
99fb0c5
4e63be7
4c3b8f4
985726e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gradio as gr
import wave
import numpy as np
from io import BytesIO
from huggingface_hub import hf_hub_download
from piper import PiperVoice 
from transformers import pipeline

# Import necessary libraries:
# gradio for creating the web interface,
# wave for handling WAV audio format,
# numpy for numerical operations,
# BytesIO for in-memory byte handling,
# huggingface_hub for downloading models from the Hugging Face Hub,
# PiperVoice for the text-to-speech functionality,
# pipeline from transformers for the NSFW classifier.

# Load the NSFW classifier model using Hugging Face's pipeline
nsfw_detector = pipeline("text-classification", model="michellejieli/NSFW_text_classifier")

def synthesize_speech(text):
    # Check for NSFW content using the classifier
    nsfw_result = nsfw_detector(text)
    # Extract the label and score from the result
    label = nsfw_result[0]['label']
    score = nsfw_result[0]['score']

    # First, check if the input text contains NSFW content.
    #nsfw_result = nsfw_detector(text)
    if label == 'NSFW' and score >= 0.95:
        # Download and read the error audio file
        error_audio_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="error_audio.wav")
        with open(error_audio_path, 'rb') as error_audio_file:
            error_audio = error_audio_file.read()
        # Return the error audio and a warning message
        return error_audio, "NSFW content detected. Cannot process."

    # If the content is safe, proceed with speech synthesis.
    # Download the model and configuration from Hugging Face Hub.
    model_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__01234_model.onnx")
    config_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__1234_model.onnx.json")
    
    # Load the PiperVoice model for speech synthesis.
    voice = PiperVoice.load(model_path, config_path)
    
    # Create a BytesIO buffer to hold the synthesized WAV file in memory.
    buffer = BytesIO()
    with wave.open(buffer, 'wb') as wav_file:
        # Set WAV file properties: sample rate, bit depth, and mono channel.
        wav_file.setframerate(voice.config.sample_rate)
        wav_file.setsampwidth(2)  # 16-bit
        wav_file.setnchannels(1)  # mono

        # Use the PiperVoice model to synthesize speech from the text.
        voice.synthesize(text, wav_file)

    # Convert the buffer content to a NumPy array, then to bytes for Gradio output.
    buffer.seek(0)
    audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
    return audio_data.tobytes(), None

# Set up the Gradio interface.
with gr.Blocks(theme=gr.themes.Base()) as blocks:
    # Create a user-friendly markdown title and description.
    gr.Markdown("# Text to Speech Synthesizer")
    gr.Markdown("Enter text to synthesize it into speech using models from the State Library of Queensland's collection using Piper.")
    
    # Define Gradio interface components: input textbox, audio output, and output textbox.
    input_text = gr.Textbox(label="Input Text")
    output_audio = gr.Audio(label="Synthesized Speech", type="numpy")
    output_text = gr.Textbox(label="Output Text", visible=True)  # Visible for error messages

    # Define a function to process the input text and produce outputs.
    def process_and_output(text):
        audio, message = synthesize_speech(text)
        if message:
            # If there's a message (e.g., an error message), return None for audio and the message.
            return None, message
        else:
            # Otherwise, return the synthesized audio and None for the message.
            return audio, None

    # Link the processing function to the Gradio interface button.
    submit_button = gr.Button("Synthesize")
    submit_button.click(process_and_output, inputs=input_text, outputs=[output_audio, output_text])

# Launch the Gradio web application.
blocks.launch()