Spaces:

DLI-SLQ
/

piper-tts

Runtime error

File size: 2,936 Bytes

4e63be7
 
 
 
 
c4a9b80
4e63be7
 
c4a9b80
4e63be7
 
 
1672a38
 
 
 
4c3b8f4
1672a38
1dd2e00
4e63be7
c4a9b80
 
 
 
 
 
 
 
 
 
 
ef6d811
c040c42
c4a9b80
ba6183f
c4a9b80
4e63be7
 
 
c4a9b80
 
70c66a8
4e63be7
 
 
 
99fb0c5
c4a9b80
581e370
4e63be7
2bf7573
4e63be7
63f13ad
3957f90
2dde117
a41f904
3957f90
4e63be7
99fb0c5
 
 
c4a9b80
99fb0c5
4c3b8f4
99fb0c5
 
4e63be7
985726e

import gradio as gr
import wave
import numpy as np
from io import BytesIO
from huggingface_hub import hf_hub_download
from piper import PiperVoice
from transformers import pipeline

# Load the NSFW classifier model
nsfw_detector = pipeline("text-classification", model="michellejieli/NSFW_text_classifier")

def synthesize_speech(text):
    # Check for NSFW content using the classifier
    nsfw_result = nsfw_detector(text)
    label = nsfw_result[0]['label']
    score = nsfw_result[0]['score']

    if label == 'NSFW' and score >= 0.95:
        error_audio_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="error_audio.wav")

        # Read the error audio file
        try:
            with wave.open(error_audio_path, 'rb') as error_audio_file:
                frames = error_audio_file.readframes(error_audio_file.getnframes())
                error_audio_data = np.frombuffer(frames, dtype=np.int16).tobytes()
        except Exception as e:
            print(f"Error reading audio file: {e}")
            return None, "Error in processing audio file."

        return error_audio_data, "NSFW content detected. Cannot process."

    model_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__01234_model.onnx")
    config_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__01234_model.onnx.json")

    voice = PiperVoice.load(model_path, config_path)

    buffer = BytesIO()
    with wave.open(buffer, 'wb') as wav_file:
        wav_file.setframerate(voice.config.sample_rate)
        wav_file.setsampwidth(2)
        wav_file.setnchannels(1)
        voice.synthesize(text, wav_file, sentence_silence=0.75, length_scale=1.2)

    buffer.seek(0)
    audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
    return audio_data.tobytes(), None

# Gradio Interface
with gr.Blocks(theme=gr.themes.Base(),css="footer {visibility: hidden}") as blocks:
    gr.Markdown("# Text to Speech Synthesizer")
    gr.Markdown("Enter text to synthesize it into speech using models from the State Library of Queensland's collection.  This model uses data from the following collections:  Suzanne Mulligan Oral Histories Archive,  the Peter Gray audio tapes, Five Years On : Toowoomba and Lockyer Valley flash floods: oral history interviews and Our Rocklea: connecting with the heart through story and creativity 2012.")
    input_text = gr.Textbox(label="Input Text")
    submit_button = gr.Button("Synthesize")
    output_audio = gr.Audio(label="Synthesized Speech", type="numpy", show_download_button=False)
    output_text = gr.Textbox(label="Output Text", visible=False)
    


    def process_and_output(text):
        audio, message = synthesize_speech(text)
        if message:
            return audio, message
        else:
            return audio, None

    submit_button.click(process_and_output, inputs=input_text, outputs=[output_audio, output_text])

blocks.launch()