File size: 2,623 Bytes
4e63be7 c4a9b80 4e63be7 c4a9b80 4e63be7 1672a38 4c3b8f4 1672a38 1dd2e00 4e63be7 c4a9b80 ef6d811 c040c42 c4a9b80 ba6183f c4a9b80 4e63be7 c4a9b80 70c66a8 4e63be7 99fb0c5 c4a9b80 4e63be7 f618670 4e63be7 3957f90 2dde117 c4a9b80 3957f90 4e63be7 99fb0c5 c4a9b80 99fb0c5 4c3b8f4 99fb0c5 4e63be7 985726e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import gradio as gr
import wave
import numpy as np
from io import BytesIO
from huggingface_hub import hf_hub_download
from piper import PiperVoice
from transformers import pipeline
# Load the NSFW classifier model
nsfw_detector = pipeline("text-classification", model="michellejieli/NSFW_text_classifier")
def synthesize_speech(text):
# Check for NSFW content using the classifier
nsfw_result = nsfw_detector(text)
label = nsfw_result[0]['label']
score = nsfw_result[0]['score']
if label == 'NSFW' and score >= 0.95:
error_audio_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="error_audio.wav")
# Read the error audio file
try:
with wave.open(error_audio_path, 'rb') as error_audio_file:
frames = error_audio_file.readframes(error_audio_file.getnframes())
error_audio_data = np.frombuffer(frames, dtype=np.int16).tobytes()
except Exception as e:
print(f"Error reading audio file: {e}")
return None, "Error in processing audio file."
return error_audio_data, "NSFW content detected. Cannot process."
model_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__01234_model.onnx")
config_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__01234_model.onnx.json")
voice = PiperVoice.load(model_path, config_path)
buffer = BytesIO()
with wave.open(buffer, 'wb') as wav_file:
wav_file.setframerate(voice.config.sample_rate)
wav_file.setsampwidth(2)
wav_file.setnchannels(1)
voice.synthesize(text, wav_file, sentence_silence=0.75, length_scale=1.2)
buffer.seek(0)
audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
return audio_data.tobytes(), None
# Gradio Interface
with gr.Blocks(theme=gr.themes.Base()) as blocks:
gr.Markdown("# Text to Speech Synthesizer")
gr.Markdown("Enter text to synthesize it into speech using models from the State Library of Queensland's collection using Piper.")
input_text = gr.Textbox(label="Input Text")
output_audio = gr.Audio(label="Synthesized Speech", type="numpy", show_download_button=False)
output_text = gr.Textbox(label="Output Text", visible=False)
submit_button = gr.Button("Synthesize")
def process_and_output(text):
audio, message = synthesize_speech(text)
if message:
return audio, message
else:
return audio, None
submit_button.click(process_and_output, inputs=input_text, outputs=[output_audio, output_text])
blocks.launch()
|