File size: 2,936 Bytes
4e63be7 c4a9b80 4e63be7 c4a9b80 4e63be7 1672a38 4c3b8f4 1672a38 1dd2e00 4e63be7 c4a9b80 ef6d811 c040c42 c4a9b80 ba6183f c4a9b80 4e63be7 c4a9b80 70c66a8 4e63be7 99fb0c5 c4a9b80 581e370 4e63be7 2bf7573 4e63be7 63f13ad 3957f90 2dde117 a41f904 3957f90 4e63be7 99fb0c5 c4a9b80 99fb0c5 4c3b8f4 99fb0c5 4e63be7 985726e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import gradio as gr
import wave
import numpy as np
from io import BytesIO
from huggingface_hub import hf_hub_download
from piper import PiperVoice
from transformers import pipeline
# Load the NSFW classifier model
nsfw_detector = pipeline("text-classification", model="michellejieli/NSFW_text_classifier")
def synthesize_speech(text):
# Check for NSFW content using the classifier
nsfw_result = nsfw_detector(text)
label = nsfw_result[0]['label']
score = nsfw_result[0]['score']
if label == 'NSFW' and score >= 0.95:
error_audio_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="error_audio.wav")
# Read the error audio file
try:
with wave.open(error_audio_path, 'rb') as error_audio_file:
frames = error_audio_file.readframes(error_audio_file.getnframes())
error_audio_data = np.frombuffer(frames, dtype=np.int16).tobytes()
except Exception as e:
print(f"Error reading audio file: {e}")
return None, "Error in processing audio file."
return error_audio_data, "NSFW content detected. Cannot process."
model_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__01234_model.onnx")
config_path = hf_hub_download(repo_id="DLI-SLQ/speaker_01234", filename="speaker__01234_model.onnx.json")
voice = PiperVoice.load(model_path, config_path)
buffer = BytesIO()
with wave.open(buffer, 'wb') as wav_file:
wav_file.setframerate(voice.config.sample_rate)
wav_file.setsampwidth(2)
wav_file.setnchannels(1)
voice.synthesize(text, wav_file, sentence_silence=0.75, length_scale=1.2)
buffer.seek(0)
audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
return audio_data.tobytes(), None
# Gradio Interface
with gr.Blocks(theme=gr.themes.Base(),css="footer {visibility: hidden}") as blocks:
gr.Markdown("# Text to Speech Synthesizer")
gr.Markdown("Enter text to synthesize it into speech using models from the State Library of Queensland's collection. This model uses data from the following collections: Suzanne Mulligan Oral Histories Archive, the Peter Gray audio tapes, Five Years On : Toowoomba and Lockyer Valley flash floods: oral history interviews and Our Rocklea: connecting with the heart through story and creativity 2012.")
input_text = gr.Textbox(label="Input Text")
submit_button = gr.Button("Synthesize")
output_audio = gr.Audio(label="Synthesized Speech", type="numpy", show_download_button=False)
output_text = gr.Textbox(label="Output Text", visible=False)
def process_and_output(text):
audio, message = synthesize_speech(text)
if message:
return audio, message
else:
return audio, None
submit_button.click(process_and_output, inputs=input_text, outputs=[output_audio, output_text])
blocks.launch()
|