RapGPT / app.py
jonathang's picture
Update app.py
de7f62c
raw
history blame
4.17 kB
import requests
import concurrent.futures
import librosa
import numpy as np
import time
import functools
import soundfile as sf
import os
import gradio as gr
import numpy as np
from scipy.io import wavfile
from io import BytesIO
from typing import Tuple
API_KEY = (os.environ["UBERDUCK_USER"], os.environ["UBERDUCK_PASS"])
API_URL = "https://api.uberduck.ai"
def start_synthesis(text, voice):
url = f"{API_URL}/speak"
data = {
"speech": text,
"voice": voice,
}
response = requests.post(url, auth=API_KEY, json=data)
response.raise_for_status()
return response.json()["uuid"]
def check_synthesis_status(uuid):
url = f"{API_URL}/speak-status?uuid={uuid}"
response = requests.get(url, auth=API_KEY)
response.raise_for_status()
return response.json()
def download_synthesis(url):
response = requests.get(url)
response.raise_for_status()
return response.content
@functools.cache
def download_and_process_speech(text, voice, sr):
uuid = start_synthesis(text, voice)
status = "started"
while status != "completed":
synthesis_status = check_synthesis_status(uuid)
url = synthesis_status["path"]
if url:
break
time.sleep(1)
audio_data = download_synthesis(url)
with open(f"{text}.wav", "wb") as f:
f.write(audio_data)
vocal, _ = librosa.load(f"{text}.wav", sr=sr)
return vocal
def place_vocals_on_track(instrumental_file, text_list, voice, name='output', offset=8, time_signature=4):
instrumental, sr = librosa.load(instrumental_file)
tempo, beat_frames = librosa.beat.beat_track(y=instrumental, sr=sr)
beat_times = librosa.frames_to_time(beat_frames, sr=sr)
measure_starts = beat_times[::time_signature]
vocals = [None] * len(text_list)
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {}
for i, text in enumerate(text_list):
if isinstance(voice, dict):
tvoice, ttext = text.split(':', maxsplit=1)
futures[executor.submit(download_and_process_speech, ttext, voice[tvoice], sr)] = i
else:
futures[executor.submit(download_and_process_speech, text, voice, sr)] = i
for future in concurrent.futures.as_completed(futures.keys()):
vocals[futures[future]] = future.result()
output = np.zeros_like(instrumental)
output[:len(instrumental)] = instrumental
for i, vocal in enumerate(vocals):
if i < len(measure_starts):
start_sample = librosa.time_to_samples(measure_starts[i+offset], sr=sr)
end_sample = start_sample + len(vocal)
output[start_sample:end_sample] += vocal[:end_sample - start_sample]
if name is not None:
sf.write(name+'.wav', output, sr, 'PCM_24')
return sr, output
def solve(text, beat, offset, time_signature):
text = text.replace(",", "").splitlines()
text = [l for l in text if l.strip() and not l.startswith("(") and not l.startswith('[')]
sr, output = place_vocals_on_track(beat, text, "snoop-dogg", name=None, offset=offset, time_signature=time_signature)
return sr, output
def process_and_play(text: str, file: gr.inputs.Audio, offset, time_signature) -> Tuple[str, gr.outputs.Audio]:
output = BytesIO()
wavfile.write(output, *file)
output.seek(0) # Reset the file pointer to the beginning of the buffer
sr, output_wav = solve(text, output, offset, time_signature) # Call the solve() function
output = BytesIO()
wavfile.write(output, sr, output_wav)
output.seek(0) # Reset the file pointer to the beginning of the buffer
return (sr, output_wav),
inputs = [
gr.inputs.Textbox(label="Input Text"),
gr.inputs.Audio(label="Input Audio"),
gr.inputs.Number(label="Offset", default=2), # Added input for the "offset"
gr.inputs.Number(label="Time Signature", default=8), # Added input for the "time signature"
]
outputs = [
gr.outputs.Audio(label="Processed Audio", type='numpy')
]
iface = gr.Interface(fn=process_and_play, inputs=inputs, outputs=outputs, title="Text and File Processor")
iface.launch()