Spaces:
Build error
Build error
Commit
·
7ee8f06
1
Parent(s):
9666de8
Code
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import io
|
| 2 |
-
import math
|
| 3 |
from threading import Thread
|
| 4 |
import random
|
|
|
|
| 5 |
|
| 6 |
import numpy as np
|
| 7 |
import spaces
|
|
@@ -26,7 +26,7 @@ model = ParlerTTSForConditionalGeneration.from_pretrained(
|
|
| 26 |
jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
|
| 27 |
).to(device)
|
| 28 |
|
| 29 |
-
client = InferenceClient()
|
| 30 |
|
| 31 |
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
| 32 |
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
|
@@ -63,10 +63,9 @@ def numpy_to_mp3(audio_array, sampling_rate):
|
|
| 63 |
sampling_rate = model.audio_encoder.config.sampling_rate
|
| 64 |
frame_rate = model.audio_encoder.config.frame_rate
|
| 65 |
|
| 66 |
-
@spaces.GPU
|
| 67 |
-
def generate_base(audio):
|
| 68 |
|
| 69 |
-
|
|
|
|
| 70 |
|
| 71 |
messages = [{"role": "sytem", "content": ("You are a magic 8 ball."
|
| 72 |
"Someone will present to you a situation or question and your job "
|
|
@@ -74,9 +73,13 @@ def generate_base(audio):
|
|
| 74 |
"'curiosity killed the cat' or 'The early bird gets the worm'.")},
|
| 75 |
{"role": "user", "content": f"Please tell me what to do about {question}"}]
|
| 76 |
|
| 77 |
-
response = client.chat_completion(messages, max_tokens=
|
| 78 |
response = response.choices[0].message.content
|
|
|
|
|
|
|
| 79 |
|
|
|
|
|
|
|
| 80 |
|
| 81 |
play_steps_in_s = 1.0
|
| 82 |
play_steps = int(frame_rate * play_steps_in_s)
|
|
@@ -85,7 +88,7 @@ def generate_base(audio):
|
|
| 85 |
description_tokens = tokenizer(description, return_tensors="pt").to(device)
|
| 86 |
|
| 87 |
streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
|
| 88 |
-
prompt = tokenizer(
|
| 89 |
|
| 90 |
generation_kwargs = dict(
|
| 91 |
input_ids=description_tokens.input_ids,
|
|
@@ -102,11 +105,12 @@ def generate_base(audio):
|
|
| 102 |
|
| 103 |
for new_audio in streamer:
|
| 104 |
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
| 105 |
-
yield
|
| 106 |
|
| 107 |
css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
|
| 108 |
.my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
|
| 109 |
|
|
|
|
| 110 |
with gr.Blocks() as block:
|
| 111 |
gr.HTML(
|
| 112 |
f"""
|
|
@@ -117,10 +121,12 @@ with gr.Blocks() as block:
|
|
| 117 |
with gr.Group():
|
| 118 |
with gr.Row():
|
| 119 |
audio_out = gr.Audio(visible=False, streaming=True)
|
| 120 |
-
answer = gr.Textbox(label="Answer")
|
|
|
|
|
|
|
| 121 |
with gr.Row():
|
| 122 |
-
audio_in = gr.Audio(label="Speak you question", sources="microphone",
|
| 123 |
|
| 124 |
-
audio_in.stop_recording(fn=generate_base, inputs=
|
| 125 |
|
| 126 |
block.launch()
|
|
|
|
| 1 |
import io
|
|
|
|
| 2 |
from threading import Thread
|
| 3 |
import random
|
| 4 |
+
import os
|
| 5 |
|
| 6 |
import numpy as np
|
| 7 |
import spaces
|
|
|
|
| 26 |
jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
|
| 27 |
).to(device)
|
| 28 |
|
| 29 |
+
client = InferenceClient(token=os.getenv("HF_TOKEN"))
|
| 30 |
|
| 31 |
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
| 32 |
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
|
|
|
| 63 |
sampling_rate = model.audio_encoder.config.sampling_rate
|
| 64 |
frame_rate = model.audio_encoder.config.frame_rate
|
| 65 |
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
def generate_response(audio):
|
| 68 |
+
question = client.automatic_speech_recognition(audio)
|
| 69 |
|
| 70 |
messages = [{"role": "sytem", "content": ("You are a magic 8 ball."
|
| 71 |
"Someone will present to you a situation or question and your job "
|
|
|
|
| 73 |
"'curiosity killed the cat' or 'The early bird gets the worm'.")},
|
| 74 |
{"role": "user", "content": f"Please tell me what to do about {question}"}]
|
| 75 |
|
| 76 |
+
response = client.chat_completion(messages, max_tokens=128, seed=random.randint(1, 5000))
|
| 77 |
response = response.choices[0].message.content
|
| 78 |
+
yield response, gr.Textbox(visible=True)
|
| 79 |
+
|
| 80 |
|
| 81 |
+
@spaces.GPU
|
| 82 |
+
def generate_base(answer):
|
| 83 |
|
| 84 |
play_steps_in_s = 1.0
|
| 85 |
play_steps = int(frame_rate * play_steps_in_s)
|
|
|
|
| 88 |
description_tokens = tokenizer(description, return_tensors="pt").to(device)
|
| 89 |
|
| 90 |
streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
|
| 91 |
+
prompt = tokenizer(answer, return_tensors="pt").to(device)
|
| 92 |
|
| 93 |
generation_kwargs = dict(
|
| 94 |
input_ids=description_tokens.input_ids,
|
|
|
|
| 105 |
|
| 106 |
for new_audio in streamer:
|
| 107 |
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
| 108 |
+
yield gr.Textbox(value=answer, visible=True), numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
| 109 |
|
| 110 |
css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
|
| 111 |
.my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
|
| 112 |
|
| 113 |
+
|
| 114 |
with gr.Blocks() as block:
|
| 115 |
gr.HTML(
|
| 116 |
f"""
|
|
|
|
| 121 |
with gr.Group():
|
| 122 |
with gr.Row():
|
| 123 |
audio_out = gr.Audio(visible=False, streaming=True)
|
| 124 |
+
answer = gr.Textbox(visible=False, label="Answer")
|
| 125 |
+
state = gr.State()
|
| 126 |
+
|
| 127 |
with gr.Row():
|
| 128 |
+
audio_in = gr.Audio(label="Speak you question", sources="microphone", type="filepath")
|
| 129 |
|
| 130 |
+
audio_in.stop_recording(generate_response, audio_in, [state, answer]).then(fn=generate_base, inputs=state, outputs=[answer, audio_out])
|
| 131 |
|
| 132 |
block.launch()
|