Spaces:
Running
on
A10G
Running
on
A10G
Another big commit
Browse files- README.md +8 -1
- data/voice_presets/metadata.json +4 -4
- examples/1.mp4 +0 -0
- examples/2.mp4 +0 -0
- examples/3.mp4 +0 -0
- examples/examples.py +76 -13
- pipeline.py +8 -0
- services.py +2 -1
- ui_client.py +26 -22
- utils.py +3 -0
README.md
CHANGED
@@ -45,7 +45,14 @@ export WAVJOURNEY_OPENAI_KEY=your_openai_key_here
|
|
45 |
|
46 |
6. Set environment variables for using API services
|
47 |
```bash
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
```
|
50 |
|
51 |
|
|
|
45 |
|
46 |
6. Set environment variables for using API services
|
47 |
```bash
|
48 |
+
# Set the port for the WAVJOURNEY service to 8021
|
49 |
+
export WAVJOURNEY_SERVICE_PORT=8021
|
50 |
+
|
51 |
+
# Set the URL for the WAVJOURNEY service to 127.0.0.1
|
52 |
+
export WAVJOURNEY_SERVICE_URL=127.0.0.1
|
53 |
+
|
54 |
+
# Limit the maximum script lines for WAVJOURNEY to 999
|
55 |
+
export WAVJOURNEY_MAX_SCRIPT_LINES=999
|
56 |
```
|
57 |
|
58 |
|
data/voice_presets/metadata.json
CHANGED
@@ -34,13 +34,13 @@
|
|
34 |
"desc": "a female voice of a off-site news reporter, suitable for news scenario",
|
35 |
"npz_path": "data/voice_presets/npz/news_female_speaker_outside.npz"
|
36 |
},
|
37 |
-
"
|
38 |
-
"id": "
|
39 |
"desc": "a small young boy voice",
|
40 |
"npz_path": "data/voice_presets/npz/child_boy.npz"
|
41 |
},
|
42 |
-
"
|
43 |
-
"id": "
|
44 |
"desc": "a voice of an old man",
|
45 |
"npz_path": "data/voice_presets/npz/elder_morgen.npz"
|
46 |
}
|
|
|
34 |
"desc": "a female voice of a off-site news reporter, suitable for news scenario",
|
35 |
"npz_path": "data/voice_presets/npz/news_female_speaker_outside.npz"
|
36 |
},
|
37 |
+
"Child": {
|
38 |
+
"id": "Child",
|
39 |
"desc": "a small young boy voice",
|
40 |
"npz_path": "data/voice_presets/npz/child_boy.npz"
|
41 |
},
|
42 |
+
"Old_man": {
|
43 |
+
"id": "Old_man",
|
44 |
"desc": "a voice of an old man",
|
45 |
"npz_path": "data/voice_presets/npz/elder_morgen.npz"
|
46 |
}
|
examples/1.mp4
ADDED
Binary file (365 kB). View file
|
|
examples/2.mp4
ADDED
Binary file (241 kB). View file
|
|
examples/3.mp4
ADDED
Binary file (346 kB). View file
|
|
examples/examples.py
CHANGED
@@ -1,24 +1,87 @@
|
|
1 |
|
2 |
example1 = {
|
3 |
-
'text': "
|
4 |
-
'
|
5 |
-
| Audio Type | Layout | ID
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
""",
|
10 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
}
|
12 |
|
13 |
example2 = {
|
14 |
-
'text': "
|
15 |
-
'
|
16 |
-
| Audio Type | Layout | ID
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
""",
|
21 |
-
'wav_file': 'examples/
|
22 |
}
|
23 |
|
24 |
-
|
|
|
|
|
|
1 |
|
2 |
example1 = {
|
3 |
+
'text': "An introduction to AI-assisted audio content creation.",
|
4 |
+
'table_script': """
|
5 |
+
| Audio Type | Layout | ID | Character | Action | Volume | Description | Length |
|
6 |
+
|--------------|------------|----|-----------|--------|--------|------------------------------------------------------------------|--------|
|
7 |
+
| music | background | 1 | N/A | begin | -35 | Inspirational technology-themed music | Auto |
|
8 |
+
| speech | foreground | N/A| Narrator | N/A | -15 | Welcome to the future of audio content creation. | Auto |
|
9 |
+
| sound_effect | foreground | N/A| N/A | N/A | -35 | Digital startup sound | 2 |
|
10 |
+
| speech | foreground | N/A| Narrator | N/A | -15 | With evolving technology, we are introducing AI-assisted tools for pristine audio production. | Auto |
|
11 |
+
| sound_effect | foreground | N/A| N/A | N/A | -35 | Keyboard typing noise | 3 |
|
12 |
+
| speech | foreground | N/A| Narrator | N/A | -15 | Imagine crafting audio content with the power of AI at your fingertips. | Auto |
|
13 |
+
| sound_effect | background | 2 | N/A | begin | -35 | Ambiance of a busy control room | Auto |
|
14 |
+
| speech | foreground | N/A| Narrator | N/A | -15 | Enhanced quality, efficient production and limitless creativity, all under one roof. | Auto |
|
15 |
+
| sound_effect | background | 2 | N/A | end | N/A | N/A | Auto |
|
16 |
+
| speech | foreground | N/A| Narrator | N/A | -15 | Unleash your potential with AI-assisted audio content creation. | Auto |
|
17 |
+
| music | background | 1 | N/A | end | N/A | N/A | Auto |
|
18 |
|
19 |
""",
|
20 |
+
'table_voice': """
|
21 |
+
| Character | Voice |
|
22 |
+
|-------------|-----------|
|
23 |
+
| Narrator | News_Male |
|
24 |
+
|
25 |
+
""",
|
26 |
+
'wav_file': 'examples/1.mp4',
|
27 |
}
|
28 |
|
29 |
example2 = {
|
30 |
+
'text': "A couple dating in a cafe.",
|
31 |
+
'table_script': """
|
32 |
+
| Audio Type | Layout | ID | Character | Action | Volume | Description | Length |
|
33 |
+
|--------------|------------|----|-----------|--------|--------|-----------------------------------------------|--------|
|
34 |
+
| sound_effect | background | 1 | N/A | begin | -35 | Soft chattering in a cafe | Auto |
|
35 |
+
| sound_effect | background | 2 | N/A | begin | -38 | Coffee brewing noises | Auto |
|
36 |
+
| music | background | 3 | N/A | begin | -35 | Soft jazz playing in the background | Auto |
|
37 |
+
| speech | foreground | N/A| Man | N/A | -15 | It’s really nice to finally get out and relax a little, isn’t it? | Auto |
|
38 |
+
| speech | foreground | N/A| Woman | N/A | -15 | I know, right? We should do this more often. | Auto |
|
39 |
+
| sound_effect | background | 2 | N/A | end | N/A | N/A | Auto |
|
40 |
+
| speech | foreground | N/A| Man | N/A | -15 | Here’s your coffee, just as you like it. | Auto |
|
41 |
+
| speech | foreground | N/A| Woman | N/A | -15 | Thank you, it smells wonderful. | Auto |
|
42 |
+
| music | background | 3 | N/A | end | N/A | N/A | Auto |
|
43 |
+
| sound_effect | background | 1 | N/A | end | N/A | N/A | Auto |
|
44 |
+
|
45 |
+
""",
|
46 |
+
'table_voice': """
|
47 |
+
| Character | Voice |
|
48 |
+
|-------------|-----------|
|
49 |
+
| Man | Male1 |
|
50 |
+
| Woman | Female1 |
|
51 |
+
|
52 |
+
""",
|
53 |
+
'wav_file': 'examples/2.mp4',
|
54 |
+
}
|
55 |
+
|
56 |
+
|
57 |
+
example3 = {
|
58 |
+
'text': "A child is participating in a farting contest.",
|
59 |
+
'table_script': """
|
60 |
+
| Audio Type | Layout | ID | Character | Action | Volume | Description | Length |
|
61 |
+
|--------------|------------|----|-----------|--------|--------|------------------------------------------------------|--------|
|
62 |
+
| sound_effect | background | 1 | N/A | begin | -35 | Outdoor park ambiance, people chattering | Auto |
|
63 |
+
| music | background | 2 | N/A | begin | -35 | Light comedy theme music, quirky | Auto |
|
64 |
+
| speech | foreground | N/A| Host | N/A | -15 | Welcome to the annual Fart Competition. | Auto |
|
65 |
+
| speech | foreground | N/A| Host | N/A | -15 | Now, let’s welcome our youngest participant. | Auto |
|
66 |
+
| sound_effect | foreground | N/A| N/A | N/A | -35 | Clapping sound | 2 |
|
67 |
+
| speech | foreground | N/A| Child | N/A | -15 | Hi, I’m excited to be here. | Auto |
|
68 |
+
| sound_effect | foreground | N/A| N/A | N/A | -35 | Short, cartoonish duration of a fart sound | 4 |
|
69 |
+
| sound_effect | foreground | N/A| N/A | N/A | -35 | Audience laughing and applauding | 2 |
|
70 |
+
| speech | foreground | N/A| Host | N/A | -15 | Wow, that was impressive! Let’s give another round of applause! | Auto |
|
71 |
+
| sound_effect | foreground | N/A| N/A | N/A | -35 | Audience clapping and cheering | 3 |
|
72 |
+
| music | background | 2 | N/A | end | N/A | N/A | Auto |
|
73 |
+
| sound_effect | background | 1 | N/A | end | N/A | N/A | Auto |
|
74 |
+
""",
|
75 |
+
'table_voice': """
|
76 |
+
| Character | Voice |
|
77 |
+
|-------------|-----------|
|
78 |
+
| Host | Male1 |
|
79 |
+
| Child | Child |
|
80 |
|
81 |
""",
|
82 |
+
'wav_file': 'examples/3.mp4',
|
83 |
}
|
84 |
|
85 |
+
|
86 |
+
|
87 |
+
examples = [example1, example2, example3]
|
pipeline.py
CHANGED
@@ -194,6 +194,14 @@ def generate_json_file(session_id, input_text, api_key):
|
|
194 |
|
195 |
# Function call used by Gradio: json to result wav
|
196 |
def generate_audio(session_id, json_script, api_key):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
output_path = utils.get_session_path(session_id)
|
198 |
output_audio_path = utils.get_session_audio_path(session_id)
|
199 |
voices = voice_presets.get_merged_voice_presets(session_id)
|
|
|
194 |
|
195 |
# Function call used by Gradio: json to result wav
|
196 |
def generate_audio(session_id, json_script, api_key):
|
197 |
+
def count_lines(content):
|
198 |
+
# Split the string using the newline character and count the non-empty lines
|
199 |
+
return sum(1 for line in content.split('\n') if line.strip())
|
200 |
+
|
201 |
+
max_lines = utils.get_max_script_lines()
|
202 |
+
if count_lines(json_script) > max_lines:
|
203 |
+
raise ValueError(f'The number of lines of the JSON script has exceeded {max_lines}!')
|
204 |
+
|
205 |
output_path = utils.get_session_path(session_id)
|
206 |
output_audio_path = utils.get_session_audio_path(session_id)
|
207 |
voices = voice_presets.get_merged_voice_presets(session_id)
|
services.py
CHANGED
@@ -227,4 +227,5 @@ def parse_voice():
|
|
227 |
|
228 |
if __name__ == '__main__':
|
229 |
service_port = get_service_port()
|
230 |
-
|
|
|
|
227 |
|
228 |
if __name__ == '__main__':
|
229 |
service_port = get_service_port()
|
230 |
+
# We disable multithreading to force services to process one request at a time and avoid CUDA OOM
|
231 |
+
app.run(debug=False, threaded=False, port=service_port)
|
ui_client.py
CHANGED
@@ -54,7 +54,7 @@ def generate_script_fn(instruction, _state: gr.State):
|
|
54 |
json_script = generate_json_file(session_id, instruction, api_key)
|
55 |
table_text = convert_json_to_md(json_script)
|
56 |
except Exception as e:
|
57 |
-
gr.Warning(str(e)
|
58 |
print(f"Generating script error: {str(e)}")
|
59 |
traceback.print_exc()
|
60 |
return [
|
@@ -99,7 +99,7 @@ def generate_audio_fn(state):
|
|
99 |
except Exception as e:
|
100 |
print(f"Generation audio error: {str(e)}")
|
101 |
traceback.print_exc()
|
102 |
-
gr.Warning(str(e)
|
103 |
|
104 |
return [
|
105 |
None,
|
@@ -210,7 +210,7 @@ def add_voice_preset(vp_id, vp_desc, file, ui_state, added_voice_preset):
|
|
210 |
except Exception as exception:
|
211 |
print(exception)
|
212 |
traceback.print_exc()
|
213 |
-
gr.Warning(str(exception)
|
214 |
|
215 |
# After added
|
216 |
dataframe = get_voice_preset_to_list(ui_state)
|
@@ -451,10 +451,29 @@ with gr.Blocks(css=css) as interface:
|
|
451 |
loading_icon = gr.HTML(loading_icon_html)
|
452 |
share_button = gr.Button(value="Share to community", elem_id="share-btn")
|
453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
454 |
# System Voice Presets
|
455 |
gr.Markdown(label='System Voice Presets', value='# System Voice Presets')
|
456 |
-
|
457 |
-
|
|
|
458 |
# User Voice Preset Related
|
459 |
gr.Markdown('# (Optional) Speaker Customization ')
|
460 |
with gr.Accordion("Click to add speakers", open=False):
|
@@ -476,22 +495,7 @@ with gr.Blocks(css=css) as interface:
|
|
476 |
vp_file = gr.File(label='Wav File', type='file', file_types=['.wav'],
|
477 |
interactive=True)
|
478 |
vp_submit = gr.Button(label='Upload Voice Preset', value="Upload Voice Preset")
|
479 |
-
|
480 |
-
from examples.examples import examples as WJExamples
|
481 |
-
def example_fn(idx, _text_input):
|
482 |
-
print('from example', idx, _text_input)
|
483 |
-
example = WJExamples[int(idx)-1]
|
484 |
-
return example['table_text'], gr.make_waveform(example['wav_file'])
|
485 |
-
|
486 |
-
_idx_input = gr.Textbox(label='Example No')
|
487 |
-
_idx_input.visible=False
|
488 |
-
gr.Examples(
|
489 |
-
[[idx+1, x['text']] for idx, x in enumerate(WJExamples)],
|
490 |
-
fn=example_fn,
|
491 |
-
inputs=[_idx_input, text_input],
|
492 |
-
outputs=[char_voice_map_markdown, audio_output],
|
493 |
-
cache_examples=True,
|
494 |
-
)
|
495 |
# clear btn, will re-new a session
|
496 |
clear_btn = gr.ClearButton(value='Clear All')
|
497 |
|
@@ -579,5 +583,5 @@ with gr.Blocks(css=css) as interface:
|
|
579 |
# debug only
|
580 |
# print_state_btn = gr.Button(value='Print State')
|
581 |
# print_state_btn.click(fn=lambda state, state2: print(state, state2), inputs=[ui_state, selected_voice_presets])
|
582 |
-
interface.queue(concurrency_count=
|
583 |
interface.launch()
|
|
|
54 |
json_script = generate_json_file(session_id, instruction, api_key)
|
55 |
table_text = convert_json_to_md(json_script)
|
56 |
except Exception as e:
|
57 |
+
gr.Warning(str(e))
|
58 |
print(f"Generating script error: {str(e)}")
|
59 |
traceback.print_exc()
|
60 |
return [
|
|
|
99 |
except Exception as e:
|
100 |
print(f"Generation audio error: {str(e)}")
|
101 |
traceback.print_exc()
|
102 |
+
gr.Warning(str(e))
|
103 |
|
104 |
return [
|
105 |
None,
|
|
|
210 |
except Exception as exception:
|
211 |
print(exception)
|
212 |
traceback.print_exc()
|
213 |
+
gr.Warning(str(exception))
|
214 |
|
215 |
# After added
|
216 |
dataframe = get_voice_preset_to_list(ui_state)
|
|
|
451 |
loading_icon = gr.HTML(loading_icon_html)
|
452 |
share_button = gr.Button(value="Share to community", elem_id="share-btn")
|
453 |
|
454 |
+
# add examples
|
455 |
+
from examples.examples import examples as WJExamples
|
456 |
+
def example_fn(idx, _text_input):
|
457 |
+
print('from example', idx, _text_input)
|
458 |
+
example = WJExamples[int(idx)-1]
|
459 |
+
print(example['table_script'], example['table_voice'], gr.make_waveform(example['wav_file']))
|
460 |
+
return example['table_script'], example['table_voice'], gr.make_waveform(example['wav_file'])
|
461 |
+
|
462 |
+
_idx_input = gr.Textbox(label='Example No.')
|
463 |
+
_idx_input.visible=False
|
464 |
+
gr.Examples(
|
465 |
+
[[idx+1, x['text']] for idx, x in enumerate(WJExamples)],
|
466 |
+
fn=example_fn,
|
467 |
+
inputs=[_idx_input, text_input],
|
468 |
+
outputs=[audio_script_markdown, char_voice_map_markdown, audio_output],
|
469 |
+
cache_examples=True,
|
470 |
+
)
|
471 |
+
|
472 |
# System Voice Presets
|
473 |
gr.Markdown(label='System Voice Presets', value='# System Voice Presets')
|
474 |
+
with gr.Accordion("Click to see system speakers", open=False):
|
475 |
+
system_markdown_voice_presets = gr.Dataframe(label='System Voice Presets', headers=VOICE_PRESETS_HEADERS,
|
476 |
+
value=system_voice_presets)
|
477 |
# User Voice Preset Related
|
478 |
gr.Markdown('# (Optional) Speaker Customization ')
|
479 |
with gr.Accordion("Click to add speakers", open=False):
|
|
|
495 |
vp_file = gr.File(label='Wav File', type='file', file_types=['.wav'],
|
496 |
interactive=True)
|
497 |
vp_submit = gr.Button(label='Upload Voice Preset', value="Upload Voice Preset")
|
498 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
# clear btn, will re-new a session
|
500 |
clear_btn = gr.ClearButton(value='Clear All')
|
501 |
|
|
|
583 |
# debug only
|
584 |
# print_state_btn = gr.Button(value='Print State')
|
585 |
# print_state_btn.click(fn=lambda state, state2: print(state, state2), inputs=[ui_state, selected_voice_presets])
|
586 |
+
interface.queue(concurrency_count=1, max_size=20)
|
587 |
interface.launch()
|
utils.py
CHANGED
@@ -77,3 +77,6 @@ def get_api_key():
|
|
77 |
api_key = os.environ.get('WAVJOURNEY_OPENAI_KEY')
|
78 |
return api_key
|
79 |
|
|
|
|
|
|
|
|
77 |
api_key = os.environ.get('WAVJOURNEY_OPENAI_KEY')
|
78 |
return api_key
|
79 |
|
80 |
+
def get_max_script_lines():
|
81 |
+
max_lines = int(os.environ.get('WAVJOURNEY_MAX_SCRIPT_LINES', 999))
|
82 |
+
return max_lines
|