ajayarora1235
commited on
Commit
•
4f1e982
1
Parent(s):
8de3ef1
add models direct to space
Browse files- .gitattributes +2 -0
- app.py +167 -1
- pretrained_models/encodec_4cb2048_giga.th +3 -0
- pretrained_models/giga330M.pth +3 -0
.gitattributes
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
ilariasuitewallpaper.jpg filter=lfs diff=lfs merge=lfs -text
|
2 |
ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
1 |
ilariasuitewallpaper.jpg filter=lfs diff=lfs merge=lfs -text
|
2 |
ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
|
3 |
+
pretrained_models/giga330M.pth filter=lfs diff=lfs merge=lfs -text
|
4 |
+
pretrained_models/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -1502,6 +1502,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
|
|
1502 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
1503 |
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
1504 |
target_transcript = transcribed_text + target_transcript
|
|
|
1505 |
info = torchaudio.info(audio_fn)
|
1506 |
audio_dur = info.num_frames / info.sample_rate
|
1507 |
|
@@ -1545,6 +1546,136 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
|
|
1545 |
|
1546 |
return [seg_save_fn_concat, seg_save_fn_gen]
|
1547 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1548 |
def upload_to_dataset(files, dir):
|
1549 |
if dir == '':
|
1550 |
dir = './dataset'
|
@@ -1678,6 +1809,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
1678 |
output_audio_gen = gr.Audio(label="Output Audio generated")
|
1679 |
cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
|
1680 |
run_btn = gr.Button(value="run")
|
|
|
1681 |
target_transcript = gr.Textbox(label="target transcript")
|
1682 |
|
1683 |
transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
|
@@ -1704,7 +1836,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
1704 |
output_audio_con,
|
1705 |
output_audio_gen
|
1706 |
])
|
1707 |
-
|
1708 |
with gr.Column():
|
1709 |
vc_output2 = gr.Audio(
|
1710 |
label="Final Result! (Click on the three dots to download the audio)",
|
@@ -1864,6 +1996,40 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
1864 |
],
|
1865 |
[vc_output1, vc_output2],
|
1866 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1867 |
|
1868 |
with gr.Accordion("Batch Conversion",open=False, visible=False):
|
1869 |
with gr.Row():
|
|
|
1502 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
1503 |
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
1504 |
target_transcript = transcribed_text + target_transcript
|
1505 |
+
print(target_transcript)
|
1506 |
info = torchaudio.info(audio_fn)
|
1507 |
audio_dur = info.num_frames / info.sample_rate
|
1508 |
|
|
|
1546 |
|
1547 |
return [seg_save_fn_concat, seg_save_fn_gen]
|
1548 |
|
1549 |
+
def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
1550 |
+
temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
|
1551 |
+
sid,
|
1552 |
+
f0_up_key,
|
1553 |
+
f0_file,
|
1554 |
+
f0_method,
|
1555 |
+
file_index,
|
1556 |
+
#file_index2,
|
1557 |
+
# file_big_npy,
|
1558 |
+
index_rate,
|
1559 |
+
filter_radius,
|
1560 |
+
resample_sr,
|
1561 |
+
rms_mix_rate,
|
1562 |
+
protect,
|
1563 |
+
crepe_hop_length):
|
1564 |
+
|
1565 |
+
global voicecraft_model, voicecraft_config, phn2num
|
1566 |
+
|
1567 |
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
1568 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
1569 |
+
os.environ["USER"] = "USER"
|
1570 |
+
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
1571 |
+
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
1572 |
+
target_transcript = transcribed_text + target_transcript
|
1573 |
+
print(target_transcript)
|
1574 |
+
info = torchaudio.info(audio_fn)
|
1575 |
+
audio_dur = info.num_frames / info.sample_rate
|
1576 |
+
|
1577 |
+
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
1578 |
+
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
1579 |
+
|
1580 |
+
if voicecraft_model is None:
|
1581 |
+
load_voicecraft()
|
1582 |
+
|
1583 |
+
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
|
1584 |
+
text_tokenizer = TextTokenizer(backend="espeak")
|
1585 |
+
audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
|
1586 |
+
|
1587 |
+
|
1588 |
+
# # run the model to get the output
|
1589 |
+
decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
|
1590 |
+
'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
|
1591 |
+
"silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
|
1592 |
+
from lib.voicecraft.inference_tts_scale import inference_one_sample
|
1593 |
+
concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
|
1594 |
+
audio_fn, target_transcript, config.device, decode_config,
|
1595 |
+
prompt_end_frame)
|
1596 |
+
|
1597 |
+
# save segments for comparison
|
1598 |
+
concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
|
1599 |
+
# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
|
1600 |
+
|
1601 |
+
output_dir = "./demo/generated_tts"
|
1602 |
+
os.makedirs(output_dir, exist_ok=True)
|
1603 |
+
seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
|
1604 |
+
seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
|
1605 |
+
|
1606 |
+
|
1607 |
+
torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
|
1608 |
+
torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
|
1609 |
+
|
1610 |
+
|
1611 |
+
global tgt_sr, net_g, vc, hubert_model, version
|
1612 |
+
|
1613 |
+
f0_up_key = int(f0_up_key)
|
1614 |
+
try:
|
1615 |
+
audio = gen_audio
|
1616 |
+
audio_max = np.abs(audio).max() / 0.95
|
1617 |
+
if audio_max > 1:
|
1618 |
+
audio /= audio_max
|
1619 |
+
times = [0, 0, 0]
|
1620 |
+
if hubert_model == None:
|
1621 |
+
load_hubert()
|
1622 |
+
if_f0 = cpt.get("f0", 1)
|
1623 |
+
file_index = (
|
1624 |
+
(
|
1625 |
+
file_index.strip(" ")
|
1626 |
+
.strip('"')
|
1627 |
+
.strip("\n")
|
1628 |
+
.strip('"')
|
1629 |
+
.strip(" ")
|
1630 |
+
.replace("trained", "added")
|
1631 |
+
)
|
1632 |
+
) # 防止小白写错,自动帮他替换掉
|
1633 |
+
# file_big_npy = (
|
1634 |
+
# file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
1635 |
+
# )
|
1636 |
+
audio_opt = vc.pipeline(
|
1637 |
+
hubert_model,
|
1638 |
+
net_g,
|
1639 |
+
sid,
|
1640 |
+
audio,
|
1641 |
+
seg_save_fn_gen,
|
1642 |
+
times,
|
1643 |
+
f0_up_key,
|
1644 |
+
f0_method,
|
1645 |
+
file_index,
|
1646 |
+
# file_big_npy,
|
1647 |
+
index_rate,
|
1648 |
+
if_f0,
|
1649 |
+
filter_radius,
|
1650 |
+
tgt_sr,
|
1651 |
+
resample_sr,
|
1652 |
+
rms_mix_rate,
|
1653 |
+
version,
|
1654 |
+
protect,
|
1655 |
+
crepe_hop_length,
|
1656 |
+
f0_file=f0_file,
|
1657 |
+
)
|
1658 |
+
if resample_sr >= 16000 and tgt_sr != resample_sr:
|
1659 |
+
tgt_sr = resample_sr
|
1660 |
+
index_info = (
|
1661 |
+
"Using index:%s." % file_index
|
1662 |
+
if os.path.exists(file_index)
|
1663 |
+
else "Index not used."
|
1664 |
+
)
|
1665 |
+
return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
|
1666 |
+
index_info,
|
1667 |
+
times[0],
|
1668 |
+
times[1],
|
1669 |
+
times[2],
|
1670 |
+
), (tgt_sr, audio_opt)
|
1671 |
+
except:
|
1672 |
+
info = traceback.format_exc()
|
1673 |
+
print(info)
|
1674 |
+
return info, (None, None)
|
1675 |
+
|
1676 |
+
|
1677 |
+
|
1678 |
+
|
1679 |
def upload_to_dataset(files, dir):
|
1680 |
if dir == '':
|
1681 |
dir = './dataset'
|
|
|
1809 |
output_audio_gen = gr.Audio(label="Output Audio generated")
|
1810 |
cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
|
1811 |
run_btn = gr.Button(value="run")
|
1812 |
+
run_btn_joint = gr.Button(value="run with RVC")
|
1813 |
target_transcript = gr.Textbox(label="target transcript")
|
1814 |
|
1815 |
transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
|
|
|
1836 |
output_audio_con,
|
1837 |
output_audio_gen
|
1838 |
])
|
1839 |
+
|
1840 |
with gr.Column():
|
1841 |
vc_output2 = gr.Audio(
|
1842 |
label="Final Result! (Click on the three dots to download the audio)",
|
|
|
1996 |
],
|
1997 |
[vc_output1, vc_output2],
|
1998 |
)
|
1999 |
+
|
2000 |
+
run_btn_joint.click(
|
2001 |
+
fn=run_joint,
|
2002 |
+
inputs=[
|
2003 |
+
seed,
|
2004 |
+
stop_repitition,
|
2005 |
+
sample_batch_size,
|
2006 |
+
left_margin,
|
2007 |
+
right_margin,
|
2008 |
+
codecaudio_sr,
|
2009 |
+
codec_sr,
|
2010 |
+
top_k,
|
2011 |
+
top_p,
|
2012 |
+
temperature,
|
2013 |
+
kvcache,
|
2014 |
+
cutoff_value,
|
2015 |
+
target_transcript,
|
2016 |
+
silence_tokens,
|
2017 |
+
transcribed_text,
|
2018 |
+
spk_item,
|
2019 |
+
vc_transform0,
|
2020 |
+
f0_file,
|
2021 |
+
f0method0,
|
2022 |
+
file_index1,
|
2023 |
+
# file_index2,
|
2024 |
+
# file_big_npy1,
|
2025 |
+
index_rate1,
|
2026 |
+
filter_radius0,
|
2027 |
+
resample_sr0,
|
2028 |
+
rms_mix_rate0,
|
2029 |
+
protect0,
|
2030 |
+
crepe_hop_length
|
2031 |
+
],
|
2032 |
+
outputs=[vc_output1, vc_output2])
|
2033 |
|
2034 |
with gr.Accordion("Batch Conversion",open=False, visible=False):
|
2035 |
with gr.Row():
|
pretrained_models/encodec_4cb2048_giga.th
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:caa0c595d4919527a9728d627150aa2a0b15b6d117b21855165851333dc63378
|
3 |
+
size 1167842971
|
pretrained_models/giga330M.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35e028b8c5237cb4a6050ca81d4569b98e3a34ad9175fa252f7b1d13e6a9ad26
|
3 |
+
size 1746844161
|