ajayarora1235 commited on
Commit
4f1e982
1 Parent(s): 8de3ef1

add models direct to space

Browse files
.gitattributes CHANGED
@@ -1,2 +1,4 @@
1
  ilariasuitewallpaper.jpg filter=lfs diff=lfs merge=lfs -text
2
  ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
 
 
 
1
  ilariasuitewallpaper.jpg filter=lfs diff=lfs merge=lfs -text
2
  ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
3
+ pretrained_models/giga330M.pth filter=lfs diff=lfs merge=lfs -text
4
+ pretrained_models/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1502,6 +1502,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
1502
  # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1503
  cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1504
  target_transcript = transcribed_text + target_transcript
 
1505
  info = torchaudio.info(audio_fn)
1506
  audio_dur = info.num_frames / info.sample_rate
1507
 
@@ -1545,6 +1546,136 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
1545
 
1546
  return [seg_save_fn_concat, seg_save_fn_gen]
1547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1548
  def upload_to_dataset(files, dir):
1549
  if dir == '':
1550
  dir = './dataset'
@@ -1678,6 +1809,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
1678
  output_audio_gen = gr.Audio(label="Output Audio generated")
1679
  cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
1680
  run_btn = gr.Button(value="run")
 
1681
  target_transcript = gr.Textbox(label="target transcript")
1682
 
1683
  transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
@@ -1704,7 +1836,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
1704
  output_audio_con,
1705
  output_audio_gen
1706
  ])
1707
-
1708
  with gr.Column():
1709
  vc_output2 = gr.Audio(
1710
  label="Final Result! (Click on the three dots to download the audio)",
@@ -1864,6 +1996,40 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
1864
  ],
1865
  [vc_output1, vc_output2],
1866
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1867
 
1868
  with gr.Accordion("Batch Conversion",open=False, visible=False):
1869
  with gr.Row():
 
1502
  # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1503
  cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1504
  target_transcript = transcribed_text + target_transcript
1505
+ print(target_transcript)
1506
  info = torchaudio.info(audio_fn)
1507
  audio_dur = info.num_frames / info.sample_rate
1508
 
 
1546
 
1547
  return [seg_save_fn_concat, seg_save_fn_gen]
1548
 
1549
+ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1550
+ temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
1551
+ sid,
1552
+ f0_up_key,
1553
+ f0_file,
1554
+ f0_method,
1555
+ file_index,
1556
+ #file_index2,
1557
+ # file_big_npy,
1558
+ index_rate,
1559
+ filter_radius,
1560
+ resample_sr,
1561
+ rms_mix_rate,
1562
+ protect,
1563
+ crepe_hop_length):
1564
+
1565
+ global voicecraft_model, voicecraft_config, phn2num
1566
+
1567
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1568
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1569
+ os.environ["USER"] = "USER"
1570
+ # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1571
+ cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1572
+ target_transcript = transcribed_text + target_transcript
1573
+ print(target_transcript)
1574
+ info = torchaudio.info(audio_fn)
1575
+ audio_dur = info.num_frames / info.sample_rate
1576
+
1577
+ assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
1578
+ prompt_end_frame = int(cut_off_sec * info.sample_rate)
1579
+
1580
+ if voicecraft_model is None:
1581
+ load_voicecraft()
1582
+
1583
+ encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
1584
+ text_tokenizer = TextTokenizer(backend="espeak")
1585
+ audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
1586
+
1587
+
1588
+ # # run the model to get the output
1589
+ decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
1590
+ 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
1591
+ "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
1592
+ from lib.voicecraft.inference_tts_scale import inference_one_sample
1593
+ concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
1594
+ audio_fn, target_transcript, config.device, decode_config,
1595
+ prompt_end_frame)
1596
+
1597
+ # save segments for comparison
1598
+ concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
1599
+ # logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
1600
+
1601
+ output_dir = "./demo/generated_tts"
1602
+ os.makedirs(output_dir, exist_ok=True)
1603
+ seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
1604
+ seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
1605
+
1606
+
1607
+ torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
1608
+ torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
1609
+
1610
+
1611
+ global tgt_sr, net_g, vc, hubert_model, version
1612
+
1613
+ f0_up_key = int(f0_up_key)
1614
+ try:
1615
+ audio = gen_audio
1616
+ audio_max = np.abs(audio).max() / 0.95
1617
+ if audio_max > 1:
1618
+ audio /= audio_max
1619
+ times = [0, 0, 0]
1620
+ if hubert_model == None:
1621
+ load_hubert()
1622
+ if_f0 = cpt.get("f0", 1)
1623
+ file_index = (
1624
+ (
1625
+ file_index.strip(" ")
1626
+ .strip('"')
1627
+ .strip("\n")
1628
+ .strip('"')
1629
+ .strip(" ")
1630
+ .replace("trained", "added")
1631
+ )
1632
+ ) # 防止小白写错,自动帮他替换掉
1633
+ # file_big_npy = (
1634
+ # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
1635
+ # )
1636
+ audio_opt = vc.pipeline(
1637
+ hubert_model,
1638
+ net_g,
1639
+ sid,
1640
+ audio,
1641
+ seg_save_fn_gen,
1642
+ times,
1643
+ f0_up_key,
1644
+ f0_method,
1645
+ file_index,
1646
+ # file_big_npy,
1647
+ index_rate,
1648
+ if_f0,
1649
+ filter_radius,
1650
+ tgt_sr,
1651
+ resample_sr,
1652
+ rms_mix_rate,
1653
+ version,
1654
+ protect,
1655
+ crepe_hop_length,
1656
+ f0_file=f0_file,
1657
+ )
1658
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
1659
+ tgt_sr = resample_sr
1660
+ index_info = (
1661
+ "Using index:%s." % file_index
1662
+ if os.path.exists(file_index)
1663
+ else "Index not used."
1664
+ )
1665
+ return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
1666
+ index_info,
1667
+ times[0],
1668
+ times[1],
1669
+ times[2],
1670
+ ), (tgt_sr, audio_opt)
1671
+ except:
1672
+ info = traceback.format_exc()
1673
+ print(info)
1674
+ return info, (None, None)
1675
+
1676
+
1677
+
1678
+
1679
  def upload_to_dataset(files, dir):
1680
  if dir == '':
1681
  dir = './dataset'
 
1809
  output_audio_gen = gr.Audio(label="Output Audio generated")
1810
  cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
1811
  run_btn = gr.Button(value="run")
1812
+ run_btn_joint = gr.Button(value="run with RVC")
1813
  target_transcript = gr.Textbox(label="target transcript")
1814
 
1815
  transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
 
1836
  output_audio_con,
1837
  output_audio_gen
1838
  ])
1839
+
1840
  with gr.Column():
1841
  vc_output2 = gr.Audio(
1842
  label="Final Result! (Click on the three dots to download the audio)",
 
1996
  ],
1997
  [vc_output1, vc_output2],
1998
  )
1999
+
2000
+ run_btn_joint.click(
2001
+ fn=run_joint,
2002
+ inputs=[
2003
+ seed,
2004
+ stop_repitition,
2005
+ sample_batch_size,
2006
+ left_margin,
2007
+ right_margin,
2008
+ codecaudio_sr,
2009
+ codec_sr,
2010
+ top_k,
2011
+ top_p,
2012
+ temperature,
2013
+ kvcache,
2014
+ cutoff_value,
2015
+ target_transcript,
2016
+ silence_tokens,
2017
+ transcribed_text,
2018
+ spk_item,
2019
+ vc_transform0,
2020
+ f0_file,
2021
+ f0method0,
2022
+ file_index1,
2023
+ # file_index2,
2024
+ # file_big_npy1,
2025
+ index_rate1,
2026
+ filter_radius0,
2027
+ resample_sr0,
2028
+ rms_mix_rate0,
2029
+ protect0,
2030
+ crepe_hop_length
2031
+ ],
2032
+ outputs=[vc_output1, vc_output2])
2033
 
2034
  with gr.Accordion("Batch Conversion",open=False, visible=False):
2035
  with gr.Row():
pretrained_models/encodec_4cb2048_giga.th ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caa0c595d4919527a9728d627150aa2a0b15b6d117b21855165851333dc63378
3
+ size 1167842971
pretrained_models/giga330M.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35e028b8c5237cb4a6050ca81d4569b98e3a34ad9175fa252f7b1d13e6a9ad26
3
+ size 1746844161