import os import random import gradio as gr from src.cost_time import calculate_time from configs import * os.environ["GRADIO_TEMP_DIR"]= './temp' description = """

Linly 智能对话系统 (Linly-Talker)
[知乎] [bilibili] [GitHub] [个人主页]
Linly-Talker 是一款智能 AI 对话系统,结合了大型语言模型 (LLMs) 与视觉模型,是一种新颖的人工智能交互方式。

""" # 设定默认参数值,可修改 # source_image = r'example.png' blink_every = True size_of_image = 256 preprocess_type = 'crop' facerender = 'facevid2vid' enhancer = False is_still_mode = False exp_weight = 1 use_ref_video = False ref_video = None ref_info = 'pose' use_idle_mode = False length_of_audio = 5 @calculate_time def TTS_response(text, voice, rate, volume, pitch, am, voc, lang, male, tts_method = 'PaddleTTS', save_path = 'answer.wav'): print(text, voice, rate, volume, pitch, am, voc, lang, male, tts_method, save_path) if tts_method == 'Edge-TTS': try: edgetts.predict(text, voice, rate, volume, pitch , 'answer.wav', 'answer.vtt') except: os.system(f'edge-tts --text "{text}" --voice {voice} --write-media answer.wav') return 'answer.wav' elif tts_method == 'PaddleTTS': paddletts.predict(text, am, voc, lang = lang, male=male, save_path = save_path) return save_path @calculate_time def Talker_response(source_image, source_video, method = 'SadTalker', driven_audio = '', batch_size = 2): # print(source_image, method , driven_audio, batch_size) if source_video: source_image = source_video print(source_image, method , driven_audio, batch_size) pose_style = random.randint(0, 45) if method == 'SadTalker': video = sadtalker.test2(source_image, driven_audio, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every, fps=20) elif method == 'Wav2Lip': video = wav2lip.predict(source_image, driven_audio, batch_size) elif method == 'ER-NeRF': video = ernerf.predict(driven_audio) else: gr.Warning("不支持的方法:" + method) return None return video def main(): with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference: gr.HTML(description) with gr.Row(equal_height=False): with gr.Column(variant='panel'): with gr.Tabs(): with gr.Tab("图片人物"): source_image = gr.Image(label='Source image', type = 'filepath') with gr.Tab("视频人物"): source_video = gr.Video(label="Source video") with gr.Tabs(): input_audio = gr.Audio(sources=['upload', 'microphone'], type="filepath", label = '语音') input_text = gr.Textbox(label="Input Text", lines=3) with gr.Column(): tts_method = gr.Radio(["Edge-TTS", "PaddleTTS"], label="Text To Speech Method (Edge-TTS利用微软的TTS,PaddleSpeech是离线的TTS,不过第一次运行会自动下载模型)", value = 'Edge-TTS') with gr.Tabs("TTS Method"): # with gr.Accordion("Advanced Settings(高级设置语音参数) ", open=False): with gr.Tab("Edge-TTS"): voice = gr.Dropdown(edgetts.SUPPORTED_VOICE, value='zh-CN-XiaoxiaoNeural', label="Voice") rate = gr.Slider(minimum=-100, maximum=100, value=0, step=1.0, label='Rate') volume = gr.Slider(minimum=0, maximum=100, value=100, step=1, label='Volume') pitch = gr.Slider(minimum=-100, maximum=100, value=0, step=1, label='Pitch') with gr.Tab("PaddleTTS"): am = gr.Dropdown(["FastSpeech2"], label="声学模型选择", value = 'FastSpeech2') voc = gr.Dropdown(["PWGan", "HifiGan"], label="声码器选择", value = 'PWGan') lang = gr.Dropdown(["zh", "en", "mix", "canton"], label="语言选择", value = 'zh') male = gr.Checkbox(label="男声(Male)", value=False) with gr.Column(variant='panel'): batch_size = gr.Slider(minimum=1, maximum=10, value=2, step=1, label='Talker Batch size') button_text = gr.Button('语音生成') button_text.click(fn=TTS_response,inputs=[input_text, voice, rate, volume, pitch, am, voc, lang, male, tts_method], outputs=[input_audio]) with gr.Column(variant='panel'): with gr.Tabs(): with gr.TabItem('数字人问答'): method = gr.Radio(choices = ['SadTalker', 'Wav2Lip', 'ER-NeRF'], value = 'SadTalker', label = '模型选择') gen_video = gr.Video(label="Generated video", format="mp4", scale=1, autoplay=True) video_button = gr.Button("提交", variant='primary') video_button.click(fn=Talker_response,inputs=[source_image, source_video, method, input_audio, batch_size] , outputs=[gen_video]) with gr.Row(): examples = [ [ 'examples/source_image/full_body_2.png', '应对压力最有效的方法是什么?', ], [ 'examples/source_image/full_body_1.png', '如何进行时间管理?', ], [ 'examples/source_image/full3.png', '为什么有些人选择使用纸质地图或寻求方向,而不是依赖GPS设备或智能手机应用程序?', ], [ 'examples/source_image/full4.jpeg', '近日,苹果公司起诉高通公司,状告其未按照相关合约进行合作,高通方面尚未回应。这句话中“其”指的是谁?', ], [ 'examples/source_image/art_13.png', '三年级同学种树80颗,四、五年级种的棵树比三年级种的2倍多14棵,三个年级共种树多少棵?', ], [ 'examples/source_image/art_5.png', '撰写一篇交响乐音乐会评论,讨论乐团的表演和观众的整体体验。', ], ] gr.Examples(examples=examples, inputs=[ source_image, input_text, ], ) return inference if __name__ == "__main__": try: from TFG import SadTalker sadtalker = SadTalker(lazy_load=True) except Exception as e: print("SadTalker Error: ", e) print("如果使用SadTalker,请先下载SadTalker模型") try: from TFG import Wav2Lip wav2lip = Wav2Lip("checkpoints/wav2lip_gan.pth") except Exception as e: print("Wav2Lip Error: ", e) print("如果使用Wav2Lip,请先下载Wav2Lip模型") try: from TFG import ERNeRF ernerf = ERNeRF() ernerf.init_model('checkpoints/Obama_ave.pth', 'checkpoints/Obama.json') except Exception as e: print("ERNeRF Error: ", e) print("如果使用ERNeRF,请先下载ERNeRF模型") try: from TTS import EdgeTTS edgetts = EdgeTTS() except Exception as e: print("EdgeTTS Error: ", e) print("如果使用EdgeTTS,请先下载EdgeTTS模型") try: from TTS import PaddleTTS paddletts = PaddleTTS() except Exception as e: print("PaddleTTS Error: ", e) print("如果使用PaddleTTS,请先下载PaddleTTS模型") gr.close_all() demo = main() demo.queue() # demo.launch() demo.launch(server_name=ip, # 本地端口localhost:127.0.0.1 全局端口转发:"0.0.0.0" server_port=port, # 似乎在Gradio4.0以上版本可以不使用证书也可以进行麦克风对话 ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, ssl_verify=False, debug=True)