David Victor commited on
Commit
6ae1dc9
1 Parent(s): bc3753a
Files changed (3) hide show
  1. app.py +514 -73
  2. app12.py +193 -0
  3. webui.py +0 -634
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import random
3
  import gradio as gr
 
4
  from zhconv import convert
5
  from LLM import LLM
6
  from ASR import WhisperASR
@@ -11,35 +12,44 @@ from src.cost_time import calculate_time
11
  from configs import *
12
  os.environ["GRADIO_TEMP_DIR"]= './temp'
13
 
14
- description = """<p style="text-align: center; font-weight: bold;">
15
- <span style="font-size: 28px;">Linly 智能对话系统 (Linly-Talker)</span>
16
- <br>
17
- <span style="font-size: 18px;" id="paper-info">
18
- [<a href="https://zhuanlan.zhihu.com/p/671006998" target="_blank">知乎</a>]
19
- [<a href="https://www.bilibili.com/video/BV1rN4y1a76x/" target="_blank">bilibili</a>]
20
- [<a href="https://github.com/Kedreamix/Linly-Talker" target="_blank">GitHub</a>]
21
- [<a herf="https://kedreamix.github.io/" target="_blank">个人主页</a>]
22
- </span>
23
- <br>
24
- <span>Linly-Talker 是一款智能 AI 对话系统,结合了大型语言模型 (LLMs) 与视觉模型,是一种新颖的人工智能交互方式。</span>
25
- </p>
26
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # 设定默认参数值,可修改
29
- source_image = r'example.png'
30
  blink_every = True
31
  size_of_image = 256
32
  preprocess_type = 'crop'
33
  facerender = 'facevid2vid'
34
  enhancer = False
35
  is_still_mode = False
36
- pic_path = "./inputs/girl.png"
37
- crop_pic_path = "./inputs/first_frame_dir_girl/girl.png"
38
- first_coeff_path = "./inputs/first_frame_dir_girl/girl.mat"
39
- crop_info = ((403, 403), (19, 30, 502, 513), [40.05956541381802, 40.17324339233366, 443.7892505041507, 443.9029284826663])
40
-
41
  exp_weight = 1
42
-
43
  use_ref_video = False
44
  ref_video = None
45
  ref_info = 'pose'
@@ -58,22 +68,78 @@ def Asr(audio):
58
  return question
59
 
60
  @calculate_time
61
- def LLM_response(question, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0):
62
  answer = llm.generate(question)
63
  print(answer)
64
- try:
65
- tts.predict(answer, voice, rate, volume, pitch , 'answer.wav', 'answer.vtt')
66
- except:
67
- os.system(f'edge-tts --text "{answer}" --voice {voice} --write-media answer.wav')
68
- return 'answer.wav', 'answer.vtt', answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  @calculate_time
71
- def Talker_response(text, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 100, pitch = 0, batch_size = 2):
72
- voice = 'zh-CN-XiaoxiaoNeural' if voice not in tts.SUPPORTED_VOICE else voice
73
- # print(voice , rate , volume , pitch)
74
- driven_audio, driven_vtt, _ = LLM_response(text, voice, rate, volume, pitch)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  pose_style = random.randint(0, 45)
76
- video = talker.test(pic_path,
 
77
  crop_pic_path,
78
  first_coeff_path,
79
  crop_info,
@@ -94,14 +160,154 @@ def Talker_response(text, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 100
94
  length_of_audio,
95
  blink_every,
96
  fps=20)
 
 
 
 
97
  if driven_vtt:
98
  return video, driven_vtt
99
  else:
100
  return video
101
 
102
- def main():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
104
- gr.HTML(description)
105
  with gr.Row(equal_height=False):
106
  with gr.Column(variant='panel'):
107
  with gr.Tabs(elem_id="question_audio"):
@@ -109,9 +315,98 @@ def main():
109
  with gr.Column(variant='panel'):
110
  question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
111
  input_text = gr.Textbox(label="Input Text", lines=3)
112
-
113
- with gr.Accordion("Advanced Settings(高级设置语音参数) ",
114
- open=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  voice = gr.Dropdown(tts.SUPPORTED_VOICE,
116
  value='zh-CN-XiaoxiaoNeural',
117
  label="Voice")
@@ -130,64 +425,210 @@ def main():
130
  value=0,
131
  step=1,
132
  label='Pitch')
133
- batch_size = gr.Slider(minimum=1,
134
- maximum=10,
135
- value=2,
136
- step=1,
137
- label='Talker Batch size')
138
  asr_text = gr.Button('语音识别(语音对话后点击)')
139
  asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])
140
-
141
- # with gr.Column(variant='panel'):
142
- # input_text = gr.Textbox(label="Input Text", lines=3)
143
- # text_button = gr.Button("文字对话", variant='primary')
144
-
145
 
146
- with gr.Column(variant='panel'):
147
- with gr.Tabs():
148
- with gr.TabItem('数字人问答'):
149
- gen_video = gr.Video(label="Generated video", format="mp4", scale=1, autoplay=True)
150
- video_button = gr.Button("提交", variant='primary')
151
- video_button.click(fn=Talker_response,inputs=[input_text,voice, rate, volume, pitch, batch_size],outputs=[gen_video])
152
-
153
- with gr.Row():
154
- with gr.Column(variant='panel'):
155
  gr.Markdown("## Text Examples")
156
- examples = ['应对压力最有效的方法是什么?',
157
- '如何进行时间管理?',
158
- '为什么有些人选择使用纸质地图或寻求方向,而不是依赖GPS设备或智能手机应用程序?',
159
- '近日,苹果公司起诉高通公司,状告其未按照相关合约进行合作,高通方面尚未回应。这句话中“其”指的是谁?',
160
- '三年级同学种树80颗,四、五年级种的棵树比三年级种的2倍多14棵,三个年级共种树多少棵?',
161
- '撰写一篇交响乐音乐会评论,讨论乐团的表演和观众的整体体验。',
162
- '翻译成中文:Luck is a dividend of sweat. The more you sweat, the luckier you get.',
163
- ]
 
164
  gr.Examples(
165
  examples = examples,
166
- fn = Talker_response,
167
  inputs = [input_text],
168
- outputs=[gen_video],
169
- # cache_examples = True,
170
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  return inference
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
 
 
 
 
 
 
 
 
 
174
 
175
  if __name__ == "__main__":
176
  # llm = LLM(mode='offline').init_model('Linly', 'Linly-AI/Chinese-LLaMA-2-7B-hf')
177
  # llm = LLM(mode='offline').init_model('Gemini', 'gemini-pro', api_key = "your api key")
178
  # llm = LLM(mode='offline').init_model('Qwen', 'Qwen/Qwen-1_8B-Chat')
179
  llm = LLM(mode='offline').init_model('Qwen', 'Qwen/Qwen-1_8B-Chat')
180
- talker = SadTalker(lazy_load=True)
181
- asr = WhisperASR('base')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  tts = EdgeTTS()
183
  gr.close_all()
184
- demo = main()
185
- demo.queue()
186
- # demo.launch()
187
- demo.launch(server_name=ip, # 本地端口localhost:127.0.0.1 全局端口转发:"0.0.0.0"
 
 
 
 
188
  server_port=port,
189
  # 似乎在Gradio4.0以上版本可以不使用证书也可以进行麦克风对话
190
  ssl_certfile=ssl_certfile,
191
  ssl_keyfile=ssl_keyfile,
192
  ssl_verify=False,
193
- debug=True)
 
 
1
  import os
2
  import random
3
  import gradio as gr
4
+ import time
5
  from zhconv import convert
6
  from LLM import LLM
7
  from ASR import WhisperASR
 
12
  from configs import *
13
  os.environ["GRADIO_TEMP_DIR"]= './temp'
14
 
15
+ def get_title(title = 'Linly 智能对话系统 (Linly-Talker)'):
16
+ description = f"""
17
+ <p style="text-align: center; font-weight: bold;">
18
+ <span style="font-size: 28px;">{title}</span>
19
+ <br>
20
+ <span style="font-size: 18px;" id="paper-info">
21
+ [<a href="https://zhuanlan.zhihu.com/p/671006998" target="_blank">知乎</a>]
22
+ [<a href="https://www.bilibili.com/video/BV1rN4y1a76x/" target="_blank">bilibili</a>]
23
+ [<a href="https://github.com/Kedreamix/Linly-Talker" target="_blank">GitHub</a>]
24
+ [<a herf="https://kedreamix.github.io/" target="_blank">个人主页</a>]
25
+ </span>
26
+ <br>
27
+ <span>Linly-Talker 是一款智能 AI 对话系统,结合了大型语言模型 (LLMs) 与视觉模型,是一种新颖的人工智能交互方式。</span>
28
+ </p>
29
+ """
30
+ return description
31
+
32
+ # 默认text的Example
33
+ examples = [
34
+ ['应对压力最有效的方法是什么?', '女性角色', 'SadTalker', 'zh-CN-XiaoxiaoNeural'],
35
+ ['如何进行时间管理?','男性角色', 'SadTalker', 'zh-CN-YunyangNeural'],
36
+ ['为什么有些人选择使用纸质地图或寻求方向,而不是依赖GPS设备或智能手机应用程序?','女性角色', 'SadTalker', 'zh-HK-HiuMaanNeural'],
37
+ ['近日,苹果公司起诉高通公司,状告其未按照相关合约进行合作,高通方面尚未回应。这句话中“其”指的是谁?', '男性角色', 'SadTalker', 'zh-TW-YunJheNeural'],
38
+ ['撰写一篇交响乐音乐会评论,讨论乐团的表演和观众的整体体验。', '男性角色', 'Wav2Lip', 'zh-CN-YunyangNeural'],
39
+ ['翻译成中文:Luck is a dividend of sweat. The more you sweat, the luckier you get.', '女性角色', 'SadTalker', 'zh-CN-XiaoxiaoNeural'],
40
+ ]
41
+
42
+ # 设置默认system
43
+ default_system = '你是一个很有帮助的助手'
44
 
45
  # 设定默认参数值,可修改
 
46
  blink_every = True
47
  size_of_image = 256
48
  preprocess_type = 'crop'
49
  facerender = 'facevid2vid'
50
  enhancer = False
51
  is_still_mode = False
 
 
 
 
 
52
  exp_weight = 1
 
53
  use_ref_video = False
54
  ref_video = None
55
  ref_info = 'pose'
 
68
  return question
69
 
70
  @calculate_time
71
+ def LLM_response(question_audio, question, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0):
72
  answer = llm.generate(question)
73
  print(answer)
74
+ if voice in tts.SUPPORTED_VOICE:
75
+ try:
76
+ tts.predict(answer, voice, rate, volume, pitch , 'answer.wav', 'answer.vtt')
77
+ except:
78
+ os.system(f'edge-tts --text "{answer}" --voice {voice} --write-media answer.wav')
79
+ return 'answer.wav', 'answer.vtt', answer
80
+ elif voice == "克隆烟嗓音":
81
+ try:
82
+ gpt_path = "../GPT-SoVITS/GPT_weights/yansang-e15.ckpt"
83
+ sovits_path = "../GPT-SoVITS/SoVITS_weights/yansang_e16_s144.pth"
84
+ vits.load_model(gpt_path, sovits_path)
85
+ vits.predict(ref_wav_path = "examples/slicer_opt/vocal_output.wav_10.wav_0000846400_0000957760.wav",
86
+ prompt_text = "你为什么要一次一次的伤我的心啊?",
87
+ prompt_language = "中文",
88
+ text = answer,
89
+ text_language = "中英混合",
90
+ how_to_cut = "按标点符号切",
91
+ save_path = 'answer.wav')
92
+ return 'answer.wav', None, answer
93
+ except Exception as e:
94
+ gr.Error("无克隆环境或者无克隆模型权重,无法克隆声音", e)
95
+ return None, None, None
96
+ elif voice == "克隆声音":
97
+ try:
98
+ if question_audio is None:
99
+ gr.Error("无声音输入,无法克隆声音")
100
+ # print("无声音输入,无法克隆声音")
101
+ return None, None, None
102
+ gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
103
+ sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
104
+ vits.load_model(gpt_path, sovits_path)
105
+ vits.predict(ref_wav_path = question_audio,
106
+ prompt_text = question,
107
+ prompt_language = "中文",
108
+ text = answer,
109
+ text_language = "中英混合",
110
+ how_to_cut = "凑四句一切",
111
+ save_path = 'answer.wav')
112
+ return 'answer.wav', None, answer
113
+ except Exception as e:
114
+ gr.Error("无克隆环境或者无克隆模型权重,无法克隆声音", e)
115
+ return None, None, None
116
 
117
  @calculate_time
118
+ def Talker_response(question_audio = None, method = 'SadTalker', text = '', voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 100, pitch = 0, batch_size = 2, character = '女性角色'):
119
+ if character == '女性角色':
120
+ # 女性角色
121
+ source_image, pic_path = r'inputs/girl.png', r'inputs/girl.png'
122
+ crop_pic_path = "./inputs/first_frame_dir_girl/girl.png"
123
+ first_coeff_path = "./inputs/first_frame_dir_girl/girl.mat"
124
+ crop_info = ((403, 403), (19, 30, 502, 513), [40.05956541381802, 40.17324339233366, 443.7892505041507, 443.9029284826663])
125
+ default_voice = 'zh-CN-XiaoxiaoNeural'
126
+ elif character == '男性角色':
127
+ # 男性角色
128
+ source_image = r'./inputs/boy.png'
129
+ pic_path = "./inputs/boy.png"
130
+ crop_pic_path = "./inputs/first_frame_dir_boy/boy.png"
131
+ first_coeff_path = "./inputs/first_frame_dir_boy/boy.mat"
132
+ crop_info = ((876, 747), (0, 0, 886, 838), [10.382158280494476, 0, 886, 747.7078990925525])
133
+ default_voice = 'zh-CN-YunyangNeural'
134
+ else:
135
+ gr.Error('未知角色')
136
+ return None
137
+ voice = default_voice if voice not in tts.SUPPORTED_VOICE+["克隆烟嗓音", "克隆声音"] else voice
138
+ print(voice, character)
139
+ driven_audio, driven_vtt, _ = LLM_response(question_audio, text, voice, rate, volume, pitch)
140
  pose_style = random.randint(0, 45)
141
+ if method == 'SadTalker':
142
+ video = talker.test(pic_path,
143
  crop_pic_path,
144
  first_coeff_path,
145
  crop_info,
 
160
  length_of_audio,
161
  blink_every,
162
  fps=20)
163
+ elif method == 'Wav2Lip':
164
+ video = wav2lip.predict(crop_pic_path, driven_audio, batch_size)
165
+ else:
166
+ return None
167
  if driven_vtt:
168
  return video, driven_vtt
169
  else:
170
  return video
171
 
172
+ def chat_response(system, message, history):
173
+ # response = llm.generate(message)
174
+ response, history = llm.chat(system, message, history)
175
+ print(history)
176
+ # 流式输出
177
+ for i in range(len(response)):
178
+ time.sleep(0.01)
179
+ yield "", history[:-1] + [(message, response[:i+1])]
180
+ return "", history
181
+
182
+ def human_respone(history, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0, batch_size = 2, character = '女性角色'):
183
+ response = history[-1][1]
184
+ driven_audio, video_vtt = 'answer.wav', 'answer.vtt'
185
+ if character == '女性角色':
186
+ # 女性角色
187
+ source_image, pic_path = r'./inputs/girl.png', r"./inputs/girl.png"
188
+ crop_pic_path = "./inputs/first_frame_dir_girl/girl.png"
189
+ first_coeff_path = "./inputs/first_frame_dir_girl/girl.mat"
190
+ crop_info = ((403, 403), (19, 30, 502, 513), [40.05956541381802, 40.17324339233366, 443.7892505041507, 443.9029284826663])
191
+ default_voice = 'zh-CN-XiaoxiaoNeural'
192
+ elif character == '男性角色':
193
+ # 男性角色
194
+ source_image = r'./inputs/boy.png'
195
+ pic_path = "./inputs/boy.png"
196
+ crop_pic_path = "./inputs/first_frame_dir_boy/boy.png"
197
+ first_coeff_path = "./inputs/first_frame_dir_boy/boy.mat"
198
+ crop_info = ((876, 747), (0, 0, 886, 838), [10.382158280494476, 0, 886, 747.7078990925525])
199
+ default_voice = 'zh-CN-YunyangNeural'
200
+ voice = default_voice if voice not in tts.SUPPORTED_VOICE else voice
201
+ tts.predict(response, voice, rate, volume, pitch, driven_audio, video_vtt)
202
+ pose_style = random.randint(0, 45) # 随机选择
203
+ video_path = talker.test(pic_path,
204
+ crop_pic_path,
205
+ first_coeff_path,
206
+ crop_info,
207
+ source_image,
208
+ driven_audio,
209
+ preprocess_type,
210
+ is_still_mode,
211
+ enhancer,
212
+ batch_size,
213
+ size_of_image,
214
+ pose_style,
215
+ facerender,
216
+ exp_weight,
217
+ use_ref_video,
218
+ ref_video,
219
+ ref_info,
220
+ use_idle_mode,
221
+ length_of_audio,
222
+ blink_every,
223
+ fps=20)
224
+
225
+ return video_path, video_vtt
226
+
227
+ def modify_system_session(system: str) -> str:
228
+ if system is None or len(system) == 0:
229
+ system = default_system
230
+ llm.clear_history()
231
+ return system, system, []
232
+
233
+ def clear_session():
234
+ # clear history
235
+ llm.clear_history()
236
+ return '', []
237
+
238
+ def voice_setting(suport_voice):
239
+ with gr.Accordion("Advanced Settings(高级设置语音参数) ", open=False):
240
+ voice = gr.Dropdown(suport_voice,
241
+ label="声音选择 Voice",
242
+ value = "克隆声音" if '克隆声音' in suport_voice else None)
243
+ rate = gr.Slider(minimum=-100,
244
+ maximum=100,
245
+ value=0,
246
+ step=1.0,
247
+ label='声音速率 Rate')
248
+ volume = gr.Slider(minimum=0,
249
+ maximum=100,
250
+ value=100,
251
+ step=1,
252
+ label='声音音量 Volume')
253
+ pitch = gr.Slider(minimum=-100,
254
+ maximum=100,
255
+ value=0,
256
+ step=1,
257
+ label='声音音调 Pitch')
258
+ batch_size = gr.Slider(minimum=1,
259
+ maximum=10,
260
+ value=2,
261
+ step=1,
262
+ label='模型参数 调节可以加快生成速度 Talker Batch size')
263
+
264
+ character = gr.Radio(['女性角色', '男性角色'], label="角色选择", value='女性角色')
265
+ method = gr.Radio(choices = ['SadTalker', 'Wav2Lip', 'ER-NeRF(Comming Soon!!!)'], value = 'SadTalker', label = '模型选择')
266
+ return voice, rate, volume, pitch, batch_size, character, method
267
+
268
+ @calculate_time
269
+ def Talker_response_img(question_audio, method, text, voice, rate, volume, pitch, source_image,
270
+ preprocess_type,
271
+ is_still_mode,
272
+ enhancer,
273
+ batch_size,
274
+ size_of_image,
275
+ pose_style,
276
+ facerender,
277
+ exp_weight,
278
+ blink_every,
279
+ fps):
280
+ driven_audio, driven_vtt, _ = LLM_response(question_audio, text, voice, rate, volume, pitch)
281
+ if method == 'SadTalker':
282
+ video = talker.test2(source_image,
283
+ driven_audio,
284
+ preprocess_type,
285
+ is_still_mode,
286
+ enhancer,
287
+ batch_size,
288
+ size_of_image,
289
+ pose_style,
290
+ facerender,
291
+ exp_weight,
292
+ use_ref_video,
293
+ ref_video,
294
+ ref_info,
295
+ use_idle_mode,
296
+ length_of_audio,
297
+ blink_every,
298
+ fps=fps)
299
+ elif method == 'Wav2Lip':
300
+ video = wav2lip.predict(source_image, driven_audio, batch_size)
301
+ else:
302
+ return None
303
+ if driven_vtt:
304
+ return video, driven_vtt
305
+ else:
306
+ return video
307
+
308
+ def app():
309
  with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
310
+ gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 文本/语音对话"))
311
  with gr.Row(equal_height=False):
312
  with gr.Column(variant='panel'):
313
  with gr.Tabs(elem_id="question_audio"):
 
315
  with gr.Column(variant='panel'):
316
  question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
317
  input_text = gr.Textbox(label="Input Text", lines=3)
318
+ voice, rate, volume, pitch, batch_size, character, method = voice_setting(tts.SUPPORTED_VOICE)
319
+ asr_text = gr.Button('语音识别(语音对话后点击)')
320
+ asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])
321
+
322
+ with gr.Column(variant='panel'):
323
+ with gr.Tabs():
324
+ with gr.TabItem('数字人问答'):
325
+ gen_video = gr.Video(label="生成视频", format="mp4", scale=1, autoplay=False)
326
+ video_button = gr.Button("提交视频生成", variant='primary')
327
+ video_button.click(fn=Talker_response,inputs=[question_audio, method, input_text,voice, rate, volume, pitch, batch_size, character],outputs=[gen_video])
328
+
329
+ with gr.Row():
330
+ with gr.Column(variant='panel'):
331
+ gr.Markdown("## Test Examples")
332
+ gr.Examples(
333
+ examples = examples,
334
+ fn = Talker_response,
335
+ inputs = [input_text, character, method, voice],
336
+ )
337
+ return inference
338
+
339
+ def app_multi():
340
+ with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
341
+ gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 多轮GPT对话"))
342
+ with gr.Row():
343
+ with gr.Column():
344
+ voice, rate, volume, pitch, batch_size, character, method = voice_setting(tts.SUPPORTED_VOICE)
345
+ video = gr.Video(label = '数字人问答', scale = 0.5)
346
+ video_button = gr.Button("🎬 生成数字人视频(对话后)", variant = 'primary')
347
+
348
+ with gr.Column():
349
+ with gr.Row():
350
+ with gr.Column(scale=3):
351
+ system_input = gr.Textbox(value=default_system, lines=1, label='System (设定角色)')
352
+ with gr.Column(scale=1):
353
+ modify_system = gr.Button("🛠️ 设置system并清除历史对话", scale=2)
354
+ system_state = gr.Textbox(value=default_system, visible=False)
355
+
356
+ chatbot = gr.Chatbot(height=400, show_copy_button=True)
357
+ audio = gr.Audio(sources=['microphone','upload'], type="filepath", label='语音对话', autoplay=False)
358
+ asr_text = gr.Button('🎤 语音识别(语音对话后点击)')
359
+
360
+ # 创建一个文本框组件,用于输入 prompt。
361
+ msg = gr.Textbox(label="Prompt/问题")
362
+ asr_text.click(fn=Asr,inputs=[audio],outputs=[msg])
363
+
364
+ with gr.Row():
365
+ clear_history = gr.Button("🧹 清除历史对话")
366
+ sumbit = gr.Button("🚀 发送", variant = 'primary')
367
+
368
+ # 设置按钮的点击事件。当点击时,调用上面定义的 函数,并传入用户的消息和聊天历史记录,然后更新文本框和聊天机器人组件。
369
+ sumbit.click(chat_response, inputs=[system_input, msg, chatbot],
370
+ outputs=[msg, chatbot])
371
+
372
+ # 点击后清空后端存储的聊天记录
373
+ clear_history.click(fn = clear_session, outputs = [msg, chatbot])
374
+
375
+ # 设置system并清除历史对话
376
+ modify_system.click(fn=modify_system_session,
377
+ inputs=[system_input],
378
+ outputs=[system_state, system_input, chatbot])
379
+
380
+ video_button.click(fn = human_respone, inputs = [chatbot, voice, rate, volume, pitch, batch_size, character], outputs = [video])
381
+
382
+ with gr.Row(variant='panel'):
383
+ with gr.Column(variant='panel'):
384
+ gr.Markdown("## Test Examples")
385
+ gr.Examples(
386
+ examples = examples,
387
+ fn = Talker_response,
388
+ inputs = [msg, character, method, voice],
389
+ )
390
+ return inference
391
+
392
+ def app_img():
393
+ with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
394
+ gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 任意图片对话"))
395
+ with gr.Row(equal_height=False):
396
+ with gr.Column(variant='panel'):
397
+ with gr.Tabs(elem_id="sadtalker_source_image"):
398
+ with gr.TabItem('Source image'):
399
+ with gr.Row():
400
+ source_image = gr.Image(label="Source image", type="filepath", elem_id="img2img_image", width=512)
401
+
402
+ with gr.Tabs(elem_id="question_audio"):
403
+ with gr.TabItem('对话'):
404
+ with gr.Column(variant='panel'):
405
+ question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
406
+ input_text = gr.Textbox(label="Input Text", lines=3, info = '文字对话')
407
+ with gr.Accordion("Advanced Settings",
408
+ open=False,
409
+ visible=True) as parameter_article:
410
  voice = gr.Dropdown(tts.SUPPORTED_VOICE,
411
  value='zh-CN-XiaoxiaoNeural',
412
  label="Voice")
 
425
  value=0,
426
  step=1,
427
  label='Pitch')
428
+
 
 
 
 
429
  asr_text = gr.Button('语音识别(语音对话后点击)')
430
  asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])
 
 
 
 
 
431
 
432
+ # with gr.Tabs(elem_id="response_audio"):
433
+ # with gr.TabItem("语音选择"):
434
+ # with gr.Column(variant='panel'):
435
+ # voice = gr.Dropdown(VOICES, values='zh-CN-XiaoxiaoNeural')
436
+
437
+
438
+ with gr.Tabs(elem_id="text_examples"):
 
 
439
  gr.Markdown("## Text Examples")
440
+ examples = [
441
+ ['应对压力最有效的方法是什么?'],
442
+ ['如何进行时间管理?'],
443
+ ['为什么有些人选择使用纸质地图或寻求方向,而不是依赖GPS设备或智能手机应用程序?'],
444
+ ['近日,苹果公司起诉高通公司,状告其未按照相关合约进行合作,高通方面尚未回应。这句话中“其”指的是谁?'],
445
+ ['三年级同学种树80颗,四、五年级种的棵树比三年级种的2倍多14棵,三个年级共种树多少棵?'],
446
+ ['撰写一篇交响乐音乐会评论,讨论乐团的表演和观众的整体体验。'],
447
+ ['翻译成中文:Luck is a dividend of sweat. The more you sweat, the luckier you get.'],
448
+ ]
449
  gr.Examples(
450
  examples = examples,
 
451
  inputs = [input_text],
 
 
452
  )
453
+
454
+ # driven_audio = 'answer.wav'
455
+ with gr.Column(variant='panel'):
456
+ method = gr.Radio(choices = ['SadTalker', 'Wav2Lip', 'ER-NeRF(Comming Soon!!!)'], value = 'SadTalker', label = '模型选择')
457
+ with gr.Tabs(elem_id="sadtalker_checkbox"):
458
+ with gr.TabItem('Settings'):
459
+ with gr.Accordion("Advanced Settings",
460
+ open=False):
461
+ gr.Markdown("SadTalker: need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials")
462
+ with gr.Column(variant='panel'):
463
+ # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
464
+ # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
465
+ with gr.Row():
466
+ pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
467
+ exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) #
468
+ blink_every = gr.Checkbox(label="use eye blink", value=True)
469
+
470
+ with gr.Row():
471
+ size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model? 256 is faster") #
472
+ preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
473
+
474
+ with gr.Row():
475
+ is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
476
+ facerender = gr.Radio(['facevid2vid', 'PIRender'], value='facevid2vid', label='facerender', info="which face render?")
477
+
478
+ with gr.Row():
479
+ batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
480
+ fps = gr.Slider(label='fps in generation', step=1, maximum=30, value =20)
481
+ enhancer = gr.Checkbox(label="GFPGAN as Face enhancer(slow)")
482
+
483
+ with gr.Tabs(elem_id="sadtalker_genearted"):
484
+ gen_video = gr.Video(label="Generated video", format="mp4",scale=0.8)
485
+
486
+ submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
487
+ submit.click(
488
+ fn=Talker_response_img,
489
+ inputs=[question_audio,
490
+ method,
491
+ input_text,
492
+ voice, rate, volume, pitch,
493
+ source_image,
494
+ preprocess_type,
495
+ is_still_mode,
496
+ enhancer,
497
+ batch_size,
498
+ size_of_image,
499
+ pose_style,
500
+ facerender,
501
+ exp_weight,
502
+ blink_every,
503
+ fps],
504
+ outputs=[gen_video]
505
+ )
506
+
507
+ with gr.Row():
508
+ examples = [
509
+ [
510
+ 'examples/source_image/full_body_2.png',
511
+ 'crop',
512
+ False,
513
+ False
514
+ ],
515
+ [
516
+ 'examples/source_image/full_body_1.png',
517
+ 'crop',
518
+ False,
519
+ False
520
+ ],
521
+ [
522
+ 'examples/source_image/full3.png',
523
+ 'crop',
524
+ False,
525
+ False
526
+ ],
527
+ [
528
+ 'examples/source_image/full4.jpeg',
529
+ 'crop',
530
+ False,
531
+ False
532
+ ],
533
+ [
534
+ 'examples/source_image/art_13.png',
535
+ 'crop',
536
+ False,
537
+ False
538
+ ],
539
+ [
540
+ 'examples/source_image/art_5.png',
541
+ 'crop',
542
+ False,
543
+ False
544
+ ],
545
+ ]
546
+ gr.Examples(examples=examples,
547
+ fn=Talker_response,
548
+ inputs=[
549
+ source_image,
550
+ preprocess_type,
551
+ is_still_mode,
552
+ enhancer],
553
+ outputs=[gen_video],
554
+ # cache_examples=True,
555
+ )
556
  return inference
557
 
558
+ def app_vits():
559
+ with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
560
+ gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 语音克隆"))
561
+ with gr.Row(equal_height=False):
562
+ with gr.Column(variant='panel'):
563
+ with gr.Tabs(elem_id="question_audio"):
564
+ with gr.TabItem('对话'):
565
+ with gr.Column(variant='panel'):
566
+ question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
567
+ input_text = gr.Textbox(label="Input Text", lines=3)
568
+ voice, rate, volume, pitch, batch_size, character, method = voice_setting(["克隆声音", "克隆烟嗓音"] + tts.SUPPORTED_VOICE)
569
+ asr_text = gr.Button('语音识别(语音对话后点击)')
570
+ asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])
571
+ with gr.Column(variant='panel'):
572
+ with gr.Tabs():
573
+ with gr.TabItem('数字人问答'):
574
+ gen_video = gr.Video(label="Generated video", format="mp4", scale=1, autoplay=False)
575
+ video_button = gr.Button("提交", variant='primary')
576
+ video_button.click(fn=Talker_response,inputs=[question_audio, method, input_text, voice, rate, volume, pitch, batch_size, character],outputs=[gen_video])
577
 
578
+ with gr.Row():
579
+ with gr.Column(variant='panel'):
580
+ gr.Markdown("## Test Examples")
581
+ gr.Examples(
582
+ examples = [["如何应对压力", "男性角色", "SadTalker", "克隆烟嗓音"], ["北京有什么好玩的地方", "男性角色", "SadTalker", "克隆声音"]] + examples,
583
+ fn = Talker_response,
584
+ inputs = [input_text, character, method, voice],
585
+ )
586
+ return inference
587
 
588
  if __name__ == "__main__":
589
  # llm = LLM(mode='offline').init_model('Linly', 'Linly-AI/Chinese-LLaMA-2-7B-hf')
590
  # llm = LLM(mode='offline').init_model('Gemini', 'gemini-pro', api_key = "your api key")
591
  # llm = LLM(mode='offline').init_model('Qwen', 'Qwen/Qwen-1_8B-Chat')
592
  llm = LLM(mode='offline').init_model('Qwen', 'Qwen/Qwen-1_8B-Chat')
593
+ try:
594
+ talker = SadTalker(lazy_load=True)
595
+ except Exception as e:
596
+ print("SadTalker Error: ", e)
597
+ # print("如果使用SadTalker,请先下载SadTalker模型")
598
+ gr.Warning("如果使用SadTalker,请先下载SadTalker模型")
599
+ try:
600
+ from TFG import Wav2Lip
601
+ wav2lip = Wav2Lip("checkpoints/wav2lip_gan.pth")
602
+ except Exception as e:
603
+ print("Wav2Lip Error: ", e)
604
+ print("如果使用Wav2Lip,请先下载Wav2Lip模型")
605
+ try:
606
+ from VITS import GPT_SoVITS
607
+ vits = GPT_SoVITS()
608
+ except Exception as e:
609
+ print("GPT-SoVITS Error: ", e)
610
+ print("如果使用VITS,请先下载GPT-SoVITS模型和安装环境")
611
+ try:
612
+ from ASR import FunASR
613
+ asr = FunASR()
614
+ except Exception as e:
615
+ print("ASR Error: ", e)
616
+ print("如果使用FunASR,请先下载FunASR模型和安装环境")
617
+ asr = WhisperASR('base')
618
  tts = EdgeTTS()
619
  gr.close_all()
620
+ demo_app = app()
621
+ demo_img = app_img()
622
+ demo_multi = app_multi()
623
+ demo_vits = app_vits()
624
+ demo = gr.TabbedInterface(interface_list = [demo_app, demo_img, demo_multi, demo_vits],
625
+ tab_names = ["文本/语音对话", "任意图片对话", "多轮GPT对话", "语音克隆数字人对话"],
626
+ title = "Linly-Talker WebUI")
627
+ demo.launch(server_name="127.0.0.1", # 本地端口localhost:127.0.0.1 全局端口转发:"0.0.0.0"
628
  server_port=port,
629
  # 似乎在Gradio4.0以上版本可以不使用证书也可以进行麦克风对话
630
  ssl_certfile=ssl_certfile,
631
  ssl_keyfile=ssl_keyfile,
632
  ssl_verify=False,
633
+ debug=True,
634
+ )
app12.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import gradio as gr
4
+ from zhconv import convert
5
+ from LLM import LLM
6
+ from ASR import WhisperASR
7
+ from TFG import SadTalker
8
+ from TTS import EdgeTTS
9
+ from src.cost_time import calculate_time
10
+
11
+ from configs import *
12
+ os.environ["GRADIO_TEMP_DIR"]= './temp'
13
+
14
+ description = """<p style="text-align: center; font-weight: bold;">
15
+ <span style="font-size: 28px;">Linly 智能对话系统 (Linly-Talker)</span>
16
+ <br>
17
+ <span style="font-size: 18px;" id="paper-info">
18
+ [<a href="https://zhuanlan.zhihu.com/p/671006998" target="_blank">知乎</a>]
19
+ [<a href="https://www.bilibili.com/video/BV1rN4y1a76x/" target="_blank">bilibili</a>]
20
+ [<a href="https://github.com/Kedreamix/Linly-Talker" target="_blank">GitHub</a>]
21
+ [<a herf="https://kedreamix.github.io/" target="_blank">个人主页</a>]
22
+ </span>
23
+ <br>
24
+ <span>Linly-Talker 是一款智能 AI 对话系统,结合了大型语言模型 (LLMs) 与视觉模型,是一种新颖的人工智能交互方式。</span>
25
+ </p>
26
+ """
27
+
28
+ # 设定默认参数值,可修改
29
+ source_image = r'example.png'
30
+ blink_every = True
31
+ size_of_image = 256
32
+ preprocess_type = 'crop'
33
+ facerender = 'facevid2vid'
34
+ enhancer = False
35
+ is_still_mode = False
36
+ pic_path = "./inputs/girl.png"
37
+ crop_pic_path = "./inputs/first_frame_dir_girl/girl.png"
38
+ first_coeff_path = "./inputs/first_frame_dir_girl/girl.mat"
39
+ crop_info = ((403, 403), (19, 30, 502, 513), [40.05956541381802, 40.17324339233366, 443.7892505041507, 443.9029284826663])
40
+
41
+ exp_weight = 1
42
+
43
+ use_ref_video = False
44
+ ref_video = None
45
+ ref_info = 'pose'
46
+ use_idle_mode = False
47
+ length_of_audio = 5
48
+
49
+ @calculate_time
50
+ def Asr(audio):
51
+ try:
52
+ question = asr.transcribe(audio)
53
+ question = convert(question, 'zh-cn')
54
+ except Exception as e:
55
+ print("ASR Error: ", e)
56
+ question = 'Gradio存在一些bug,麦克风模式有时候可能音频还未传入,请重新点击一下语音识别即可'
57
+ gr.Warning(question)
58
+ return question
59
+
60
+ @calculate_time
61
+ def LLM_response(question, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0):
62
+ answer = llm.generate(question)
63
+ print(answer)
64
+ try:
65
+ tts.predict(answer, voice, rate, volume, pitch , 'answer.wav', 'answer.vtt')
66
+ except:
67
+ os.system(f'edge-tts --text "{answer}" --voice {voice} --write-media answer.wav')
68
+ return 'answer.wav', 'answer.vtt', answer
69
+
70
+ @calculate_time
71
+ def Talker_response(text, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 100, pitch = 0, batch_size = 2):
72
+ voice = 'zh-CN-XiaoxiaoNeural' if voice not in tts.SUPPORTED_VOICE else voice
73
+ # print(voice , rate , volume , pitch)
74
+ driven_audio, driven_vtt, _ = LLM_response(text, voice, rate, volume, pitch)
75
+ pose_style = random.randint(0, 45)
76
+ video = talker.test(pic_path,
77
+ crop_pic_path,
78
+ first_coeff_path,
79
+ crop_info,
80
+ source_image,
81
+ driven_audio,
82
+ preprocess_type,
83
+ is_still_mode,
84
+ enhancer,
85
+ batch_size,
86
+ size_of_image,
87
+ pose_style,
88
+ facerender,
89
+ exp_weight,
90
+ use_ref_video,
91
+ ref_video,
92
+ ref_info,
93
+ use_idle_mode,
94
+ length_of_audio,
95
+ blink_every,
96
+ fps=20)
97
+ if driven_vtt:
98
+ return video, driven_vtt
99
+ else:
100
+ return video
101
+
102
+ def main():
103
+ with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
104
+ gr.HTML(description)
105
+ with gr.Row(equal_height=False):
106
+ with gr.Column(variant='panel'):
107
+ with gr.Tabs(elem_id="question_audio"):
108
+ with gr.TabItem('对话'):
109
+ with gr.Column(variant='panel'):
110
+ question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
111
+ input_text = gr.Textbox(label="Input Text", lines=3)
112
+
113
+ with gr.Accordion("Advanced Settings(高级设置语音参数) ",
114
+ open=False):
115
+ voice = gr.Dropdown(tts.SUPPORTED_VOICE,
116
+ value='zh-CN-XiaoxiaoNeural',
117
+ label="Voice")
118
+ rate = gr.Slider(minimum=-100,
119
+ maximum=100,
120
+ value=0,
121
+ step=1.0,
122
+ label='Rate')
123
+ volume = gr.Slider(minimum=0,
124
+ maximum=100,
125
+ value=100,
126
+ step=1,
127
+ label='Volume')
128
+ pitch = gr.Slider(minimum=-100,
129
+ maximum=100,
130
+ value=0,
131
+ step=1,
132
+ label='Pitch')
133
+ batch_size = gr.Slider(minimum=1,
134
+ maximum=10,
135
+ value=2,
136
+ step=1,
137
+ label='Talker Batch size')
138
+ asr_text = gr.Button('语音识别(语音对话后点击)')
139
+ asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])
140
+
141
+ # with gr.Column(variant='panel'):
142
+ # input_text = gr.Textbox(label="Input Text", lines=3)
143
+ # text_button = gr.Button("文字对话", variant='primary')
144
+
145
+
146
+ with gr.Column(variant='panel'):
147
+ with gr.Tabs():
148
+ with gr.TabItem('数字人问答'):
149
+ gen_video = gr.Video(label="Generated video", format="mp4", scale=1, autoplay=True)
150
+ video_button = gr.Button("提交", variant='primary')
151
+ video_button.click(fn=Talker_response,inputs=[input_text,voice, rate, volume, pitch, batch_size],outputs=[gen_video])
152
+
153
+ with gr.Row():
154
+ with gr.Column(variant='panel'):
155
+ gr.Markdown("## Text Examples")
156
+ examples = ['应对压力最有效的方法是什么?',
157
+ '如何进行时间管理?',
158
+ '为什么有些人选择使用纸质地图或寻求方向,而不是依赖GPS设备或智能手机应用程序?',
159
+ '近日,苹果公司起诉高通公司,状告其未按照相关合约进行合作,高通方面尚未回应。这句话中“其”指的是谁?',
160
+ '三年级同学种树80颗,四、五年级种的棵树比三年级种的2倍多14棵,三个年级共种树多少棵?',
161
+ '撰写一篇交响乐音乐会评论,讨论乐团的表演和观众的整体体验。',
162
+ '翻译成中文:Luck is a dividend of sweat. The more you sweat, the luckier you get.',
163
+ ]
164
+ gr.Examples(
165
+ examples = examples,
166
+ fn = Talker_response,
167
+ inputs = [input_text],
168
+ outputs=[gen_video],
169
+ # cache_examples = True,
170
+ )
171
+ return inference
172
+
173
+
174
+
175
+ if __name__ == "__main__":
176
+ # llm = LLM(mode='offline').init_model('Linly', 'Linly-AI/Chinese-LLaMA-2-7B-hf')
177
+ # llm = LLM(mode='offline').init_model('Gemini', 'gemini-pro', api_key = "your api key")
178
+ # llm = LLM(mode='offline').init_model('Qwen', 'Qwen/Qwen-1_8B-Chat')
179
+ llm = LLM(mode='offline').init_model('Qwen', 'Qwen/Qwen-1_8B-Chat')
180
+ talker = SadTalker(lazy_load=True)
181
+ asr = WhisperASR('base')
182
+ tts = EdgeTTS()
183
+ gr.close_all()
184
+ demo = main()
185
+ demo.queue()
186
+ # demo.launch()
187
+ demo.launch(server_name=ip, # 本地端口localhost:127.0.0.1 全局端口转发:"0.0.0.0"
188
+ server_port=port,
189
+ # 似乎在Gradio4.0以上版本可以不使用证书也可以进行麦克风对话
190
+ ssl_certfile=ssl_certfile,
191
+ ssl_keyfile=ssl_keyfile,
192
+ ssl_verify=False,
193
+ debug=True)
webui.py DELETED
@@ -1,634 +0,0 @@
1
- import os
2
- import random
3
- import gradio as gr
4
- import time
5
- from zhconv import convert
6
- from LLM import LLM
7
- from ASR import WhisperASR
8
- from TFG import SadTalker
9
- from TTS import EdgeTTS
10
- from src.cost_time import calculate_time
11
-
12
- from configs import *
13
- os.environ["GRADIO_TEMP_DIR"]= './temp'
14
-
15
- def get_title(title = 'Linly 智能对话系统 (Linly-Talker)'):
16
- description = f"""
17
- <p style="text-align: center; font-weight: bold;">
18
- <span style="font-size: 28px;">{title}</span>
19
- <br>
20
- <span style="font-size: 18px;" id="paper-info">
21
- [<a href="https://zhuanlan.zhihu.com/p/671006998" target="_blank">知乎</a>]
22
- [<a href="https://www.bilibili.com/video/BV1rN4y1a76x/" target="_blank">bilibili</a>]
23
- [<a href="https://github.com/Kedreamix/Linly-Talker" target="_blank">GitHub</a>]
24
- [<a herf="https://kedreamix.github.io/" target="_blank">个人主页</a>]
25
- </span>
26
- <br>
27
- <span>Linly-Talker 是一款智能 AI 对话系统,结合了大型语言模型 (LLMs) 与视觉模型,是一种新颖的人工智能交互方式。</span>
28
- </p>
29
- """
30
- return description
31
-
32
- # 默认text的Example
33
- examples = [
34
- ['应对压力最有效的方法是什么?', '女性角色', 'SadTalker', 'zh-CN-XiaoxiaoNeural'],
35
- ['如何进行时间管理?','男性角色', 'SadTalker', 'zh-CN-YunyangNeural'],
36
- ['为什么有些人选择使用纸质地图或寻求方向,而不是依赖GPS设备或智能手机应用程序?','女性角色', 'SadTalker', 'zh-HK-HiuMaanNeural'],
37
- ['近日,苹果公司起诉高通公司,状告其未按照相关合约进行合作,高通方面尚未回应。这句话中“其”指的是谁?', '男性角色', 'SadTalker', 'zh-TW-YunJheNeural'],
38
- ['撰写一篇交响乐音乐会评论,讨论乐团的表演和观众的整体体验。', '男性角色', 'Wav2Lip', 'zh-CN-YunyangNeural'],
39
- ['翻译成中文:Luck is a dividend of sweat. The more you sweat, the luckier you get.', '女性角色', 'SadTalker', 'zh-CN-XiaoxiaoNeural'],
40
- ]
41
-
42
- # 设置默认system
43
- default_system = '你是一个很有帮助的助手'
44
-
45
- # 设定默认参数值,可修改
46
- blink_every = True
47
- size_of_image = 256
48
- preprocess_type = 'crop'
49
- facerender = 'facevid2vid'
50
- enhancer = False
51
- is_still_mode = False
52
- exp_weight = 1
53
- use_ref_video = False
54
- ref_video = None
55
- ref_info = 'pose'
56
- use_idle_mode = False
57
- length_of_audio = 5
58
-
59
- @calculate_time
60
- def Asr(audio):
61
- try:
62
- question = asr.transcribe(audio)
63
- question = convert(question, 'zh-cn')
64
- except Exception as e:
65
- print("ASR Error: ", e)
66
- question = 'Gradio存在一些bug,麦克风模式有时候可能音频还未传入,请重新点击一下语音识别即可'
67
- gr.Warning(question)
68
- return question
69
-
70
- @calculate_time
71
- def LLM_response(question_audio, question, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0):
72
- answer = llm.generate(question)
73
- print(answer)
74
- if voice in tts.SUPPORTED_VOICE:
75
- try:
76
- tts.predict(answer, voice, rate, volume, pitch , 'answer.wav', 'answer.vtt')
77
- except:
78
- os.system(f'edge-tts --text "{answer}" --voice {voice} --write-media answer.wav')
79
- return 'answer.wav', 'answer.vtt', answer
80
- elif voice == "克隆烟嗓音":
81
- try:
82
- gpt_path = "../GPT-SoVITS/GPT_weights/yansang-e15.ckpt"
83
- sovits_path = "../GPT-SoVITS/SoVITS_weights/yansang_e16_s144.pth"
84
- vits.load_model(gpt_path, sovits_path)
85
- vits.predict(ref_wav_path = "examples/slicer_opt/vocal_output.wav_10.wav_0000846400_0000957760.wav",
86
- prompt_text = "你为什么要一次一次的伤我的心啊?",
87
- prompt_language = "中文",
88
- text = answer,
89
- text_language = "中英混合",
90
- how_to_cut = "按标点符号切",
91
- save_path = 'answer.wav')
92
- return 'answer.wav', None, answer
93
- except Exception as e:
94
- gr.Error("无克隆环境或者无克隆模型权重,无法克隆声音", e)
95
- return None, None, None
96
- elif voice == "克隆声音":
97
- try:
98
- if question_audio is None:
99
- gr.Error("无声音输入,无法克隆声音")
100
- # print("无声音输入,无法克隆声音")
101
- return None, None, None
102
- gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
103
- sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
104
- vits.load_model(gpt_path, sovits_path)
105
- vits.predict(ref_wav_path = question_audio,
106
- prompt_text = question,
107
- prompt_language = "中文",
108
- text = answer,
109
- text_language = "中英混合",
110
- how_to_cut = "凑四句一切",
111
- save_path = 'answer.wav')
112
- return 'answer.wav', None, answer
113
- except Exception as e:
114
- gr.Error("无克隆环境或者无克隆模型权重,无法克隆声音", e)
115
- return None, None, None
116
-
117
- @calculate_time
118
- def Talker_response(question_audio = None, method = 'SadTalker', text = '', voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 100, pitch = 0, batch_size = 2, character = '女性角色'):
119
- if character == '女性角色':
120
- # 女性角色
121
- source_image, pic_path = r'inputs/girl.png', r'inputs/girl.png'
122
- crop_pic_path = "./inputs/first_frame_dir_girl/girl.png"
123
- first_coeff_path = "./inputs/first_frame_dir_girl/girl.mat"
124
- crop_info = ((403, 403), (19, 30, 502, 513), [40.05956541381802, 40.17324339233366, 443.7892505041507, 443.9029284826663])
125
- default_voice = 'zh-CN-XiaoxiaoNeural'
126
- elif character == '男性角色':
127
- # 男性角色
128
- source_image = r'./inputs/boy.png'
129
- pic_path = "./inputs/boy.png"
130
- crop_pic_path = "./inputs/first_frame_dir_boy/boy.png"
131
- first_coeff_path = "./inputs/first_frame_dir_boy/boy.mat"
132
- crop_info = ((876, 747), (0, 0, 886, 838), [10.382158280494476, 0, 886, 747.7078990925525])
133
- default_voice = 'zh-CN-YunyangNeural'
134
- else:
135
- gr.Error('未知角色')
136
- return None
137
- voice = default_voice if voice not in tts.SUPPORTED_VOICE+["克隆烟嗓音", "克隆声音"] else voice
138
- print(voice, character)
139
- driven_audio, driven_vtt, _ = LLM_response(question_audio, text, voice, rate, volume, pitch)
140
- pose_style = random.randint(0, 45)
141
- if method == 'SadTalker':
142
- video = talker.test(pic_path,
143
- crop_pic_path,
144
- first_coeff_path,
145
- crop_info,
146
- source_image,
147
- driven_audio,
148
- preprocess_type,
149
- is_still_mode,
150
- enhancer,
151
- batch_size,
152
- size_of_image,
153
- pose_style,
154
- facerender,
155
- exp_weight,
156
- use_ref_video,
157
- ref_video,
158
- ref_info,
159
- use_idle_mode,
160
- length_of_audio,
161
- blink_every,
162
- fps=20)
163
- elif method == 'Wav2Lip':
164
- video = wav2lip.predict(crop_pic_path, driven_audio, batch_size)
165
- else:
166
- return None
167
- if driven_vtt:
168
- return video, driven_vtt
169
- else:
170
- return video
171
-
172
- def chat_response(system, message, history):
173
- # response = llm.generate(message)
174
- response, history = llm.chat(system, message, history)
175
- print(history)
176
- # 流式输出
177
- for i in range(len(response)):
178
- time.sleep(0.01)
179
- yield "", history[:-1] + [(message, response[:i+1])]
180
- return "", history
181
-
182
- def human_respone(history, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0, batch_size = 2, character = '女性角色'):
183
- response = history[-1][1]
184
- driven_audio, video_vtt = 'answer.wav', 'answer.vtt'
185
- if character == '女性角色':
186
- # 女性角色
187
- source_image, pic_path = r'./inputs/girl.png', r"./inputs/girl.png"
188
- crop_pic_path = "./inputs/first_frame_dir_girl/girl.png"
189
- first_coeff_path = "./inputs/first_frame_dir_girl/girl.mat"
190
- crop_info = ((403, 403), (19, 30, 502, 513), [40.05956541381802, 40.17324339233366, 443.7892505041507, 443.9029284826663])
191
- default_voice = 'zh-CN-XiaoxiaoNeural'
192
- elif character == '男性角色':
193
- # 男性角色
194
- source_image = r'./inputs/boy.png'
195
- pic_path = "./inputs/boy.png"
196
- crop_pic_path = "./inputs/first_frame_dir_boy/boy.png"
197
- first_coeff_path = "./inputs/first_frame_dir_boy/boy.mat"
198
- crop_info = ((876, 747), (0, 0, 886, 838), [10.382158280494476, 0, 886, 747.7078990925525])
199
- default_voice = 'zh-CN-YunyangNeural'
200
- voice = default_voice if voice not in tts.SUPPORTED_VOICE else voice
201
- tts.predict(response, voice, rate, volume, pitch, driven_audio, video_vtt)
202
- pose_style = random.randint(0, 45) # 随机选择
203
- video_path = talker.test(pic_path,
204
- crop_pic_path,
205
- first_coeff_path,
206
- crop_info,
207
- source_image,
208
- driven_audio,
209
- preprocess_type,
210
- is_still_mode,
211
- enhancer,
212
- batch_size,
213
- size_of_image,
214
- pose_style,
215
- facerender,
216
- exp_weight,
217
- use_ref_video,
218
- ref_video,
219
- ref_info,
220
- use_idle_mode,
221
- length_of_audio,
222
- blink_every,
223
- fps=20)
224
-
225
- return video_path, video_vtt
226
-
227
- def modify_system_session(system: str) -> str:
228
- if system is None or len(system) == 0:
229
- system = default_system
230
- llm.clear_history()
231
- return system, system, []
232
-
233
- def clear_session():
234
- # clear history
235
- llm.clear_history()
236
- return '', []
237
-
238
- def voice_setting(suport_voice):
239
- with gr.Accordion("Advanced Settings(高级设置语音参数) ", open=False):
240
- voice = gr.Dropdown(suport_voice,
241
- label="声音选择 Voice",
242
- value = "克隆声音" if '克隆声音' in suport_voice else None)
243
- rate = gr.Slider(minimum=-100,
244
- maximum=100,
245
- value=0,
246
- step=1.0,
247
- label='声音速率 Rate')
248
- volume = gr.Slider(minimum=0,
249
- maximum=100,
250
- value=100,
251
- step=1,
252
- label='声音音量 Volume')
253
- pitch = gr.Slider(minimum=-100,
254
- maximum=100,
255
- value=0,
256
- step=1,
257
- label='声音音调 Pitch')
258
- batch_size = gr.Slider(minimum=1,
259
- maximum=10,
260
- value=2,
261
- step=1,
262
- label='模型参数 调节可以加快生成速度 Talker Batch size')
263
-
264
- character = gr.Radio(['女性角色', '男性角色'], label="角色选择", value='女性角色')
265
- method = gr.Radio(choices = ['SadTalker', 'Wav2Lip', 'ER-NeRF(Comming Soon!!!)'], value = 'SadTalker', label = '模型选择')
266
- return voice, rate, volume, pitch, batch_size, character, method
267
-
268
- @calculate_time
269
- def Talker_response_img(question_audio, method, text, voice, rate, volume, pitch, source_image,
270
- preprocess_type,
271
- is_still_mode,
272
- enhancer,
273
- batch_size,
274
- size_of_image,
275
- pose_style,
276
- facerender,
277
- exp_weight,
278
- blink_every,
279
- fps):
280
- driven_audio, driven_vtt, _ = LLM_response(question_audio, text, voice, rate, volume, pitch)
281
- if method == 'SadTalker':
282
- video = talker.test2(source_image,
283
- driven_audio,
284
- preprocess_type,
285
- is_still_mode,
286
- enhancer,
287
- batch_size,
288
- size_of_image,
289
- pose_style,
290
- facerender,
291
- exp_weight,
292
- use_ref_video,
293
- ref_video,
294
- ref_info,
295
- use_idle_mode,
296
- length_of_audio,
297
- blink_every,
298
- fps=fps)
299
- elif method == 'Wav2Lip':
300
- video = wav2lip.predict(source_image, driven_audio, batch_size)
301
- else:
302
- return None
303
- if driven_vtt:
304
- return video, driven_vtt
305
- else:
306
- return video
307
-
308
- def app():
309
- with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
310
- gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 文本/语音对话"))
311
- with gr.Row(equal_height=False):
312
- with gr.Column(variant='panel'):
313
- with gr.Tabs(elem_id="question_audio"):
314
- with gr.TabItem('对话'):
315
- with gr.Column(variant='panel'):
316
- question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
317
- input_text = gr.Textbox(label="Input Text", lines=3)
318
- voice, rate, volume, pitch, batch_size, character, method = voice_setting(tts.SUPPORTED_VOICE)
319
- asr_text = gr.Button('语音识别(语音对话后点击)')
320
- asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])
321
-
322
- with gr.Column(variant='panel'):
323
- with gr.Tabs():
324
- with gr.TabItem('数字人问答'):
325
- gen_video = gr.Video(label="生成视频", format="mp4", scale=1, autoplay=False)
326
- video_button = gr.Button("提交视频生成", variant='primary')
327
- video_button.click(fn=Talker_response,inputs=[question_audio, method, input_text,voice, rate, volume, pitch, batch_size, character],outputs=[gen_video])
328
-
329
- with gr.Row():
330
- with gr.Column(variant='panel'):
331
- gr.Markdown("## Test Examples")
332
- gr.Examples(
333
- examples = examples,
334
- fn = Talker_response,
335
- inputs = [input_text, character, method, voice],
336
- )
337
- return inference
338
-
339
- def app_multi():
340
- with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
341
- gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 多轮GPT对话"))
342
- with gr.Row():
343
- with gr.Column():
344
- voice, rate, volume, pitch, batch_size, character, method = voice_setting(tts.SUPPORTED_VOICE)
345
- video = gr.Video(label = '数字人问答', scale = 0.5)
346
- video_button = gr.Button("🎬 生成数字人视频(对话后)", variant = 'primary')
347
-
348
- with gr.Column():
349
- with gr.Row():
350
- with gr.Column(scale=3):
351
- system_input = gr.Textbox(value=default_system, lines=1, label='System (设定角色)')
352
- with gr.Column(scale=1):
353
- modify_system = gr.Button("🛠️ 设置system并清除历史对话", scale=2)
354
- system_state = gr.Textbox(value=default_system, visible=False)
355
-
356
- chatbot = gr.Chatbot(height=400, show_copy_button=True)
357
- audio = gr.Audio(sources=['microphone','upload'], type="filepath", label='语音对话', autoplay=False)
358
- asr_text = gr.Button('🎤 语音识别(语音对话后点击)')
359
-
360
- # 创建一个文本框组件,用于输入 prompt。
361
- msg = gr.Textbox(label="Prompt/问题")
362
- asr_text.click(fn=Asr,inputs=[audio],outputs=[msg])
363
-
364
- with gr.Row():
365
- clear_history = gr.Button("🧹 清除历史对话")
366
- sumbit = gr.Button("🚀 发送", variant = 'primary')
367
-
368
- # 设置按钮的点击事件。当点击时,调用上面定义的 函数,并传入用户的消息和聊天历史记录,然后更新文本框和聊天机器人组件。
369
- sumbit.click(chat_response, inputs=[system_input, msg, chatbot],
370
- outputs=[msg, chatbot])
371
-
372
- # 点击后清空后端存储的聊天记录
373
- clear_history.click(fn = clear_session, outputs = [msg, chatbot])
374
-
375
- # 设置system并清除历史对话
376
- modify_system.click(fn=modify_system_session,
377
- inputs=[system_input],
378
- outputs=[system_state, system_input, chatbot])
379
-
380
- video_button.click(fn = human_respone, inputs = [chatbot, voice, rate, volume, pitch, batch_size, character], outputs = [video])
381
-
382
- with gr.Row(variant='panel'):
383
- with gr.Column(variant='panel'):
384
- gr.Markdown("## Test Examples")
385
- gr.Examples(
386
- examples = examples,
387
- fn = Talker_response,
388
- inputs = [msg, character, method, voice],
389
- )
390
- return inference
391
-
392
- def app_img():
393
- with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
394
- gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 任意图片对话"))
395
- with gr.Row(equal_height=False):
396
- with gr.Column(variant='panel'):
397
- with gr.Tabs(elem_id="sadtalker_source_image"):
398
- with gr.TabItem('Source image'):
399
- with gr.Row():
400
- source_image = gr.Image(label="Source image", type="filepath", elem_id="img2img_image", width=512)
401
-
402
- with gr.Tabs(elem_id="question_audio"):
403
- with gr.TabItem('对话'):
404
- with gr.Column(variant='panel'):
405
- question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
406
- input_text = gr.Textbox(label="Input Text", lines=3, info = '文字对话')
407
- with gr.Accordion("Advanced Settings",
408
- open=False,
409
- visible=True) as parameter_article:
410
- voice = gr.Dropdown(tts.SUPPORTED_VOICE,
411
- value='zh-CN-XiaoxiaoNeural',
412
- label="Voice")
413
- rate = gr.Slider(minimum=-100,
414
- maximum=100,
415
- value=0,
416
- step=1.0,
417
- label='Rate')
418
- volume = gr.Slider(minimum=0,
419
- maximum=100,
420
- value=100,
421
- step=1,
422
- label='Volume')
423
- pitch = gr.Slider(minimum=-100,
424
- maximum=100,
425
- value=0,
426
- step=1,
427
- label='Pitch')
428
-
429
- asr_text = gr.Button('语音识别(语音对话后点击)')
430
- asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])
431
-
432
- # with gr.Tabs(elem_id="response_audio"):
433
- # with gr.TabItem("语音选择"):
434
- # with gr.Column(variant='panel'):
435
- # voice = gr.Dropdown(VOICES, values='zh-CN-XiaoxiaoNeural')
436
-
437
-
438
- with gr.Tabs(elem_id="text_examples"):
439
- gr.Markdown("## Text Examples")
440
- examples = [
441
- ['应对压力最有效的方法是什么?'],
442
- ['如何进行时间管理?'],
443
- ['为什么有些人选择使用纸质地图或寻求方向,而不是依赖GPS设备或智能手机应用程序?'],
444
- ['近日,苹果公司起诉高通公司,状告其未按照相关合约进行合作,高通方面尚未回应。这句话中“其”指的是谁?'],
445
- ['三年级同学种树80颗,四、五年级种的棵树比三年级种的2倍多14棵,三个年级共种树多少棵?'],
446
- ['撰写一篇交响乐音乐会评论,讨论乐团的表演和观众的整体体验。'],
447
- ['翻译成中文:Luck is a dividend of sweat. The more you sweat, the luckier you get.'],
448
- ]
449
- gr.Examples(
450
- examples = examples,
451
- inputs = [input_text],
452
- )
453
-
454
- # driven_audio = 'answer.wav'
455
- with gr.Column(variant='panel'):
456
- method = gr.Radio(choices = ['SadTalker', 'Wav2Lip', 'ER-NeRF(Comming Soon!!!)'], value = 'SadTalker', label = '模型选择')
457
- with gr.Tabs(elem_id="sadtalker_checkbox"):
458
- with gr.TabItem('Settings'):
459
- with gr.Accordion("Advanced Settings",
460
- open=False):
461
- gr.Markdown("SadTalker: need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials")
462
- with gr.Column(variant='panel'):
463
- # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
464
- # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
465
- with gr.Row():
466
- pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
467
- exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) #
468
- blink_every = gr.Checkbox(label="use eye blink", value=True)
469
-
470
- with gr.Row():
471
- size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model? 256 is faster") #
472
- preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
473
-
474
- with gr.Row():
475
- is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
476
- facerender = gr.Radio(['facevid2vid', 'PIRender'], value='facevid2vid', label='facerender', info="which face render?")
477
-
478
- with gr.Row():
479
- batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
480
- fps = gr.Slider(label='fps in generation', step=1, maximum=30, value =20)
481
- enhancer = gr.Checkbox(label="GFPGAN as Face enhancer(slow)")
482
-
483
- with gr.Tabs(elem_id="sadtalker_genearted"):
484
- gen_video = gr.Video(label="Generated video", format="mp4",scale=0.8)
485
-
486
- submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
487
- submit.click(
488
- fn=Talker_response_img,
489
- inputs=[question_audio,
490
- method,
491
- input_text,
492
- voice, rate, volume, pitch,
493
- source_image,
494
- preprocess_type,
495
- is_still_mode,
496
- enhancer,
497
- batch_size,
498
- size_of_image,
499
- pose_style,
500
- facerender,
501
- exp_weight,
502
- blink_every,
503
- fps],
504
- outputs=[gen_video]
505
- )
506
-
507
- with gr.Row():
508
- examples = [
509
- [
510
- 'examples/source_image/full_body_2.png',
511
- 'crop',
512
- False,
513
- False
514
- ],
515
- [
516
- 'examples/source_image/full_body_1.png',
517
- 'crop',
518
- False,
519
- False
520
- ],
521
- [
522
- 'examples/source_image/full3.png',
523
- 'crop',
524
- False,
525
- False
526
- ],
527
- [
528
- 'examples/source_image/full4.jpeg',
529
- 'crop',
530
- False,
531
- False
532
- ],
533
- [
534
- 'examples/source_image/art_13.png',
535
- 'crop',
536
- False,
537
- False
538
- ],
539
- [
540
- 'examples/source_image/art_5.png',
541
- 'crop',
542
- False,
543
- False
544
- ],
545
- ]
546
- gr.Examples(examples=examples,
547
- fn=Talker_response,
548
- inputs=[
549
- source_image,
550
- preprocess_type,
551
- is_still_mode,
552
- enhancer],
553
- outputs=[gen_video],
554
- # cache_examples=True,
555
- )
556
- return inference
557
-
558
- def app_vits():
559
- with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
560
- gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 语音克隆"))
561
- with gr.Row(equal_height=False):
562
- with gr.Column(variant='panel'):
563
- with gr.Tabs(elem_id="question_audio"):
564
- with gr.TabItem('对话'):
565
- with gr.Column(variant='panel'):
566
- question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
567
- input_text = gr.Textbox(label="Input Text", lines=3)
568
- voice, rate, volume, pitch, batch_size, character, method = voice_setting(["克隆声音", "克隆烟嗓音"] + tts.SUPPORTED_VOICE)
569
- asr_text = gr.Button('语音识别(语音对话后点击)')
570
- asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])
571
- with gr.Column(variant='panel'):
572
- with gr.Tabs():
573
- with gr.TabItem('数字人问答'):
574
- gen_video = gr.Video(label="Generated video", format="mp4", scale=1, autoplay=False)
575
- video_button = gr.Button("提交", variant='primary')
576
- video_button.click(fn=Talker_response,inputs=[question_audio, method, input_text, voice, rate, volume, pitch, batch_size, character],outputs=[gen_video])
577
-
578
- with gr.Row():
579
- with gr.Column(variant='panel'):
580
- gr.Markdown("## Test Examples")
581
- gr.Examples(
582
- examples = [["如何应对压力", "男性角色", "SadTalker", "克隆烟嗓音"], ["北京有什么好玩的地方", "男性角色", "SadTalker", "克隆声音"]] + examples,
583
- fn = Talker_response,
584
- inputs = [input_text, character, method, voice],
585
- )
586
- return inference
587
-
588
- if __name__ == "__main__":
589
- # llm = LLM(mode='offline').init_model('Linly', 'Linly-AI/Chinese-LLaMA-2-7B-hf')
590
- # llm = LLM(mode='offline').init_model('Gemini', 'gemini-pro', api_key = "your api key")
591
- # llm = LLM(mode='offline').init_model('Qwen', 'Qwen/Qwen-1_8B-Chat')
592
- llm = LLM(mode='offline').init_model('Qwen', 'Qwen/Qwen-1_8B-Chat')
593
- try:
594
- talker = SadTalker(lazy_load=True)
595
- except Exception as e:
596
- print("SadTalker Error: ", e)
597
- # print("如果使用SadTalker,请先下载SadTalker模型")
598
- gr.Warning("如果使用SadTalker,请先下载SadTalker模型")
599
- try:
600
- from TFG import Wav2Lip
601
- wav2lip = Wav2Lip("checkpoints/wav2lip_gan.pth")
602
- except Exception as e:
603
- print("Wav2Lip Error: ", e)
604
- print("如果使用Wav2Lip,请先下载Wav2Lip模型")
605
- try:
606
- from VITS import GPT_SoVITS
607
- vits = GPT_SoVITS()
608
- except Exception as e:
609
- print("GPT-SoVITS Error: ", e)
610
- print("如果使用VITS,请先下载GPT-SoVITS模型和安装环境")
611
- try:
612
- from ASR import FunASR
613
- asr = FunASR()
614
- except Exception as e:
615
- print("ASR Error: ", e)
616
- print("如果使用FunASR,请先下载FunASR模型和安装环境")
617
- asr = WhisperASR('base')
618
- tts = EdgeTTS()
619
- gr.close_all()
620
- demo_app = app()
621
- demo_img = app_img()
622
- demo_multi = app_multi()
623
- demo_vits = app_vits()
624
- demo = gr.TabbedInterface(interface_list = [demo_app, demo_img, demo_multi, demo_vits],
625
- tab_names = ["文本/语音对话", "任意图片对话", "多轮GPT对话", "语音克隆数字人对话"],
626
- title = "Linly-Talker WebUI")
627
- demo.launch(server_name="127.0.0.1", # 本地端口localhost:127.0.0.1 全局端口转发:"0.0.0.0"
628
- server_port=port,
629
- # 似乎在Gradio4.0以上版本可以不使用证书也可以进行麦克风对话
630
- ssl_certfile=ssl_certfile,
631
- ssl_keyfile=ssl_keyfile,
632
- ssl_verify=False,
633
- debug=True,
634
- )