Yijun-Yang commited on
Commit
766ebf2
·
1 Parent(s): 3702bb5

Add .ipynb_checkpoints to .gitignore and remove from tracking

Browse files
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  __pycache__/
2
  *.pyc
3
-
 
 
1
  __pycache__/
2
  *.pyc
3
+ .ipynb_checkpoints
4
+ */.ipynb_checkpoints/*
.ipynb_checkpoints/app-checkpoint.py DELETED
@@ -1,644 +0,0 @@
1
- import argparse
2
- import json
3
- import time
4
- import os
5
- import glob
6
- import random
7
- import shutil
8
- from enum import Enum
9
- from threading import Thread
10
- from multiprocessing import Process, Value
11
-
12
- import gradio as gr
13
- import pytoml
14
- from loguru import logger
15
- import spaces
16
-
17
- from huixiangdou.service import Worker, llm_serve, ArticleRetrieval, CacheRetriever, FeatureStore, FileOperation
18
-
19
- class PARAM_CODE(Enum):
20
- """Parameter code."""
21
- SUCCESS = 0
22
- FAILED = 1
23
- ERROR = 2
24
-
25
- def parse_args():
26
- """Parse args."""
27
- parser = argparse.ArgumentParser(description='Worker.')
28
- parser.add_argument('--work_dir',
29
- type=str,
30
- default='workdir',
31
- help='Working directory.')
32
- parser.add_argument('--repo_dir',
33
- type=str,
34
- default='repodir',
35
- help='Repository directory.')
36
- parser.add_argument(
37
- '--config_path',
38
- default='config.ini',
39
- type=str,
40
- help='Worker configuration path. Default value is config.ini')
41
- parser.add_argument('--standalone',
42
- action='store_true',
43
- default=True,
44
- help='Auto deploy required Hybrid LLM Service.')
45
- args = parser.parse_args()
46
- return args
47
-
48
- def update_remote_buttons(remote):
49
- if remote:
50
- return [
51
- gr.Markdown("[如何配置API]('https://github.com/jabberwockyang/MedicalReviewAgent/blob/main/README.md')",
52
- visible=True),
53
- gr.Dropdown(["kimi", "deepseek", "zhipuai",'gpt'],
54
- label="选择大模型提供商",
55
- interactive=True,visible=True),
56
- gr.Textbox(label="您的API",lines = 1,
57
- interactive=True,visible=True),
58
- gr.Dropdown([],label="选择模型",
59
- interactive=True,visible=True)
60
- ]
61
- else:
62
- return [
63
- gr.Markdown("[如何配置API]('https://github.com/jabberwockyang/MedicalReviewAgent/blob/main/README.md')",
64
- visible=False),
65
- gr.Dropdown(["kimi", "deepseek", "zhipuai",'gpt'],
66
- label="选择大模型提供商",
67
- interactive=False,visible=False),
68
- gr.Textbox(label="您的API",lines = 1,
69
- interactive=False,visible=False),
70
- gr.Dropdown([],label="选择模型",
71
- interactive=False,visible=False)
72
- ]
73
-
74
- def udate_model_dropdown(remote_company):
75
- model_choices = {
76
- 'kimi': ['moonshot-v1-128k'],
77
- 'deepseek': ['deepseek-chat'],
78
- 'zhipuai': ['glm-4'],
79
- 'gpt': ['gpt-4-32k-0613','gpt-3.5-turbo']
80
- }
81
- return gr.Dropdown(choices= model_choices[remote_company])
82
-
83
- def update_remote_config(remote_ornot,remote_company = None,api = None,model = None):
84
- with open(CONFIG_PATH, encoding='utf8') as f:
85
- config = pytoml.load(f)
86
-
87
- if remote_ornot:
88
- if remote_company == None or api == None or model == None:
89
- raise ValueError('remote_company, api, model not provided')
90
- config['llm']['enable_local'] = 0
91
- config['llm']['enable_remote'] = 1
92
- config['llm']['server']['remote_type'] = remote_company
93
- config['llm']['server']['remote_api_key'] = api
94
- config['llm']['server']['remote_llm_model'] = model
95
- else:
96
- config['llm']['enable_local'] = 1
97
- config['llm']['enable_remote'] = 0
98
- with open(CONFIG_PATH, 'w') as f:
99
- pytoml.dump(config, f)
100
- return gr.Button("配置已保存")
101
-
102
- @spaces.GPU
103
- def get_ready(query:str,chunksize=None,k=None):
104
-
105
- with open(CONFIG_PATH, encoding='utf8') as f:
106
- config = pytoml.load(f)
107
- workdir = config['feature_store']['work_dir']
108
- repodir = config['feature_store']['repo_dir']
109
-
110
- if query == 'repo_work': # no need to return assistant
111
- return repodir, workdir, config
112
- theme = ''
113
- try:
114
- with open(os.path.join(config['feature_store']['repo_dir'],'config.json'), 'r') as f:
115
- repo_config = json.load(f)
116
- theme = ' '.join(repo_config['keywords'])
117
- except:
118
- pass
119
-
120
- if query == 'annotation':
121
- if not chunksize or not k:
122
- raise ValueError('chunksize or k not provided')
123
- chunkdir = os.path.join(workdir, f'chunksize_{chunksize}')
124
- clusterdir = os.path.join(chunkdir, 'cluster_features', f'cluster_features_{k}')
125
- assistant = Worker(work_dir=chunkdir, config_path=CONFIG_PATH,language='en')
126
- samples_json = os.path.join(clusterdir,'samples.json')
127
- with open(samples_json, 'r') as f:
128
- samples = json.load(f)
129
- f.close()
130
- return clusterdir, samples, assistant, theme
131
-
132
- elif query == 'inspiration':
133
- if not chunksize or not k:
134
- raise ValueError('chunksize or k not provided')
135
-
136
- chunkdir = os.path.join(workdir, f'chunksize_{chunksize}')
137
- clusterdir = os.path.join(chunkdir, 'cluster_features', f'cluster_features_{k}')
138
- assistant = Worker(work_dir=chunkdir, config_path=CONFIG_PATH,language='en')
139
- annofile = os.path.join(clusterdir,'annotation.jsonl')
140
- with open(annofile, 'r') as f:
141
- annoresult = f.readlines()
142
-
143
- f.close()
144
- annoresult = [json.loads(obj) for obj in annoresult]
145
- return clusterdir, annoresult, assistant, theme
146
- elif query == 'summarize': # no need for params k
147
- if not chunksize:
148
- raise ValueError('chunksize not provided')
149
- chunkdir = os.path.join(workdir, f'chunksize_{chunksize}')
150
- assistant = Worker(work_dir=chunkdir, config_path=CONFIG_PATH,language='en')
151
- return assistant,theme
152
-
153
- else:
154
- raise ValueError('query not recognized')
155
-
156
- def update_repo_info():
157
- with open(CONFIG_PATH, encoding='utf8') as f:
158
- config = pytoml.load(f)
159
- repodir = config['feature_store']['repo_dir']
160
- if os.path.exists(repodir):
161
- pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
162
- number_of_pdf = len(pdffiles)
163
- if os.path.exists(os.path.join(repodir,'config.json')):
164
-
165
- with open(os.path.join(repodir,'config.json'), 'r') as f:
166
- repo_config = json.load(f)
167
-
168
- keywords = repo_config['keywords']
169
- length = repo_config['len']
170
- retmax = repo_config['retmax']
171
-
172
- return keywords,length,retmax,number_of_pdf
173
- else:
174
- return None,None,None,number_of_pdf
175
- else:
176
- return None,None,None,None
177
-
178
- def upload_file(files):
179
- repodir, workdir, _ = get_ready('repo_work')
180
- if not os.path.exists(repodir):
181
- os.makedirs(repodir)
182
-
183
- for file in files:
184
- destination_path = os.path.join(repodir, os.path.basename(file.name))
185
-
186
- shutil.copy(file.name, destination_path)
187
-
188
-
189
- return files
190
-
191
- def generate_articles_repo(keywords:str,retmax:int):
192
- keys= [k.strip() for k in keywords.split('\n')]
193
- repodir, _, _ = get_ready('repo_work')
194
-
195
- articelfinder = ArticleRetrieval(keywords = keys,
196
- repo_dir = repodir,
197
- retmax = retmax)
198
- articelfinder.initiallize()
199
- return update_repo()
200
-
201
- def delete_articles_repo():
202
- # 在这里运行生成数据库的函数
203
- repodir, workdir, _ = get_ready('repo_work')
204
- if os.path.exists(repodir):
205
- shutil.rmtree(repodir)
206
- if os.path.exists(workdir):
207
- shutil.rmtree(workdir)
208
-
209
- return gr.Textbox(label="文献库概况",lines =3,
210
- value = '文献库和相关数据库已删除',
211
- visible = True)
212
-
213
- def update_repo():
214
- keys,len,retmax,pdflen = update_repo_info()
215
- if keys:
216
- newinfo = f"搜索得到文献:\n 关键词:{keys}\n 文献数量:{len}\n 获取上限:{retmax}\n\n上传文献:\n 数量:{pdflen}"
217
- else:
218
- if pdflen:
219
- newinfo = f'搜索得到文献:无\n上传文献:\n 数量:{pdflen}'
220
- else:
221
- newinfo = '目前还没有文献库'
222
-
223
- return gr.Textbox(label="文献库概况",lines =1,
224
- value = newinfo,
225
- visible = True)
226
-
227
- def update_database_info():
228
- with open(CONFIG_PATH, encoding='utf8') as f:
229
- config = pytoml.load(f)
230
- workdir = config['feature_store']['work_dir']
231
- chunkdirs = glob.glob(os.path.join(workdir, 'chunksize_*'))
232
- chunkdirs.sort()
233
- list_of_chunksize = [int(chunkdir.split('_')[-1]) for chunkdir in chunkdirs]
234
- # print(list_of_chunksize)
235
- jsonobj = {}
236
- for chunkdir in chunkdirs:
237
- k_dir = glob.glob(os.path.join(chunkdir, 'cluster_features','cluster_features_*'))
238
- k_dir.sort()
239
- list_of_k = [int(k.split('_')[-1]) for k in k_dir]
240
- jsonobj[int(chunkdir.split('_')[-1])] = list_of_k
241
-
242
-
243
- new_options = [f"chunksize:{chunksize}, k:{k}" for chunksize in list_of_chunksize for k in jsonobj[chunksize]]
244
-
245
- return new_options, jsonobj
246
-
247
- @spaces.GPU
248
- def generate_database(chunksize:int,nclusters:str|list[str]):
249
- # 在这里运行生成数据库的函数
250
- repodir, workdir, _ = get_ready('repo_work')
251
- if not os.path.exists(repodir):
252
- return gr.Textbox(label="数据库已生成",value = '请先生成文献库',visible = True)
253
- nclusters = [int(i) for i in nclusters]
254
- # 文献库和数据库的覆盖删除逻辑待定
255
- # 理论上 文献库只能生成一次 所以每次生成文献库都要删除之前的文献库和数据库
256
- # 数据库可以根据文献库多次生成 暂不做删除 目前没有节省算力的逻辑 重复计算后覆盖 以后优化
257
- # 不同的chunksize和nclusters会放在不同的文件夹下 不会互相覆盖
258
- # if os.path.exists(workdir):
259
- # shutil.rmtree(workdir)
260
-
261
- cache = CacheRetriever(config_path=CONFIG_PATH)
262
- fs_init = FeatureStore(embeddings=cache.embeddings,
263
- reranker=cache.reranker,
264
- chunk_size=chunksize,
265
- n_clusters=nclusters,
266
- config_path=CONFIG_PATH)
267
-
268
- # walk all files in repo dir
269
- file_opr = FileOperation()
270
- files = file_opr.scan_dir(repo_dir=repodir)
271
- fs_init.initialize(files=files, work_dir=workdir,file_opr=file_opr)
272
- file_opr.summarize(files)
273
- del fs_init
274
- cache.pop('default')
275
- texts, _ = update_database_info()
276
- return gr.Textbox(label="数据库概况",value = '\n'.join(texts) ,visible = True)
277
-
278
- def delete_database():
279
- _, workdir, _ = get_ready('repo_work')
280
- if os.path.exists(workdir):
281
- shutil.rmtree(workdir)
282
- return gr.Textbox(label="数据库概况",lines =3,value = '数据库已删除',visible = True)
283
-
284
- def update_database_textbox():
285
- texts, _ = update_database_info()
286
- if texts == []:
287
- return gr.Textbox(label="数据库概况",value = '目前还没有数据库',visible = True)
288
- else:
289
- return gr.Textbox(label="数据库概况",value = '\n'.join(texts),visible = True)
290
-
291
- def update_chunksize_dropdown():
292
- _, jsonobj = update_database_info()
293
- return gr.Dropdown(choices= jsonobj.keys())
294
-
295
- def update_ncluster_dropdown(chunksize:int):
296
- _, jsonobj = update_database_info()
297
- nclusters = jsonobj[chunksize]
298
- return gr.Dropdown(choices= nclusters)
299
-
300
- @spaces.GPU
301
- def annotation(n,chunksize:int,nclusters:int,remote_ornot:bool):
302
- '''
303
- use llm to annotate cluster
304
- n: percentage of clusters to annotate
305
- '''
306
- query = 'annotation'
307
- if remote_ornot:
308
- backend = 'remote'
309
- else:
310
- backend = 'local'
311
-
312
- clusterdir, samples, assistant, theme = get_ready('annotation',chunksize,nclusters)
313
- new_obj_list = []
314
- n = round(n * len(samples.keys()))
315
- for cluster_no in random.sample(samples.keys(), n):
316
- chunk = '\n'.join(samples[cluster_no]['samples'][:10])
317
-
318
- code, reply, cluster_no = assistant.annotate_cluster(
319
- theme = theme,
320
- cluster_no=cluster_no,
321
- chunk=chunk,
322
- history=[],
323
- groupname='',
324
- backend=backend)
325
- references = f"cluster_no: {cluster_no}"
326
- new_obj = {
327
- 'cluster_no': cluster_no,
328
- 'chunk': chunk,
329
- 'annotation': reply
330
- }
331
- new_obj_list.append(new_obj)
332
- logger.info(f'{code}, {query}, {reply}, {references}')
333
-
334
- with open(os.path.join(clusterdir, 'annotation.jsonl'), 'a') as f:
335
- json.dump(new_obj, f, ensure_ascii=False)
336
- f.write('\n')
337
-
338
- return '\n\n'.join([obj['annotation'] for obj in new_obj_list])
339
-
340
- @spaces.GPU
341
- def inspiration(annotation:str,chunksize:int,nclusters:int,remote_ornot:bool):
342
- query = 'inspiration'
343
- if remote_ornot:
344
- backend = 'remote'
345
- else:
346
- backend = 'local'
347
-
348
- clusterdir, annoresult, assistant, theme = get_ready('inspiration',chunksize,nclusters)
349
- new_obj_list = []
350
-
351
- if annotation is not None: # if the user wants to get inspiration from specific clusters only
352
- annoresult = [obj for obj in annoresult if obj['annotation'] in [txt.strip() for txt in annotation.split('\n')]]
353
-
354
- for index in random.sample(range(len(annoresult)), min(5, len(annoresult))):
355
- cluster_no = annoresult[index]['cluster_no']
356
- chunks = annoresult[index]['annotation']
357
-
358
- code, reply = assistant.getinspiration(
359
- theme = theme,
360
- annotations = chunks,
361
- history=[],
362
- groupname='',backend=backend)
363
- new_obj = {
364
- 'inspiration': reply,
365
- 'cluster_no': cluster_no
366
- }
367
- new_obj_list.append(new_obj)
368
- logger.info(f'{code}, {query}, {cluster_no},{reply}')
369
-
370
- with open(os.path.join(clusterdir, 'inspiration.jsonl'), 'a') as f:
371
- json.dump(new_obj, f, ensure_ascii=False)
372
- with open(os.path.join(clusterdir, 'inspiration.txt'), 'a') as f:
373
- f.write(f'{reply}\n')
374
-
375
- return '\n\n'.join(list(set([obj['inspiration'] for obj in new_obj_list])))
376
-
377
-
378
- def getpmcurls(references):
379
- urls = []
380
- for ref in references:
381
- if ref.startswith('PMC'):
382
-
383
- refid = ref.replace('.txt','')
384
- urls.append(f'https://www.ncbi.nlm.nih.gov/pmc/articles/{refid}/')
385
- else:
386
- urls.append(ref)
387
- return urls
388
-
389
- @spaces.GPU
390
- def summarize_text(query,chunksize:int,remote_ornot:bool):
391
- if remote_ornot:
392
- backend = 'remote'
393
- else:
394
- backend = 'local'
395
-
396
- assistant,_ = get_ready('summarize',chunksize=chunksize,k=None)
397
- code, reply, references = assistant.generate(query=query,
398
- history=[],
399
- groupname='',backend = backend)
400
-
401
- logger.info(f'{code}, {query}, {reply}, {references}')
402
- urls = getpmcurls(references)
403
- mds = '\n'.join([f'[{ref}]({url})' for ref,url in zip(references,urls)])
404
- return reply, gr.Markdown(label="参考文献",value = mds)
405
-
406
- def main_interface():
407
- with gr.Blocks() as demo:
408
- with gr.Row():
409
- gr.Markdown(
410
- """
411
- # 医学文献综述助手 (又名 不想看文献)
412
- """
413
- )
414
-
415
- with gr.Tab("模型服务配置"):
416
- gr.Markdown("""
417
- #### 配置模型服务 🛠️
418
-
419
- 1. **是否使用远程大模型**
420
- - 勾选此项,如果你想使用远程的大模型服务。
421
- - 如果不勾选,将默认使用本地模型服务。
422
-
423
- 2. **API配置**
424
- - 配置大模型提供商和API,确保模型服务能够正常运行。
425
- - 提供商选择:kimi、deepseek、zhipuai、gpt。
426
- - 输入您的API密钥和选择对应模型。
427
- - 点击“保存配置”按钮以保存您的设置。
428
-
429
- 📝 **备注**:请参考[如何使用]('https://github.com/jabberwockyang/MedicalReviewAgent/blob/main/README.md')获取更多信息。
430
-
431
- """)
432
-
433
- remote_ornot = gr.Checkbox(label="是否使用远程大模型")
434
- with gr.Accordion("API配置", open=True):
435
- apimd = gr.Markdown("[如何配置API]('https://github.com/jabberwockyang/MedicalReviewAgent/blob/main/README.md')",visible=False)
436
- remote_company = gr.Dropdown(["kimi", "deepseek", "zhipuai",'gpt'],
437
- label="选择大模型提供商",interactive=False,visible=False)
438
- api = gr.Textbox(label="您的API",lines = 1,interactive=False,visible=False)
439
- model = gr.Dropdown([],label="选择模型",interactive=False,visible=False)
440
-
441
- confirm_button = gr.Button("保存配置")
442
-
443
- remote_ornot.change(update_remote_buttons, inputs=[remote_ornot],outputs=[apimd,remote_company,api,model])
444
- remote_company.change(udate_model_dropdown, inputs=[remote_company],outputs=[model])
445
- confirm_button.click(update_remote_config, inputs=[remote_ornot,remote_company,api,model],outputs=[confirm_button])
446
-
447
-
448
- with gr.Tab("文献查找+数据库生成"):
449
- gr.Markdown("""
450
- #### 查找文献 📚
451
-
452
- 1. **输入关键词批量PubMed PMC文献**
453
- - 在“感兴趣的关键词”框中输入您感兴趣的关键词,每行一个。
454
- - 设置查找数量(0-1000)。
455
- - 点击“搜索PubMed PMC”按钮进行文献查找。
456
-
457
- 2. **上传PDF**
458
- - 通过“上传PDF”按钮上传您已有的PDF文献文件。
459
-
460
- 3. **更新文献库情况 删除文献库**
461
- - 点击“更新文献库情况”按钮,查看当前文献库的概况。
462
- - 如果需要重置或删除现有文献库,点击“删除文献库”按钮。
463
-
464
-
465
- #### 生成数据库 🗂️
466
-
467
- 1. **设置数据库构建参数 生成数据库**
468
- - 选择块大小(Chunk Size)和聚类数(Number of Clusters)。
469
- - 提供选项用于选择合适的块大小和聚类数。
470
- - 点击“生成数据库”按钮开始数据库生成过程。
471
-
472
- 2. **更新数据库情况 删除数据库**
473
- - 点击“更新数据库情况”按钮,查看当前数据库的概况。
474
- - 点击“删除数据库”按钮移除现有数据库。
475
-
476
- 📝 **备注**:请参考[如何选择数据库构建参数]('https://github.com/jabberwockyang/MedicalReviewAgent/tree/main')获取更多信息。
477
- """)
478
- with gr.Row(equal_height=True):
479
- with gr.Column(scale=1):
480
- input_keys = gr.Textbox(label="感兴趣的关键词",
481
- lines = 5)
482
- retmax = gr.Slider(
483
- minimum=0,
484
- maximum=1000,
485
- value=500,
486
- interactive=True,
487
- label="查多少",
488
- )
489
- generate_repo_button = gr.Button("搜索PubMed PMC")
490
- with gr.Column(scale=2):
491
- file_output = gr.File(scale=2)
492
- upload_button = gr.UploadButton("上传PDF",
493
- file_types=[".pdf",".csv",".doc"],
494
- file_count="multiple",scale=0)
495
-
496
- with gr.Row(equal_height=True):
497
- with gr.Column(scale=0):
498
- delete_repo_button = gr.Button("删除文献库")
499
- update_repo_button = gr.Button("更新文献库情况")
500
- with gr.Column(scale=2):
501
-
502
- repo_summary =gr.Textbox(label= '文献库概况', value="目前还没有文献库")
503
-
504
- generate_repo_button.click(generate_articles_repo,
505
- inputs=[input_keys,retmax],
506
- outputs = [repo_summary])
507
-
508
-
509
- delete_repo_button.click(delete_articles_repo, inputs=None,
510
- outputs = repo_summary)
511
- update_repo_button.click(update_repo, inputs=None,
512
- outputs = repo_summary)
513
- upload_button.upload(upload_file, upload_button, file_output)
514
-
515
- with gr.Accordion("数据库构建参数", open=True):
516
- gr.Markdown("[如何选择数据库构建参数]('https://github.com/jabberwockyang/MedicalReviewAgent/tree/main')")
517
- chunksize = gr.Slider(label="Chunk Size",
518
- info= 'How long you want the chunk to be?',
519
- minimum=128, maximum=4096,value=1024,step=1,
520
- interactive=True)
521
- ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
522
- # default=["20", "50", '100'],
523
- label="Number of Clusters",
524
- info="How many Clusters you want to generate")
525
-
526
- with gr.Row():
527
- gene_database_button = gr.Button("生成数据库")
528
- delete_database_button = gr.Button("删除数据库")
529
- update_database_button = gr.Button("更新数据库情况")
530
-
531
- database_summary = gr.Textbox(label="数据库概况",lines = 1,value="目前还没有数据库")
532
-
533
-
534
- gene_database_button.click(generate_database, inputs=[chunksize,ncluster],
535
- outputs = database_summary)
536
-
537
- update_database_button.click(update_database_textbox,inputs=None,
538
- outputs = [database_summary])
539
-
540
- delete_database_button.click(delete_database, inputs=None,
541
- outputs = database_summary)
542
- with gr.Tab("写综述"):
543
- gr.Markdown("""
544
- #### 写综述 ✍️
545
-
546
- 1. **更新数据库情况**
547
- - 点击“更新数据库情况”按钮,确保使用最新的数据库信息。
548
-
549
- 2. **选择块大小和聚类数**
550
- - 从下拉菜单中选择合适的块大小和聚类数。
551
-
552
- 3. **抽样标注文章聚类**
553
- - 设置抽样标注比例(0-1)。
554
- - 点击“抽样标注文章聚类”按钮开始标注过程。
555
-
556
- 4. **获取灵感**
557
- - 如果不知道写什么,点击“获取灵感”按钮。
558
- - 系统将基于标注的文章聚类提供相应的综述子问题。
559
-
560
- 5. **写综述**
561
- - 输入您想写的内容或主题。
562
- - 点击“写综述”按钮,生成综述文本。
563
-
564
- 6. **查看生成结果**
565
- - 生成的综述文本将显示在“看看”文本框中。
566
- - 参考文献将显示在“参考文献”框中。
567
-
568
- 📝 **备注**:可以尝试不同的参数进行标注和灵感获取,有助于提高综述的质量和相关性。
569
- """)
570
-
571
- with gr.Accordion("聚类标注相关参数", open=True):
572
- with gr.Row():
573
- update_options = gr.Button("更新数据库情况", scale=0)
574
- chunksize = gr.Dropdown([], label="选择块大小", scale=0)
575
- nclusters = gr.Dropdown([], label="选择聚类数", scale=0)
576
- ntoread = gr.Slider(
577
- minimum=0,maximum=1,value=0.5,
578
- interactive=True,
579
- label="抽样标注比例",
580
- )
581
-
582
- annotation_button = gr.Button("抽样标注文章聚类")
583
- annotation_output = gr.Textbox(label="文章聚类标注/片段摘要",
584
- lines = 5,
585
- interactive= True,
586
- show_copy_button=True)
587
- inspiration_button = gr.Button("获取灵感")
588
- inspiration_output = gr.Textbox(label="灵光一现",
589
- lines = 5,
590
- show_copy_button=True)
591
-
592
-
593
- query = gr.Textbox(label="想写什么")
594
-
595
- write_button = gr.Button("写综述")
596
- output_text = gr.Textbox(label="看看",lines=10)
597
- output_references = gr.Markdown(label="参考文献")
598
-
599
- update_options.click(update_chunksize_dropdown,
600
- outputs=[chunksize])
601
-
602
- chunksize.change(update_ncluster_dropdown,
603
- inputs=[chunksize],
604
- outputs= [nclusters])
605
-
606
- annotation_button.click(annotation,
607
- inputs = [ntoread, chunksize, nclusters,remote_ornot],
608
- outputs=[annotation_output])
609
-
610
- inspiration_button.click(inspiration,
611
- inputs= [annotation_output, chunksize, nclusters,remote_ornot],
612
- outputs=[inspiration_output])
613
-
614
- write_button.click(summarize_text,
615
- inputs=[query, chunksize,remote_ornot],
616
- outputs =[output_text,output_references])
617
-
618
- demo.launch(share=False, server_name='0.0.0.0', debug=True,show_error=True,allowed_paths=['img_0.jpg'])
619
-
620
- # start service
621
- if __name__ == '__main__':
622
- args = parse_args()
623
- # copy config from config-bak
624
- shutil.copy('config-bak.ini', args.config_path) # yyj
625
- CONFIG_PATH = args.config_path
626
-
627
- if args.standalone is True:
628
- # hybrid llm serve
629
- server_ready = Value('i', 0)
630
- server_process = Process(target=llm_serve,
631
- args=(args.config_path, server_ready))
632
- server_process.start()
633
- while True:
634
- if server_ready.value == 0:
635
- logger.info('waiting for server to be ready..')
636
- time.sleep(3)
637
- elif server_ready.value == 1:
638
- break
639
- else:
640
- logger.error('start local LLM server failed, quit.')
641
- raise Exception('local LLM path')
642
- logger.info('Hybrid LLM Server start.')
643
-
644
- main_interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/config-bak-checkpoint.ini DELETED
@@ -1,63 +0,0 @@
1
- [feature_store]
2
- reject_throttle = 0
3
- embedding_model_path = "maidalun1020/bce-embedding-base_v1"
4
- reranker_model_path = "maidalun1020/bce-reranker-base_v1"
5
- repo_dir = "repodir"
6
- work_dir = "workdir"
7
- n_clusters = [20, 50]
8
- chunk_size = 1024
9
-
10
- [web_search]
11
- x_api_key = "${YOUR-API-KEY}"
12
- domain_partial_order = ["openai.com", "pytorch.org", "readthedocs.io", "nvidia.com", "stackoverflow.com", "juejin.cn", "zhuanlan.zhihu.com", "www.cnblogs.com"]
13
- save_dir = "logs/web_search_result"
14
-
15
- [llm]
16
- enable_local = 1
17
- enable_remote = 1
18
- client_url = "http://127.0.0.1:8888/inference"
19
-
20
- [llm.server]
21
- local_llm_path = "Qwen/Qwen1.5-7B-Chat"
22
- local_llm_max_text_length = 32000
23
- local_llm_bind_port = 8888
24
- remote_type = ""
25
- remote_api_key = ""
26
- remote_llm_max_text_length = 32000
27
- remote_llm_model = ""
28
- rpm = 500
29
-
30
- [worker]
31
- enable_sg_search = 0
32
- save_path = "logs/work.txt"
33
-
34
- [worker.time]
35
- start = "00:00:00"
36
- end = "23:59:59"
37
- has_weekday = 1
38
-
39
- [sg_search]
40
- binary_src_path = "/usr/local/bin/src"
41
- src_access_token = "${YOUR-SRC-ACCESS-TOKEN}"
42
-
43
- [sg_search.opencompass]
44
- github_repo_id = "open-compass/opencompass"
45
- introduction = "用于评测大型语言模型(LLM). 它提供了完整的开源可复现的评测框架,支持大语言模型、多模态模型的一站式评测,基于分布式技术,对大参数量模型亦能实现高效评测。评测方向汇总为知识、语言、理解、推理、考试五大能力维度,整合集纳了超过70个评测数据集,合计提供了超过40万个模型评测问题,并提供长文本、安全、代码3类大模型特色技术能力评测。"
46
-
47
- [sg_search.lmdeploy]
48
- github_repo_id = "internlm/lmdeploy"
49
- introduction = "lmdeploy 是一个用于压缩、部署和服务 LLM(Large Language Model)的工具包。是一个服务端场景下,transformer 结构 LLM 部署工具,支持 GPU 服务端部署,速度有保障,支持 Tensor Parallel,多并发优化,功能全面,包括模型转换、缓存历史会话的 cache feature 等. 它还提供了 WebUI、命令行和 gRPC 客户端接入。"
50
-
51
- [frontend]
52
- type = "none"
53
- webhook_url = "https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxx"
54
- message_process_policy = "immediate"
55
-
56
- [frontend.lark_group]
57
- app_id = "cli_a53a34dcb778500e"
58
- app_secret = "2ajhg1ixSvlNm1bJkH4tJhPfTCsGGHT1"
59
- encrypt_key = "abc"
60
- verification_token = "def"
61
-
62
- [frontend.wechat_personal]
63
- bind_port = 9527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/packages-checkpoint.txt DELETED
@@ -1,8 +0,0 @@
1
- apt-get install libgl1-mesa-glx
2
- cd /root && mkdir models
3
- cd /root/models
4
-
5
- # login required
6
- huggingface-cli download Qwen/Qwen1.5-7B-Chat --local-dir /root/models/Qwen1.5-7B-Chat
7
- huggingface-cli download maidalun1020/bce-embedding-base_v1 --local-dir /root/models/bce-embedding-base_v1
8
- huggingface-cli download maidalun1020/bce-reranker-base_v1 --local-dir /root/models/bce-reranker-base_v1