Spaces:
Running
Running
wangrongsheng
commited on
Commit
·
e55d43b
1
Parent(s):
1328244
Upload 2 files
Browse files- app.py +108 -117
- optimizeOpenAI.py +226 -0
app.py
CHANGED
@@ -13,7 +13,8 @@ import gradio
|
|
13 |
import markdown
|
14 |
import json
|
15 |
import tiktoken
|
16 |
-
|
|
|
17 |
def parse_text(text):
|
18 |
lines = text.split("\n")
|
19 |
for i,line in enumerate(lines):
|
@@ -30,26 +31,47 @@ def parse_text(text):
|
|
30 |
lines[i] = '<br/>'+line.replace(" ", " ")
|
31 |
return "".join(lines)
|
32 |
|
33 |
-
def get_response(system, context, myKey, raw = False):
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
|
47 |
-
|
|
|
|
|
48 |
try:
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
51 |
except:
|
52 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
class Paper:
|
55 |
def __init__(self, path, title='', url='', abs='', authers=[], sl=[]):
|
@@ -303,8 +325,9 @@ class Reader:
|
|
303 |
def __init__(self, key_word='', query='', filter_keys='',
|
304 |
root_path='./',
|
305 |
gitee_key='',
|
306 |
-
sort=arxiv.SortCriterion.SubmittedDate, user_name='defualt', language='cn',
|
307 |
-
self.
|
|
|
308 |
self.user_name = user_name # 读者姓名
|
309 |
self.key_word = key_word # 读者感兴趣的关键词
|
310 |
self.query = query # 读者输入的搜索查询
|
@@ -435,7 +458,7 @@ class Reader:
|
|
435 |
|
436 |
return image_url
|
437 |
|
438 |
-
def summary_with_chat(self, paper_list
|
439 |
htmls = []
|
440 |
utoken = 0
|
441 |
ctoken = 0
|
@@ -451,7 +474,7 @@ class Reader:
|
|
451 |
text += list(paper.section_text_dict.values())[0]
|
452 |
#max_token = 2500 * 4
|
453 |
#text = text[:max_token]
|
454 |
-
chat_summary_text, utoken1, ctoken1, ttoken1 = self.chat_summary(text=text
|
455 |
htmls.append(chat_summary_text)
|
456 |
|
457 |
# TODO 往md文档中插入论文里的像素最大的一张图片,这个方案可以弄的更加智能一些:
|
@@ -469,7 +492,7 @@ class Reader:
|
|
469 |
# methods
|
470 |
method_text += paper.section_text_dict[method_key]
|
471 |
text = summary_text + "\n<Methods>:\n" + method_text
|
472 |
-
chat_method_text, utoken2, ctoken2, ttoken2 = self.chat_method(text=text
|
473 |
htmls.append(chat_method_text)
|
474 |
else:
|
475 |
chat_method_text = ''
|
@@ -492,7 +515,7 @@ class Reader:
|
|
492 |
text = summary_text + "\n <Conclusion>:\n" + conclusion_text
|
493 |
else:
|
494 |
text = summary_text
|
495 |
-
chat_conclusion_text, utoken3, ctoken3, ttoken3 = self.chat_conclusion(text=text
|
496 |
htmls.append(chat_conclusion_text)
|
497 |
htmls.append("\n")
|
498 |
# token统计
|
@@ -507,24 +530,20 @@ class Reader:
|
|
507 |
"cost": str(cost),
|
508 |
}
|
509 |
md_text = "\n".join(htmls)
|
510 |
-
|
511 |
return markdown.markdown(md_text), pos_count
|
512 |
|
513 |
|
514 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
515 |
stop=tenacity.stop_after_attempt(5),
|
516 |
reraise=True)
|
517 |
-
def chat_conclusion(self, text
|
518 |
-
openai.api_key = key
|
519 |
conclusion_prompt_token = 650
|
520 |
text_token = len(self.encoding.encode(text))
|
521 |
clip_text_index = int(len(text)*(self.max_token_num-conclusion_prompt_token)/text_token)
|
522 |
-
clip_text = text[:clip_text_index]
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
{"role": "assistant", "content": "This is the <summary> and <conclusion> part of an English literature, where <summary> you have already summarized, but <conclusion> part, I need your help to summarize the following questions:"+clip_text}, # 背景知识,可以参考OpenReview的审稿流程
|
527 |
-
{"role": "user", "content": """
|
528 |
8. Make the following summary.Be sure to use Chinese answers (proper nouns need to be marked in English).
|
529 |
- (1):What is the significance of this piece of work?
|
530 |
- (2):Summarize the strengths and weaknesses of this article in three dimensions: innovation point, performance, and workload.
|
@@ -535,42 +554,26 @@ class Reader:
|
|
535 |
- (2):Innovation point: xxx; Performance: xxx; Workload: xxx;\n
|
536 |
|
537 |
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not repeat the content of the previous <summary>, the value of the use of the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements, if not, you can not write.
|
538 |
-
"""
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
messages=messages,
|
544 |
-
temperature=temperature, # What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
|
545 |
-
top_p=p # An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
|
546 |
)
|
547 |
-
|
548 |
-
result
|
549 |
-
for choice in response.choices:
|
550 |
-
result += choice.message.content
|
551 |
-
#print("prompt_token_used:", response.usage.prompt_tokens,
|
552 |
-
# "completion_token_used:", response.usage.completion_tokens,
|
553 |
-
# "total_token_used:", response.usage.total_tokens)
|
554 |
-
#print("response_time:", response.response_ms/1000.0, 's')
|
555 |
-
usage_token = response.usage.prompt_tokens
|
556 |
-
com_token = response.usage.completion_tokens
|
557 |
-
total_token = response.usage.total_tokens
|
558 |
-
|
559 |
-
return result, usage_token, com_token, total_token
|
560 |
|
561 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
562 |
stop=tenacity.stop_after_attempt(5),
|
563 |
reraise=True)
|
564 |
-
def chat_method(self, text
|
565 |
-
openai.api_key = key
|
566 |
method_prompt_token = 650
|
567 |
text_token = len(self.encoding.encode(text))
|
568 |
clip_text_index = int(len(text)*(self.max_token_num-method_prompt_token)/text_token)
|
569 |
-
clip_text = text[:clip_text_index]
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
{"role": "user", "content": """
|
574 |
7. Describe in detail the methodological idea of this article. Be sure to use Chinese answers (proper nouns need to be marked in English). For example, its steps are.
|
575 |
- (1):...
|
576 |
- (2):...
|
@@ -584,42 +587,26 @@ class Reader:
|
|
584 |
....... \n\n
|
585 |
|
586 |
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not repeat the content of the previous <summary>, the value of the use of the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements, if not, you can not write.
|
587 |
-
"""
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
temperature=temperature, # What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
|
593 |
-
top_p=p # An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
|
594 |
)
|
595 |
-
|
596 |
-
result
|
597 |
-
for choice in response.choices:
|
598 |
-
result += choice.message.content
|
599 |
-
print("method_result:\n", result)
|
600 |
-
#print("prompt_token_used:", response.usage.prompt_tokens,
|
601 |
-
# "completion_token_used:", response.usage.completion_tokens,
|
602 |
-
# "total_token_used:", response.usage.total_tokens)
|
603 |
-
#print("response_time:", response.response_ms/1000.0, 's')
|
604 |
-
usage_token = response.usage.prompt_tokens
|
605 |
-
com_token = response.usage.completion_tokens
|
606 |
-
total_token = response.usage.total_tokens
|
607 |
-
|
608 |
-
return result, usage_token, com_token, total_token
|
609 |
|
610 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
611 |
stop=tenacity.stop_after_attempt(5),
|
612 |
reraise=True)
|
613 |
-
def chat_summary(self, text
|
614 |
-
openai.api_key = key
|
615 |
summary_prompt_token = 1000
|
616 |
text_token = len(self.encoding.encode(text))
|
617 |
clip_text_index = int(len(text)*(self.max_token_num-summary_prompt_token)/text_token)
|
618 |
clip_text = text[:clip_text_index]
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
{"role": "user", "content": """
|
623 |
1. Mark the title of the paper (with Chinese translation)
|
624 |
2. list all the authors' names (use English)
|
625 |
3. mark the first author's affiliation (output Chinese translation only)
|
@@ -643,29 +630,14 @@ class Reader:
|
|
643 |
- (4):xxx.\n\n
|
644 |
|
645 |
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not have too much repetitive information, numerical values using the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed.
|
646 |
-
"""
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
messages=messages,
|
652 |
-
temperature=temperature, # What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
|
653 |
-
top_p=p # An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
|
654 |
)
|
655 |
-
|
656 |
-
result
|
657 |
-
for choice in response.choices:
|
658 |
-
result += choice.message.content
|
659 |
-
print("summary_result:\n", result)
|
660 |
-
#print("prompt_token_used:", response.usage.prompt_tokens,
|
661 |
-
# "completion_token_used:", response.usage.completion_tokens,
|
662 |
-
# "total_token_used:", response.usage.total_tokens)
|
663 |
-
#print("response_time:", response.response_ms/1000.0, 's')
|
664 |
-
usage_token = response.usage.prompt_tokens
|
665 |
-
com_token = response.usage.completion_tokens
|
666 |
-
total_token = response.usage.total_tokens
|
667 |
-
|
668 |
-
return result, usage_token, com_token, total_token
|
669 |
|
670 |
def export_to_markdown(self, text, file_name, mode='w'):
|
671 |
# 使用markdown模块的convert方法,将文本转换为html格式
|
@@ -681,10 +653,16 @@ class Reader:
|
|
681 |
print(f"Query: {self.query}")
|
682 |
print(f"Sort: {self.sort}")
|
683 |
|
684 |
-
def upload_pdf(
|
685 |
# 检查两个输入都不为空
|
686 |
-
|
|
|
|
|
|
|
|
|
|
|
687 |
return "两个输入都不能为空,请输入字符并上传 PDF 文件!"
|
|
|
688 |
# 判断PDF文件
|
689 |
#if file and file.name.split(".")[-1].lower() != "pdf":
|
690 |
# return '请勿上传非 PDF 文件!'
|
@@ -692,22 +670,29 @@ def upload_pdf(key, text, model_name, p, temperature, file):
|
|
692 |
section_list = text.split(',')
|
693 |
paper_list = [Paper(path=file, sl=section_list)]
|
694 |
# 创建一个Reader对象
|
695 |
-
|
696 |
-
|
|
|
697 |
return cost, sum_info
|
698 |
|
699 |
api_title = "api-key可用验证"
|
700 |
api_description = '''<div align='left'>
|
|
|
701 |
<img src='https://visitor-badge.laobi.icu/badge?page_id=https://huggingface.co/spaces/wangrongsheng/ChatPaper'>
|
|
|
702 |
<img align='right' src='https://i.328888.xyz/2023/03/12/vH9dU.png' width="150">
|
|
|
703 |
Use ChatGPT to summary the papers.Star our Github [🌟ChatPaper](https://github.com/kaixindelele/ChatPaper) .
|
|
|
704 |
💗如果您觉得我们的项目对您有帮助,还请您给我们一些鼓励!💗
|
|
|
705 |
🔴请注意:千万不要用于严肃的学术场景,只能用于论文阅读前的初筛!
|
|
|
706 |
</div>
|
707 |
'''
|
708 |
|
709 |
api_input = [
|
710 |
-
gradio.inputs.Textbox(label="请输入你的
|
711 |
]
|
712 |
api_gui = gradio.Interface(fn=valid_apikey, inputs=api_input, outputs="text", title=api_title, description=api_description)
|
713 |
|
@@ -715,20 +700,26 @@ api_gui = gradio.Interface(fn=valid_apikey, inputs=api_input, outputs="text", ti
|
|
715 |
title = "ChatPaper"
|
716 |
# 描述
|
717 |
description = '''<div align='left'>
|
|
|
718 |
<img src='https://visitor-badge.laobi.icu/badge?page_id=https://huggingface.co/spaces/wangrongsheng/ChatPaper'>
|
|
|
719 |
<img align='right' src='https://i.328888.xyz/2023/03/12/vH9dU.png' width="150">
|
|
|
720 |
Use ChatGPT to summary the papers.Star our Github [🌟ChatPaper](https://github.com/kaixindelele/ChatPaper) .
|
|
|
721 |
💗如果您觉得我们的项目对您有帮助,还请您给我们一些鼓励!💗
|
|
|
722 |
🔴请注意:千万不要用于严肃的学术场景,只能用于论文阅读前的初筛!
|
|
|
723 |
</div>
|
724 |
'''
|
725 |
# 创建Gradio界面
|
726 |
ip = [
|
727 |
-
gradio.inputs.Textbox(label="请输入你的
|
728 |
gradio.inputs.Textbox(label="请输入论文大标题索引(用英文逗号隔开,必填)", default="'Abstract,Introduction,Related Work,Background,Preliminary,Problem Formulation,Methods,Methodology,Method,Approach,Approaches,Materials and Methods,Experiment Settings,Experiment,Experimental Results,Evaluation,Experiments,Results,Findings,Data Analysis,Discussion,Results and Discussion,Conclusion,References'"),
|
729 |
gradio.inputs.Radio(choices=["gpt-3.5-turbo", "gpt-3.5-turbo-0301"], default="gpt-3.5-turbo", label="Select model"),
|
730 |
gradio.inputs.Slider(minimum=-0, maximum=1.0, default=1.0, step=0.05, label="Top-p (nucleus sampling)"),
|
731 |
-
gradio.inputs.Slider(minimum=-0, maximum=5.0, default=
|
732 |
gradio.inputs.File(label="请上传论文PDF(必填)")
|
733 |
]
|
734 |
|
|
|
13 |
import markdown
|
14 |
import json
|
15 |
import tiktoken
|
16 |
+
import concurrent.futures
|
17 |
+
from optimizeOpenAI import chatPaper
|
18 |
def parse_text(text):
|
19 |
lines = text.split("\n")
|
20 |
for i,line in enumerate(lines):
|
|
|
31 |
lines[i] = '<br/>'+line.replace(" ", " ")
|
32 |
return "".join(lines)
|
33 |
|
34 |
+
# def get_response(system, context, myKey, raw = False):
|
35 |
+
# openai.api_key = myKey
|
36 |
+
# response = openai.ChatCompletion.create(
|
37 |
+
# model="gpt-3.5-turbo",
|
38 |
+
# messages=[system, *context],
|
39 |
+
# )
|
40 |
+
# openai.api_key = ""
|
41 |
+
# if raw:
|
42 |
+
# return response
|
43 |
+
# else:
|
44 |
+
# message = response["choices"][0]["message"]["content"]
|
45 |
+
# message_with_stats = f'{message}'
|
46 |
+
# return message, parse_text(message_with_stats)
|
47 |
|
48 |
+
valid_api_keys = []
|
49 |
+
|
50 |
+
def api_key_check(api_key):
|
51 |
try:
|
52 |
+
chat = chatPaper([api_key])
|
53 |
+
if chat.check_api_available():
|
54 |
+
return api_key
|
55 |
+
else:
|
56 |
+
return None
|
57 |
except:
|
58 |
+
return None
|
59 |
+
|
60 |
+
def valid_apikey(api_keys):
|
61 |
+
api_keys = api_keys.replace(' ', '')
|
62 |
+
api_key_list = api_keys.split(',')
|
63 |
+
print(api_key_list)
|
64 |
+
global valid_api_keys
|
65 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
66 |
+
future_results = {executor.submit(api_key_check, api_key): api_key for api_key in api_key_list}
|
67 |
+
for future in concurrent.futures.as_completed(future_results):
|
68 |
+
result = future.result()
|
69 |
+
if result:
|
70 |
+
valid_api_keys.append(result)
|
71 |
+
if len(valid_api_keys) > 0:
|
72 |
+
return "有效的api-key一共有{}个,分别是:{}, 现在可以提交你的paper".format(len(valid_api_keys), valid_api_keys)
|
73 |
+
return "无效的api-key"
|
74 |
+
|
75 |
|
76 |
class Paper:
|
77 |
def __init__(self, path, title='', url='', abs='', authers=[], sl=[]):
|
|
|
325 |
def __init__(self, key_word='', query='', filter_keys='',
|
326 |
root_path='./',
|
327 |
gitee_key='',
|
328 |
+
sort=arxiv.SortCriterion.SubmittedDate, user_name='defualt', language='cn', api_keys:list = [], model_name="gpt-3.5-turbo", p=1.0, temperature=1.0):
|
329 |
+
self.api_keys = api_keys
|
330 |
+
self.chatPaper = chatPaper( api_keys = self.api_keys, apiTimeInterval=10 , temperature=temperature,top_p=p,model_name=model_name) #openAI api封装
|
331 |
self.user_name = user_name # 读者姓名
|
332 |
self.key_word = key_word # 读者感兴趣的关键词
|
333 |
self.query = query # 读者输入的搜索查询
|
|
|
458 |
|
459 |
return image_url
|
460 |
|
461 |
+
def summary_with_chat(self, paper_list):
|
462 |
htmls = []
|
463 |
utoken = 0
|
464 |
ctoken = 0
|
|
|
474 |
text += list(paper.section_text_dict.values())[0]
|
475 |
#max_token = 2500 * 4
|
476 |
#text = text[:max_token]
|
477 |
+
chat_summary_text, utoken1, ctoken1, ttoken1 = self.chat_summary(text=text)
|
478 |
htmls.append(chat_summary_text)
|
479 |
|
480 |
# TODO 往md文档中插入论文里的像素最大的一张图片,这个方案可以弄的更加智能一些:
|
|
|
492 |
# methods
|
493 |
method_text += paper.section_text_dict[method_key]
|
494 |
text = summary_text + "\n<Methods>:\n" + method_text
|
495 |
+
chat_method_text, utoken2, ctoken2, ttoken2 = self.chat_method(text=text)
|
496 |
htmls.append(chat_method_text)
|
497 |
else:
|
498 |
chat_method_text = ''
|
|
|
515 |
text = summary_text + "\n <Conclusion>:\n" + conclusion_text
|
516 |
else:
|
517 |
text = summary_text
|
518 |
+
chat_conclusion_text, utoken3, ctoken3, ttoken3 = self.chat_conclusion(text=text)
|
519 |
htmls.append(chat_conclusion_text)
|
520 |
htmls.append("\n")
|
521 |
# token统计
|
|
|
530 |
"cost": str(cost),
|
531 |
}
|
532 |
md_text = "\n".join(htmls)
|
|
|
533 |
return markdown.markdown(md_text), pos_count
|
534 |
|
535 |
|
536 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
537 |
stop=tenacity.stop_after_attempt(5),
|
538 |
reraise=True)
|
539 |
+
def chat_conclusion(self, text):
|
|
|
540 |
conclusion_prompt_token = 650
|
541 |
text_token = len(self.encoding.encode(text))
|
542 |
clip_text_index = int(len(text)*(self.max_token_num-conclusion_prompt_token)/text_token)
|
543 |
+
clip_text = text[:clip_text_index]
|
544 |
+
self.chatPaper.reset(convo_id="chatConclusion",system_prompt="You are a reviewer in the field of ["+self.key_word+"] and you need to critically review this article")
|
545 |
+
self.chatPaper.add_to_conversation(convo_id="chatConclusion", role="assistant", message="This is the <summary> and <conclusion> part of an English literature, where <summary> you have already summarized, but <conclusion> part, I need your help to summarize the following questions:"+clip_text)# 背景知识,可以参考OpenReview的审稿流程
|
546 |
+
content = """
|
|
|
|
|
547 |
8. Make the following summary.Be sure to use Chinese answers (proper nouns need to be marked in English).
|
548 |
- (1):What is the significance of this piece of work?
|
549 |
- (2):Summarize the strengths and weaknesses of this article in three dimensions: innovation point, performance, and workload.
|
|
|
554 |
- (2):Innovation point: xxx; Performance: xxx; Workload: xxx;\n
|
555 |
|
556 |
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not repeat the content of the previous <summary>, the value of the use of the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements, if not, you can not write.
|
557 |
+
"""
|
558 |
+
result = self.chatPaper.ask(
|
559 |
+
prompt = content,
|
560 |
+
role="user",
|
561 |
+
convo_id="chatConclusion",
|
|
|
|
|
|
|
562 |
)
|
563 |
+
print(result)
|
564 |
+
return result[0], result[1], result[2], result[3]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
565 |
|
566 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
567 |
stop=tenacity.stop_after_attempt(5),
|
568 |
reraise=True)
|
569 |
+
def chat_method(self, text):
|
|
|
570 |
method_prompt_token = 650
|
571 |
text_token = len(self.encoding.encode(text))
|
572 |
clip_text_index = int(len(text)*(self.max_token_num-method_prompt_token)/text_token)
|
573 |
+
clip_text = text[:clip_text_index]
|
574 |
+
self.chatPaper.reset(convo_id="chatMethod",system_prompt="You are a researcher in the field of ["+self.key_word+"] who is good at summarizing papers using concise statements")# chatgpt 角色
|
575 |
+
self.chatPaper.add_to_conversation(convo_id="chatMethod", role="assistant", message=str("This is the <summary> and <Method> part of an English document, where <summary> you have summarized, but the <Methods> part, I need your help to read and summarize the following questions."+clip_text))
|
576 |
+
content= """
|
|
|
577 |
7. Describe in detail the methodological idea of this article. Be sure to use Chinese answers (proper nouns need to be marked in English). For example, its steps are.
|
578 |
- (1):...
|
579 |
- (2):...
|
|
|
587 |
....... \n\n
|
588 |
|
589 |
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not repeat the content of the previous <summary>, the value of the use of the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements, if not, you can not write.
|
590 |
+
"""
|
591 |
+
result = self.chatPaper.ask(
|
592 |
+
prompt = content,
|
593 |
+
role="user",
|
594 |
+
convo_id="chatMethod",
|
|
|
|
|
595 |
)
|
596 |
+
print(result)
|
597 |
+
return result[0], result[1], result[2], result[3]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
598 |
|
599 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
600 |
stop=tenacity.stop_after_attempt(5),
|
601 |
reraise=True)
|
602 |
+
def chat_summary(self, text):
|
|
|
603 |
summary_prompt_token = 1000
|
604 |
text_token = len(self.encoding.encode(text))
|
605 |
clip_text_index = int(len(text)*(self.max_token_num-summary_prompt_token)/text_token)
|
606 |
clip_text = text[:clip_text_index]
|
607 |
+
self.chatPaper.reset(convo_id="chatSummary",system_prompt="You are a researcher in the field of ["+self.key_word+"] who is good at summarizing papers using concise statements")
|
608 |
+
self.chatPaper.add_to_conversation(convo_id="chatSummary", role="assistant", message=str("This is the title, author, link, abstract and introduction of an English document. I need your help to read and summarize the following questions: "+clip_text))
|
609 |
+
content= """
|
|
|
610 |
1. Mark the title of the paper (with Chinese translation)
|
611 |
2. list all the authors' names (use English)
|
612 |
3. mark the first author's affiliation (output Chinese translation only)
|
|
|
630 |
- (4):xxx.\n\n
|
631 |
|
632 |
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not have too much repetitive information, numerical values using the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed.
|
633 |
+
"""
|
634 |
+
result = self.chatPaper.ask(
|
635 |
+
prompt = content,
|
636 |
+
role="user",
|
637 |
+
convo_id="chatSummary",
|
|
|
|
|
|
|
638 |
)
|
639 |
+
print(result)
|
640 |
+
return result[0], result[1], result[2], result[3]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
641 |
|
642 |
def export_to_markdown(self, text, file_name, mode='w'):
|
643 |
# 使用markdown模块的convert方法,将文本转换为html格式
|
|
|
653 |
print(f"Query: {self.query}")
|
654 |
print(f"Sort: {self.sort}")
|
655 |
|
656 |
+
def upload_pdf(api_keys, text, model_name, p, temperature, file):
|
657 |
# 检查两个输入都不为空
|
658 |
+
api_key_list = None
|
659 |
+
if api_keys:
|
660 |
+
api_key_list = api_keys.split(',')
|
661 |
+
elif not api_keys and valid_api_keys!=[]:
|
662 |
+
api_key_list = valid_api_keys
|
663 |
+
if not text or not file or not api_key_list:
|
664 |
return "两个输入都不能为空,请输入字符并上传 PDF 文件!"
|
665 |
+
|
666 |
# 判断PDF文件
|
667 |
#if file and file.name.split(".")[-1].lower() != "pdf":
|
668 |
# return '请勿上传非 PDF 文件!'
|
|
|
670 |
section_list = text.split(',')
|
671 |
paper_list = [Paper(path=file, sl=section_list)]
|
672 |
# 创建一个Reader对象
|
673 |
+
print(api_key_list)
|
674 |
+
reader = Reader(api_keys=api_key_list, model_name=model_name, p=p, temperature=temperature)
|
675 |
+
sum_info, cost = reader.summary_with_chat(paper_list=paper_list) # type: ignore
|
676 |
return cost, sum_info
|
677 |
|
678 |
api_title = "api-key可用验证"
|
679 |
api_description = '''<div align='left'>
|
680 |
+
|
681 |
<img src='https://visitor-badge.laobi.icu/badge?page_id=https://huggingface.co/spaces/wangrongsheng/ChatPaper'>
|
682 |
+
|
683 |
<img align='right' src='https://i.328888.xyz/2023/03/12/vH9dU.png' width="150">
|
684 |
+
|
685 |
Use ChatGPT to summary the papers.Star our Github [🌟ChatPaper](https://github.com/kaixindelele/ChatPaper) .
|
686 |
+
|
687 |
💗如果您觉得我们的项目对您有帮助,还请您给我们一些鼓励!💗
|
688 |
+
|
689 |
🔴请注意:千万不要用于严肃的学术场景,只能用于论文阅读前的初筛!
|
690 |
+
|
691 |
</div>
|
692 |
'''
|
693 |
|
694 |
api_input = [
|
695 |
+
gradio.inputs.Textbox(label="请输入你的API-key(必填, 多个API-key请用英文逗号隔开)", default="", type='password')
|
696 |
]
|
697 |
api_gui = gradio.Interface(fn=valid_apikey, inputs=api_input, outputs="text", title=api_title, description=api_description)
|
698 |
|
|
|
700 |
title = "ChatPaper"
|
701 |
# 描述
|
702 |
description = '''<div align='left'>
|
703 |
+
|
704 |
<img src='https://visitor-badge.laobi.icu/badge?page_id=https://huggingface.co/spaces/wangrongsheng/ChatPaper'>
|
705 |
+
|
706 |
<img align='right' src='https://i.328888.xyz/2023/03/12/vH9dU.png' width="150">
|
707 |
+
|
708 |
Use ChatGPT to summary the papers.Star our Github [🌟ChatPaper](https://github.com/kaixindelele/ChatPaper) .
|
709 |
+
|
710 |
💗如果您觉得我们的项目对您有帮助,还请您给我们一些鼓励!💗
|
711 |
+
|
712 |
🔴请注意:千万不要用于严肃的学术场景,只能用于论文阅读前的初筛!
|
713 |
+
|
714 |
</div>
|
715 |
'''
|
716 |
# 创建Gradio界面
|
717 |
ip = [
|
718 |
+
gradio.inputs.Textbox(label="请输入你的API-key(必填, 多个API-key请用英文逗号隔开),不需要空格", default="", type='password'),
|
719 |
gradio.inputs.Textbox(label="请输入论文大标题索引(用英文逗号隔开,必填)", default="'Abstract,Introduction,Related Work,Background,Preliminary,Problem Formulation,Methods,Methodology,Method,Approach,Approaches,Materials and Methods,Experiment Settings,Experiment,Experimental Results,Evaluation,Experiments,Results,Findings,Data Analysis,Discussion,Results and Discussion,Conclusion,References'"),
|
720 |
gradio.inputs.Radio(choices=["gpt-3.5-turbo", "gpt-3.5-turbo-0301"], default="gpt-3.5-turbo", label="Select model"),
|
721 |
gradio.inputs.Slider(minimum=-0, maximum=1.0, default=1.0, step=0.05, label="Top-p (nucleus sampling)"),
|
722 |
+
gradio.inputs.Slider(minimum=-0, maximum=5.0, default=0.5, step=0.5, label="Temperature"),
|
723 |
gradio.inputs.File(label="请上传论文PDF(必填)")
|
724 |
]
|
725 |
|
optimizeOpenAI.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
A simple wrapper for the official ChatGPT API
|
3 |
+
"""
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import threading
|
7 |
+
import time
|
8 |
+
import requests
|
9 |
+
import tiktoken
|
10 |
+
from typing import Generator
|
11 |
+
from queue import PriorityQueue as PQ
|
12 |
+
import json
|
13 |
+
import os
|
14 |
+
import time
|
15 |
+
ENCODER = tiktoken.get_encoding("gpt2")
|
16 |
+
class chatPaper:
|
17 |
+
"""
|
18 |
+
Official ChatGPT API
|
19 |
+
"""
|
20 |
+
def __init__(
|
21 |
+
self,
|
22 |
+
api_keys: list,
|
23 |
+
proxy = None,
|
24 |
+
api_proxy = None,
|
25 |
+
max_tokens: int = 4000,
|
26 |
+
temperature: float = 0.5,
|
27 |
+
top_p: float = 1.0,
|
28 |
+
model_name: str = "gpt-3.5-turbo",
|
29 |
+
reply_count: int = 1,
|
30 |
+
system_prompt = "You are ChatPaper, A paper reading bot",
|
31 |
+
lastAPICallTime = time.time()-100,
|
32 |
+
apiTimeInterval = 20,
|
33 |
+
) -> None:
|
34 |
+
self.model_name = model_name
|
35 |
+
self.system_prompt = system_prompt
|
36 |
+
self.apiTimeInterval = apiTimeInterval
|
37 |
+
self.session = requests.Session()
|
38 |
+
self.api_keys = PQ()
|
39 |
+
for key in api_keys:
|
40 |
+
self.api_keys.put((lastAPICallTime,key))
|
41 |
+
self.proxy = proxy
|
42 |
+
if self.proxy:
|
43 |
+
proxies = {
|
44 |
+
"http": self.proxy,
|
45 |
+
"https": self.proxy,
|
46 |
+
}
|
47 |
+
self.session.proxies = proxies
|
48 |
+
self.max_tokens = max_tokens
|
49 |
+
self.temperature = temperature
|
50 |
+
self.top_p = top_p
|
51 |
+
self.reply_count = reply_count
|
52 |
+
self.decrease_step = 250
|
53 |
+
self.conversation = {}
|
54 |
+
if self.token_str(self.system_prompt) > self.max_tokens:
|
55 |
+
raise Exception("System prompt is too long")
|
56 |
+
self.lock = threading.Lock()
|
57 |
+
|
58 |
+
def get_api_key(self):
|
59 |
+
with self.lock:
|
60 |
+
apiKey = self.api_keys.get()
|
61 |
+
delay = self._calculate_delay(apiKey)
|
62 |
+
time.sleep(delay)
|
63 |
+
self.api_keys.put((time.time(), apiKey[1]))
|
64 |
+
return apiKey[1]
|
65 |
+
|
66 |
+
def _calculate_delay(self, apiKey):
|
67 |
+
elapsed_time = time.time() - apiKey[0]
|
68 |
+
if elapsed_time < self.apiTimeInterval:
|
69 |
+
return self.apiTimeInterval - elapsed_time
|
70 |
+
else:
|
71 |
+
return 0
|
72 |
+
|
73 |
+
def add_to_conversation(self, message: str, role: str, convo_id: str = "default"):
|
74 |
+
if(convo_id not in self.conversation):
|
75 |
+
self.reset(convo_id)
|
76 |
+
self.conversation[convo_id].append({"role": role, "content": message})
|
77 |
+
|
78 |
+
def __truncate_conversation(self, convo_id: str = "default"):
|
79 |
+
"""
|
80 |
+
Truncate the conversation
|
81 |
+
"""
|
82 |
+
last_dialog = self.conversation[convo_id][-1]
|
83 |
+
query = str(last_dialog['content'])
|
84 |
+
if(len(ENCODER.encode(str(query)))>self.max_tokens):
|
85 |
+
query = query[:int(1.5*self.max_tokens)]
|
86 |
+
while(len(ENCODER.encode(str(query)))>self.max_tokens):
|
87 |
+
query = query[:self.decrease_step]
|
88 |
+
self.conversation[convo_id] = self.conversation[convo_id][:-1]
|
89 |
+
full_conversation = "\n".join([str(x["content"]) for x in self.conversation[convo_id]],)
|
90 |
+
if len(ENCODER.encode(full_conversation)) > self.max_tokens:
|
91 |
+
self.conversation_summary(convo_id=convo_id)
|
92 |
+
while True:
|
93 |
+
full_conversation = ""
|
94 |
+
for x in self.conversation[convo_id]:
|
95 |
+
full_conversation = str(x["content"]) + "\n" + full_conversation
|
96 |
+
if (len(ENCODER.encode(full_conversation+query)) > self.max_tokens):
|
97 |
+
query = query[:self.decrease_step]
|
98 |
+
else:
|
99 |
+
break
|
100 |
+
last_dialog['content'] = str(query)
|
101 |
+
self.conversation[convo_id].append(last_dialog)
|
102 |
+
|
103 |
+
def ask_stream(
|
104 |
+
self,
|
105 |
+
prompt: str,
|
106 |
+
role: str = "user",
|
107 |
+
convo_id: str = "default",
|
108 |
+
**kwargs,
|
109 |
+
) -> Generator:
|
110 |
+
if convo_id not in self.conversation:
|
111 |
+
self.reset(convo_id=convo_id)
|
112 |
+
self.add_to_conversation(prompt, "user", convo_id=convo_id)
|
113 |
+
self.__truncate_conversation(convo_id=convo_id)
|
114 |
+
apiKey = self.get_api_key()
|
115 |
+
response = self.session.post(
|
116 |
+
"https://api.openai.com/v1/chat/completions",
|
117 |
+
headers={"Authorization": f"Bearer {kwargs.get('api_key', apiKey)}"},
|
118 |
+
json={
|
119 |
+
"model": self.model_name,
|
120 |
+
"messages": self.conversation[convo_id],
|
121 |
+
"stream": True,
|
122 |
+
# kwargs
|
123 |
+
"temperature": kwargs.get("temperature", self.temperature),
|
124 |
+
"top_p": kwargs.get("top_p", self.top_p),
|
125 |
+
"n": kwargs.get("n", self.reply_count),
|
126 |
+
"user": role,
|
127 |
+
},
|
128 |
+
stream=True,
|
129 |
+
)
|
130 |
+
if response.status_code != 200:
|
131 |
+
raise Exception(
|
132 |
+
f"Error: {response.status_code} {response.reason} {response.text}",
|
133 |
+
)
|
134 |
+
for line in response.iter_lines():
|
135 |
+
if not line:
|
136 |
+
continue
|
137 |
+
# Remove "data: "
|
138 |
+
line = line.decode("utf-8")[6:]
|
139 |
+
if line == "[DONE]":
|
140 |
+
break
|
141 |
+
resp: dict = json.loads(line)
|
142 |
+
choices = resp.get("choices")
|
143 |
+
if not choices:
|
144 |
+
continue
|
145 |
+
delta = choices[0].get("delta")
|
146 |
+
if not delta:
|
147 |
+
continue
|
148 |
+
if "content" in delta:
|
149 |
+
content = delta["content"]
|
150 |
+
yield content
|
151 |
+
def ask(self, prompt: str, role: str = "user", convo_id: str = "default", **kwargs):
|
152 |
+
"""
|
153 |
+
Non-streaming ask
|
154 |
+
"""
|
155 |
+
response = self.ask_stream(
|
156 |
+
prompt=prompt,
|
157 |
+
role=role,
|
158 |
+
convo_id=convo_id,
|
159 |
+
**kwargs,
|
160 |
+
)
|
161 |
+
full_response: str = "".join(response)
|
162 |
+
self.add_to_conversation(full_response, role, convo_id=convo_id)
|
163 |
+
usage_token = self.token_str(prompt)
|
164 |
+
com_token = self.token_str(full_response)
|
165 |
+
total_token = self.token_cost(convo_id=convo_id)
|
166 |
+
return full_response, usage_token, com_token, total_token
|
167 |
+
|
168 |
+
def check_api_available(self):
|
169 |
+
response = self.session.post(
|
170 |
+
"https://api.openai.com/v1/chat/completions",
|
171 |
+
headers={"Authorization": f"Bearer {self.get_api_key()}"},
|
172 |
+
json={
|
173 |
+
"model": self.engine,
|
174 |
+
"messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "print A"}],
|
175 |
+
"stream": True,
|
176 |
+
# kwargs
|
177 |
+
"temperature": self.temperature,
|
178 |
+
"top_p": self.top_p,
|
179 |
+
"n": self.reply_count,
|
180 |
+
"user": "user",
|
181 |
+
},
|
182 |
+
stream=True,
|
183 |
+
)
|
184 |
+
if response.status_code == 200:
|
185 |
+
return True
|
186 |
+
else:
|
187 |
+
return False
|
188 |
+
def reset(self, convo_id: str = "default", system_prompt = None):
|
189 |
+
"""
|
190 |
+
Reset the conversation
|
191 |
+
"""
|
192 |
+
self.conversation[convo_id] = [
|
193 |
+
{"role": "system", "content": str(system_prompt or self.system_prompt)},
|
194 |
+
]
|
195 |
+
def conversation_summary(self, convo_id: str = "default"):
|
196 |
+
input = ""
|
197 |
+
role = ""
|
198 |
+
for conv in self.conversation[convo_id]:
|
199 |
+
if (conv["role"]=='user'):
|
200 |
+
role = 'User'
|
201 |
+
else:
|
202 |
+
role = 'ChatGpt'
|
203 |
+
input+=role+' : '+conv['content']+'\n'
|
204 |
+
prompt = "Your goal is to summarize the provided conversation in English. Your summary should be concise and focus on the key information to facilitate better dialogue for the large language model.Ensure that you include all necessary details and relevant information while still reducing the length of the conversation as much as possible. Your summary should be clear and easily understandable for the ChatGpt model providing a comprehensive and concise summary of the conversation."
|
205 |
+
if(self.token_str(str(input)+prompt)>self.max_tokens):
|
206 |
+
input = input[self.token_str(str(input))-self.max_tokens:]
|
207 |
+
while self.token_str(str(input)+prompt)>self.max_tokens:
|
208 |
+
input = input[self.decrease_step:]
|
209 |
+
prompt = prompt.replace("{conversation}", input)
|
210 |
+
self.reset(convo_id='conversationSummary')
|
211 |
+
response = self.ask(prompt,convo_id='conversationSummary')
|
212 |
+
while self.token_str(str(response))>self.max_tokens:
|
213 |
+
response = response[:-self.decrease_step]
|
214 |
+
self.reset(convo_id='conversationSummary',system_prompt='Summariaze our diaglog')
|
215 |
+
self.conversation[convo_id] = [
|
216 |
+
{"role": "system", "content": self.system_prompt},
|
217 |
+
{"role": "user", "content": "Summariaze our diaglog"},
|
218 |
+
{"role": 'assistant', "content": response},
|
219 |
+
]
|
220 |
+
return self.conversation[convo_id]
|
221 |
+
def token_cost(self,convo_id: str = "default"):
|
222 |
+
return len(ENCODER.encode("\n".join([x["content"] for x in self.conversation[convo_id]])))
|
223 |
+
def token_str(self,content:str):
|
224 |
+
return len(ENCODER.encode(content))
|
225 |
+
def main():
|
226 |
+
return
|