Spaces:
Running
Running
Read html version of papers instead of just abstract
Browse files- src/action.py +1 -1
- src/app.py +2 -2
- src/download_new_papers.py +21 -1
- src/relevancy.py +28 -11
- src/utils.py +6 -3
src/action.py
CHANGED
@@ -247,7 +247,7 @@ def generate_body(topic, categories, interest, threshold):
|
|
247 |
papers,
|
248 |
query={"interest": interest},
|
249 |
threshold_score=threshold,
|
250 |
-
num_paper_in_prompt=
|
251 |
)
|
252 |
body = "<br><br>".join(
|
253 |
[
|
|
|
247 |
papers,
|
248 |
query={"interest": interest},
|
249 |
threshold_score=threshold,
|
250 |
+
num_paper_in_prompt=2,
|
251 |
)
|
252 |
body = "<br><br>".join(
|
253 |
[
|
src/app.py
CHANGED
@@ -81,7 +81,7 @@ def sample(email, topic, physics_topic, categories, interest):
|
|
81 |
papers,
|
82 |
query={"interest": interest},
|
83 |
threshold_score=0,
|
84 |
-
num_paper_in_prompt=
|
85 |
return "\n\n".join([paper["summarized_text"] for paper in relevancy])
|
86 |
else:
|
87 |
return "\n\n".join(f"Title: {paper['title']}\nAuthors: {paper['authors']}" for paper in papers)
|
@@ -127,7 +127,7 @@ def test(email, topic, physics_topic, categories, interest, key):
|
|
127 |
papers,
|
128 |
query={"interest": interest},
|
129 |
threshold_score=7,
|
130 |
-
num_paper_in_prompt=
|
131 |
body = "<br><br>".join([f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}' for paper in relevancy])
|
132 |
if hallucination:
|
133 |
body = "Warning: the model hallucinated some papers. We have tried to remove them, but the scores may not be accurate.<br><br>" + body
|
|
|
81 |
papers,
|
82 |
query={"interest": interest},
|
83 |
threshold_score=0,
|
84 |
+
num_paper_in_prompt=2)
|
85 |
return "\n\n".join([paper["summarized_text"] for paper in relevancy])
|
86 |
else:
|
87 |
return "\n\n".join(f"Title: {paper['title']}\nAuthors: {paper['authors']}" for paper in papers)
|
|
|
127 |
papers,
|
128 |
query={"interest": interest},
|
129 |
threshold_score=7,
|
130 |
+
num_paper_in_prompt=2)
|
131 |
body = "<br><br>".join([f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}' for paper in relevancy])
|
132 |
if hallucination:
|
133 |
body = "Warning: the model hallucinated some papers. We have tried to remove them, but the scores may not be accurate.<br><br>" + body
|
src/download_new_papers.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
# encoding: utf-8
|
2 |
import os
|
|
|
|
|
3 |
import tqdm
|
4 |
from bs4 import BeautifulSoup as bs
|
5 |
import urllib.request
|
@@ -8,6 +10,21 @@ import datetime
|
|
8 |
import pytz
|
9 |
|
10 |
#Linh - add new def crawl_html_version(html_link) here
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def _download_new_papers(field_abbr):
|
12 |
NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new' # https://arxiv.org/list/cs/new
|
13 |
page = urllib.request.urlopen(NEW_SUB_URL)
|
@@ -30,13 +47,14 @@ def _download_new_papers(field_abbr):
|
|
30 |
paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1]
|
31 |
paper['main_page'] = arxiv_base + paper_number
|
32 |
paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
|
33 |
-
paper['html'] = arxiv_html + paper_number + "v1"
|
34 |
|
35 |
paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title: ", "").strip()
|
36 |
paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \
|
37 |
.replace("Authors:\n", "").replace("\n", "").strip()
|
38 |
paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects: ", "").strip()
|
39 |
paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
|
|
|
40 |
new_paper_list.append(paper)
|
41 |
|
42 |
|
@@ -64,3 +82,5 @@ def get_papers(field_abbr, limit=None):
|
|
64 |
return results
|
65 |
results.append(json.loads(line))
|
66 |
return results
|
|
|
|
|
|
1 |
# encoding: utf-8
|
2 |
import os
|
3 |
+
from urllib.error import HTTPError
|
4 |
+
|
5 |
import tqdm
|
6 |
from bs4 import BeautifulSoup as bs
|
7 |
import urllib.request
|
|
|
10 |
import pytz
|
11 |
|
12 |
#Linh - add new def crawl_html_version(html_link) here
|
13 |
+
def crawl_html_version(html_link):
|
14 |
+
main_content = []
|
15 |
+
try:
|
16 |
+
html = urllib.request.urlopen(html_link)
|
17 |
+
except HTTPError as e:
|
18 |
+
return ["None"]
|
19 |
+
soup = bs(html)
|
20 |
+
content = soup.find('div', attrs={'class': 'ltx_page_content'})
|
21 |
+
para_list = content.find_all("div", attrs={'class': 'ltx_para'})
|
22 |
+
|
23 |
+
for each in para_list:
|
24 |
+
main_content.append(each.text.strip())
|
25 |
+
return ' '.join(main_content)[:8000]
|
26 |
+
#if len(main_content >)
|
27 |
+
#return ''.join(main_content) if len(main_content) < 20000 else ''.join(main_content[:20000])
|
28 |
def _download_new_papers(field_abbr):
|
29 |
NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new' # https://arxiv.org/list/cs/new
|
30 |
page = urllib.request.urlopen(NEW_SUB_URL)
|
|
|
47 |
paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1]
|
48 |
paper['main_page'] = arxiv_base + paper_number
|
49 |
paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
|
50 |
+
#paper['html'] = arxiv_html + paper_number + "v1"
|
51 |
|
52 |
paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title: ", "").strip()
|
53 |
paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \
|
54 |
.replace("Authors:\n", "").replace("\n", "").strip()
|
55 |
paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects: ", "").strip()
|
56 |
paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
|
57 |
+
paper['content'] = crawl_html_version(arxiv_html + paper_number + "v1")
|
58 |
new_paper_list.append(paper)
|
59 |
|
60 |
|
|
|
82 |
return results
|
83 |
results.append(json.loads(line))
|
84 |
return results
|
85 |
+
|
86 |
+
#crawl_html_version("https://arxiv.org/html/2404.11972v1")
|
src/relevancy.py
CHANGED
@@ -23,15 +23,16 @@ def encode_prompt(query, prompt_papers):
|
|
23 |
prompt += query['interest']
|
24 |
|
25 |
for idx, task_dict in enumerate(prompt_papers):
|
26 |
-
(title, authors, abstract) = task_dict["title"], task_dict["authors"], task_dict["abstract"]
|
27 |
if not title:
|
28 |
raise
|
29 |
prompt += f"###\n"
|
30 |
prompt += f"{idx + 1}. Title: {title}\n"
|
31 |
prompt += f"{idx + 1}. Authors: {authors}\n"
|
32 |
prompt += f"{idx + 1}. Abstract: {abstract}\n"
|
|
|
33 |
prompt += f"\n Generate response:\n1."
|
34 |
-
print(prompt)
|
35 |
return prompt
|
36 |
|
37 |
|
@@ -42,18 +43,34 @@ def is_json(myjson):
|
|
42 |
return False
|
43 |
return True
|
44 |
|
45 |
-
|
46 |
-
def post_process_chat_gpt_response(paper_data, response, threshold_score=8):
|
47 |
selected_data = []
|
48 |
if response is None:
|
49 |
return []
|
50 |
json_items = response['message']['content'].replace("\n\n", "\n").split("\n")
|
51 |
pattern = r"^\d+\. |\\"
|
52 |
import pprint
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
try:
|
54 |
-
score_items = [
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
except Exception as e:
|
58 |
pprint.pprint([re.sub(pattern, "", line) for line in json_items if "relevancy score" in line.lower()])
|
59 |
try:
|
@@ -105,7 +122,7 @@ def generate_relevance_score(
|
|
105 |
query,
|
106 |
model_name="gpt-3.5-turbo-16k",
|
107 |
threshold_score=7,
|
108 |
-
num_paper_in_prompt=
|
109 |
temperature=0.4,
|
110 |
top_p=1.0,
|
111 |
sorting=True
|
@@ -121,7 +138,7 @@ def generate_relevance_score(
|
|
121 |
decoding_args = utils.OpenAIDecodingArguments(
|
122 |
temperature=temperature,
|
123 |
n=1,
|
124 |
-
max_tokens=
|
125 |
top_p=top_p,
|
126 |
)
|
127 |
request_start = time.time()
|
@@ -153,8 +170,8 @@ def run_all_day_paper(
|
|
153 |
date=None,
|
154 |
data_dir="../data",
|
155 |
model_name="gpt-3.5-turbo-16k",
|
156 |
-
threshold_score=
|
157 |
-
num_paper_in_prompt=
|
158 |
temperature=0.4,
|
159 |
top_p=1.0
|
160 |
):
|
|
|
23 |
prompt += query['interest']
|
24 |
|
25 |
for idx, task_dict in enumerate(prompt_papers):
|
26 |
+
(title, authors, abstract, content) = task_dict["title"], task_dict["authors"], task_dict["abstract"], task_dict["content"]
|
27 |
if not title:
|
28 |
raise
|
29 |
prompt += f"###\n"
|
30 |
prompt += f"{idx + 1}. Title: {title}\n"
|
31 |
prompt += f"{idx + 1}. Authors: {authors}\n"
|
32 |
prompt += f"{idx + 1}. Abstract: {abstract}\n"
|
33 |
+
prompt += f"{idx + 1}. Content: {content}\n"
|
34 |
prompt += f"\n Generate response:\n1."
|
35 |
+
#print(prompt)
|
36 |
return prompt
|
37 |
|
38 |
|
|
|
43 |
return False
|
44 |
return True
|
45 |
|
46 |
+
def post_process_chat_gpt_response(paper_data, response, threshold_score=7):
|
|
|
47 |
selected_data = []
|
48 |
if response is None:
|
49 |
return []
|
50 |
json_items = response['message']['content'].replace("\n\n", "\n").split("\n")
|
51 |
pattern = r"^\d+\. |\\"
|
52 |
import pprint
|
53 |
+
|
54 |
+
def try_loads(line):
|
55 |
+
try:
|
56 |
+
return json.loads(re.sub(pattern, "", line))
|
57 |
+
except json.JSONDecodeError:
|
58 |
+
return None
|
59 |
+
score_items = []
|
60 |
try:
|
61 |
+
# score_items = [
|
62 |
+
# json.loads(re.sub(pattern, "", line))
|
63 |
+
# for line in json_items if (is_json(line) and "relevancy score" in line.lower())]
|
64 |
+
for line in json_items:
|
65 |
+
if is_json(line) and "relevancy score" in line.lower():
|
66 |
+
score_items.append(json.loads(re.sub(pattern, "", line)))
|
67 |
+
#elif
|
68 |
+
|
69 |
+
# score_items = [
|
70 |
+
# loaded_json
|
71 |
+
# for line in json_items if (is_json(line) and "relevancy score" in line.lower())
|
72 |
+
# for loaded_json in [try_loads(line)] if loaded_json is not None
|
73 |
+
# ]
|
74 |
except Exception as e:
|
75 |
pprint.pprint([re.sub(pattern, "", line) for line in json_items if "relevancy score" in line.lower()])
|
76 |
try:
|
|
|
122 |
query,
|
123 |
model_name="gpt-3.5-turbo-16k",
|
124 |
threshold_score=7,
|
125 |
+
num_paper_in_prompt=1,
|
126 |
temperature=0.4,
|
127 |
top_p=1.0,
|
128 |
sorting=True
|
|
|
138 |
decoding_args = utils.OpenAIDecodingArguments(
|
139 |
temperature=temperature,
|
140 |
n=1,
|
141 |
+
max_tokens=1024*num_paper_in_prompt, # The response for each paper should be less than 128 tokens.
|
142 |
top_p=top_p,
|
143 |
)
|
144 |
request_start = time.time()
|
|
|
170 |
date=None,
|
171 |
data_dir="../data",
|
172 |
model_name="gpt-3.5-turbo-16k",
|
173 |
+
threshold_score=7,
|
174 |
+
num_paper_in_prompt=2,
|
175 |
temperature=0.4,
|
176 |
top_p=1.0
|
177 |
):
|
src/utils.py
CHANGED
@@ -24,7 +24,8 @@ if openai_org is not None:
|
|
24 |
|
25 |
@dataclasses.dataclass
|
26 |
class OpenAIDecodingArguments(object):
|
27 |
-
max_tokens: int = 1800
|
|
|
28 |
temperature: float = 0.2
|
29 |
top_p: float = 1.0
|
30 |
n: int = 1
|
@@ -39,7 +40,7 @@ def openai_completion(
|
|
39 |
prompts, #: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]],
|
40 |
decoding_args: OpenAIDecodingArguments,
|
41 |
model_name="text-davinci-003",
|
42 |
-
sleep_time=
|
43 |
batch_size=1,
|
44 |
max_instances=sys.maxsize,
|
45 |
max_batches=sys.maxsize,
|
@@ -96,10 +97,11 @@ def openai_completion(
|
|
96 |
):
|
97 |
batch_decoding_args = copy.deepcopy(decoding_args) # cloning the decoding_args
|
98 |
|
99 |
-
backoff =
|
100 |
|
101 |
while True:
|
102 |
try:
|
|
|
103 |
shared_kwargs = dict(
|
104 |
model=model_name,
|
105 |
**batch_decoding_args.__dict__,
|
@@ -134,6 +136,7 @@ def openai_completion(
|
|
134 |
backoff -= 1
|
135 |
logging.warning("Hit request rate limit; retrying...")
|
136 |
time.sleep(sleep_time) # Annoying rate limit on requests.
|
|
|
137 |
|
138 |
if return_text:
|
139 |
completions = [completion.text for completion in completions]
|
|
|
24 |
|
25 |
@dataclasses.dataclass
|
26 |
class OpenAIDecodingArguments(object):
|
27 |
+
#max_tokens: int = 1800
|
28 |
+
max_tokens: int = 4800
|
29 |
temperature: float = 0.2
|
30 |
top_p: float = 1.0
|
31 |
n: int = 1
|
|
|
40 |
prompts, #: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]],
|
41 |
decoding_args: OpenAIDecodingArguments,
|
42 |
model_name="text-davinci-003",
|
43 |
+
sleep_time=15,
|
44 |
batch_size=1,
|
45 |
max_instances=sys.maxsize,
|
46 |
max_batches=sys.maxsize,
|
|
|
97 |
):
|
98 |
batch_decoding_args = copy.deepcopy(decoding_args) # cloning the decoding_args
|
99 |
|
100 |
+
backoff = 5
|
101 |
|
102 |
while True:
|
103 |
try:
|
104 |
+
time.sleep(3)
|
105 |
shared_kwargs = dict(
|
106 |
model=model_name,
|
107 |
**batch_decoding_args.__dict__,
|
|
|
136 |
backoff -= 1
|
137 |
logging.warning("Hit request rate limit; retrying...")
|
138 |
time.sleep(sleep_time) # Annoying rate limit on requests.
|
139 |
+
continue
|
140 |
|
141 |
if return_text:
|
142 |
completions = [completion.text for completion in completions]
|