linhkid91 commited on
Commit
fc0e67e
·
1 Parent(s): 5d885c4

Read html version of papers instead of just abstract

Browse files
Files changed (5) hide show
  1. src/action.py +1 -1
  2. src/app.py +2 -2
  3. src/download_new_papers.py +21 -1
  4. src/relevancy.py +28 -11
  5. src/utils.py +6 -3
src/action.py CHANGED
@@ -247,7 +247,7 @@ def generate_body(topic, categories, interest, threshold):
247
  papers,
248
  query={"interest": interest},
249
  threshold_score=threshold,
250
- num_paper_in_prompt=20,
251
  )
252
  body = "<br><br>".join(
253
  [
 
247
  papers,
248
  query={"interest": interest},
249
  threshold_score=threshold,
250
+ num_paper_in_prompt=2,
251
  )
252
  body = "<br><br>".join(
253
  [
src/app.py CHANGED
@@ -81,7 +81,7 @@ def sample(email, topic, physics_topic, categories, interest):
81
  papers,
82
  query={"interest": interest},
83
  threshold_score=0,
84
- num_paper_in_prompt=4)
85
  return "\n\n".join([paper["summarized_text"] for paper in relevancy])
86
  else:
87
  return "\n\n".join(f"Title: {paper['title']}\nAuthors: {paper['authors']}" for paper in papers)
@@ -127,7 +127,7 @@ def test(email, topic, physics_topic, categories, interest, key):
127
  papers,
128
  query={"interest": interest},
129
  threshold_score=7,
130
- num_paper_in_prompt=8)
131
  body = "<br><br>".join([f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}' for paper in relevancy])
132
  if hallucination:
133
  body = "Warning: the model hallucinated some papers. We have tried to remove them, but the scores may not be accurate.<br><br>" + body
 
81
  papers,
82
  query={"interest": interest},
83
  threshold_score=0,
84
+ num_paper_in_prompt=2)
85
  return "\n\n".join([paper["summarized_text"] for paper in relevancy])
86
  else:
87
  return "\n\n".join(f"Title: {paper['title']}\nAuthors: {paper['authors']}" for paper in papers)
 
127
  papers,
128
  query={"interest": interest},
129
  threshold_score=7,
130
+ num_paper_in_prompt=2)
131
  body = "<br><br>".join([f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}' for paper in relevancy])
132
  if hallucination:
133
  body = "Warning: the model hallucinated some papers. We have tried to remove them, but the scores may not be accurate.<br><br>" + body
src/download_new_papers.py CHANGED
@@ -1,5 +1,7 @@
1
  # encoding: utf-8
2
  import os
 
 
3
  import tqdm
4
  from bs4 import BeautifulSoup as bs
5
  import urllib.request
@@ -8,6 +10,21 @@ import datetime
8
  import pytz
9
 
10
  #Linh - add new def crawl_html_version(html_link) here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def _download_new_papers(field_abbr):
12
  NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new' # https://arxiv.org/list/cs/new
13
  page = urllib.request.urlopen(NEW_SUB_URL)
@@ -30,13 +47,14 @@ def _download_new_papers(field_abbr):
30
  paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1]
31
  paper['main_page'] = arxiv_base + paper_number
32
  paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
33
- paper['html'] = arxiv_html + paper_number + "v1"
34
 
35
  paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title: ", "").strip()
36
  paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \
37
  .replace("Authors:\n", "").replace("\n", "").strip()
38
  paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects: ", "").strip()
39
  paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
 
40
  new_paper_list.append(paper)
41
 
42
 
@@ -64,3 +82,5 @@ def get_papers(field_abbr, limit=None):
64
  return results
65
  results.append(json.loads(line))
66
  return results
 
 
 
1
  # encoding: utf-8
2
  import os
3
+ from urllib.error import HTTPError
4
+
5
  import tqdm
6
  from bs4 import BeautifulSoup as bs
7
  import urllib.request
 
10
  import pytz
11
 
12
  #Linh - add new def crawl_html_version(html_link) here
13
+ def crawl_html_version(html_link):
14
+ main_content = []
15
+ try:
16
+ html = urllib.request.urlopen(html_link)
17
+ except HTTPError as e:
18
+ return ["None"]
19
+ soup = bs(html)
20
+ content = soup.find('div', attrs={'class': 'ltx_page_content'})
21
+ para_list = content.find_all("div", attrs={'class': 'ltx_para'})
22
+
23
+ for each in para_list:
24
+ main_content.append(each.text.strip())
25
+ return ' '.join(main_content)[:8000]
26
+ #if len(main_content >)
27
+ #return ''.join(main_content) if len(main_content) < 20000 else ''.join(main_content[:20000])
28
  def _download_new_papers(field_abbr):
29
  NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new' # https://arxiv.org/list/cs/new
30
  page = urllib.request.urlopen(NEW_SUB_URL)
 
47
  paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1]
48
  paper['main_page'] = arxiv_base + paper_number
49
  paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
50
+ #paper['html'] = arxiv_html + paper_number + "v1"
51
 
52
  paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title: ", "").strip()
53
  paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \
54
  .replace("Authors:\n", "").replace("\n", "").strip()
55
  paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects: ", "").strip()
56
  paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
57
+ paper['content'] = crawl_html_version(arxiv_html + paper_number + "v1")
58
  new_paper_list.append(paper)
59
 
60
 
 
82
  return results
83
  results.append(json.loads(line))
84
  return results
85
+
86
+ #crawl_html_version("https://arxiv.org/html/2404.11972v1")
src/relevancy.py CHANGED
@@ -23,15 +23,16 @@ def encode_prompt(query, prompt_papers):
23
  prompt += query['interest']
24
 
25
  for idx, task_dict in enumerate(prompt_papers):
26
- (title, authors, abstract) = task_dict["title"], task_dict["authors"], task_dict["abstract"]
27
  if not title:
28
  raise
29
  prompt += f"###\n"
30
  prompt += f"{idx + 1}. Title: {title}\n"
31
  prompt += f"{idx + 1}. Authors: {authors}\n"
32
  prompt += f"{idx + 1}. Abstract: {abstract}\n"
 
33
  prompt += f"\n Generate response:\n1."
34
- print(prompt)
35
  return prompt
36
 
37
 
@@ -42,18 +43,34 @@ def is_json(myjson):
42
  return False
43
  return True
44
 
45
-
46
- def post_process_chat_gpt_response(paper_data, response, threshold_score=8):
47
  selected_data = []
48
  if response is None:
49
  return []
50
  json_items = response['message']['content'].replace("\n\n", "\n").split("\n")
51
  pattern = r"^\d+\. |\\"
52
  import pprint
 
 
 
 
 
 
 
53
  try:
54
- score_items = [
55
- json.loads(re.sub(pattern, "", line))
56
- for line in json_items if (is_json(line) and "relevancy score" in line.lower())]
 
 
 
 
 
 
 
 
 
 
57
  except Exception as e:
58
  pprint.pprint([re.sub(pattern, "", line) for line in json_items if "relevancy score" in line.lower()])
59
  try:
@@ -105,7 +122,7 @@ def generate_relevance_score(
105
  query,
106
  model_name="gpt-3.5-turbo-16k",
107
  threshold_score=7,
108
- num_paper_in_prompt=8,
109
  temperature=0.4,
110
  top_p=1.0,
111
  sorting=True
@@ -121,7 +138,7 @@ def generate_relevance_score(
121
  decoding_args = utils.OpenAIDecodingArguments(
122
  temperature=temperature,
123
  n=1,
124
- max_tokens=128*num_paper_in_prompt, # The response for each paper should be less than 128 tokens.
125
  top_p=top_p,
126
  )
127
  request_start = time.time()
@@ -153,8 +170,8 @@ def run_all_day_paper(
153
  date=None,
154
  data_dir="../data",
155
  model_name="gpt-3.5-turbo-16k",
156
- threshold_score=8,
157
- num_paper_in_prompt=8,
158
  temperature=0.4,
159
  top_p=1.0
160
  ):
 
23
  prompt += query['interest']
24
 
25
  for idx, task_dict in enumerate(prompt_papers):
26
+ (title, authors, abstract, content) = task_dict["title"], task_dict["authors"], task_dict["abstract"], task_dict["content"]
27
  if not title:
28
  raise
29
  prompt += f"###\n"
30
  prompt += f"{idx + 1}. Title: {title}\n"
31
  prompt += f"{idx + 1}. Authors: {authors}\n"
32
  prompt += f"{idx + 1}. Abstract: {abstract}\n"
33
+ prompt += f"{idx + 1}. Content: {content}\n"
34
  prompt += f"\n Generate response:\n1."
35
+ #print(prompt)
36
  return prompt
37
 
38
 
 
43
  return False
44
  return True
45
 
46
+ def post_process_chat_gpt_response(paper_data, response, threshold_score=7):
 
47
  selected_data = []
48
  if response is None:
49
  return []
50
  json_items = response['message']['content'].replace("\n\n", "\n").split("\n")
51
  pattern = r"^\d+\. |\\"
52
  import pprint
53
+
54
+ def try_loads(line):
55
+ try:
56
+ return json.loads(re.sub(pattern, "", line))
57
+ except json.JSONDecodeError:
58
+ return None
59
+ score_items = []
60
  try:
61
+ # score_items = [
62
+ # json.loads(re.sub(pattern, "", line))
63
+ # for line in json_items if (is_json(line) and "relevancy score" in line.lower())]
64
+ for line in json_items:
65
+ if is_json(line) and "relevancy score" in line.lower():
66
+ score_items.append(json.loads(re.sub(pattern, "", line)))
67
+ #elif
68
+
69
+ # score_items = [
70
+ # loaded_json
71
+ # for line in json_items if (is_json(line) and "relevancy score" in line.lower())
72
+ # for loaded_json in [try_loads(line)] if loaded_json is not None
73
+ # ]
74
  except Exception as e:
75
  pprint.pprint([re.sub(pattern, "", line) for line in json_items if "relevancy score" in line.lower()])
76
  try:
 
122
  query,
123
  model_name="gpt-3.5-turbo-16k",
124
  threshold_score=7,
125
+ num_paper_in_prompt=1,
126
  temperature=0.4,
127
  top_p=1.0,
128
  sorting=True
 
138
  decoding_args = utils.OpenAIDecodingArguments(
139
  temperature=temperature,
140
  n=1,
141
+ max_tokens=1024*num_paper_in_prompt, # The response for each paper should be less than 128 tokens.
142
  top_p=top_p,
143
  )
144
  request_start = time.time()
 
170
  date=None,
171
  data_dir="../data",
172
  model_name="gpt-3.5-turbo-16k",
173
+ threshold_score=7,
174
+ num_paper_in_prompt=2,
175
  temperature=0.4,
176
  top_p=1.0
177
  ):
src/utils.py CHANGED
@@ -24,7 +24,8 @@ if openai_org is not None:
24
 
25
  @dataclasses.dataclass
26
  class OpenAIDecodingArguments(object):
27
- max_tokens: int = 1800
 
28
  temperature: float = 0.2
29
  top_p: float = 1.0
30
  n: int = 1
@@ -39,7 +40,7 @@ def openai_completion(
39
  prompts, #: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]],
40
  decoding_args: OpenAIDecodingArguments,
41
  model_name="text-davinci-003",
42
- sleep_time=2,
43
  batch_size=1,
44
  max_instances=sys.maxsize,
45
  max_batches=sys.maxsize,
@@ -96,10 +97,11 @@ def openai_completion(
96
  ):
97
  batch_decoding_args = copy.deepcopy(decoding_args) # cloning the decoding_args
98
 
99
- backoff = 3
100
 
101
  while True:
102
  try:
 
103
  shared_kwargs = dict(
104
  model=model_name,
105
  **batch_decoding_args.__dict__,
@@ -134,6 +136,7 @@ def openai_completion(
134
  backoff -= 1
135
  logging.warning("Hit request rate limit; retrying...")
136
  time.sleep(sleep_time) # Annoying rate limit on requests.
 
137
 
138
  if return_text:
139
  completions = [completion.text for completion in completions]
 
24
 
25
  @dataclasses.dataclass
26
  class OpenAIDecodingArguments(object):
27
+ #max_tokens: int = 1800
28
+ max_tokens: int = 4800
29
  temperature: float = 0.2
30
  top_p: float = 1.0
31
  n: int = 1
 
40
  prompts, #: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]],
41
  decoding_args: OpenAIDecodingArguments,
42
  model_name="text-davinci-003",
43
+ sleep_time=15,
44
  batch_size=1,
45
  max_instances=sys.maxsize,
46
  max_batches=sys.maxsize,
 
97
  ):
98
  batch_decoding_args = copy.deepcopy(decoding_args) # cloning the decoding_args
99
 
100
+ backoff = 5
101
 
102
  while True:
103
  try:
104
+ time.sleep(3)
105
  shared_kwargs = dict(
106
  model=model_name,
107
  **batch_decoding_args.__dict__,
 
136
  backoff -= 1
137
  logging.warning("Hit request rate limit; retrying...")
138
  time.sleep(sleep_time) # Annoying rate limit on requests.
139
+ continue
140
 
141
  if return_text:
142
  completions = [completion.text for completion in completions]