Spaces:

linhkid91
/

ArxivDigest-extra

Running

App Files Files Community

linhkid91 commited on Apr 26, 2024

Commit

fc0e67e

1 Parent(s): 5d885c4

Read html version of papers instead of just abstract

Browse files

Files changed (5) hide show

src/action.py +1 -1
src/app.py +2 -2
src/download_new_papers.py +21 -1
src/relevancy.py +28 -11
src/utils.py +6 -3

src/action.py CHANGED Viewed

@@ -247,7 +247,7 @@ def generate_body(topic, categories, interest, threshold):
             papers,
             query={"interest": interest},
             threshold_score=threshold,
-            num_paper_in_prompt=20,
         )
         body = "<br><br>".join(
             [

             papers,
             query={"interest": interest},
             threshold_score=threshold,
+            num_paper_in_prompt=2,
         )
         body = "<br><br>".join(
             [

src/app.py CHANGED Viewed

@@ -81,7 +81,7 @@ def sample(email, topic, physics_topic, categories, interest):
             papers,
             query={"interest": interest},
             threshold_score=0,
-            num_paper_in_prompt=4)
         return "\n\n".join([paper["summarized_text"] for paper in relevancy])
     else:
         return "\n\n".join(f"Title: {paper['title']}\nAuthors: {paper['authors']}" for paper in papers)
@@ -127,7 +127,7 @@ def test(email, topic, physics_topic, categories, interest, key):
             papers,
             query={"interest": interest},
             threshold_score=7,
-            num_paper_in_prompt=8)
         body = "<br><br>".join([f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}' for paper in relevancy])
         if hallucination:
             body = "Warning: the model hallucinated some papers. We have tried to remove them, but the scores may not be accurate.<br><br>" + body

             papers,
             query={"interest": interest},
             threshold_score=0,
+            num_paper_in_prompt=2)
         return "\n\n".join([paper["summarized_text"] for paper in relevancy])
     else:
         return "\n\n".join(f"Title: {paper['title']}\nAuthors: {paper['authors']}" for paper in papers)
             papers,
             query={"interest": interest},
             threshold_score=7,
+            num_paper_in_prompt=2)
         body = "<br><br>".join([f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}' for paper in relevancy])
         if hallucination:
             body = "Warning: the model hallucinated some papers. We have tried to remove them, but the scores may not be accurate.<br><br>" + body

src/download_new_papers.py CHANGED Viewed

@@ -1,5 +1,7 @@
 # encoding: utf-8
 import os
 import tqdm
 from bs4 import BeautifulSoup as bs
 import urllib.request
@@ -8,6 +10,21 @@ import datetime
 import pytz
 #Linh - add new def crawl_html_version(html_link) here
 def _download_new_papers(field_abbr):
     NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new'  # https://arxiv.org/list/cs/new
     page = urllib.request.urlopen(NEW_SUB_URL)
@@ -30,13 +47,14 @@ def _download_new_papers(field_abbr):
         paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1]
         paper['main_page'] = arxiv_base + paper_number
         paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
-        paper['html'] = arxiv_html + paper_number + "v1"
         paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title: ", "").strip()
         paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \
                             .replace("Authors:\n", "").replace("\n", "").strip()
         paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects: ", "").strip()
         paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
         new_paper_list.append(paper)
@@ -64,3 +82,5 @@ def get_papers(field_abbr, limit=None):
                 return results
             results.append(json.loads(line))
     return results

 # encoding: utf-8
 import os
+from urllib.error import HTTPError
 import tqdm
 from bs4 import BeautifulSoup as bs
 import urllib.request
 import pytz
 #Linh - add new def crawl_html_version(html_link) here
+def crawl_html_version(html_link):
+    main_content = []
+    try:
+        html = urllib.request.urlopen(html_link)
+    except HTTPError as e:
+        return ["None"]
+    soup = bs(html)
+    content = soup.find('div', attrs={'class': 'ltx_page_content'})
+    para_list = content.find_all("div", attrs={'class': 'ltx_para'})
+    for each in para_list:
+        main_content.append(each.text.strip())
+    return ' '.join(main_content)[:8000]
+    #if len(main_content >)
+    #return ''.join(main_content) if len(main_content) < 20000 else ''.join(main_content[:20000])
 def _download_new_papers(field_abbr):
     NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new'  # https://arxiv.org/list/cs/new
     page = urllib.request.urlopen(NEW_SUB_URL)
         paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1]
         paper['main_page'] = arxiv_base + paper_number
         paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
+        #paper['html'] = arxiv_html + paper_number + "v1"
         paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title: ", "").strip()
         paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \
                             .replace("Authors:\n", "").replace("\n", "").strip()
         paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects: ", "").strip()
         paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
+        paper['content'] = crawl_html_version(arxiv_html + paper_number + "v1")
         new_paper_list.append(paper)
                 return results
             results.append(json.loads(line))
     return results
+#crawl_html_version("https://arxiv.org/html/2404.11972v1")

src/relevancy.py CHANGED Viewed

@@ -23,15 +23,16 @@ def encode_prompt(query, prompt_papers):
     prompt += query['interest']
     for idx, task_dict in enumerate(prompt_papers):
-        (title, authors, abstract) = task_dict["title"], task_dict["authors"], task_dict["abstract"]
         if not title:
             raise
         prompt += f"###\n"
         prompt += f"{idx + 1}. Title: {title}\n"
         prompt += f"{idx + 1}. Authors: {authors}\n"
         prompt += f"{idx + 1}. Abstract: {abstract}\n"
     prompt += f"\n Generate response:\n1."
-    print(prompt)
     return prompt
@@ -42,18 +43,34 @@ def is_json(myjson):
         return False
     return True
-def post_process_chat_gpt_response(paper_data, response, threshold_score=8):
     selected_data = []
     if response is None:
         return []
     json_items = response['message']['content'].replace("\n\n", "\n").split("\n")
     pattern = r"^\d+\. |\\"
     import pprint
     try:
-        score_items = [
-            json.loads(re.sub(pattern, "", line))
-            for line in json_items if (is_json(line) and "relevancy score" in line.lower())]
     except Exception as e:
         pprint.pprint([re.sub(pattern, "", line) for line in json_items if "relevancy score" in line.lower()])
         try:
@@ -105,7 +122,7 @@ def generate_relevance_score(
     query,
     model_name="gpt-3.5-turbo-16k",
     threshold_score=7,
-    num_paper_in_prompt=8,
     temperature=0.4,
     top_p=1.0,
     sorting=True
@@ -121,7 +138,7 @@ def generate_relevance_score(
         decoding_args = utils.OpenAIDecodingArguments(
             temperature=temperature,
             n=1,
-            max_tokens=128*num_paper_in_prompt, # The response for each paper should be less than 128 tokens.
             top_p=top_p,
         )
         request_start = time.time()
@@ -153,8 +170,8 @@ def run_all_day_paper(
     date=None,
     data_dir="../data",
     model_name="gpt-3.5-turbo-16k",
-    threshold_score=8,
-    num_paper_in_prompt=8,
     temperature=0.4,
     top_p=1.0
 ):

     prompt += query['interest']
     for idx, task_dict in enumerate(prompt_papers):
+        (title, authors, abstract, content) = task_dict["title"], task_dict["authors"], task_dict["abstract"], task_dict["content"]
         if not title:
             raise
         prompt += f"###\n"
         prompt += f"{idx + 1}. Title: {title}\n"
         prompt += f"{idx + 1}. Authors: {authors}\n"
         prompt += f"{idx + 1}. Abstract: {abstract}\n"
+        prompt += f"{idx + 1}. Content: {content}\n"
     prompt += f"\n Generate response:\n1."
+    #print(prompt)
     return prompt
         return False
     return True
+def post_process_chat_gpt_response(paper_data, response, threshold_score=7):
     selected_data = []
     if response is None:
         return []
     json_items = response['message']['content'].replace("\n\n", "\n").split("\n")
     pattern = r"^\d+\. |\\"
     import pprint
+    def try_loads(line):
+        try:
+            return json.loads(re.sub(pattern, "", line))
+        except json.JSONDecodeError:
+            return None
+    score_items = []
     try:
+        # score_items = [
+        #     json.loads(re.sub(pattern, "", line))
+        #     for line in json_items if (is_json(line) and "relevancy score" in line.lower())]
+        for line in json_items:
+            if is_json(line) and "relevancy score" in line.lower():
+                score_items.append(json.loads(re.sub(pattern, "", line)))
+            #elif
+        # score_items = [
+        #     loaded_json
+        #     for line in json_items if (is_json(line) and "relevancy score" in line.lower())
+        #     for loaded_json in [try_loads(line)] if loaded_json is not None
+        # ]
     except Exception as e:
         pprint.pprint([re.sub(pattern, "", line) for line in json_items if "relevancy score" in line.lower()])
         try:
     query,
     model_name="gpt-3.5-turbo-16k",
     threshold_score=7,
+    num_paper_in_prompt=1,
     temperature=0.4,
     top_p=1.0,
     sorting=True
         decoding_args = utils.OpenAIDecodingArguments(
             temperature=temperature,
             n=1,
+            max_tokens=1024*num_paper_in_prompt, # The response for each paper should be less than 128 tokens.
             top_p=top_p,
         )
         request_start = time.time()
     date=None,
     data_dir="../data",
     model_name="gpt-3.5-turbo-16k",
+    threshold_score=7,
+    num_paper_in_prompt=2,
     temperature=0.4,
     top_p=1.0
 ):

src/utils.py CHANGED Viewed

@@ -24,7 +24,8 @@ if openai_org is not None:
 @dataclasses.dataclass
 class OpenAIDecodingArguments(object):
-    max_tokens: int = 1800
     temperature: float = 0.2
     top_p: float = 1.0
     n: int = 1
@@ -39,7 +40,7 @@ def openai_completion(
     prompts, #: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]],
     decoding_args: OpenAIDecodingArguments,
     model_name="text-davinci-003",
-    sleep_time=2,
     batch_size=1,
     max_instances=sys.maxsize,
     max_batches=sys.maxsize,
@@ -96,10 +97,11 @@ def openai_completion(
     ):
         batch_decoding_args = copy.deepcopy(decoding_args)  # cloning the decoding_args
-        backoff = 3
         while True:
             try:
                 shared_kwargs = dict(
                     model=model_name,
                     **batch_decoding_args.__dict__,
@@ -134,6 +136,7 @@ def openai_completion(
                     backoff -= 1
                     logging.warning("Hit request rate limit; retrying...")
                     time.sleep(sleep_time)  # Annoying rate limit on requests.
     if return_text:
         completions = [completion.text for completion in completions]

 @dataclasses.dataclass
 class OpenAIDecodingArguments(object):
+    #max_tokens: int = 1800
+    max_tokens: int = 4800
     temperature: float = 0.2
     top_p: float = 1.0
     n: int = 1
     prompts, #: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]],
     decoding_args: OpenAIDecodingArguments,
     model_name="text-davinci-003",
+    sleep_time=15,
     batch_size=1,
     max_instances=sys.maxsize,
     max_batches=sys.maxsize,
     ):
         batch_decoding_args = copy.deepcopy(decoding_args)  # cloning the decoding_args
+        backoff = 5
         while True:
             try:
+                time.sleep(3)
                 shared_kwargs = dict(
                     model=model_name,
                     **batch_decoding_args.__dict__,
                     backoff -= 1
                     logging.warning("Hit request rate limit; retrying...")
                     time.sleep(sleep_time)  # Annoying rate limit on requests.
+                    continue
     if return_text:
         completions = [completion.text for completion in completions]