davanstrien's picture
davanstrien HF staff
increase returned papers to 7
1ee33b8
import gradio as gr
import requests
from cachetools import cached, TTLCache
from bs4 import BeautifulSoup
from httpx import Client
import json
from pathlib import Path
from huggingface_hub import CommitScheduler
from dotenv import load_dotenv
import os
from functools import lru_cache
from typing import Tuple
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
CACHE_TIME = 60 * 60 * 6 # 6 hours
client = Client()
REPO_ID = "librarian-bots/paper-recommendations-v2"
scheduler = CommitScheduler(
repo_id=REPO_ID,
repo_type="dataset",
folder_path="comments",
path_in_repo="data",
every=5,
token=HF_TOKEN,
)
def parse_arxiv_id_from_paper_url(url):
return url.split("/")[-1]
@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
def get_recommendations_from_semantic_scholar(semantic_scholar_id: str):
try:
r = requests.post(
"https://api.semanticscholar.org/recommendations/v1/papers/",
json={
"positivePaperIds": [semantic_scholar_id],
},
params={"fields": "externalIds,title,year", "limit": 14},
)
return r.json()["recommendedPapers"]
except KeyError as e:
raise gr.Error(
"Error getting recommendations, if this is a new paper it may not yet have"
" been indexed by Semantic Scholar."
) from e
def filter_recommendations(recommendations, max_paper_count=7):
# include only arxiv papers
arxiv_paper = [
r for r in recommendations if r["externalIds"].get("ArXiv", None) is not None
]
if len(arxiv_paper) > max_paper_count:
arxiv_paper = arxiv_paper[:max_paper_count]
return arxiv_paper
@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
def get_paper_title_from_arxiv_id(arxiv_id):
try:
return requests.get(f"https://huggingface.co/api/papers/{arxiv_id}").json()[
"title"
]
except Exception as e:
print(f"Error getting paper title for {arxiv_id}: {e}")
raise gr.Error("Error getting paper title for {arxiv_id}: {e}") from e
def format_recommendation_into_markdown(arxiv_id, recommendations):
# title = get_paper_title_from_arxiv_id(arxiv_id)
# url = f"https://huggingface.co/papers/{arxiv_id}"
# comment = f"Recommended papers for [{title}]({url})\n\n"
comment = "The following papers were recommended by the Semantic Scholar API \n\n"
for r in recommendations:
hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}"
comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n"
return comment
def format_comment(result: str):
result = (
"This is an automated message from the [Librarian Bot](https://huggingface.co/librarian-bots). I found the following papers similar to this paper. \n\n"
+ result
)
result += "\n\n Please give a thumbs up to this comment if you found it helpful!"
result += "\n\n If you want recommendations for any Paper on Hugging Face checkout [this](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) Space"
result += "\n\n You can directly ask Librarian Bot for paper recommendations by tagging it in a comment: `@librarian-bot recommend`"
return result
def post_comment(
paper_url: str, comment: str, comment_id: str | None = None, token: str = HF_TOKEN
) -> Tuple[bool, str]:
"""
Post a comment on a paper or a reply to a comment using the Hugging Face API.
Args:
paper_url (str): The URL of the paper to post the comment on.
comment (str): The text of the comment or reply to post.
comment_id (str, optional): The ID of the comment to reply to. If provided, the function will post a reply to the specified comment. Defaults to None.
token (str, optional): The authentication token to use for the API request. Defaults to HF_TOKEN.
Returns:
Tuple[bool, str]: A tuple containing two elements:
- bool: True if the comment or reply was posted successfully, False otherwise.
- str: The ID of the posted comment or reply if successful, an empty string otherwise.
Raises:
requests.exceptions.RequestException: If an error occurs while making the API request.
"""
try:
paper_id = paper_url.split("/")[-1]
if comment_id:
url = f"https://huggingface.co/api/papers/{paper_id}/comment/{comment_id}/reply"
gr.Info(f"Replying to comment {comment_id}")
print(f"Replying to comment {comment_id}")
else:
url = f"https://huggingface.co/api/papers/{paper_id}/comment"
print(f"Posting comment for {paper_url}")
gr.Info(f"Posting comment for {paper_url}")
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
}
comment_data = {"comment": comment}
response = requests.post(url, json=comment_data, headers=headers)
if response.status_code == 201:
posted_comment_id = response.json().get("id", "")
if comment_id:
print(
f"Reply posted successfully to comment {comment_id} for {paper_url}. Reply ID: {posted_comment_id}"
)
else:
print(
f"Comment posted successfully for {paper_url}. Comment ID: {posted_comment_id}"
)
return True, posted_comment_id
else:
print(
f"Failed to post {'reply' if comment_id else 'comment'} for {paper_url}. Status code: {response.status_code}"
)
print(f"Response text: {response.text}")
return False, ""
except requests.exceptions.RequestException as e:
print(
f"Error posting {'reply' if comment_id else 'comment'} for {paper_url}: {e}"
)
return False, ""
# @lru_cache(maxsize=500)
# def is_comment_from_librarian_bot(html: str) -> bool:
# """
# Checks if the given HTML contains a comment from the librarian-bot.
# Args:
# html (str): The HTML content to check.
# Returns:
# bool: True if a comment from the librarian-bot is found, False otherwise.
# """
# soup = BeautifulSoup(html, "lxml")
# librarian_bot_links = soup.find_all("a", string="librarian-bot")
# return any(librarian_bot_links)
def check_if_lib_bot_comment_exists(paper_url: str) -> Tuple[bool, str]:
"""
Check if a comment or reply from the librarian-bot exists for a given paper URL using the Hugging Face API.
Args:
paper_url (str): The URL of the paper to check for librarian-bot comments.
Returns:
Tuple[bool, str]: A tuple containing two elements:
- bool: True if a comment or reply from the librarian-bot is found, False otherwise.
- str: The ID of the comment if a librarian-bot comment is found, an empty string otherwise.
Raises:
Exception: If an error occurs while retrieving comments from the API.
"""
try:
paper_id = paper_url.split("/")[-1]
url = f"https://huggingface.co/api/papers/{paper_id}/?field=comments"
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
response = requests.get(url, headers=headers)
if response.status_code == 200:
paper_data = response.json()
comments = paper_data.get("comments", [])
for comment in comments:
comment_author = comment.get("author", {}).get("name")
if comment_author == "librarian-bot":
return True, comment.get("id")
replies = comment.get("replies", [])
for reply in replies:
reply_author = reply.get("author", {}).get("name")
if reply_author == "librarian-bot":
return True, comment.get("id")
else:
print(
f"Failed to retrieve comments for {paper_url}. Status code: {response.status_code}"
)
return False, ""
except Exception as e:
print(f"Error checking if comment exists for {paper_url}: {e}")
return True, "" # default to not posting comment
def log_comments(paper_url: str, comment: str):
"""
Logs comments for a given paper URL.
Args:
paper_url (str): The URL of the paper.
comment (str): The comment to be logged.
Returns:
None
"""
paper_id = paper_url.split("/")[-1]
file_path = Path(f"comments/{paper_id}.json")
if not file_path.exists():
with scheduler.lock:
with open(file_path, "w") as f:
data = {"paper_url": paper_url, "comment": comment}
json.dump(data, f)
def return_recommendations(
url: str, comment_id: str | None, post_to_paper: bool = True
) -> str:
arxiv_id = parse_arxiv_id_from_paper_url(url)
recommendations = get_recommendations_from_semantic_scholar(f"ArXiv:{arxiv_id}")
filtered_recommendations = filter_recommendations(recommendations)
formatted_recommendation = format_recommendation_into_markdown(
arxiv_id, filtered_recommendations
) # Assign early
if post_to_paper:
comment = format_comment(formatted_recommendation)
# Check if a librarian-bot comment already exists.
existing_comments, existing_comment_id = check_if_lib_bot_comment_exists(url)
if existing_comments:
gr.Info(
f"Librarian-bot already commented on this paper. Comment ID: {existing_comment_id}. No further action will be taken."
)
else:
# If no existing librarian-bot comment, check if a specific comment_id is provided for replying.
if comment_id:
comment_status, posted_comment_id = post_comment(
url, comment, comment_id, token=HF_TOKEN
)
if comment_status:
log_comments(url, comment)
gr.Info(f"Posted reply to comment {posted_comment_id}")
else:
# If no comment_id is provided, post a new comment.
comment_status, posted_comment_id = post_comment(
url, comment, token=HF_TOKEN
)
if comment_status:
log_comments(url, comment)
gr.Info(f"Posted new comment {posted_comment_id}")
if not comment_status:
gr.Info("Failed to post comment")
return formatted_recommendation
title = "Semantic Scholar Paper Recommender"
description = (
"Paste a link to a paper on Hugging Face Papers and get recommendations for similar"
" papers from Semantic Scholar. **Note**: Some papers may not have recommendations"
" yet if they are new or have not been indexed by Semantic Scholar."
)
examples = [
["https://huggingface.co/papers/2309.12307", None, False],
["https://huggingface.co/papers/2211.10086", None, False],
]
interface = gr.Interface(
return_recommendations,
[
gr.Textbox(lines=1),
gr.Textbox(None, lines=1, label="Comment ID (only for API)", visible=False),
gr.Checkbox(False, label="Post recommendations to Paper page?"),
],
gr.Markdown(),
examples=examples,
title=title,
description=description,
)
interface.queue()
interface.launch()