import re import requests import xml.etree.ElementTree as ET from bs4 import BeautifulSoup import json HEADERS = { 'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36' } def fetch_pmc_doi(pmc_id): url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={pmc_id}&format=json" response = requests.get(url, headers=HEADERS).json() if response['status'] == 'ok': doi = response['records'][0]['doi'] return f"https://doi.org/{doi}" def fetch_pmc_pdf(pmc_id): pdf_url = None url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmc_id}&format=pdf" response = requests.get(url, headers=HEADERS).content try: ET.fromstring(response).find(".//error").text url = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmc_id}/" response = requests.get(url, headers=HEADERS).content data = BeautifulSoup(response, 'html.parser') pdf_url = data.find("a", {"data-ga-label" : "pdf_download_desktop"}, href=True) if pdf_url: pdf_url = url + pdf_url['href'] else: return None except Exception as e: pdf_url = ET.fromstring(response).find("records").find("record").find("link").attrib['href'].replace('ftp://', 'http://') finally: return pdf_url def fetch_arxiv_doi(arxiv_id): page_url = f"https://arxiv.org/abs/{arxiv_id}" page_content = requests.get(page_url, headers=HEADERS).content page_data = BeautifulSoup(page_content, 'html.parser') doi = page_data.find('td', {'class': "tablecell arxivdoi"}).find('a', {'id': 'arxiv-doi-link'}).text return doi def fetch_citation_title(doi): url = f"https://api.citeas.org/product/{doi}" response = requests.get(url) cite = None title = None if response.status_code == 200: citations_list = response.json()['citations'] for citation in citations_list: if citation['style_shortname'] == 'apa': cite = citation['citation'] if response.json()['name']: title = response.json()['name'] return cite, title else: response.raise_for_status() def fetch_paper_data(id): data = {} try: if id.startswith('PMC'): doi = fetch_pmc_doi(id) print(doi) pdf_url = fetch_pmc_pdf(id) else: doi = fetch_arxiv_doi(id) pdf_url = f"https://arxiv.org/pdf/{id}" if doi and pdf_url: citation, title = fetch_citation_title(doi) citation = citation.replace('\n', ' ').replace("", "").replace("", "").strip() title = title.replace('\n', ' ').replace("", "").replace("", "").strip() data['status'] = 'success' data['data'] = {} data['data']['doi'] = doi data['data']['title'] = title data['data']['pdf_url'] = pdf_url data['data']['citation'] = citation except Exception as e: data['status'] = 'error' print(str(e)) return json.dumps(data, indent=4, ensure_ascii=False) if __name__ == '__main__': data = fetch_paper_data('2501.06029') print(data)