Spaces:

raannakasturi
/

ReXplorePaperDataFetcher

Running

App Files Files Community

raannakasturi commited on Dec 22, 2024

Commit

908a89c

1 Parent(s): 246c816

Add Gradio app for fetching research paper data and implement fetching functions

Browse files

Files changed (3) hide show

__pycache__/fetch_paper_data.cpython-312.pyc +0 -0
app.py +32 -0
fetch_arxiv_data.py → fetch_paper_data.py +32 -8

__pycache__/fetch_paper_data.cpython-312.pyc ADDED Viewed

Binary file (3.89 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import gradio as gr
+from fetch_paper_data import fetch_paper_data
+theme = gr.themes.Soft(
+    primary_hue="purple",
+    secondary_hue="cyan",
+    neutral_hue="slate",
+    font=[
+        gr.themes.GoogleFont('Syne'),
+        gr.themes.GoogleFont('Poppins'),
+        gr.themes.GoogleFont('Poppins'),
+        gr.themes.GoogleFont('Poppins')
+    ],
+)
+def clear_data(id, raw_data):
+    id = None
+    raw_data = None
+    return id, raw_data
+with gr.Blocks(theme=theme, title="Fetch Research Paper Data") as app:
+    with gr.Row():
+        with gr.Column():
+            id = gr.Textbox(label="Enter PMCID or arXiv ID", placeholder="PMC1234567 or 1234.56789")
+            with gr.Row():
+                fetch_data_btn = gr.Button(value="Fetch Data")
+                reset_btn = gr.Button(value="Reset")
+        raw_data = gr.Textbox(lines=15, label="Raw Data", interactive=False, show_copy_button=True)
+    fetch_data_btn.click(fn=fetch_paper_data, inputs=[id], outputs=[raw_data], api_name="fetch_paper_data")
+    reset_btn.click(fn=clear_data, inputs=[id, raw_data], outputs=[id, raw_data])
+app.queue(default_concurrency_limit=25).launch(max_threads=5000)

fetch_arxiv_data.py → fetch_paper_data.py RENAMED Viewed

@@ -1,5 +1,7 @@
 import requests
 from bs4 import BeautifulSoup
 HEADERS = {
     'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
@@ -12,6 +14,17 @@ def fetch_pmc_doi(pmc_id):
         doi = response['records'][0]['doi']
         return f"https://doi.org/{doi}"
 def fetch_arxiv_doi(arxiv_id):
     page_url = f"https://arxiv.org/abs/{arxiv_id}"
     page_content = requests.get(page_url, headers=HEADERS).content
@@ -23,14 +36,25 @@ def fetch_citation(doi):
     citation_content = requests.get(doi, headers={ 'User-Agent':HEADERS['User-Agent'], 'Accept': 'text/x-bibliography; style=apa'}).content
     return citation_content.decode('utf-8')
-def generate_citation(id):
-    if id.startswith('PMC'):
-        doi = fetch_pmc_doi(id)
-    else:
-        doi = fetch_arxiv_doi(id)
-    citation = fetch_citation(doi).replace('\n', ' ').replace('<i>', '').replace('</i>', '').strip()
-    return citation
 if __name__ == '__main__':
-    citation = generate_citation('2105.03824')
     print(citation)

 import requests
 from bs4 import BeautifulSoup
+from xml.etree import ElementTree as ET
+import json
 HEADERS = {
     'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
         doi = response['records'][0]['doi']
         return f"https://doi.org/{doi}"
+def fetch_pmc_pdf(pmc_id):
+    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmc_id}&format=pdf"
+    response = requests.get(url, headers=HEADERS).content
+    records = ET.fromstring(response).find('records').findall('record')
+    for record in records:
+        if record.attrib['id'] == pmc_id:
+            pdf_url = record.find('link').attrib['href']
+            return pdf_url.replace('ftp://', 'https://')
+        else:
+            return None
 def fetch_arxiv_doi(arxiv_id):
     page_url = f"https://arxiv.org/abs/{arxiv_id}"
     page_content = requests.get(page_url, headers=HEADERS).content
     citation_content = requests.get(doi, headers={ 'User-Agent':HEADERS['User-Agent'], 'Accept': 'text/x-bibliography; style=apa'}).content
     return citation_content.decode('utf-8')
+def fetch_paper_data(id):
+    data = {}
+    try:
+        if id.startswith('PMC'):
+            doi = fetch_pmc_doi(id)
+            pdf_url = fetch_pmc_pdf(id)
+        else:
+            doi = fetch_arxiv_doi(id)
+            pdf_url = f"https://arxiv.org/pdf/{id}"
+        citation = fetch_citation(doi).replace('\n', ' ').strip()
+        data['status'] = 'success'
+        data['doi'] = doi
+        data['pdf_url'] = pdf_url
+        data['citation'] = citation
+    except Exception as e:
+        data['status'] = 'error'
+        print(str(e))
+    return json.dumps(data, indent=4, ensure_ascii=False)
 if __name__ == '__main__':
+    citation = fetch_paper_data('PMC8391798')
     print(citation)