Commit
·
908a89c
1
Parent(s):
246c816
Add Gradio app for fetching research paper data and implement fetching functions
Browse files
__pycache__/fetch_paper_data.cpython-312.pyc
ADDED
Binary file (3.89 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from fetch_paper_data import fetch_paper_data
|
3 |
+
|
4 |
+
theme = gr.themes.Soft(
|
5 |
+
primary_hue="purple",
|
6 |
+
secondary_hue="cyan",
|
7 |
+
neutral_hue="slate",
|
8 |
+
font=[
|
9 |
+
gr.themes.GoogleFont('Syne'),
|
10 |
+
gr.themes.GoogleFont('Poppins'),
|
11 |
+
gr.themes.GoogleFont('Poppins'),
|
12 |
+
gr.themes.GoogleFont('Poppins')
|
13 |
+
],
|
14 |
+
)
|
15 |
+
|
16 |
+
def clear_data(id, raw_data):
|
17 |
+
id = None
|
18 |
+
raw_data = None
|
19 |
+
return id, raw_data
|
20 |
+
|
21 |
+
with gr.Blocks(theme=theme, title="Fetch Research Paper Data") as app:
|
22 |
+
with gr.Row():
|
23 |
+
with gr.Column():
|
24 |
+
id = gr.Textbox(label="Enter PMCID or arXiv ID", placeholder="PMC1234567 or 1234.56789")
|
25 |
+
with gr.Row():
|
26 |
+
fetch_data_btn = gr.Button(value="Fetch Data")
|
27 |
+
reset_btn = gr.Button(value="Reset")
|
28 |
+
raw_data = gr.Textbox(lines=15, label="Raw Data", interactive=False, show_copy_button=True)
|
29 |
+
fetch_data_btn.click(fn=fetch_paper_data, inputs=[id], outputs=[raw_data], api_name="fetch_paper_data")
|
30 |
+
reset_btn.click(fn=clear_data, inputs=[id, raw_data], outputs=[id, raw_data])
|
31 |
+
|
32 |
+
app.queue(default_concurrency_limit=25).launch(max_threads=5000)
|
fetch_arxiv_data.py → fetch_paper_data.py
RENAMED
@@ -1,5 +1,7 @@
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
|
|
|
|
3 |
|
4 |
HEADERS = {
|
5 |
'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
|
@@ -12,6 +14,17 @@ def fetch_pmc_doi(pmc_id):
|
|
12 |
doi = response['records'][0]['doi']
|
13 |
return f"https://doi.org/{doi}"
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def fetch_arxiv_doi(arxiv_id):
|
16 |
page_url = f"https://arxiv.org/abs/{arxiv_id}"
|
17 |
page_content = requests.get(page_url, headers=HEADERS).content
|
@@ -23,14 +36,25 @@ def fetch_citation(doi):
|
|
23 |
citation_content = requests.get(doi, headers={ 'User-Agent':HEADERS['User-Agent'], 'Accept': 'text/x-bibliography; style=apa'}).content
|
24 |
return citation_content.decode('utf-8')
|
25 |
|
26 |
-
def
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
if __name__ == '__main__':
|
35 |
-
citation =
|
36 |
print(citation)
|
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
+
from xml.etree import ElementTree as ET
|
4 |
+
import json
|
5 |
|
6 |
HEADERS = {
|
7 |
'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
|
|
|
14 |
doi = response['records'][0]['doi']
|
15 |
return f"https://doi.org/{doi}"
|
16 |
|
17 |
+
def fetch_pmc_pdf(pmc_id):
|
18 |
+
url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmc_id}&format=pdf"
|
19 |
+
response = requests.get(url, headers=HEADERS).content
|
20 |
+
records = ET.fromstring(response).find('records').findall('record')
|
21 |
+
for record in records:
|
22 |
+
if record.attrib['id'] == pmc_id:
|
23 |
+
pdf_url = record.find('link').attrib['href']
|
24 |
+
return pdf_url.replace('ftp://', 'https://')
|
25 |
+
else:
|
26 |
+
return None
|
27 |
+
|
28 |
def fetch_arxiv_doi(arxiv_id):
|
29 |
page_url = f"https://arxiv.org/abs/{arxiv_id}"
|
30 |
page_content = requests.get(page_url, headers=HEADERS).content
|
|
|
36 |
citation_content = requests.get(doi, headers={ 'User-Agent':HEADERS['User-Agent'], 'Accept': 'text/x-bibliography; style=apa'}).content
|
37 |
return citation_content.decode('utf-8')
|
38 |
|
39 |
+
def fetch_paper_data(id):
|
40 |
+
data = {}
|
41 |
+
try:
|
42 |
+
if id.startswith('PMC'):
|
43 |
+
doi = fetch_pmc_doi(id)
|
44 |
+
pdf_url = fetch_pmc_pdf(id)
|
45 |
+
else:
|
46 |
+
doi = fetch_arxiv_doi(id)
|
47 |
+
pdf_url = f"https://arxiv.org/pdf/{id}"
|
48 |
+
citation = fetch_citation(doi).replace('\n', ' ').strip()
|
49 |
+
data['status'] = 'success'
|
50 |
+
data['doi'] = doi
|
51 |
+
data['pdf_url'] = pdf_url
|
52 |
+
data['citation'] = citation
|
53 |
+
except Exception as e:
|
54 |
+
data['status'] = 'error'
|
55 |
+
print(str(e))
|
56 |
+
return json.dumps(data, indent=4, ensure_ascii=False)
|
57 |
|
58 |
if __name__ == '__main__':
|
59 |
+
citation = fetch_paper_data('PMC8391798')
|
60 |
print(citation)
|