raannakasturi commited on
Commit
908a89c
·
1 Parent(s): 246c816

Add Gradio app for fetching research paper data and implement fetching functions

Browse files
__pycache__/fetch_paper_data.cpython-312.pyc ADDED
Binary file (3.89 kB). View file
 
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from fetch_paper_data import fetch_paper_data
3
+
4
+ theme = gr.themes.Soft(
5
+ primary_hue="purple",
6
+ secondary_hue="cyan",
7
+ neutral_hue="slate",
8
+ font=[
9
+ gr.themes.GoogleFont('Syne'),
10
+ gr.themes.GoogleFont('Poppins'),
11
+ gr.themes.GoogleFont('Poppins'),
12
+ gr.themes.GoogleFont('Poppins')
13
+ ],
14
+ )
15
+
16
+ def clear_data(id, raw_data):
17
+ id = None
18
+ raw_data = None
19
+ return id, raw_data
20
+
21
+ with gr.Blocks(theme=theme, title="Fetch Research Paper Data") as app:
22
+ with gr.Row():
23
+ with gr.Column():
24
+ id = gr.Textbox(label="Enter PMCID or arXiv ID", placeholder="PMC1234567 or 1234.56789")
25
+ with gr.Row():
26
+ fetch_data_btn = gr.Button(value="Fetch Data")
27
+ reset_btn = gr.Button(value="Reset")
28
+ raw_data = gr.Textbox(lines=15, label="Raw Data", interactive=False, show_copy_button=True)
29
+ fetch_data_btn.click(fn=fetch_paper_data, inputs=[id], outputs=[raw_data], api_name="fetch_paper_data")
30
+ reset_btn.click(fn=clear_data, inputs=[id, raw_data], outputs=[id, raw_data])
31
+
32
+ app.queue(default_concurrency_limit=25).launch(max_threads=5000)
fetch_arxiv_data.py → fetch_paper_data.py RENAMED
@@ -1,5 +1,7 @@
1
  import requests
2
  from bs4 import BeautifulSoup
 
 
3
 
4
  HEADERS = {
5
  'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
@@ -12,6 +14,17 @@ def fetch_pmc_doi(pmc_id):
12
  doi = response['records'][0]['doi']
13
  return f"https://doi.org/{doi}"
14
 
 
 
 
 
 
 
 
 
 
 
 
15
  def fetch_arxiv_doi(arxiv_id):
16
  page_url = f"https://arxiv.org/abs/{arxiv_id}"
17
  page_content = requests.get(page_url, headers=HEADERS).content
@@ -23,14 +36,25 @@ def fetch_citation(doi):
23
  citation_content = requests.get(doi, headers={ 'User-Agent':HEADERS['User-Agent'], 'Accept': 'text/x-bibliography; style=apa'}).content
24
  return citation_content.decode('utf-8')
25
 
26
- def generate_citation(id):
27
- if id.startswith('PMC'):
28
- doi = fetch_pmc_doi(id)
29
- else:
30
- doi = fetch_arxiv_doi(id)
31
- citation = fetch_citation(doi).replace('\n', ' ').replace('<i>', '').replace('</i>', '').strip()
32
- return citation
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  if __name__ == '__main__':
35
- citation = generate_citation('2105.03824')
36
  print(citation)
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ from xml.etree import ElementTree as ET
4
+ import json
5
 
6
  HEADERS = {
7
  'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
 
14
  doi = response['records'][0]['doi']
15
  return f"https://doi.org/{doi}"
16
 
17
+ def fetch_pmc_pdf(pmc_id):
18
+ url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmc_id}&format=pdf"
19
+ response = requests.get(url, headers=HEADERS).content
20
+ records = ET.fromstring(response).find('records').findall('record')
21
+ for record in records:
22
+ if record.attrib['id'] == pmc_id:
23
+ pdf_url = record.find('link').attrib['href']
24
+ return pdf_url.replace('ftp://', 'https://')
25
+ else:
26
+ return None
27
+
28
  def fetch_arxiv_doi(arxiv_id):
29
  page_url = f"https://arxiv.org/abs/{arxiv_id}"
30
  page_content = requests.get(page_url, headers=HEADERS).content
 
36
  citation_content = requests.get(doi, headers={ 'User-Agent':HEADERS['User-Agent'], 'Accept': 'text/x-bibliography; style=apa'}).content
37
  return citation_content.decode('utf-8')
38
 
39
+ def fetch_paper_data(id):
40
+ data = {}
41
+ try:
42
+ if id.startswith('PMC'):
43
+ doi = fetch_pmc_doi(id)
44
+ pdf_url = fetch_pmc_pdf(id)
45
+ else:
46
+ doi = fetch_arxiv_doi(id)
47
+ pdf_url = f"https://arxiv.org/pdf/{id}"
48
+ citation = fetch_citation(doi).replace('\n', ' ').strip()
49
+ data['status'] = 'success'
50
+ data['doi'] = doi
51
+ data['pdf_url'] = pdf_url
52
+ data['citation'] = citation
53
+ except Exception as e:
54
+ data['status'] = 'error'
55
+ print(str(e))
56
+ return json.dumps(data, indent=4, ensure_ascii=False)
57
 
58
  if __name__ == '__main__':
59
+ citation = fetch_paper_data('PMC8391798')
60
  print(citation)