wzkariampuzha commited on
Commit
c52403d
·
1 Parent(s): c14e35e

Update classify_abs.py

Browse files
Files changed (1) hide show
  1. classify_abs.py +60 -61
classify_abs.py CHANGED
@@ -291,70 +291,69 @@ def streamlit_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:
291
  searchterm_list = list(searchterm_list)
292
  #maxResults is multiplied by a little bit because sometimes the results returned is more than maxResults
293
  percent_by_step = 1/(maxResults*1.05)
294
- API_Loading = st.spinner("Gathering PubMed IDs...")
295
- PMIDs_bar = st.progress(0)
296
- for dz in searchterm_list:
297
- term = ''
298
- dz_words = dz.split()
299
- for word in dz_words:
300
- term += word + '%20'
301
- query = term[:-3]
302
-
303
- url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
304
- r = requests.get(url)
305
- root = ET.fromstring(r.content)
306
-
307
- for result in root.iter('IdList'):
308
- if len(pmids) >= maxResults:
309
- break
310
- pmidlist = [pmid.text for pmid in result.iter('Id')]
311
- pmids.update(pmidlist)
312
- PMIDs_bar.progress(min(round(len(pmids)*percent_by_step,1),1.0))
313
-
314
- url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
315
- r = requests.get(url)
316
- root = ET.fromstring(r.content)
317
-
318
- for result in root.iter('result'):
319
- if len(pmids) >= maxResults:
320
- break
321
- pmidlist = [pmid.text for pmid in result.iter('id')]
322
- if len(pmidlist) > 0:
323
- pmid = pmidlist[0]
324
- if pmid[0].isdigit():
325
- pmids.add(pmid)
326
- PMIDs_bar.progress(min(round(len(pmids)*percent_by_step,1),1.0))
327
- PMIDs_bar.empty()
328
 
329
- API_Loading = st.spinner("Found "+str(len(pmids))+" PMIDs. Gathering Abstracts and Filtering...")
330
- abstracts_bar = st.progress(0)
331
- percent_by_step = 1/(maxResults)
332
- if filtering !='none' or filtering !='strict':
333
- filter_terms = set(searchterm_list).union(set(str(re.sub(',','',' '.join(searchterm_list))).split()).difference(STOPWORDS))
334
-
335
- for pmid in pmids:
336
- abstract = PMID_getAb(pmid)
337
- if len(abstract)>5:
338
- #do filtering here
339
- if filtering == 'strict':
340
- uncased_ab = abstract.lower()
341
- for term in searchterm_list:
342
- if term.lower() in uncased_ab:
343
- pmid_abs[pmid] = abstract
344
- abstracts_bar.progress(min(round(len(pmid_abs)*percent_by_step,1),1.0))
345
- break
346
- elif filtering =='none':
347
- pmid_abs[pmid] = abstract
348
- abstracts_bar.progress(min(round(len(pmid_abs)*percent_by_step,1),1.0))
349
-
350
- #Default filtering is 'lenient'.
351
- else:
352
- #Else and if are separated for readability and to better understand logical flow.
353
- if set(filter_terms).intersection(set(word_tokenize(abstract))):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  pmid_abs[pmid] = abstract
355
  abstracts_bar.progress(min(round(len(pmid_abs)*percent_by_step,1),1.0))
356
- abstracts_bar.empty()
357
- API_Loading.empty()
 
 
 
 
 
 
358
  st.success('Found '+str(len(pmids))+' PMIDs. Gathered '+str(len(pmid_abs))+' Relevant Abstracts. Classifying and extracting epidemiology information...')
359
 
360
  return pmid_abs, (len(pmids),len(pmid_abs))
 
291
  searchterm_list = list(searchterm_list)
292
  #maxResults is multiplied by a little bit because sometimes the results returned is more than maxResults
293
  percent_by_step = 1/(maxResults*1.05)
294
+ with st.spinner("Gathering PubMed IDs..."):
295
+ PMIDs_bar = st.progress(0)
296
+ for dz in searchterm_list:
297
+ term = ''
298
+ dz_words = dz.split()
299
+ for word in dz_words:
300
+ term += word + '%20'
301
+ query = term[:-3]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
+ url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
304
+ r = requests.get(url)
305
+ root = ET.fromstring(r.content)
306
+
307
+ for result in root.iter('IdList'):
308
+ if len(pmids) >= maxResults:
309
+ break
310
+ pmidlist = [pmid.text for pmid in result.iter('Id')]
311
+ pmids.update(pmidlist)
312
+ PMIDs_bar.progress(min(round(len(pmids)*percent_by_step,1),1.0))
313
+
314
+ url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
315
+ r = requests.get(url)
316
+ root = ET.fromstring(r.content)
317
+
318
+ for result in root.iter('result'):
319
+ if len(pmids) >= maxResults:
320
+ break
321
+ pmidlist = [pmid.text for pmid in result.iter('id')]
322
+ if len(pmidlist) > 0:
323
+ pmid = pmidlist[0]
324
+ if pmid[0].isdigit():
325
+ pmids.add(pmid)
326
+ PMIDs_bar.progress(min(round(len(pmids)*percent_by_step,1),1.0))
327
+ PMIDs_bar.empty()
328
+
329
+ with st.spinner("Found "+str(len(pmids))+" PMIDs. Gathering Abstracts and Filtering..."):
330
+ abstracts_bar = st.progress(0)
331
+ percent_by_step = 1/(maxResults)
332
+ if filtering !='none' or filtering !='strict':
333
+ filter_terms = set(searchterm_list).union(set(str(re.sub(',','',' '.join(searchterm_list))).split()).difference(STOPWORDS))
334
+
335
+ for pmid in pmids:
336
+ abstract = PMID_getAb(pmid)
337
+ if len(abstract)>5:
338
+ #do filtering here
339
+ if filtering == 'strict':
340
+ uncased_ab = abstract.lower()
341
+ for term in searchterm_list:
342
+ if term.lower() in uncased_ab:
343
+ pmid_abs[pmid] = abstract
344
+ abstracts_bar.progress(min(round(len(pmid_abs)*percent_by_step,1),1.0))
345
+ break
346
+ elif filtering =='none':
347
  pmid_abs[pmid] = abstract
348
  abstracts_bar.progress(min(round(len(pmid_abs)*percent_by_step,1),1.0))
349
+
350
+ #Default filtering is 'lenient'.
351
+ else:
352
+ #Else and if are separated for readability and to better understand logical flow.
353
+ if set(filter_terms).intersection(set(word_tokenize(abstract))):
354
+ pmid_abs[pmid] = abstract
355
+ abstracts_bar.progress(min(round(len(pmid_abs)*percent_by_step,1),1.0))
356
+ abstracts_bar.empty()
357
  st.success('Found '+str(len(pmids))+' PMIDs. Gathered '+str(len(pmid_abs))+' Relevant Abstracts. Classifying and extracting epidemiology information...')
358
 
359
  return pmid_abs, (len(pmids),len(pmid_abs))