wzkariampuzha commited on
Commit
e517955
·
1 Parent(s): d5406d4

Update classify_abs.py

Browse files
Files changed (1) hide show
  1. classify_abs.py +61 -80
classify_abs.py CHANGED
@@ -277,103 +277,84 @@ def search_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int
277
 
278
  return pmid_abs
279
 
 
280
  def streamlist_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int, filtering:str) -> Dict[str,str]:
281
- #set of all pmids
282
  pmids = set()
283
 
284
- #dictionary {pmid:abstract}
285
  pmid_abs = {}
286
 
287
- #type validation, allows string or list input
288
  if type(searchterm_list)!=list:
289
  if type(searchterm_list)==str:
290
  searchterm_list = [searchterm_list]
291
  else:
292
  searchterm_list = list(searchterm_list)
293
 
294
- my_bar = st.progress(0)
295
- percent_by_step = 100/maxResults
296
 
297
- #gathers pmids into a set first
298
- for dz in searchterm_list:
299
- term = ''
300
- dz_words = dz.split()
301
- for word in dz_words:
302
- term += word + '%20'
303
- query = term[:-3]
304
-
305
- ## get pmid results from searching for disease name through PubMed API
306
- url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
307
- r = requests.get(url)
308
- root = ET.fromstring(r.content)
309
-
310
- # loop over resulting articles
311
- for result in root.iter('IdList'):
312
- if len(pmids) >= maxResults:
313
- break
314
- pmidlist = [pmid.text for pmid in result.iter('Id')]
315
- pmids.update(pmidlist)
316
-
317
- ## get results from searching for disease name through EBI API
318
- url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
319
- r = requests.get(url)
320
- root = ET.fromstring(r.content)
321
 
322
- # loop over resulting articles
323
- for result in root.iter('result'):
324
- if len(pmids) >= maxResults:
325
- break
326
- pmidlist = [pmid.text for pmid in result.iter('id')]
327
- #can also gather abstract and title here but for some reason did not work as intended the first time. Optimize in future versions to reduce latency.
328
- if len(pmidlist) > 0:
329
- pmid = pmidlist[0]
330
- if pmid[0].isdigit():
331
- pmids.add(pmid)
332
 
333
- #Construct sets for filtering (right before adding abstract to pmid_abs
334
- # The purpose of this is to do a second check of the abstracts, filters out any abstracts unrelated to the search terms
335
- #if filtering is 'lenient' or default
336
- if filtering !='none' or filtering !='strict':
337
- filter_terms = set(searchterm_list).union(set(str(re.sub(',','',' '.join(searchterm_list))).split()).difference(STOPWORDS))
338
- '''
339
- # The above is equivalent to this but uses less memory and may be faster:
340
- #create a single string of the terms within the searchterm_list
341
- joined = ' '.join(searchterm_list)
342
- #remove commas
343
- comma_gone = re.sub(',','',joined)
344
- #split the string into list of words and convert list into a Pythonic set
345
- split = set(comma_gone.split())
346
- #remove the STOPWORDS from the set of key words
347
- key_words = split.difference(STOPWORDS)
348
- #create a new set of the list members in searchterm_list
349
- search_set = set(searchterm_list)
350
- #join the two sets
351
- terms = search_set.union(key_words)
352
- #if any word(s) in the abstract intersect with any of these terms then the abstract is good to go.
353
- '''
354
 
355
- ## get abstracts from EBI PMID API and output a dictionary
356
- for pmid in pmids:
357
- abstract = PMID_getAb(pmid)
358
- if len(abstract)>5:
359
- #do filtering here
360
- if filtering == 'strict':
361
- uncased_ab = abstract.lower()
362
- for term in searchterm_list:
363
- if term.lower() in uncased_ab:
364
- pmid_abs[pmid] = abstract
365
- break
366
- elif filtering =='none':
367
- pmid_abs[pmid] = abstract
368
-
369
- #Default filtering is 'lenient'.
370
- else:
371
- #Else and if are separated for readability and to better understand logical flow.
372
- if set(filter_terms).intersection(set(word_tokenize(abstract))):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  pmid_abs[pmid] = abstract
 
374
 
375
-
376
- print('Found',len(pmids),'PMIDs. Gathered',len(pmid_abs),'Relevant Abstracts.')
 
 
 
 
 
 
377
 
378
  return pmid_abs
379
 
 
277
 
278
  return pmid_abs
279
 
280
+ #This is a streamlit version of search_getAbs. Refer to search_getAbs for documentation
281
  def streamlist_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int, filtering:str) -> Dict[str,str]:
 
282
  pmids = set()
283
 
 
284
  pmid_abs = {}
285
 
 
286
  if type(searchterm_list)!=list:
287
  if type(searchterm_list)==str:
288
  searchterm_list = [searchterm_list]
289
  else:
290
  searchterm_list = list(searchterm_list)
291
 
292
+ percent_by_step = 1/(maxResults*1.25) #maxResults is multiplied by a little bit because sometimes the results returned is more than maxResults
 
293
 
294
+ with PMIDs_bar = st.progress(0):
295
+
296
+
297
+ for dz in searchterm_list:
298
+ term = ''
299
+ dz_words = dz.split()
300
+ for word in dz_words:
301
+ term += word + '%20'
302
+ query = term[:-3]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
+ url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
305
+ r = requests.get(url)
306
+ root = ET.fromstring(r.content)
 
 
 
 
 
 
 
307
 
308
+ for result in root.iter('IdList'):
309
+ if len(pmids) >= maxResults:
310
+ break
311
+ pmidlist = [pmid.text for pmid in result.iter('Id')]
312
+ pmids.update(pmidlist)
313
+ PMIDs_bar.progress(round(len(pmids)*percent_by_step,1))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
+ url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
316
+ r = requests.get(url)
317
+ root = ET.fromstring(r.content)
318
+
319
+ for result in root.iter('result'):
320
+ if len(pmids) >= maxResults:
321
+ break
322
+ pmidlist = [pmid.text for pmid in result.iter('id')]
323
+ if len(pmidlist) > 0:
324
+ pmid = pmidlist[0]
325
+ if pmid[0].isdigit():
326
+ pmids.add(pmid)
327
+ PMIDs_bar.progress(round(len(pmids)*percent_by_step,1))
328
+ st.success('Found',len(pmids),'PMIDs. Gathering Abstracts and Filtering...')
329
+
330
+ with abstracts_bar = st.progress(0):
331
+ percent_by_step = 1/(maxResults)
332
+ if filtering !='none' or filtering !='strict':
333
+ filter_terms = set(searchterm_list).union(set(str(re.sub(',','',' '.join(searchterm_list))).split()).difference(STOPWORDS))
334
+
335
+ for pmid in pmids:
336
+ abstract = PMID_getAb(pmid)
337
+ if len(abstract)>5:
338
+ #do filtering here
339
+ if filtering == 'strict':
340
+ uncased_ab = abstract.lower()
341
+ for term in searchterm_list:
342
+ if term.lower() in uncased_ab:
343
+ pmid_abs[pmid] = abstract
344
+ abstracts_bar.progress(round(len(pmid_abs)*percent_by_step,1))
345
+ break
346
+ elif filtering =='none':
347
  pmid_abs[pmid] = abstract
348
+ abstracts_bar.progress(round(len(pmid_abs)*percent_by_step,1))
349
 
350
+ #Default filtering is 'lenient'.
351
+ else:
352
+ #Else and if are separated for readability and to better understand logical flow.
353
+ if set(filter_terms).intersection(set(word_tokenize(abstract))):
354
+ pmid_abs[pmid] = abstract
355
+ abstracts_bar.progress(round(len(pmid_abs)*percent_by_step,1))
356
+
357
+ st.success('Found',len(pmids),'PMIDs. Gathered',len(pmid_abs),'Relevant Abstracts.')
358
 
359
  return pmid_abs
360