Spaces:
Running
Running
wzkariampuzha
commited on
Commit
·
0bc8dab
1
Parent(s):
e7caceb
Update extract_abs.py
Browse files- extract_abs.py +47 -0
extract_abs.py
CHANGED
@@ -19,6 +19,7 @@ import json
|
|
19 |
import codecs
|
20 |
from unidecode import unidecode
|
21 |
from collections import OrderedDict
|
|
|
22 |
from typing import (
|
23 |
Dict,
|
24 |
List,
|
@@ -275,6 +276,52 @@ def search_term_extraction(search_term:Union[int,str], maxResults:int, filtering
|
|
275 |
|
276 |
print(len(results),'abstracts classified as epidemiological.')
|
277 |
return results.sort_values('EPI_PROB', ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
|
279 |
#Identical to search_term_extraction, except it returns a JSON object instead of a df
|
280 |
def API_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
|
|
|
19 |
import codecs
|
20 |
from unidecode import unidecode
|
21 |
from collections import OrderedDict
|
22 |
+
import streamlit as st
|
23 |
from typing import (
|
24 |
Dict,
|
25 |
List,
|
|
|
276 |
|
277 |
print(len(results),'abstracts classified as epidemiological.')
|
278 |
return results.sort_values('EPI_PROB', ascending=False)
|
279 |
+
|
280 |
+
#Returns a Pandas dataframe
|
281 |
+
def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
|
282 |
+
NER_pipeline:Any, entity_classes:Union[Set[str],List[str]], #for biobert extraction
|
283 |
+
extract_diseases:bool, GARD_dict:Dict[str,str], max_length:int, #for disease extraction
|
284 |
+
classify_model_vars:Tuple[Any,Any,Any,Any,Any]) -> Any: #for classification
|
285 |
+
|
286 |
+
#Format of Output
|
287 |
+
ordered_labels = order_labels(entity_classes)
|
288 |
+
if extract_diseases:
|
289 |
+
columns = ['PMID', 'ABSTRACT','EPI_PROB','IsEpi','IDS','DIS']+ordered_labels
|
290 |
+
else:
|
291 |
+
columns = ['PMID', 'ABSTRACT','EPI_PROB','IsEpi']+ordered_labels
|
292 |
+
|
293 |
+
results = pd.DataFrame(columns=columns)
|
294 |
+
|
295 |
+
##Check to see if search term maps to anything in the GARD dictionary, if so it pulls up all synonyms for the search
|
296 |
+
search_term_list = autosearch(search_term, GARD_dict)
|
297 |
+
if len(search_term_list)>1:
|
298 |
+
st.write("SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR: "+ str(search_term_list))
|
299 |
+
else:
|
300 |
+
st.write("SEARCHING FOR: "+ str(search_term_list))
|
301 |
+
|
302 |
+
#Gather title+abstracts into a dictionary {pmid:abstract}
|
303 |
+
pmid_abs = classify_abs.search_getAbs(search_term_list, maxResults, filtering)
|
304 |
+
st.write("GATHERED " +str(len(pmid_abs))+" PubMed IDs.")
|
305 |
+
|
306 |
+
i = 0
|
307 |
+
my_bar = st.progress(i)
|
308 |
+
percent_at_step = 100/len(pmid_abs)
|
309 |
+
for pmid, abstract in pmid_abs.items():
|
310 |
+
epi_prob, isEpi = classify_abs.getTextPredictions(abstract, classify_model_vars)
|
311 |
+
if isEpi:
|
312 |
+
#Preprocessing Functions for Extraction
|
313 |
+
sentences = str2sents(abstract)
|
314 |
+
model_outputs = [NER_pipeline(sent) for sent in sentences]
|
315 |
+
extraction = parse_info(sentences, model_outputs, entity_classes, extract_diseases, GARD_dict, max_length)
|
316 |
+
if extraction:
|
317 |
+
extraction.update({'PMID':pmid, 'ABSTRACT':abstract, 'EPI_PROB':epi_prob, 'IsEpi':isEpi})
|
318 |
+
#Slow dataframe update
|
319 |
+
results = results.append(extraction, ignore_index=True)
|
320 |
+
i+=1
|
321 |
+
my_bar.progress(i*percent_at_step)
|
322 |
+
|
323 |
+
st.write(len(results),'abstracts classified as epidemiological.')
|
324 |
+
return results.sort_values('EPI_PROB', ascending=False)
|
325 |
|
326 |
#Identical to search_term_extraction, except it returns a JSON object instead of a df
|
327 |
def API_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
|