import argparse import pandas as pd import requests from pygbif import occurrences as occ from tqdm import tqdm tqdm.pandas() import os.path def getFirstFamilyName(recordedBy): firstFamilyName = None parsed = bananompy.parse(recordedBy) try: firstFamilyName = parsed[0]['parsed'][0]['family'] except: pass return firstFamilyName def getFirstFamilyNames(recordedBy_l): # post to bionomia bionomia_parse_endpoint_url = "https://api.bionomia.net/parse.json" data = dict() data['names'] = '\r\n'.join(recordedBy_l) r = requests.post(bionomia_parse_endpoint_url, data=data) parsed_results = r.json() results = dict() for parsed_result in parsed_results: try: results[parsed_result['original']] = parsed_result['parsed'][0]['family'] except: results[parsed_result['original']] = None return results def getFirstFamilyNameBulk(df, recordedByColName="recordedBy", firstFamilyNameColName="recordedBy_first_familyname", batchsize=500): results = dict() recordedBy_l = [] for s in tqdm(df[recordedByColName].values): if len(recordedBy_l) == batchsize: # send it results.update(getFirstFamilyNames(recordedBy_l)) # clear for next iteration recordedBy_l = [] recordedBy_l.append(s) if len(recordedBy_l) > 0: results.update(getFirstFamilyNames(recordedBy_l)) df[firstFamilyNameColName] = df[recordedByColName].map(results) return df GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV = 'https://api.gbif.org/v1/occurrence/download/describe/simpleCsv' GBIF_DOWNLOAD_DESCRIBE_URL_DWCA = 'https://api.gbif.org/v1/occurrence/download/describe/dwca' def getGbifDownloadColumnNames(download_format): column_names = None if download_format == 'SIMPLE_CSV': r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV) columns_metadata = r.json() column_names = [column_metadata['name'] for column_metadata in columns_metadata['fields']] elif download_format == 'DWCA': r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_DWCA) columns_metadata = r.json() column_names = [column_metadata['name'] for column_metadata in columns_metadata['verbatim']['fields']] return column_names if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("data_dir") parser.add_argument("download_id") parser.add_argument("-c","--createcols", action='store_true') parser.add_argument("-l","--limit", type=int) parser.add_argument("outputfilename") args = parser.parse_args() # Determine format of datafile by accessing download metadata from GBIF API gbif_metadata = occ.download_meta(key = args.download_id) download_format = gbif_metadata['request']['format'] # The GBIF download format determines: # (1) the columns in the download, SIMPLE_CSV being a much restricted set # of columns than DWCA # (2) The name of the occurrence data file, SIMPLE_CSV : '[download_id].csv' # DWCA : 'occurrence.txt' inputfilename = None column_names_simple_csv = getGbifDownloadColumnNames('SIMPLE_CSV') column_names = None if download_format == 'SIMPLE_CSV': inputfilename = '{}.csv'.format(args.download_id) column_names = column_names_simple_csv elif download_format == 'DWCA': inputfilename = 'occurrence.txt' column_names_dwca = getGbifDownloadColumnNames('DWCA') column_names = [column_name for column_name in column_names_dwca if column_name in column_names_simple_csv] df = pd.read_csv(os.path.join(args.data_dir,inputfilename), encoding='utf8', keep_default_na=False, on_bad_lines='skip', sep='\t', usecols=column_names, nrows=args.limit) if args.createcols: # Extract unique recordedBy values df_rb = df[['recordedBy']].drop_duplicates() df_rb = getFirstFamilyNameBulk(df_rb) #df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName) # Apply back to main dataframe df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left') # Add column holding collector name and number mask = (df.recordNumber.notnull()) df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1) df.to_csv(os.path.join(args.data_dir,args.outputfilename), index=False, sep=',')