Spaces:
Runtime error
Runtime error
File size: 4,765 Bytes
abf8c49 2cc6a74 7f36417 424fd94 7f36417 abf8c49 2cc6a74 7875bd2 2cc6a74 7875bd2 2cc6a74 7f36417 abf8c49 5aa6463 7f36417 bc6bc45 90c51f5 5aa6463 abf8c49 7f36417 5aa6463 7f36417 5aa6463 7f36417 5aa6463 7f36417 5aa6463 e57384a cbab2c9 7f36417 cbab2c9 424fd94 2cc6a74 424fd94 5aa6463 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import argparse
import pandas as pd
import requests
from pygbif import occurrences as occ
from tqdm import tqdm
tqdm.pandas()
import os.path
def getFirstFamilyName(recordedBy):
firstFamilyName = None
parsed = bananompy.parse(recordedBy)
try:
firstFamilyName = parsed[0]['parsed'][0]['family']
except:
pass
return firstFamilyName
def getFirstFamilyNames(recordedBy_l):
# post to bionomia
bionomia_parse_endpoint_url = "https://api.bionomia.net/parse.json"
data = dict()
data['names'] = '\r\n'.join(recordedBy_l)
r = requests.post(bionomia_parse_endpoint_url, data=data)
parsed_results = r.json()
results = dict()
for parsed_result in parsed_results:
try:
results[parsed_result['original']] = parsed_result['parsed'][0]['family']
except:
results[parsed_result['original']] = None
return results
def getFirstFamilyNameBulk(df,
recordedByColName="recordedBy",
firstFamilyNameColName="recordedBy_first_familyname",
batchsize=500):
results = dict()
recordedBy_l = []
for s in tqdm(df[recordedByColName].values):
if len(recordedBy_l) == batchsize:
# send it
results.update(getFirstFamilyNames(recordedBy_l))
# clear for next iteration
recordedBy_l = []
recordedBy_l.append(s)
if len(recordedBy_l) > 0:
results.update(getFirstFamilyNames(recordedBy_l))
df[firstFamilyNameColName] = df[recordedByColName].map(results)
return df
GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV = 'https://api.gbif.org/v1/occurrence/download/describe/simpleCsv'
GBIF_DOWNLOAD_DESCRIBE_URL_DWCA = 'https://api.gbif.org/v1/occurrence/download/describe/dwca'
def getGbifDownloadColumnNames(download_format):
column_names = None
if download_format == 'SIMPLE_CSV':
r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV)
columns_metadata = r.json()
column_names = [column_metadata['name'] for column_metadata in columns_metadata['fields']]
elif download_format == 'DWCA':
r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_DWCA)
columns_metadata = r.json()
column_names = [column_metadata['name'] for column_metadata in columns_metadata['verbatim']['fields']]
return column_names
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("data_dir")
parser.add_argument("download_id")
parser.add_argument("-c","--createcols", action='store_true')
parser.add_argument("-l","--limit", type=int)
parser.add_argument("outputfilename")
args = parser.parse_args()
# Determine format of datafile by accessing download metadata from GBIF API
gbif_metadata = occ.download_meta(key = args.download_id)
download_format = gbif_metadata['request']['format']
# The GBIF download format determines:
# (1) the columns in the download, SIMPLE_CSV being a much restricted set
# of columns than DWCA
# (2) The name of the occurrence data file, SIMPLE_CSV : '[download_id].csv'
# DWCA : 'occurrence.txt'
inputfilename = None
column_names_simple_csv = getGbifDownloadColumnNames('SIMPLE_CSV')
column_names = None
if download_format == 'SIMPLE_CSV':
inputfilename = '{}.csv'.format(args.download_id)
column_names = column_names_simple_csv
elif download_format == 'DWCA':
inputfilename = 'occurrence.txt'
column_names_dwca = getGbifDownloadColumnNames('DWCA')
column_names = [column_name for column_name in column_names_dwca if column_name in column_names_simple_csv]
df = pd.read_csv(os.path.join(args.data_dir,inputfilename),
encoding='utf8',
keep_default_na=False,
on_bad_lines='skip',
sep='\t',
usecols=column_names,
nrows=args.limit)
if args.createcols:
# Extract unique recordedBy values
df_rb = df[['recordedBy']].drop_duplicates()
df_rb = getFirstFamilyNameBulk(df_rb)
#df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
# Apply back to main dataframe
df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
# Add column holding collector name and number
mask = (df.recordNumber.notnull())
df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
df.to_csv(os.path.join(args.data_dir,args.outputfilename), index=False, sep=',') |