Spaces:
Runtime error
Runtime error
File size: 2,801 Bytes
abf8c49 2cc6a74 424fd94 abf8c49 2cc6a74 7875bd2 2cc6a74 7875bd2 2cc6a74 abf8c49 bc6bc45 90c51f5 abf8c49 e57384a cbab2c9 424fd94 2cc6a74 424fd94 80bafd5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import argparse
import pandas as pd
import requests
from tqdm import tqdm
tqdm.pandas()
def getFirstFamilyName(recordedBy):
firstFamilyName = None
parsed = bananompy.parse(recordedBy)
try:
firstFamilyName = parsed[0]['parsed'][0]['family']
except:
pass
return firstFamilyName
def getFirstFamilyNames(recordedBy_l):
# post to bionomia
bionomia_parse_endpoint_url = "https://api.bionomia.net/parse.json"
data = dict()
data['names'] = '\r\n'.join(recordedBy_l)
r = requests.post(bionomia_parse_endpoint_url, data=data)
parsed_results = r.json()
results = dict()
for parsed_result in parsed_results:
try:
results[parsed_result['original']] = parsed_result['parsed'][0]['family']
except:
results[parsed_result['original']] = None
return results
def getFirstFamilyNameBulk(df,
recordedByColName="recordedBy",
firstFamilyNameColName="recordedBy_first_familyname",
batchsize=500):
results = dict()
recordedBy_l = []
for s in tqdm(df[recordedByColName].values):
if len(recordedBy_l) == batchsize:
# send it
results.update(getFirstFamilyNames(recordedBy_l))
# clear for next iteration
recordedBy_l = []
recordedBy_l.append(s)
if len(recordedBy_l) > 0:
results.update(getFirstFamilyNames(recordedBy_l))
df[firstFamilyNameColName] = df[recordedByColName].map(results)
return df
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("inputfile")
parser.add_argument("-c","--createcols", action='store_true')
parser.add_argument("-l","--limit", type=int)
parser.add_argument("outputfile")
args = parser.parse_args()
df = pd.read_csv(args.inputfile,
encoding='utf8',
keep_default_na=False,
on_bad_lines='skip',
sep='\t',
nrows=args.limit)
if args.createcols:
# Extract unique recordedBy values
df_rb = df[['recordedBy']].drop_duplicates()
df_rb = getFirstFamilyNameBulk(df_rb)
#df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
# Apply back to main dataframe
df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
# Add column holding collector name and number
mask = (df.recordNumber.notnull())
df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
df.to_csv(args.outputfile, index=False, sep=',') |