File size: 4,765 Bytes
abf8c49
 
2cc6a74
7f36417
424fd94
 
7f36417
abf8c49
2cc6a74
7875bd2
2cc6a74
7875bd2
 
 
 
 
 
2cc6a74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f36417
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abf8c49
 
5aa6463
7f36417
bc6bc45
90c51f5
5aa6463
abf8c49
 
7f36417
 
 
5aa6463
 
 
 
 
 
7f36417
 
 
5aa6463
7f36417
 
5aa6463
7f36417
 
 
5aa6463
e57384a
 
 
cbab2c9
7f36417
cbab2c9
424fd94
 
 
2cc6a74
 
424fd94
 
 
 
 
5aa6463
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import argparse
import pandas as pd
import requests
from pygbif import occurrences as occ
from tqdm import tqdm
tqdm.pandas()
import os.path

def getFirstFamilyName(recordedBy):
    firstFamilyName = None
    parsed = bananompy.parse(recordedBy)
    try:
        firstFamilyName = parsed[0]['parsed'][0]['family']
    except:
        pass
    return firstFamilyName

def getFirstFamilyNames(recordedBy_l):
    # post to bionomia
    bionomia_parse_endpoint_url = "https://api.bionomia.net/parse.json"
    data = dict()
    data['names'] = '\r\n'.join(recordedBy_l)
    r = requests.post(bionomia_parse_endpoint_url, data=data)
    parsed_results = r.json()
    results = dict()
    for parsed_result in parsed_results:
        try:
            results[parsed_result['original']] = parsed_result['parsed'][0]['family']
        except:
            results[parsed_result['original']] = None
    return results

def getFirstFamilyNameBulk(df, 
                            recordedByColName="recordedBy", 
                            firstFamilyNameColName="recordedBy_first_familyname",
                            batchsize=500):
    results = dict()
    recordedBy_l = []
    for s in tqdm(df[recordedByColName].values):
        if len(recordedBy_l) == batchsize:
            # send it
            results.update(getFirstFamilyNames(recordedBy_l))
            # clear for next iteration
            recordedBy_l = []
        recordedBy_l.append(s)
    if len(recordedBy_l) > 0:
        results.update(getFirstFamilyNames(recordedBy_l))
    df[firstFamilyNameColName] = df[recordedByColName].map(results)
    return df

GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV = 'https://api.gbif.org/v1/occurrence/download/describe/simpleCsv'
GBIF_DOWNLOAD_DESCRIBE_URL_DWCA = 'https://api.gbif.org/v1/occurrence/download/describe/dwca'

def getGbifDownloadColumnNames(download_format):
    column_names = None
    if download_format == 'SIMPLE_CSV':
        r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV)
        columns_metadata = r.json()
        column_names = [column_metadata['name'] for column_metadata in columns_metadata['fields']]
    elif download_format == 'DWCA':
        r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_DWCA)
        columns_metadata = r.json()
        column_names = [column_metadata['name'] for column_metadata in columns_metadata['verbatim']['fields']]
    return column_names


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("data_dir")
    parser.add_argument("download_id")
    parser.add_argument("-c","--createcols", action='store_true')
    parser.add_argument("-l","--limit", type=int)
    parser.add_argument("outputfilename")    
    args = parser.parse_args()

    # Determine format of datafile by accessing download metadata from GBIF API
    gbif_metadata = occ.download_meta(key = args.download_id)
    download_format = gbif_metadata['request']['format']
    # The GBIF download format determines:
    # (1) the columns in the download, SIMPLE_CSV being a much restricted set 
    # of columns than DWCA
    # (2) The name of the occurrence data file, SIMPLE_CSV : '[download_id].csv'
    # DWCA : 'occurrence.txt'
    inputfilename = None
    column_names_simple_csv = getGbifDownloadColumnNames('SIMPLE_CSV')
    column_names = None
    if download_format == 'SIMPLE_CSV':
        inputfilename = '{}.csv'.format(args.download_id)
        column_names = column_names_simple_csv
    elif download_format == 'DWCA':
        inputfilename = 'occurrence.txt'
        column_names_dwca = getGbifDownloadColumnNames('DWCA')
        column_names = [column_name for column_name in column_names_dwca if column_name in column_names_simple_csv]
        
    df = pd.read_csv(os.path.join(args.data_dir,inputfilename), 
                    encoding='utf8', 
                    keep_default_na=False, 
                    on_bad_lines='skip', 
                    sep='\t',
                    usecols=column_names,
                    nrows=args.limit)
    if args.createcols:
        # Extract unique recordedBy values
        df_rb = df[['recordedBy']].drop_duplicates()
        df_rb = getFirstFamilyNameBulk(df_rb)
        #df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
        # Apply back to main dataframe
        df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
        # Add column holding collector name and number
        mask = (df.recordNumber.notnull())
        df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
    df.to_csv(os.path.join(args.data_dir,args.outputfilename), index=False, sep=',')