File size: 1,319 Bytes
948faf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import argparse
import pandas as pd
import bananompy
from tqdm import tqdm
tqdm.pandas()

def getFirstFamilyName(s):
    firstFamilyName = None
    parsed = bananompy.parse(s)
    try:
        firstFamilyName = parsed[0]['parsed'][0]['family']
    except:
        pass
    return firstFamilyName

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("inputfile")
    parser.add_argument("outputfile")    
    args = parser.parse_args()

    df = pd.read_csv(args.inputfile, 
                    encoding='utf8', 
                    keep_default_na=False, 
                    na_values=['NONE',''],
                    on_bad_lines='skip', 
                    sep=',')
    # Extract unique recordedBy values
    df_rb = df[['recordedBy']].drop_duplicates()
    df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
    # Apply back to main dataframe
    df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
    # Add column holding collector name and number
    mask = (df.recordNumber.notnull())
    df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
    df.to_csv(args.outputfile, index=False, sep=',')