gbifocc-datasette / tab2csv.py
Nicky Nicolson
Add limit arg and test install ruby gem for name parsing
cbab2c9
raw
history blame
1.49 kB
import argparse
import pandas as pd
import bananompy
from tqdm import tqdm
tqdm.pandas()
def getFirstFamilyName(s):
firstFamilyName = None
parsed = bananompy.parse(s)
try:
firstFamilyName = parsed[0]['parsed'][0]['family']
except:
pass
return firstFamilyName
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("inputfile")
parser.add_argument("-c","--createcols", action='store_true')
parser.add_argument("-l","--limit", type='int')
parser.add_argument("outputfile")
args = parser.parse_args()
df = pd.read_csv(args.inputfile,
encoding='utf8',
keep_default_na=False,
on_bad_lines='skip',
sep='\t',
nrows=args.limit)
if args.createcols:
# Extract unique recordedBy values
df_rb = df[['recordedBy']].drop_duplicates()
df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
# Apply back to main dataframe
df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
# Add column holding collector name and number
mask = (df.recordNumber.notnull())
df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
df.to_csv(args.outputfile, index=False, sep=',')