Nicky Nicolson commited on
Commit
948faf7
1 Parent(s): 7a26d32

Add step to extract collector name & create col for reconciliation

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. Dockerfile +2 -1
  3. extractcollectorname.py +36 -0
  4. metadata.json +1 -1
  5. requirements.txt +2 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ env
2
+ data
Dockerfile CHANGED
@@ -17,7 +17,8 @@ RUN ls -l /data
17
  RUN unzip /data/gbif-occs.zip -d /data
18
  RUN ls -l /data
19
  COPY ./tab2csv.py /code/tab2csv.py
20
- RUN python tab2csv.py /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc.csv
 
21
  RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
22
  RUN ls -l /code
23
  RUN sqlite-utils tables /code/gbifocc.db --counts
 
17
  RUN unzip /data/gbif-occs.zip -d /data
18
  RUN ls -l /data
19
  COPY ./tab2csv.py /code/tab2csv.py
20
+ RUN python tab2csv.py /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc-temp.csv
21
+ RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
22
  RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
23
  RUN ls -l /code
24
  RUN sqlite-utils tables /code/gbifocc.db --counts
extractcollectorname.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import pandas as pd
3
+ import bananompy
4
+ from tqdm import tqdm
5
+ tqdm.pandas()
6
+
7
+ def getFirstFamilyName(s):
8
+ firstFamilyName = None
9
+ parsed = bananompy.parse(s)
10
+ try:
11
+ firstFamilyName = parsed[0]['parsed'][0]['family']
12
+ except:
13
+ pass
14
+ return firstFamilyName
15
+
16
+ if __name__ == '__main__':
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument("inputfile")
19
+ parser.add_argument("outputfile")
20
+ args = parser.parse_args()
21
+
22
+ df = pd.read_csv(args.inputfile,
23
+ encoding='utf8',
24
+ keep_default_na=False,
25
+ na_values=['NONE',''],
26
+ on_bad_lines='skip',
27
+ sep=',')
28
+ # Extract unique recordedBy values
29
+ df_rb = df[['recordedBy']].drop_duplicates()
30
+ df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
31
+ # Apply back to main dataframe
32
+ df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
33
+ # Add column holding collector name and number
34
+ mask = (df.recordNumber.notnull())
35
+ df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
36
+ df.to_csv(args.outputfile, index=False, sep=',')
metadata.json CHANGED
@@ -7,7 +7,7 @@
7
  "plugins": {
8
  "datasette-reconcile": {
9
  "id_field": "gbifID",
10
- "name_field": "recordedBy",
11
  "type_field": "basisOfRecord",
12
  "type_default": [{
13
  "id": "basisOfRecord",
 
7
  "plugins": {
8
  "datasette-reconcile": {
9
  "id_field": "gbifID",
10
+ "name_field": "collectorNameAndNumber",
11
  "type_field": "basisOfRecord",
12
  "type_default": [{
13
  "id": "basisOfRecord",
requirements.txt CHANGED
@@ -3,3 +3,5 @@ datasette-reconcile
3
  sqlite-utils
4
  csvs-to-sqlite
5
  pandas==1.5.3
 
 
 
3
  sqlite-utils
4
  csvs-to-sqlite
5
  pandas==1.5.3
6
+ bananompy
7
+ tqdm