Nicky Nicolson commited on
Commit
cbab2c9
1 Parent(s): 6e64108

Add limit arg and test install ruby gem for name parsing

Browse files
Files changed (2) hide show
  1. Dockerfile +9 -1
  2. tab2csv.py +3 -1
Dockerfile CHANGED
@@ -17,8 +17,16 @@ RUN ls -l /data
17
  RUN unzip /data/gbif-occs.zip -d /data
18
  RUN ls -l /data
19
  COPY ./tab2csv.py /code/tab2csv.py
 
 
 
 
 
 
 
 
20
  #COPY ./extractcollectorname.py /code/extractcollectorname.py
21
- RUN python tab2csv.py --createcols /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc-temp.csv
22
  #RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
23
  RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
24
  RUN ls -l /code
 
17
  RUN unzip /data/gbif-occs.zip -d /data
18
  RUN ls -l /data
19
  COPY ./tab2csv.py /code/tab2csv.py
20
+
21
+ # Setup to parse collector names using Bionomia utils (reqs Ruby)
22
+ # Install ruby
23
+ RUN \
24
+ apt-get update && \
25
+ apt-get install -y ruby
26
+ RUN gem install dwc_agent
27
+
28
  #COPY ./extractcollectorname.py /code/extractcollectorname.py
29
+ RUN python tab2csv.py --limit 1000 --createcols /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc-temp.csv
30
  #RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
31
  RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
32
  RUN ls -l /code
tab2csv.py CHANGED
@@ -17,6 +17,7 @@ if __name__ == '__main__':
17
  parser = argparse.ArgumentParser()
18
  parser.add_argument("inputfile")
19
  parser.add_argument("-c","--createcols", action='store_true')
 
20
  parser.add_argument("outputfile")
21
  args = parser.parse_args()
22
 
@@ -24,7 +25,8 @@ if __name__ == '__main__':
24
  encoding='utf8',
25
  keep_default_na=False,
26
  on_bad_lines='skip',
27
- sep='\t')
 
28
  if args.createcols:
29
  # Extract unique recordedBy values
30
  df_rb = df[['recordedBy']].drop_duplicates()
 
17
  parser = argparse.ArgumentParser()
18
  parser.add_argument("inputfile")
19
  parser.add_argument("-c","--createcols", action='store_true')
20
+ parser.add_argument("-l","--limit", type='int')
21
  parser.add_argument("outputfile")
22
  args = parser.parse_args()
23
 
 
25
  encoding='utf8',
26
  keep_default_na=False,
27
  on_bad_lines='skip',
28
+ sep='\t',
29
+ nrows=args.limit)
30
  if args.createcols:
31
  # Extract unique recordedBy values
32
  df_rb = df[['recordedBy']].drop_duplicates()