Spaces:
Runtime error
Runtime error
Nicky Nicolson
commited on
Commit
•
cbab2c9
1
Parent(s):
6e64108
Add limit arg and test install ruby gem for name parsing
Browse files- Dockerfile +9 -1
- tab2csv.py +3 -1
Dockerfile
CHANGED
@@ -17,8 +17,16 @@ RUN ls -l /data
|
|
17 |
RUN unzip /data/gbif-occs.zip -d /data
|
18 |
RUN ls -l /data
|
19 |
COPY ./tab2csv.py /code/tab2csv.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
#COPY ./extractcollectorname.py /code/extractcollectorname.py
|
21 |
-
RUN python tab2csv.py --createcols /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc-temp.csv
|
22 |
#RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
|
23 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
24 |
RUN ls -l /code
|
|
|
17 |
RUN unzip /data/gbif-occs.zip -d /data
|
18 |
RUN ls -l /data
|
19 |
COPY ./tab2csv.py /code/tab2csv.py
|
20 |
+
|
21 |
+
# Setup to parse collector names using Bionomia utils (reqs Ruby)
|
22 |
+
# Install ruby
|
23 |
+
RUN \
|
24 |
+
apt-get update && \
|
25 |
+
apt-get install -y ruby
|
26 |
+
RUN gem install dwc_agent
|
27 |
+
|
28 |
#COPY ./extractcollectorname.py /code/extractcollectorname.py
|
29 |
+
RUN python tab2csv.py --limit 1000 --createcols /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc-temp.csv
|
30 |
#RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
|
31 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
32 |
RUN ls -l /code
|
tab2csv.py
CHANGED
@@ -17,6 +17,7 @@ if __name__ == '__main__':
|
|
17 |
parser = argparse.ArgumentParser()
|
18 |
parser.add_argument("inputfile")
|
19 |
parser.add_argument("-c","--createcols", action='store_true')
|
|
|
20 |
parser.add_argument("outputfile")
|
21 |
args = parser.parse_args()
|
22 |
|
@@ -24,7 +25,8 @@ if __name__ == '__main__':
|
|
24 |
encoding='utf8',
|
25 |
keep_default_na=False,
|
26 |
on_bad_lines='skip',
|
27 |
-
sep='\t'
|
|
|
28 |
if args.createcols:
|
29 |
# Extract unique recordedBy values
|
30 |
df_rb = df[['recordedBy']].drop_duplicates()
|
|
|
17 |
parser = argparse.ArgumentParser()
|
18 |
parser.add_argument("inputfile")
|
19 |
parser.add_argument("-c","--createcols", action='store_true')
|
20 |
+
parser.add_argument("-l","--limit", type='int')
|
21 |
parser.add_argument("outputfile")
|
22 |
args = parser.parse_args()
|
23 |
|
|
|
25 |
encoding='utf8',
|
26 |
keep_default_na=False,
|
27 |
on_bad_lines='skip',
|
28 |
+
sep='\t',
|
29 |
+
nrows=args.limit)
|
30 |
if args.createcols:
|
31 |
# Extract unique recordedBy values
|
32 |
df_rb = df[['recordedBy']].drop_duplicates()
|