Spaces:
Runtime error
Runtime error
Nicky Nicolson
commited on
Commit
•
2cc6a74
1
Parent(s):
4d53d1c
Modified name parsing to use bionomia directly
Browse files- Dockerfile +7 -7
- tab2csv.py +38 -4
Dockerfile
CHANGED
@@ -18,15 +18,15 @@ RUN unzip /data/gbif-occs.zip -d /data
|
|
18 |
RUN ls -l /data
|
19 |
COPY ./tab2csv.py /code/tab2csv.py
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
RUN \
|
24 |
-
apt-get update && \
|
25 |
-
apt-get install -y ruby
|
26 |
-
RUN gem install dwc_agent
|
27 |
|
28 |
#COPY ./extractcollectorname.py /code/extractcollectorname.py
|
29 |
-
RUN python tab2csv.py --
|
30 |
#RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
|
31 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
32 |
RUN ls -l /code
|
|
|
18 |
RUN ls -l /data
|
19 |
COPY ./tab2csv.py /code/tab2csv.py
|
20 |
|
21 |
+
## Setup to parse collector names using Bionomia utils (reqs Ruby)
|
22 |
+
## Install ruby
|
23 |
+
#RUN \
|
24 |
+
# apt-get update && \
|
25 |
+
# apt-get install -y ruby
|
26 |
+
#RUN gem install dwc_agent
|
27 |
|
28 |
#COPY ./extractcollectorname.py /code/extractcollectorname.py
|
29 |
+
RUN python tab2csv.py --createcols /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc.csv
|
30 |
#RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
|
31 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
32 |
RUN ls -l /code
|
tab2csv.py
CHANGED
@@ -1,18 +1,51 @@
|
|
1 |
import argparse
|
2 |
import pandas as pd
|
3 |
-
import
|
4 |
from tqdm import tqdm
|
5 |
tqdm.pandas()
|
6 |
|
7 |
-
def getFirstFamilyName(
|
8 |
firstFamilyName = None
|
9 |
-
parsed = bananompy.parse(
|
10 |
try:
|
11 |
firstFamilyName = parsed[0]['parsed'][0]['family']
|
12 |
except:
|
13 |
pass
|
14 |
return firstFamilyName
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
if __name__ == '__main__':
|
17 |
parser = argparse.ArgumentParser()
|
18 |
parser.add_argument("inputfile")
|
@@ -30,7 +63,8 @@ if __name__ == '__main__':
|
|
30 |
if args.createcols:
|
31 |
# Extract unique recordedBy values
|
32 |
df_rb = df[['recordedBy']].drop_duplicates()
|
33 |
-
df_rb
|
|
|
34 |
# Apply back to main dataframe
|
35 |
df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
|
36 |
# Add column holding collector name and number
|
|
|
1 |
import argparse
|
2 |
import pandas as pd
|
3 |
+
import requests
|
4 |
from tqdm import tqdm
|
5 |
tqdm.pandas()
|
6 |
|
7 |
+
def getFirstFamilyName(recordedBy):
|
8 |
firstFamilyName = None
|
9 |
+
parsed = bananompy.parse(recordedBy)
|
10 |
try:
|
11 |
firstFamilyName = parsed[0]['parsed'][0]['family']
|
12 |
except:
|
13 |
pass
|
14 |
return firstFamilyName
|
15 |
|
16 |
+
def getFirstFamilyNames(recordedBy_l):
|
17 |
+
# post to bionomia
|
18 |
+
bionomia_parse_endpoint_url = "https://api.bionomia.net/parse.json"
|
19 |
+
data = dict()
|
20 |
+
data['names'] = '\r\n'.join(recordedBy_l)
|
21 |
+
r = requests.post(bionomia_parse_endpoint_url, data=data)
|
22 |
+
parsed_results = r.json()
|
23 |
+
results = dict()
|
24 |
+
for parsed_result in parsed_results:
|
25 |
+
try:
|
26 |
+
results[parsed_result['original']] = parsed_result['parsed'][0]['family']
|
27 |
+
except:
|
28 |
+
results[parsed_result['original']] = None
|
29 |
+
return results
|
30 |
+
|
31 |
+
def getFirstFamilyNameBulk(df,
|
32 |
+
recordedByColName="recordedBy",
|
33 |
+
firstFamilyNameColName="recordedBy_first_familyname",
|
34 |
+
batchsize=500):
|
35 |
+
results = dict()
|
36 |
+
recordedBy_l = []
|
37 |
+
for s in tqdm(df[recordedByColName].values):
|
38 |
+
if len(recordedBy_l) == batchsize:
|
39 |
+
# send it
|
40 |
+
results.update(getFirstFamilyNames(recordedBy_l))
|
41 |
+
# clear for next iteration
|
42 |
+
recordedBy_l = []
|
43 |
+
recordedBy_l.append(s)
|
44 |
+
if len(recordedBy_l) > 0:
|
45 |
+
results.update(getFirstFamilyNames(recordedBy_l))
|
46 |
+
df[firstFamilyNameColName] = df[recordedByColName].map(results)
|
47 |
+
return df
|
48 |
+
|
49 |
if __name__ == '__main__':
|
50 |
parser = argparse.ArgumentParser()
|
51 |
parser.add_argument("inputfile")
|
|
|
63 |
if args.createcols:
|
64 |
# Extract unique recordedBy values
|
65 |
df_rb = df[['recordedBy']].drop_duplicates()
|
66 |
+
df_rb = getFirstFamilyNameBulk(df_rb)
|
67 |
+
#df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
|
68 |
# Apply back to main dataframe
|
69 |
df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
|
70 |
# Add column holding collector name and number
|