Nicky Nicolson commited on
Commit
e6f931e
1 Parent(s): d40c9b3

Initial revision

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. Dockerfile +28 -0
  3. getDownloadMetadata.py +21 -0
  4. metadata.json +5 -0
  5. requirements.txt +5 -0
  6. tab2csv.py +23 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ data/
2
+ env/
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ # Download ID is set as a space variable
4
+ # By default it is a download of all Solanum preserved specimen records (c600K)
5
+ ARG GBIF_DATASET_ID=$GBIF_DATASET_ID
6
+
7
+ WORKDIR /code
8
+
9
+ COPY ./requirements.txt /code/requirements.txt
10
+
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
+
13
+ # Download GBIF occurrences and prepare for use with datasette
14
+ RUN mkdir /data
15
+ ADD https://api.gbif.org/v1/literature/export?format=TSV&gbifDatasetKey=${GBIF_DATASET_ID} /data/gbif-citations.tsv
16
+ COPY ./tab2csv.py /code/tab2csv.py
17
+
18
+
19
+ RUN python tab2csv.py --createcols /data/gbif-citations.tsv /data/gbif-citations.csv
20
+ RUN csvs-to-sqlite /data/gbif-citations.csv /code/gbifcit.db
21
+ RUN ls -l /code
22
+ RUN sqlite-utils tables /code/gbifcit.db --counts
23
+ RUN chmod 755 /code/gbifcit.db
24
+
25
+ COPY ./metadata.json /code/metadata.json
26
+ RUN python getDownloadMetadata.py --dataset_id=${GBIF_DATASET_ID} /code/metadata.json /code/metadata.json
27
+
28
+ CMD ["datasette", "/code/gbifcit.db", "-m", "/code/metadata.json", "--host", "0.0.0.0", "--port", "7860"]
getDownloadMetadata.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+
4
+ if __name__ == '__main__':
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument("inputfile")
7
+ parser.add_argument("--dataset_id", type=str)
8
+ parser.add_argument("outputfile")
9
+
10
+ args = parser.parse_args()
11
+
12
+ datasette_metadata = None
13
+ with open(args.inputfile, 'r') as f_in:
14
+ datasette_metadata = json.load(f_in)
15
+
16
+ source_url = 'https://www.gbif.org/resource/search?contentType=literature&gbifDatasetKey={}'.format(args.dataset_id)
17
+ datasette_metadata['source_url'] = source_url
18
+
19
+ datasette_metadata_json = json.dumps(datasette_metadata, indent=4)
20
+ with open(args.outputfile, 'w') as f_out:
21
+ f_out.write(datasette_metadata_json)
metadata.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "title": "GBIF-monitored citations of RBG Kew herbarium data",
3
+ "description": "This is a datasette instance containing GBIF-monitored citations of RBG Kew herbarium data. It can be used to browse citations and to chart summaries by year and quarter.",
4
+ "source": "Global Biodiversity Information Facility (GBIF)",
5
+ }
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ datasette
2
+ datasette-vega
3
+ sqlite-utils
4
+ csvs-to-sqlite
5
+ pandas==1.5.3
tab2csv.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import pandas as pd
3
+
4
+ if __name__ == '__main__':
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument("inputfile")
7
+ parser.add_argument("-c","--createcols", action='store_true')
8
+ parser.add_argument("-l","--limit", type=int)
9
+ parser.add_argument("outputfile")
10
+ args = parser.parse_args()
11
+
12
+ date_columns = ['discovered','published','added']
13
+ df = pd.read_csv(args.inputfile,
14
+ encoding='utf8',
15
+ keep_default_na=False,
16
+ on_bad_lines='skip',
17
+ sep='\t',
18
+ nrows=args.limit,
19
+ parse_dates=date_columns)
20
+ if args.createcols:
21
+ for date_column in date_columns:
22
+ df[date_column + '_q'] = pd.PeriodIndex(df[date_column], freq='Q')
23
+ df.to_csv(args.outputfile, index=False, sep=',')