Spaces:
Runtime error
Runtime error
Nicky Nicolson
commited on
Commit
•
e6f931e
1
Parent(s):
d40c9b3
Initial revision
Browse files- .gitignore +2 -0
- Dockerfile +28 -0
- getDownloadMetadata.py +21 -0
- metadata.json +5 -0
- requirements.txt +5 -0
- tab2csv.py +23 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
data/
|
2 |
+
env/
|
Dockerfile
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11
|
2 |
+
|
3 |
+
# Download ID is set as a space variable
|
4 |
+
# By default it is a download of all Solanum preserved specimen records (c600K)
|
5 |
+
ARG GBIF_DATASET_ID=$GBIF_DATASET_ID
|
6 |
+
|
7 |
+
WORKDIR /code
|
8 |
+
|
9 |
+
COPY ./requirements.txt /code/requirements.txt
|
10 |
+
|
11 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
12 |
+
|
13 |
+
# Download GBIF occurrences and prepare for use with datasette
|
14 |
+
RUN mkdir /data
|
15 |
+
ADD https://api.gbif.org/v1/literature/export?format=TSV&gbifDatasetKey=${GBIF_DATASET_ID} /data/gbif-citations.tsv
|
16 |
+
COPY ./tab2csv.py /code/tab2csv.py
|
17 |
+
|
18 |
+
|
19 |
+
RUN python tab2csv.py --createcols /data/gbif-citations.tsv /data/gbif-citations.csv
|
20 |
+
RUN csvs-to-sqlite /data/gbif-citations.csv /code/gbifcit.db
|
21 |
+
RUN ls -l /code
|
22 |
+
RUN sqlite-utils tables /code/gbifcit.db --counts
|
23 |
+
RUN chmod 755 /code/gbifcit.db
|
24 |
+
|
25 |
+
COPY ./metadata.json /code/metadata.json
|
26 |
+
RUN python getDownloadMetadata.py --dataset_id=${GBIF_DATASET_ID} /code/metadata.json /code/metadata.json
|
27 |
+
|
28 |
+
CMD ["datasette", "/code/gbifcit.db", "-m", "/code/metadata.json", "--host", "0.0.0.0", "--port", "7860"]
|
getDownloadMetadata.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
|
4 |
+
if __name__ == '__main__':
|
5 |
+
parser = argparse.ArgumentParser()
|
6 |
+
parser.add_argument("inputfile")
|
7 |
+
parser.add_argument("--dataset_id", type=str)
|
8 |
+
parser.add_argument("outputfile")
|
9 |
+
|
10 |
+
args = parser.parse_args()
|
11 |
+
|
12 |
+
datasette_metadata = None
|
13 |
+
with open(args.inputfile, 'r') as f_in:
|
14 |
+
datasette_metadata = json.load(f_in)
|
15 |
+
|
16 |
+
source_url = 'https://www.gbif.org/resource/search?contentType=literature&gbifDatasetKey={}'.format(args.dataset_id)
|
17 |
+
datasette_metadata['source_url'] = source_url
|
18 |
+
|
19 |
+
datasette_metadata_json = json.dumps(datasette_metadata, indent=4)
|
20 |
+
with open(args.outputfile, 'w') as f_out:
|
21 |
+
f_out.write(datasette_metadata_json)
|
metadata.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"title": "GBIF-monitored citations of RBG Kew herbarium data",
|
3 |
+
"description": "This is a datasette instance containing GBIF-monitored citations of RBG Kew herbarium data. It can be used to browse citations and to chart summaries by year and quarter.",
|
4 |
+
"source": "Global Biodiversity Information Facility (GBIF)",
|
5 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasette
|
2 |
+
datasette-vega
|
3 |
+
sqlite-utils
|
4 |
+
csvs-to-sqlite
|
5 |
+
pandas==1.5.3
|
tab2csv.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
if __name__ == '__main__':
|
5 |
+
parser = argparse.ArgumentParser()
|
6 |
+
parser.add_argument("inputfile")
|
7 |
+
parser.add_argument("-c","--createcols", action='store_true')
|
8 |
+
parser.add_argument("-l","--limit", type=int)
|
9 |
+
parser.add_argument("outputfile")
|
10 |
+
args = parser.parse_args()
|
11 |
+
|
12 |
+
date_columns = ['discovered','published','added']
|
13 |
+
df = pd.read_csv(args.inputfile,
|
14 |
+
encoding='utf8',
|
15 |
+
keep_default_na=False,
|
16 |
+
on_bad_lines='skip',
|
17 |
+
sep='\t',
|
18 |
+
nrows=args.limit,
|
19 |
+
parse_dates=date_columns)
|
20 |
+
if args.createcols:
|
21 |
+
for date_column in date_columns:
|
22 |
+
df[date_column + '_q'] = pd.PeriodIndex(df[date_column], freq='Q')
|
23 |
+
df.to_csv(args.outputfile, index=False, sep=',')
|