File size: 4,633 Bytes
b40e563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import io
import boto3
import requests
import numpy as np
import polars as pl
from PIL import Image
from botocore.config import Config
import logging

logger = logging.getLogger(__name__)

# S3 for sample images
my_config = Config(
    region_name='us-east-1'
)
s3_client = boto3.client('s3', config=my_config)

# Set basepath for EOL pages for info
EOL_URL = "https://eol.org/pages/"
RANKS = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]

def get_sample(df, pred_taxon, rank):
    '''
    Function to retrieve a sample image of the predicted taxon and EOL page link for more info.

    Parameters:
    -----------
    df : DataFrame
        DataFrame with all sample images listed and their filepaths (in "file_path" column).
    pred_taxon : str
        Predicted taxon of the uploaded image.
    rank : int
        Index of rank in RANKS chosen for prediction.

    Returns:
    --------
    img : PIL.Image
        Sample image of predicted taxon for display.
    eol_page : str
        URL to EOL page for the taxon (may be a lower rank, e.g., species sample).
    '''
    logger.info(f"Getting sample for taxon: {pred_taxon} at rank: {rank}")
    try:
        filepath, eol_page_id, full_name, is_exact = get_sample_data(df, pred_taxon, rank)
    except Exception as e:
        logger.error(f"Error retrieving sample data: {e}")
        return None, f"We encountered the following error trying to retrieve a sample image: {e}."
    if filepath is None:
        logger.warning(f"No sample image found for taxon: {pred_taxon}")
        return None, f"Sorry, our EOL images do not include {pred_taxon}."

    # Get sample image of selected individual
    try:
        img_src = s3_client.generate_presigned_url('get_object',
                                                   Params={'Bucket': 'treeoflife-10m-sample-images',
                                                           'Key': filepath}
                                                   )
        img_resp = requests.get(img_src)
        img = Image.open(io.BytesIO(img_resp.content))
        full_eol_url = EOL_URL + eol_page_id
        if is_exact:
            eol_page = f"<p>Check out the EOL entry for {pred_taxon} to learn more: <a href={full_eol_url} target='_blank'>{full_eol_url}</a>.</p>"
        else:
            eol_page = f"<p>Check out an example EOL entry within {pred_taxon} to learn more: {full_name} <a href={full_eol_url} target='_blank'>{full_eol_url}</a>.</p>"
        logger.info(f"Successfully retrieved sample image and EOL page for {pred_taxon}")
        return img, eol_page
    except Exception as e:
        logger.error(f"Error retrieving sample image: {e}")
        return None, f"We encountered the following error trying to retrieve a sample image: {e}."

def get_sample_data(df, pred_taxon, rank):
    '''
    Function to randomly select a sample individual of the given taxon and provide associated native location.
    
    Parameters:
    -----------
    df : DataFrame
        DataFrame with all sample images listed and their filepaths (in "file_path" column).
    pred_taxon : str
        Predicted taxon of the uploaded image.
    rank : int
        Index of rank in RANKS chosen for prediction.

    Returns:
    --------
    filepath : str
        Filepath of selected sample image for predicted taxon.
    eol_page_id : str
        EOL page ID associated with predicted taxon for more information.
    full_name : str
        Full taxonomic name of the selected sample.
    is_exact : bool
        Flag indicating if the match is exact (i.e., with empty lower ranks).
    '''
    for idx in range(rank + 1):
        taxon = RANKS[idx]
        target_taxon = pred_taxon.split(" ")[idx]
        df = df.filter(pl.col(taxon) == target_taxon)

    if df.shape[0] == 0:
        return None, np.nan, "", False

    # First, try to find entries with empty lower ranks
    exact_df = df
    for lower_rank in RANKS[rank + 1:]:
        exact_df = exact_df.filter((pl.col(lower_rank).is_null()) | (pl.col(lower_rank) == ""))

    if exact_df.shape[0] > 0:
        df_filtered = exact_df.sample()
        full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0))
        return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, True

    # If no exact matches, return any entry with the specified rank
    df_filtered = df.sample()
    full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0)) + " " + " ".join(df_filtered.select(RANKS[rank+1:]).row(0))
    return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, False