adding utility files used throughout FusOn-pLM training and benchmarking

Browse files

Files changed (17) hide show

fuson_plm/utils/README.md +3 -0
fuson_plm/utils/__init__.py +0 -0
fuson_plm/utils/__pycache__/__init__.cpython-310.pyc +0 -0
fuson_plm/utils/__pycache__/clustering.cpython-310.pyc +0 -0
fuson_plm/utils/__pycache__/constants.cpython-310.pyc +0 -0
fuson_plm/utils/__pycache__/data_cleaning.cpython-310.pyc +0 -0
fuson_plm/utils/__pycache__/embedding.cpython-310.pyc +0 -0
fuson_plm/utils/__pycache__/logging.cpython-310.pyc +0 -0
fuson_plm/utils/__pycache__/splitting.cpython-310.pyc +0 -0
fuson_plm/utils/__pycache__/visualizing.cpython-310.pyc +0 -0
fuson_plm/utils/clustering.py +139 -0
fuson_plm/utils/constants.py +108 -0
fuson_plm/utils/data_cleaning.py +126 -0
fuson_plm/utils/embedding.py +193 -0
fuson_plm/utils/logging.py +116 -0
fuson_plm/utils/splitting.py +206 -0
fuson_plm/utils/visualizing.py +545 -0

fuson_plm/utils/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ This folder contains common functions for data cleaning, clustering, train-test splitting, visualization, embedding, and logging.
2	+
3	+ The functions in these scripts are used throughout the pository for training the main model, FusOn-pLM, as well as benchmarks.

fuson_plm/utils/__init__.py ADDED Viewed

File without changes

fuson_plm/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (149 Bytes). View file

fuson_plm/utils/__pycache__/clustering.cpython-310.pyc ADDED Viewed

Binary file (4.87 kB). View file

fuson_plm/utils/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (2.48 kB). View file

fuson_plm/utils/__pycache__/data_cleaning.cpython-310.pyc ADDED Viewed

Binary file (4.45 kB). View file

fuson_plm/utils/__pycache__/embedding.cpython-310.pyc ADDED Viewed

Binary file (5.13 kB). View file

fuson_plm/utils/__pycache__/logging.cpython-310.pyc ADDED Viewed

Binary file (3.31 kB). View file

fuson_plm/utils/__pycache__/splitting.cpython-310.pyc ADDED Viewed

Binary file (6.95 kB). View file

fuson_plm/utils/__pycache__/visualizing.cpython-310.pyc ADDED Viewed

Binary file (13.4 kB). View file

fuson_plm/utils/clustering.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import pandas as pd
+import os
+import subprocess
+import sys
+from Bio import SeqIO
+import shutil
+from fuson_plm.utils.logging import open_logfile, log_update
+def ensure_mmseqs_in_path(mmseqs_dir):
+    """
+    Checks if MMseqs2 is in the PATH. If it's not, add it. MMseqs2 will not run if this is not done correctly.
+    Args:
+        mmseqs_dir (str): Directory containing MMseqs2 binaries
+    """
+    mmseqs_bin = os.path.join(mmseqs_dir, 'mmseqs')
+    # Check if mmseqs is already in PATH
+    if shutil.which('mmseqs') is None:
+        # Export the MMseqs2 directory to PATH
+        os.environ['PATH'] = f"{mmseqs_dir}:{os.environ['PATH']}"
+        log_update(f"\tAdded {mmseqs_dir} to PATH")
+def process_fasta(fasta_path):
+    fasta_sequences = SeqIO.parse(open(fasta_path),'fasta')
+    d = {}
+    for fasta in fasta_sequences:
+        id, sequence = fasta.id, str(fasta.seq)
+        d[id] = sequence
+    return d
+def analyze_clustering_result(input_fasta: str, tsv_path: str):
+    """
+    Args:
+        input_fasta (str): path to input fasta file
+    """
+    # Process input fasta
+    input_d = process_fasta(input_fasta)
+    # Process clusters.tsv
+    clusters = pd.read_csv(f'{tsv_path}',sep='\t',header=None)
+    clusters = clusters.rename(columns={
+        0: 'representative seq_id',
+        1: 'member seq_id'
+    })
+    clusters['representative seq'] = clusters['representative seq_id'].apply(lambda seq_id: input_d[seq_id])
+    clusters['member seq'] = clusters['member seq_id'].apply(lambda seq_id: input_d[seq_id])
+    # Sort them so that splitting results are reproducible
+    clusters = clusters.sort_values(by=['representative seq_id','member seq_id'],ascending=True).reset_index(drop=True)
+    return clusters
+def make_fasta(sequences: dict, fasta_path: str):
+    """
+    Makes a fasta file from sequences, where the key is the header and the value is the sequence.
+    Args:
+        sequences (dict): A dictionary where the key is the header and the value is the sequence.
+    Returns:
+        str: The path to the fasta file.
+    """
+    with open(fasta_path, 'w') as f:
+      for header, sequence in sequences.items():
+        f.write(f'>{header}\n{sequence}\n')
+    return fasta_path
+def run_mmseqs_clustering(input_fasta, output_dir, min_seq_id=0.3, c=0.8, cov_mode=0, cluster_mode=0, path_to_mmseqs='fuson_plm/mmseqs'):
+    """
+    Runs MMSeqs2 clustering using easycluster module
+    Args:
+        input_fasta (str): path to input fasta file, formatted >header\nsequence\n>header\nsequence....
+        output_dir (str): path to output dir for clustering results
+        min_seq_id (float): number [0,1] representing --min-seq-id in cluster command
+        c (float): nunber [0,1] representing -c in cluster command
+        cov_mode (int): number 0, 1, 2, or 3 representing --cov-mode in cluster command
+        cluster_mode (int): number 0, 1, or 2 representing --cluster-mode in cluster command
+    """
+    # Get mmseqs dir
+    log_update("\nRunning MMSeqs clustering...")
+    mmseqs_dir = os.path.join(path_to_mmseqs[0:path_to_mmseqs.index('/mmseqs')], 'mmseqs/bin')
+    # Ensure MMseqs2 is in the PATH
+    ensure_mmseqs_in_path(mmseqs_dir)
+    # Define paths for MMseqs2
+    mmseqs_bin = "mmseqs"  # Ensure this is in your PATH or provide the full path to mmseqs binary
+    # Create the output directory
+    os.makedirs(output_dir, exist_ok=True)
+    # Run MMseqs2 easy-cluster
+    cmd_easy_cluster = [
+        mmseqs_bin, "easy-cluster", input_fasta, os.path.join(output_dir, "mmseqs"), output_dir,
+        "--min-seq-id", str(min_seq_id),
+        "-c", str(c),
+        "--cov-mode", str(cov_mode),
+        "--cluster-mode", str(cluster_mode),
+        "--dbtype", "1"
+    ]
+    # Write the command to a log file
+    log_update("\n\tCommand entered to MMSeqs2:")
+    log_update("\t" + " ".join(cmd_easy_cluster) + "\n")
+    subprocess.run(cmd_easy_cluster, check=True)
+    log_update(f"Clustering completed. Results are in {output_dir}")
+def cluster_summary(clusters: pd.DataFrame):
+    """
+    Summarizes how many clusters were formed, how big they are, etc ...
+    """
+    grouped_clusters = clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'})
+    assert len(grouped_clusters) == len(clusters['representative seq_id'].unique()) # make sure number of cluster reps = # grouped clusters
+    total_seqs = sum(grouped_clusters['member count'])
+    log_update(f"Created {len(grouped_clusters)} clusters of {total_seqs} sequences")
+    log_update(f"\t{len(grouped_clusters.loc[grouped_clusters['member count']==1])} clusters of size 1")
+    csize1_seqs = sum(grouped_clusters[grouped_clusters['member count']==1]['member count'])
+    log_update(f"\t\tsequences: {csize1_seqs} ({round(100*csize1_seqs/total_seqs, 2)}%)")
+    log_update(f"\t{len(grouped_clusters.loc[grouped_clusters['member count']>1])} clusters of size > 1")
+    csizeg1_seqs = sum(grouped_clusters[grouped_clusters['member count']>1]['member count'])
+    log_update(f"\t\tsequences: {csizeg1_seqs} ({round(100*csizeg1_seqs/total_seqs, 2)}%)")
+    log_update(f"\tlargest cluster: {max(grouped_clusters['member count'])}")
+    log_update("\nCluster size breakdown below...")
+    value_counts = grouped_clusters['member count'].value_counts().reset_index().rename(columns={'index':'cluster size (n_members)','member count': 'n_clusters'})
+    log_update(value_counts.sort_values(by='cluster size (n_members)',ascending=True).to_string(index=False))

fuson_plm/utils/constants.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Data Cleaning Parameters
+# TCGA abbreviations for cancer. From https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations
+TCGA_CODES = {
+    'LAML':	'Acute Myeloid Leukemia',
+    'ACC':	'Adrenocortical carcinoma',
+    'BLCA':	'Bladder Urothelial Carcinoma',
+    'LGG':	'Brain Lower Grade Glioma',
+    'BRCA':	'Breast invasive carcinoma',
+    'CESC':	'Cervical squamous cell carcinoma and endocervical adenocarcinoma',
+    'CHOL':	'Cholangiocarcinoma',
+    'LCML':	'Chronic Myelogenous Leukemia',
+    'COAD':	'Colon adenocarcinoma',
+    'CNTL':	'Controls',
+    'ESCA':	'Esophageal carcinoma',
+    'FPPP':	'FFPE Pilot Phase II',
+    'GBM':	'Glioblastoma multiforme',
+    'HNSC':	'Head and Neck squamous cell carcinoma',
+    'KICH':	'Kidney Chromophobe',
+    'KIRC':	'Kidney renal clear cell carcinoma',
+    'KIRP':	'Kidney renal papillary cell carcinoma',
+    'LIHC':	'Liver hepatocellular carcinoma',
+    'LUAD':	'Lung adenocarcinoma',
+    'LUSC':	'Lung squamous cell carcinoma',
+    'DLBC':	'Lymphoid Neoplasm Diffuse Large B-cell Lymphoma',
+    'MESO':	'Mesothelioma',
+    'MISC':	'Miscellaneous',
+    'OV':	'Ovarian serous cystadenocarcinoma',
+    'PAAD':	'Pancreatic adenocarcinoma',
+    'PCPG':	'Pheochromocytoma and Paraganglioma',
+    'PRAD':	'Prostate adenocarcinoma',
+    'READ':	'Rectum adenocarcinoma',
+    'SARC':	'Sarcoma',
+    'SKCM':	'Skin Cutaneous Melanoma',
+    'STAD':	'Stomach adenocarcinoma',
+    'TGCT':	'Testicular Germ Cell Tumors',
+    'THYM':	'Thymoma',
+    'THCA':	'Thyroid carcinoma',
+    'UCS':	'Uterine Carcinosarcoma',
+    'UCEC':	'Uterine Corpus Endometrial Carcinoma',
+    'UVM':	'Uveal Melanoma'
+}
+FODB_CODES = {
+    'ACC':	'Adenoid cystic carcinoma',
+    'ALL':	'Acute Lymphoid Leukemia',
+    'AML':	'Acute Myeloid Leukemia',
+    'BALL':	'B-cell acute lymphoblastic leukemia',
+    'BLCA':	'Bladder Urothelial Carcinoma',
+    'BRCA':	'Breast invasive carcinoma',
+    'CESC':	'Cervical squamous cell carcinoma and endocervical adenocarcinoma',
+    'CHOL':	'Cholangiocarcinoma',
+    'EPD':	'Ependymoma',
+    'HGG':	'High-grade glioma',
+    'HNSC':	'Head and Neck squamous cell carcinoma',
+    'KIRC':	'Kidney renal clear cell carcinoma',
+    'LGG':	'Low-grade glioma',
+    'LUAD':	'Lung adenocarcinoma',
+    'LUSC': 'Lung squamous cell carcinoma',
+    'MEL':	'Melanoma',
+    'MESO':	'Mesothelioma',
+    'NBL':	'Neuroblastoma',
+    'OS':	'Osteosarcoma',
+    'OV':	'Ovarian serous cystadenocarcinoma',
+    'PCPG':	'Pheochromocytoma and Paraganglioma',
+    'PRAD':	'Prostate adenocarcinoma',
+    'READ':	'Rectum adenocarcinoma',
+    'RHB':	'Rhabdomyosarcoma',
+    'SARC':	'Sarcoma',
+    'STAD':	'Stomach adenocarcinoma',
+    'TALL':	'T-cell acute lymphoblastic leukemia',
+    'THYM':	'Thymoma',
+    'UCEC':	'Uterine Corpus Endometrial Carcinoma',
+    'UCS':	'Uterine Carcinosarcoma',
+    'UVM':	'Uveal Melanoma',
+    'WLM':	'Wilms tumor'
+}
+VALID_AAS = {'A',
+             'R',
+             'N',
+             'D',
+             'C',
+             'E',
+             'Q',
+             'G',
+             'H',
+             'I',
+             'L',
+             'K',
+             'M',
+             'F',
+             'P',
+             'S',
+             'T',
+             'W',
+             'Y',
+             'V'}
+DELIMITERS = {',',
+              ';',
+              '|',
+              '\t',
+              ' ',
+              ':',
+              '-',
+              '/',
+              '\\',
+              '\n'}

fuson_plm/utils/data_cleaning.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import pandas as pd
+import numpy as np
+from fuson_plm.utils.logging import log_update
+def clean_rows_and_cols(df: pd.Series) -> pd.Series:
+    """
+    Deletes empty rows and columns
+    Args:
+        df (pd.Series): input DatFrame to be cleaned
+    Returns:
+        pd.Series: cleaned DataFrame
+    """
+    # Delete rows with no data
+    log_update(f"\trow cleaning...\n\t\toriginal # rows: {len(df)}")
+    log_update("\t\tdropping rows where all entries are np.nan...")
+    df = df.dropna(how='all')
+    log_update(f"\t\tnew # rows: {len(df)}")
+    # Delete columns with no data
+    log_update(f"\tcolumn cleaning...\n\t\toriginal # columns: {len(df.columns)}")
+    log_update("\t\tdropping columns where all entries are np.nan...")
+    df = df.dropna(axis=1,how='all')
+    log_update(f"\t\tnew # columns: {len(df.columns)}")
+    log_update(f"\t\tcolumn names: {','.join(list(df.columns))}")
+    return df
+def check_columns_for_listlike(df: pd.DataFrame, cols_of_interest: list, delimiters: set):
+    """
+    Checks if a column contains any listlike items
+    Args:
+        df (pd.DataFrame): DataFrame to be investigated
+        cols_of_interest (list): columns in df to be investigated for list-containing potential
+        delimiters (set): set of potential delimiting strings to search for. A column with any of these strings is considered listlike.
+    Returns:
+        dict: dictionary containing a set {} of all delimiters found in each column
+            e.g., { 'col1': {',',';'},
+                    'col2': {'|'}   }
+    """
+    # return the delimiters/listlike things found for each column
+    return_dict = {}
+    log_update("\tchecking if any of our columns of interest look listlike (contain list objects or delimiters)...")
+    for col in cols_of_interest:
+        unique_col = list(df[col].value_counts().index)
+        listlike = any([check_item_for_listlike(x, delimiters) for x in unique_col])
+        if listlike:
+            found_delims = df[col].apply(lambda x: check_item_for_listlike(x, delimiters)).value_counts().reset_index()['index'].to_list()
+            unique_found_delims = set()
+            for x in found_delims:
+                unique_found_delims = unique_found_delims.union(x)
+            return_dict[col] = unique_found_delims
+        else:
+            return_dict[col] = False
+        # display the return dict
+        log_update(f"\t\tcolumn name: {col}\tlistlike: {return_dict[col]}")
+    return return_dict
+def check_item_for_listlike(x, delimiters: set):
+    """
+    Checks if a column looks like it contains a list of items, rather than an inidvidual item, based on string delimiters.
+    Args:
+        x: the item to check. Any dtype.
+        delimiters: a set of delimiters to check for. e.g., {',', ';', '|', '\t', ' ', ':', '-', '/', '\\', '\n'}
+    Returns:
+        If x is a string: the set (may be empty) of delimiters contained in the string
+        If x is not a string: the dtype of x
+    """
+    if isinstance(x, str):
+        return find_delimiters(x, delimiters)
+    else:
+        if x is None:
+            # if it's None, it's not listlike, it's just empty. return {} because it has no delimiters.
+            return {}
+        if type(x)==float:
+            # if it's nan, it's not listlike, it's just empty. return {} because it has no delimiters.
+            if np.isnan(x):
+                return {}
+        return type(x)
+def find_delimiters(seq: str, delimiters: set) -> set:
+    """
+    Find and return a set of delimiters in a sequence. Helper mtehod for check_item_for_listlike.
+    Args:
+        seq (str): The sequence you wish to search for invalid characters.
+        delimiters (set): a set of delimiters to check for. e.g., {',', ';', '|', '\t', ' ', ':', '-', '/', '\\', '\n'}
+    Returns:
+        set: A set of characters in the sequence that are not in the set of valid characters.
+    """
+    unique_chars = set(seq) # set of all characters in the sequence; unique_chars = {A, C} for protein="AAACCC"
+    overlap = delimiters.intersection(unique_chars)
+    if len(overlap)==0:
+        return {}
+    else:
+        return overlap
+def find_invalid_chars(seq: str, valid_chars: set) -> set:
+    """
+    Find and return a set of invalid characters in a sequence.
+    Args:
+        seq (str): The sequence you wish to search for invalid characters.
+        valid_chars (set): A set of valid characters.
+    Returns:
+        set: A set of characters in the sequence that are not in the set of valid characters.
+    """
+    unique_chars = set(seq) # set of all characters in the sequence; unique_chars = {A, C} for protein="AAACCC"
+    if unique_chars.issubset(valid_chars):  # e.g. unique_chars = {A,C}, and {A,C} is a subset of valid_chars
+        return ''
+    else: # e.g. unique_chars = {A,X}. {A,X} is not a subset of valid_chars because X is not in valid_chars
+        return unique_chars.difference(valid_chars) # e.g. {A,X} - valid_chars = {X}

fuson_plm/utils/embedding.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import pickle
+import torch
+from transformers import EsmModel, AutoTokenizer
+from transformers import T5Tokenizer, T5EncoderModel
+import pickle
+import logging
+from fuson_plm.utils.logging import log_update
+def redump_pickle_dictionary(pickle_path):
+    """
+    Loads a pickle dictionary and redumps it in its location. This allows a clean reset for a pickle built with 'ab+'
+    """
+    entries = {}
+    # Load one by one
+    with open(pickle_path, 'rb') as f:
+        while True:
+            try:
+                entry = pickle.load(f)
+                entries.update(entry)
+            except EOFError:
+                break  # End of file reached
+            except Exception as e:
+                print(f"An error occurred: {e}")
+                break
+    # Redump
+    with open(pickle_path, 'wb') as f:
+        pickle.dump(entries, f)
+def load_esm2_type(esm_type, device=None):
+    """
+    Loads ESM-2 version of a specified version (e.g. esm2_t33_650M_UR50D)
+    """
+    # Suppress warnings about newly initialized 'esm.pooler.dense.bias', 'esm.pooler.dense.weight' layers - these are not used to extract embeddings
+    logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {device}")
+    model = EsmModel.from_pretrained(f"facebook/{esm_type}")
+    tokenizer = AutoTokenizer.from_pretrained(f"facebook/{esm_type}")
+    model.to(device)
+    model.eval()  # disables dropout for deterministic results
+    return model, tokenizer, device
+def load_prott5():
+    # Initialize tokenizer and model
+    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)
+    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc")
+    if device == torch.device('cpu'):
+        model.to(torch.float32)
+    model.to(device)
+    return model, tokenizer, device
+def get_esm_embeddings(model, tokenizer, sequences, device, average=True, print_updates=False, savepath=None, save_at_end=False,max_length=None):
+    """
+    Compute ESM embeddings.
+    Args:
+        model
+        tokenizer
+        sequences
+        device
+        average: if True, the average embeddings will be taken and returned
+        savepath: if savepath is not None, the embeddings will be saved somewhere. It must be a pickle
+    """
+    # Correct save path to pickle if necessary
+    if savepath is not None:
+        if savepath[-4::] != '.pkl': savepath += '.pkl'
+    # If no max length was passed, just set it to the maximum in the dataset
+    max_seq_len = max([len(s) for s in sequences])
+    if max_length is None: max_length=max_seq_len+2 #+2 for BOS, EOS
+    # Initialize an empty dict to store the ESM embeddings
+    embedding_dict = {}
+    # Iterate through the seqs
+    for i in range(len(sequences)):
+        sequence = sequences[i]
+        # Get the embeddings
+        with torch.no_grad():
+            inputs = tokenizer(sequence, return_tensors="pt",padding=True, truncation=True,max_length=max_length)
+            inputs = {k:v.to(device) for k, v in inputs.items()}
+            outputs = model(**inputs)
+            embedding = outputs.last_hidden_state
+            # remove extra dimension
+            embedding = embedding.squeeze(0)
+            # remove BOS and EOS tokens
+            embedding = embedding[1:-1, :]
+            # Convert embeddings to numpy array (if needed)
+            embedding = embedding.cpu().numpy()
+        # Average (if necessary)
+        if average:
+            embedding = embedding.mean(0)
+        # Add to dictionary
+        embedding_dict[sequence] = embedding
+        # Save individual embedding (if necessary)
+        if not(savepath is None) and not(save_at_end):
+            with open(savepath, 'ab+') as f:
+                d = {sequence: embedding}
+                pickle.dump(d, f)
+        # Print update (if necessary)
+        if print_updates: log_update(f"sequence {i+1}: {sequence[0:10]}...")
+    # Dump all at once at the end (if necessary)
+    if not(savepath is None):
+        # If saving for the first time, just dump it
+        if save_at_end:
+            with open(savepath, 'wb') as f:
+                pickle.dump(embedding_dict, f)
+        # If we've been saving all along and made it here without crashing, correct the pickle file so it can be loaded nicely
+        else:
+            redump_pickle_dictionary(savepath)
+    # Return the dictionary
+    return embedding_dict
+def get_prott5_embeddings(model, tokenizer, sequences, device, average=True, print_updates=False, savepath=None, save_at_end=False,max_length=None):
+    # Correct save path to pickle if necessary
+    if savepath is not None:
+        if savepath[-4::] != '.pkl': savepath += '.pkl'
+    # If no max length was passed, just set it to the maximum in the dataset
+    max_seq_len = max([len(s) for s in sequences])
+    if max_length is None: max_length=max_seq_len+2 #+2 for BOS, EOS
+    # the ProtT5 tokenizer requires that there are spaces between residues
+    spaced_sequences = [' '.join(list(seq)) for seq in sequences] # Spaces between residues for Prot-T5 tokenizer
+    # Store embeddings here
+    embedding_dict = {} # store embeddings here
+    for i in range(0, len(spaced_sequences)):
+        spaced_sequence = spaced_sequences[i] # get current sequence
+        seq = spaced_sequence.replace(" ", "")
+        with torch.no_grad():
+          inputs = tokenizer(spaced_sequence, return_tensors="pt", add_special_tokens=True, truncation=True,max_length=max_length) # shouldn't have to pad because batch size is 1
+          inputs = {k:v.to(device) for k, v in inputs.items()}
+          # Pass through the model with no gradient to get embeddings
+          embedding_repr = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
+          # Process the embedding
+          seq_length = len(seq)  # length of the sequence is after you remove spaces
+          embedding = embedding_repr.last_hidden_state.squeeze(0)  # remove batch dimension
+          embedding = embedding[0:-1]  # remove EOS token (there is no BOS token)
+          embedding = embedding.cpu().numpy()  # put on CPU and numpy
+          embedding_log = f"\tembedding shape: {embedding.shape}"
+          # MAKE SURE the embedding lengths are right with an assert. We expect embedding dimension 1024, and sequence length to match real sequence length
+          assert embedding.shape[1] == 1024
+          assert embedding.shape[0] == seq_length
+          # Average (if necessary)
+          if average:
+            dim_before = embedding.shape
+            embedding = embedding.mean(0)
+            embedding_log = f"\tembedding shape before avg: {dim_before}\tafter avg: {embedding.shape}"
+          # Add the embedding to the dictionary
+          embedding_dict[seq] = embedding
+          # Save individual embedding (if necessary)
+          if not(savepath is None) and not(save_at_end):
+              with open(savepath, 'ab+') as f:
+                  d = {seq: embedding}
+                  pickle.dump(d, f)
+          if print_updates: log_update(f"sequence {i+1}: {seq[0:10]}...{embedding_log}\t seq len: {seq_length}")
+    # Dump all at once at the end (if necessary)
+    if not(savepath is None):
+        # If saving for the first time, just dump it
+        if save_at_end:
+            with open(savepath, 'wb') as f:
+                pickle.dump(embedding_dict, f)
+        # If we've been saving all along and made it here without crashing, correct the pickle file so it can be loaded nicely
+        else:
+            redump_pickle_dictionary(savepath)
+    # Return the dictionary
+    return embedding_dict

fuson_plm/utils/logging.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from datetime import datetime
+from contextlib import contextmanager
+import sys
+import pytz
+import os
+class CustomParams:
+    """
+    Class for custom parameters where dictionary elements can be accessed as attributes
+    """
+    def __init__(self, **kwargs):
+        self.__dict__.update(kwargs)
+    def print_config(self,indent=''):
+        for attr, value in self.__dict__.items():
+            print(f"{indent}{attr}: {value}")
+def log_update(text: str):
+    """
+    Logs input text to an output file
+    Args:
+        text (str): the text to be logged
+    """
+    print(text) # log_update the text
+    sys.stdout.flush() # flush to automatically update the output file
+@contextmanager
+def open_logfile(log_path,mode='w'):
+    """
+    Open log-file for real-time logging of the most important updates
+    """
+    log_file = open(log_path, mode)      # open
+    original_stdout = sys.stdout        # save original stdout
+    sys.stdout = log_file               # redirect stdout to log_file
+    try:
+        yield log_file
+    finally:
+        sys.stdout = original_stdout
+        log_file.close()
+@contextmanager
+def open_errfile(log_path,mode='w'):
+    """
+    Redirects stderr (error messages) to a separate log file.
+    """
+    log_file = open(log_path, mode)      # open the error log file for writing
+    original_stderr = sys.stderr        # save original stderr
+    sys.stderr = log_file               # redirect stderr to log_file
+    try:
+        yield log_file
+    finally:
+        sys.stderr = original_stderr    # restore original stderr
+        log_file.close()                # close the error log file
+def print_configpy(module):
+    """
+    Prints all the configurations in a config.py file
+    """
+    log_update("All configurations:")
+    # Iterate over attributes
+    for attribute in dir(module):
+        # Filter out built-in attributes and methods
+        if not attribute.startswith("__"):
+            value = getattr(module, attribute)
+            log_update(f"\t{attribute}: {value}")
+def get_local_time(timezone_str='US/Eastern'):
+    """
+    Get current time in the specified timezone.
+    Args:
+        timezone_str (str): The timezone to retrieve time for. Defaults to 'US/Eastern'.
+    Returns:
+        str: The formatted current time in the specified timezone.
+    """
+    try:
+        timezone = pytz.timezone(timezone_str)
+    except pytz.UnknownTimeZoneError:
+        return f"Unknown timezone: {timezone_str}"
+    current_datetime = datetime.now(pytz.utc).astimezone(timezone)
+    return current_datetime.strftime('%m-%d-%Y-%H:%M:%S')
+def get_local_date_yr(timezone_str='US/Eastern'):
+    """
+    Get current time in the specified timezone.
+    Args:
+        timezone_str (str): The timezone to retrieve time for. Defaults to 'US/Eastern'.
+    Returns:
+        str: The formatted current time in the specified timezone.
+    """
+    try:
+        timezone = pytz.timezone(timezone_str)
+    except pytz.UnknownTimeZoneError:
+        return f"Unknown timezone: {timezone_str}"
+    current_datetime = datetime.now(pytz.utc).astimezone(timezone)
+    return current_datetime.strftime('%m_%d_%Y')
+def find_fuson_plm_directory():
+    """
+    Constructs a path backwards to fuson_plm directory so we don't have to use absolute paths (helps for docker containers)
+    """
+    current_dir = os.path.abspath(os.getcwd())
+    while True:
+        if 'fuson_plm' in os.listdir(current_dir):
+            return os.path.join(current_dir, 'fuson_plm')
+        parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
+        if parent_dir == current_dir:  # If we've reached the root directory
+            raise FileNotFoundError("fuson_plm directory not found.")
+        current_dir = parent_dir

fuson_plm/utils/splitting.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import pandas as pd
+from sklearn.model_selection import train_test_split
+from fuson_plm.utils.logging import log_update
+def split_clusters_train_test(X, y, benchmark_cluster_reps=[], random_state = 1, test_size = 0.20):
+    # cluster with random state fixed for reproducible results
+    log_update(f"\tPerforming split: all clusters -> train clusters ({round(1-test_size,3)}) and test clusters ({test_size})")
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
+    # add benchmark representatives back to X_test
+    log_update(f"\tManually adding {len(benchmark_cluster_reps)} clusters containing benchmark seqs into X_test")
+    X_test += benchmark_cluster_reps
+    # assert no duplicates within the train, test, or val sets (there shouldn't be, if the input data was clean)
+    assert len(X_train)==len(set(X_train))
+    assert len(X_test)==len(set(X_test))
+    return {
+        'X_train': X_train,
+        'X_test': X_test
+    }
+def split_clusters_train_val_test(X, y, benchmark_cluster_reps=[], random_state_1 = 1, random_state_2 = 1, test_size_1 = 0.20, test_size_2 = 0.50):
+    # cluster with random state fixed for reproducible results
+    log_update(f"\tPerforming first split: all clusters -> train clusters ({round(1-test_size_1,3)}) and other ({test_size_1})")
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_1, random_state=random_state_1)
+    log_update(f"\tPerforming second split: other -> val clusters ({round(1-test_size_2,3)}) and test clusters ({test_size_2})")
+    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_size_2, random_state=random_state_2)
+    # add benchmark representatives back to X_test
+    log_update(f"\tManually adding {len(benchmark_cluster_reps)} clusters containing benchmark seqs into X_test")
+    X_test += benchmark_cluster_reps
+    # assert no duplicates within the train, test, or val sets (there shouldn't be, if the input data was clean)
+    assert len(X_train)==len(set(X_train))
+    assert len(X_val)==len(set(X_val))
+    assert len(X_test)==len(set(X_test))
+    return {
+        'X_train': X_train,
+        'X_val': X_val,
+        'X_test': X_test
+    }
+def split_clusters(cluster_representatives: list, val_set = True, benchmark_cluster_reps=[], random_state_1 = 1, random_state_2 = 1, test_size_1 = 0.20, test_size_2 = 0.50):
+    """"
+    Cluster-splitting method amenable to either train-test or train-val-test.
+    For train-val-test, there are two splits.
+    """
+    log_update("\nPerforming splits...")
+    # Approx. 80/10/10 split
+    X = [x for x in cluster_representatives if not(x in benchmark_cluster_reps)]     # X, for splitting, does NOT include benchmark reps. We'll add these clusters to test.
+    y = [0]*len(X)      # y is a dummy array here; there are no values.
+    split_dict = None
+    if val_set:
+        split_dict = split_clusters_train_val_test(X, y, benchmark_cluster_reps=benchmark_cluster_reps,
+                                      random_state_1 = random_state_1, random_state_2 = random_state_2,
+                                      test_size_1 = test_size_1, test_size_2 = test_size_2)
+    else:
+        split_dict = split_clusters_train_test(X, y, benchmark_cluster_reps=benchmark_cluster_reps,
+                                      random_state = random_state_1,
+                                      test_size = test_size_1)
+    return split_dict
+def check_split_validity(train_clusters, val_clusters, test_clusters, benchmark_sequences=None):
+    """
+    Args:
+        train_clusters (pd.DataFrame):
+        val_clusters (pd.DataFrame): (optional - can pass None if there is no validation set)
+        test_clusters (pd.DataFrame):
+    """
+    # Make grouped versions of these DataFrames for size analysis
+    train_clustersgb = train_clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'})
+    if val_clusters is not None:
+        val_clustersgb = val_clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'})
+    if test_clusters is not None:
+        test_clustersgb = test_clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'})
+    # Calculate stats - clusters
+    n_train_clusters = len(train_clustersgb)
+    n_val_clusters, n_test_clusters = 0, 0
+    if val_clusters is not None:
+        n_val_clusters = len(val_clustersgb)
+    if test_clusters is not None:
+        n_test_clusters = len(test_clustersgb)
+    n_clusters = n_train_clusters + n_val_clusters + n_test_clusters
+    assert len(train_clusters['representative seq_id'].unique()) == len(train_clustersgb)
+    if val_clusters is not None:
+        assert len(val_clusters['representative seq_id'].unique()) == len(val_clustersgb)
+    if test_clusters is not None:
+        assert len(test_clusters['representative seq_id'].unique()) == len(test_clustersgb)
+    train_cluster_pcnt = round(100*n_train_clusters/n_clusters,2)
+    if val_clusters is not None:
+        val_cluster_pcnt = round(100*n_val_clusters/n_clusters,2)
+    if test_clusters is not None:
+        test_cluster_pcnt = round(100*n_test_clusters/n_clusters,2)
+    # Calculate stats - proteins
+    n_train_proteins = len(train_clusters)
+    n_val_proteins, n_test_proteins = 0, 0
+    if val_clusters is not None:
+        n_val_proteins = len(val_clusters)
+    if test_clusters is not None:
+        n_test_proteins = len(test_clusters)
+    n_proteins = n_train_proteins + n_val_proteins + n_test_proteins
+    assert len(train_clusters) == sum(train_clustersgb['member count'])
+    if val_clusters is not None:
+        assert len(val_clusters) == sum(val_clustersgb['member count'])
+    if test_clusters is not None:
+        assert len(test_clusters) == sum(test_clustersgb['member count'])
+    train_protein_pcnt = round(100*n_train_proteins/n_proteins,2)
+    if val_clusters is not None:
+        val_protein_pcnt = round(100*n_val_proteins/n_proteins,2)
+    if test_clusters is not None:
+        test_protein_pcnt = round(100*n_test_proteins/n_proteins,2)
+    # Print results
+    log_update("\nCluster breakdown...")
+    log_update(f"Total clusters = {n_clusters}, total proteins = {n_proteins}")
+    log_update(f"\tTrain set:\n\t\tTotal Clusters = {len(train_clustersgb)} ({train_cluster_pcnt}%)\n\t\tTotal Proteins = {len(train_clusters)} ({train_protein_pcnt}%)")
+    if val_clusters is not None:
+        log_update(f"\tVal set:\n\t\tTotal Clusters = {len(val_clustersgb)} ({val_cluster_pcnt}%)\n\t\tTotal Proteins = {len(val_clusters)} ({val_protein_pcnt}%)")
+    if test_clusters is not None:
+        log_update(f"\tTest set:\n\t\tTotal Clusters = {len(test_clustersgb)} ({test_cluster_pcnt}%)\n\t\tTotal Proteins = {len(test_clusters)} ({test_protein_pcnt}%)")
+    # Check for overlap in both sequence ID and sequence actual
+    train_protein_ids = set(train_clusters['member seq_id'])
+    train_protein_seqs = set(train_clusters['member seq'])
+    if val_clusters is not None:
+        val_protein_ids = set(val_clusters['member seq_id'])
+        val_protein_seqs = set(val_clusters['member seq'])
+    if test_clusters is not None:
+        test_protein_ids = set(test_clusters['member seq_id'])
+        test_protein_seqs = set(test_clusters['member seq'])
+    # Print results
+    log_update("\nChecking for overlap...")
+    if (val_clusters is not None) and (test_clusters is not None):
+        log_update(f"\tSequence IDs...\n\t\tTrain-Val Overlap: {len(train_protein_ids.intersection(val_protein_ids))}\n\t\tTrain-Test Overlap: {len(train_protein_ids.intersection(test_protein_ids))}\n\t\tVal-Test Overlap: {len(val_protein_ids.intersection(test_protein_ids))}")
+        log_update(f"\tSequences...\n\t\tTrain-Val Overlap: {len(train_protein_seqs.intersection(val_protein_seqs))}\n\t\tTrain-Test Overlap: {len(train_protein_seqs.intersection(test_protein_seqs))}\n\t\tVal-Test Overlap: {len(val_protein_seqs.intersection(test_protein_seqs))}")
+    if (val_clusters is not None) and (test_clusters is None):
+        log_update(f"\tSequence IDs...\n\t\tTrain-Val Overlap: {len(train_protein_ids.intersection(val_protein_ids))}")
+        log_update(f"\tSequences...\n\t\tTrain-Val Overlap: {len(train_protein_seqs.intersection(val_protein_seqs))}")
+    if (val_clusters is None) and (test_clusters is not None):
+        log_update(f"\tSequence IDs...\n\t\tTrain-Test Overlap: {len(train_protein_ids.intersection(test_protein_ids))}")
+        log_update(f"\tSequences...\n\t\tTrain-Test Overlap: {len(train_protein_seqs.intersection(test_protein_seqs))}")
+    # Assert no sequence overlap
+    if val_clusters is not None:
+        assert len(train_protein_seqs.intersection(val_protein_seqs))==0
+    if test_clusters is not None:
+        assert len(train_protein_seqs.intersection(test_protein_seqs))==0
+    if (val_clusters is not None) and (test_clusters is not None):
+        assert len(val_protein_seqs.intersection(test_protein_seqs))==0
+    # Finally, check that there are only benchmark sequences in test - if there are benchmark sequences
+    if not(benchmark_sequences is None):
+        bench_in_train = len(train_clusters.loc[train_clusters['member seq'].isin(benchmark_sequences)]['member seq'].unique())
+        bench_in_val, bench_in_test = 0, 0
+        if val_clusters is not None:
+            bench_in_val = len(val_clusters.loc[val_clusters['member seq'].isin(benchmark_sequences)]['member seq'].unique())
+        if test_clusters is not None:
+            bench_in_test = len(test_clusters.loc[test_clusters['member seq'].isin(benchmark_sequences)]['member seq'].unique())
+        # Assert this
+        log_update("\nChecking for benchmark sequence presence in test, and absence from train and val...")
+        log_update(f"\tTotal benchmark sequences: {len(benchmark_sequences)}")
+        log_update(f"\tBenchmark sequences in train: {bench_in_train}")
+        if val_clusters is not None:
+            log_update(f"\tBenchmark sequences in val: {bench_in_val}")
+        if test_clusters is not None:
+            log_update(f"\tBenchmark sequences in test: {bench_in_test}")
+        assert bench_in_train == bench_in_val == 0
+        assert bench_in_test == len(benchmark_sequences)
+def check_class_distributions(train_df, val_df, test_df, class_col='class'):
+    """
+    Checks class distributions within train, val, and test sets.
+    Expects input dataframes to have 'sequence' column and 'class' column
+    """
+    train_vc = pd.DataFrame(train_df[class_col].value_counts()).reset_index().rename(columns={'index':class_col, class_col:'train_count'})
+    train_vc['train_pct'] = (train_vc['train_count'] / train_vc['train_count'].sum()).round(3)*100
+    if val_df is not None:
+        val_vc = pd.DataFrame(val_df[class_col].value_counts()).reset_index().rename(columns={'index':class_col, class_col:'val_count'})
+        val_vc['val_pct'] = (val_vc['val_count'] / val_vc['val_count'].sum()).round(3)*100
+    test_vc = pd.DataFrame(test_df[class_col].value_counts()).reset_index().rename(columns={'index':class_col, class_col:'test_count'})
+    test_vc['test_pct'] = (test_vc['test_count'] / test_vc['test_count'].sum()).round(3)*100
+    # concatenate so I can see them next to each other
+    if val_df is not None:
+        compare = pd.concat([train_vc, val_vc, test_vc], axis=1)
+        compare['train-val diff'] = (compare['train_pct'] - compare['val_pct']).apply(lambda x: abs(x))
+        compare['val-test diff'] = (compare['val_pct'] - compare['test_pct']).apply(lambda x: abs(x))
+    else:
+        compare = pd.concat([train_vc, test_vc], axis=1)
+    compare['train-test diff'] = (compare['train_pct'] - compare['test_pct']).apply(lambda x: abs(x))
+    compare_str = compare.to_string(index=False)
+    compare_str = "\t" + compare_str.replace("\n","\n\t")
+    log_update(f"\nClass distribution:\n{compare_str}")

fuson_plm/utils/visualizing.py ADDED Viewed

	@@ -0,0 +1,545 @@

+import matplotlib.pyplot as plt
+import matplotlib.font_manager as fm
+from matplotlib.font_manager import FontProperties
+from scipy.stats import entropy
+from sklearn.manifold import TSNE
+import pickle
+import pandas as pd
+import os
+import numpy as np
+from fuson_plm.utils.logging import log_update, find_fuson_plm_directory
+def set_font():
+    # Load and set the font
+    fuson_plm_dir = find_fuson_plm_directory()
+    # Paths for regular, bold, italic fonts
+    regular_font_path = os.path.join(fuson_plm_dir, 'ubuntu_font', 'Ubuntu-Regular.ttf')
+    bold_font_path = os.path.join(fuson_plm_dir, 'ubuntu_font', 'Ubuntu-Bold.ttf')
+    italic_font_path = os.path.join(fuson_plm_dir, 'ubuntu_font', 'Ubuntu-Italic.ttf')
+    bold_italic_font_path = os.path.join(fuson_plm_dir, 'ubuntu_font', 'Ubuntu-BoldItalic.ttf')
+    # Load the font properties
+    regular_font = FontProperties(fname=regular_font_path)
+    bold_font = FontProperties(fname=bold_font_path)
+    italic_font = FontProperties(fname=italic_font_path)
+    bold_italic_font = FontProperties(fname=bold_italic_font_path)
+    # Add the fonts to the font manager
+    fm.fontManager.addfont(regular_font_path)
+    fm.fontManager.addfont(bold_font_path)
+    fm.fontManager.addfont(italic_font_path)
+    fm.fontManager.addfont(bold_italic_font_path)
+    # Set the font family globally to Ubuntu
+    plt.rcParams['font.family'] = regular_font.get_name()
+    # Set the fonts for math text (like for labels) to use the loaded Ubuntu fonts
+    plt.rcParams['mathtext.fontset'] = 'custom'
+    plt.rcParams['mathtext.rm'] = regular_font.get_name()
+    plt.rcParams['mathtext.it'] = f'{italic_font.get_name()}'
+    plt.rcParams['mathtext.bf'] = f'{bold_font.get_name()}'
+global default_color_map
+default_color_map = {
+    'train': '#0072B2',
+    'val': '#009E73',
+    'test': '#E69F00'
+}
+def get_avg_embeddings_for_tsne(train_sequences=None, val_sequences=None, test_sequences=None, embedding_path='fuson_db_embeddings/fuson_db_esm2_t33_650M_UR50D_avg_embeddings.pkl'):
+    if train_sequences is None: train_sequences = []
+    if val_sequences is None: val_sequences = []
+    if test_sequences is None: test_sequences = []
+    embeddings = {}
+    try:
+        with open(embedding_path, 'rb') as f:
+            embeddings = pickle.load(f)
+        train_embeddings = [v for k, v in embeddings.items() if k in train_sequences]
+        val_embeddings = [v for k, v in embeddings.items() if k in val_sequences]
+        test_embeddings = [v for k, v in embeddings.items() if k in test_sequences]
+        return train_embeddings, val_embeddings, test_embeddings
+    except:
+        print("could not open embeddings")
+def calculate_aa_composition(sequences):
+    composition = {}
+    total_length = sum([len(seq) for seq in sequences])
+    for seq in sequences:
+        for aa in seq:
+            if aa in composition:
+                composition[aa] += 1
+            else:
+                composition[aa] = 1
+    # Convert counts to relative frequency
+    for aa in composition:
+        composition[aa] /= total_length
+    return composition
+def calculate_shannon_entropy(sequence):
+    """
+    Calculate the Shannon entropy for a given sequence.
+    Args:
+        sequence (str): A sequence of characters (e.g., amino acids or nucleotides).
+    Returns:
+        float: Shannon entropy value.
+    """
+    bases = set(sequence)
+    counts = [sequence.count(base) for base in bases]
+    return entropy(counts, base=2)
+def visualize_splits_hist(train_lengths=None, val_lengths=None, test_lengths=None, colormap=None, savepath=f'splits/length_distributions.png', axes=None):
+    """
+    Works to plot train, val, test; train, val; or train, test
+    """
+    set_font()
+    if colormap is None: colormap=default_color_map
+    log_update('\nMaking histogram of length distributions')
+    # Get index for test plot
+    val_plot_index, test_plot_index, total_plots = 1, 2, 3
+    if val_lengths is None:
+        val_plot_index = None
+        test_plot_index-= 1
+        total_plots-=1
+    if test_lengths is None:
+        test_plot_index = None
+        total_plots-=1
+    # Create a figure and axes with 1 row and 3 columns
+    fig_individual, axes_individual = plt.subplots(1, total_plots, figsize=(6*total_plots, 6))
+    # Set axes list
+    axes_list = [axes_individual] if axes is None else [axes_individual, axes]
+    # Unpack the labels and titles
+    xlabel, ylabel = ['Sequence Length (AA)', 'Frequency']
+    for cur_axes in axes_list:
+        # Plot the first histogram
+        cur_axes[0].hist(train_lengths, bins=20, edgecolor='k',color=colormap['train'])
+        cur_axes[0].set_xlabel(xlabel)
+        cur_axes[0].set_ylabel(ylabel)
+        cur_axes[0].set_title(f'Train Set Length Distribution (n={len(train_lengths)})')
+        cur_axes[0].grid(True)
+        cur_axes[0].set_axisbelow(True)
+        # Plot the second histogram
+        if not(val_plot_index is None):
+            cur_axes[val_plot_index].hist(val_lengths, bins=20, edgecolor='k',color=colormap['val'])
+            cur_axes[val_plot_index].set_xlabel(xlabel)
+            cur_axes[val_plot_index].set_ylabel(ylabel)
+            cur_axes[val_plot_index].set_title(f'Validation Set Length Distribution (n={len(val_lengths)})')
+            cur_axes[val_plot_index].grid(True)
+            cur_axes[val_plot_index].set_axisbelow(True)
+        # Plot the third histogram
+        if not(test_plot_index is None):
+            cur_axes[test_plot_index].hist(test_lengths, bins=20, edgecolor='k',color=colormap['test'])
+            cur_axes[test_plot_index].set_xlabel(xlabel)
+            cur_axes[test_plot_index].set_ylabel(ylabel)
+            cur_axes[test_plot_index].set_title(f'Test Set Length Distribution (n={len(test_lengths)})')
+            cur_axes[test_plot_index].grid(True)
+            cur_axes[test_plot_index].set_axisbelow(True)
+    # Adjust layout
+    fig_individual.set_tight_layout(True)
+    # Save the figure
+    fig_individual.savefig(savepath)
+    log_update(f"\tSaved figure to {savepath}")
+def visualize_splits_scatter(train_clusters=None, val_clusters=None, test_clusters=None, benchmark_cluster_reps=None, colormap=None, savepath='splits/scatterplot.png', axes=None):
+    set_font()
+    if colormap is None: colormap=default_color_map
+    # Create a figure and axes with 1 row and 3 columns
+    fig_individual, axes_individual = plt.subplots(figsize=(18, 6))
+    # Set axes list
+    axes_list = [axes_individual] if axes is None else [axes_individual, axes]
+    log_update("\nMaking scatterplot with distribution of cluster sizes across train, test, and val")
+    # Make grouped versions of these DataFrames for size analysis
+    train_clustersgb = train_clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'})
+    if not(val_clusters is None):
+        val_clustersgb = val_clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'})
+    if not(test_clusters is None):
+        test_clustersgb = test_clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'})
+        # Isolate benchmark-containing clusters so their contribution can be plotted separately
+        total_test_proteins = sum(test_clustersgb['member count'])
+    if not(benchmark_cluster_reps is None):
+        test_clustersgb['benchmark cluster'] = test_clustersgb['representative seq_id'].isin(benchmark_cluster_reps)
+        benchmark_clustersgb = test_clustersgb.loc[test_clustersgb['benchmark cluster']].reset_index(drop=True)
+        test_clustersgb = test_clustersgb.loc[test_clustersgb['benchmark cluster']==False].reset_index(drop=True)
+    # Convert them to value counts
+    train_clustersgb = train_clustersgb['member count'].value_counts().reset_index().rename(columns={'index':'cluster size (n_members)','member count': 'n_clusters'})
+    if not(val_clusters is None):
+        val_clustersgb = val_clustersgb['member count'].value_counts().reset_index().rename(columns={'index':'cluster size (n_members)','member count': 'n_clusters'})
+    if not(test_clusters is None):
+        test_clustersgb = test_clustersgb['member count'].value_counts().reset_index().rename(columns={'index':'cluster size (n_members)','member count': 'n_clusters'})
+    if not(benchmark_cluster_reps is None):
+        benchmark_clustersgb = benchmark_clustersgb['member count'].value_counts().reset_index().rename(columns={'index':'cluster size (n_members)','member count': 'n_clusters'})
+    # Get the percentage of each dataset that's made of each cluster size
+    train_clustersgb['n_proteins'] = train_clustersgb['cluster size (n_members)']*train_clustersgb['n_clusters']    # proteins per cluster * n clusters = # proteins
+    train_clustersgb['percent_proteins'] = train_clustersgb['n_proteins']/sum(train_clustersgb['n_proteins'])
+    if not(val_clusters is None):
+        val_clustersgb['n_proteins'] = val_clustersgb['cluster size (n_members)']*val_clustersgb['n_clusters']
+        val_clustersgb['percent_proteins'] = val_clustersgb['n_proteins']/sum(val_clustersgb['n_proteins'])
+    if not(test_clusters is None):
+        test_clustersgb['n_proteins'] = test_clustersgb['cluster size (n_members)']*test_clustersgb['n_clusters']
+        test_clustersgb['percent_proteins'] = test_clustersgb['n_proteins']/total_test_proteins
+    if not(benchmark_cluster_reps is None):
+        benchmark_clustersgb['n_proteins'] = benchmark_clustersgb['cluster size (n_members)']*benchmark_clustersgb['n_clusters']
+        benchmark_clustersgb['percent_proteins'] = benchmark_clustersgb['n_proteins']/total_test_proteins
+    # Specially mark the benchmark clusters because these can't be reallocated
+    for ax in axes_list:
+        ax.plot(train_clustersgb['cluster size (n_members)'],train_clustersgb['percent_proteins'],linestyle='None',marker='.',color=colormap['train'],label='train')
+        if not(val_clusters is None):
+            ax.plot(val_clustersgb['cluster size (n_members)'],val_clustersgb['percent_proteins'],linestyle='None',marker='.',color=colormap['val'],label='val')
+        if not(test_clusters is None):
+            ax.plot(test_clustersgb['cluster size (n_members)'],test_clustersgb['percent_proteins'],linestyle='None',marker='.',color=colormap['test'],label='test')
+        if not(benchmark_cluster_reps is None):
+            ax.plot(benchmark_clustersgb['cluster size (n_members)'],benchmark_clustersgb['percent_proteins'],
+                marker='o',
+                linestyle='None',
+                markerfacecolor=colormap['test'],      # fill same as test
+                markeredgecolor='black',    # outline black
+                markeredgewidth=1.5,
+                label='benchmark'
+            )
+        ax.set(ylabel='Percentage of Proteins in Dataset',xlabel='cluster_size')
+        ax.legend()
+    # save the figure
+    fig_individual.set_tight_layout(True)
+    fig_individual.savefig(savepath)
+    log_update(f"\tSaved figure to {savepath}")
+def visualize_splits_tsne(train_sequences=None, val_sequences=None, test_sequences=None, colormap=None, esm_type="esm2_t33_650M_UR50D", embedding_path="fuson_db_embeddings/fuson_db_esm2_t33_650M_UR50D_avg_embeddings.pkl", savepath='splits/tsne_plot.png',axes=None):
+    set_font()
+    if colormap is None: colormap=default_color_map
+    """
+    Generate a t-SNE plot of embeddings for train, test, and validation.
+    """
+    log_update('\nMaking t-SNE plot of train, val, and test embeddings')
+    # Create a figure and axes with 1 row and 3 columns
+    fig_individual, axes_individual = plt.subplots(figsize=(18, 6))
+    # Set axes list
+    axes_list = [axes_individual] if axes is None else [axes_individual, axes]
+    # Combine the embeddings into one array
+    train_embeddings, val_embeddings, test_embeddings = get_avg_embeddings_for_tsne(train_sequences=train_sequences,
+                                                                                    val_sequences=val_sequences,
+                                                                                    test_sequences=test_sequences, embedding_path=embedding_path)
+    if not(val_embeddings is None) and not(test_embeddings is None):
+        embeddings = np.concatenate([train_embeddings, val_embeddings, test_embeddings])
+        labels = ['train'] * len(train_embeddings) + ['val'] * len(val_embeddings) + ['test'] * len(test_embeddings)
+    if not(val_embeddings is None) and (test_embeddings is None):
+        embeddings = np.concatenate([train_embeddings, val_embeddings])
+        labels = ['train'] * len(train_embeddings) + ['val'] * len(val_embeddings)
+    if (val_embeddings is None) and not(test_embeddings is None):
+        embeddings = np.concatenate([train_embeddings, test_embeddings])
+        labels = ['train'] * len(train_embeddings) + ['test'] * len(test_embeddings)
+    # Perform t-SNE
+    tsne = TSNE(n_components=2, random_state=42)
+    tsne_results = tsne.fit_transform(embeddings)
+    # Convert t-SNE results into a DataFrame
+    tsne_df = pd.DataFrame(data=tsne_results, columns=['TSNE_1', 'TSNE_2'])
+    tsne_df['label'] = labels
+    for ax in axes_list:
+        # Scatter plot for each set
+        for label, color in colormap.items():
+            subset = tsne_df[tsne_df['label'] == label].reset_index(drop=True)
+            ax.scatter(subset['TSNE_1'], subset['TSNE_2'], c=color, label=label.capitalize(), alpha=0.6)
+        ax.set_title(f't-SNE of {esm_type} Embeddings')
+        ax.set_xlabel('t-SNE Dimension 1')
+        ax.set_ylabel('t-SNE Dimension 2')
+        ax.legend()
+        ax.grid(True)
+    # Save the figure if savepath is provided
+    fig_individual.set_tight_layout(True)
+    fig_individual.savefig(savepath)
+    log_update(f"\tSaved figure to {savepath}")
+def visualize_splits_shannon_entropy(train_sequences=None, val_sequences=None, test_sequences=None, colormap=None, savepath='splits/shannon_entropy_plot.png',axes=None):
+    set_font()
+    """
+    Generate Shannon entropy plots for train, validation, and test sets.
+    """
+    # Get index for test plot
+    val_plot_index, test_plot_index, total_plots = 1, 2, 3
+    if val_sequences is None:
+        val_plot_index = None
+        test_plot_index-= 1
+        total_plots-=1
+    if test_sequences is None:
+        test_plot_index = None
+        total_plots-=1
+    if colormap is None: colormap=default_color_map
+    # Create a figure and axes with 1 row and 3 columns
+    fig_individual, axes_individual = plt.subplots(1, total_plots, figsize=(6*total_plots, 6))
+    # Set axes list
+    axes_list = [axes_individual] if axes is None else [axes_individual, axes]
+    log_update('\nMaking histogram of Shannon Entropy distributions')
+    train_entropy = [calculate_shannon_entropy(seq) for seq in train_sequences]
+    if not(val_plot_index is None):
+        val_entropy = [calculate_shannon_entropy(seq) for seq in val_sequences]
+    if not(test_plot_index is None):
+        test_entropy = [calculate_shannon_entropy(seq) for seq in test_sequences]
+    for ax in axes_list:
+        ax[0].hist(train_entropy, bins=20, edgecolor='k', color=colormap['train'])
+        ax[0].set_title(f'Train Set (n={len(train_entropy)})')
+        ax[0].set_xlabel('Shannon Entropy')
+        ax[0].set_ylabel('Frequency')
+        ax[0].grid(True)
+        ax[0].set_axisbelow(True)
+        if not(val_plot_index is None):
+            ax[val_plot_index].hist(val_entropy, bins=20, edgecolor='k', color=colormap['val'])
+            ax[val_plot_index].set_title(f'Validation Set (n={len(val_entropy)})')
+            ax[val_plot_index].set_xlabel('Shannon Entropy')
+            ax[val_plot_index].grid(True)
+            ax[val_plot_index].set_axisbelow(True)
+        if not(test_plot_index is None):
+            ax[test_plot_index].hist(test_entropy, bins=20, edgecolor='k', color=colormap['test'])
+            ax[test_plot_index].set_title(f'Test Set (n={len(test_entropy)})')
+            ax[test_plot_index].set_xlabel('Shannon Entropy')
+            ax[test_plot_index].grid(True)
+            ax[test_plot_index].set_axisbelow(True)
+    fig_individual.set_tight_layout(True)
+    fig_individual.savefig(savepath)
+    log_update(f"\tSaved figure to {savepath}")
+def visualize_splits_aa_composition(train_sequences=None, val_sequences=None, test_sequences=None, colormap=None, savepath='splits/aa_comp.png',axes=None):
+    set_font()
+    if colormap is None: colormap=default_color_map
+    # Create a figure and axes with 1 row and 3 columns
+    fig_individual, axes_individual = plt.subplots(figsize=(18, 6))
+    # Set axes list
+    axes_list = [axes_individual] if axes is None else [axes_individual, axes]
+    log_update('\nMaking bar plot of AA composition across each set')
+    train_comp = calculate_aa_composition(train_sequences)
+    if not(val_sequences is None):
+        val_comp = calculate_aa_composition(val_sequences)
+    if not(test_sequences is None):
+        test_comp = calculate_aa_composition(test_sequences)
+    # Create DataFrame
+    if not(val_sequences is None) and not(test_sequences is None):
+        comp_df = pd.DataFrame([train_comp, val_comp, test_comp], index=['train', 'val', 'test']).T
+    if not(val_sequences is None) and (test_sequences is None):
+        comp_df = pd.DataFrame([train_comp, val_comp], index=['train', 'val']).T
+    if (val_sequences is None) and not(test_sequences is None):
+        comp_df = pd.DataFrame([train_comp, test_comp], index=['train', 'test']).T
+    colors = [colormap[col] for col in comp_df.columns]
+    # Plotting
+    for ax in axes_list:
+        comp_df.plot(kind='bar', color=colors, ax=ax)
+        ax.set_title('Amino Acid Composition Across Datasets')
+        ax.set_xlabel('Amino Acid')
+        ax.set_ylabel('Relative Frequency')
+    fig_individual.set_tight_layout(True)
+    fig_individual.savefig(savepath)
+    log_update(f"\tSaved figure to {savepath}")
+### Outer methods for visualizing splits
+def visualize_splits(train_clusters=None, val_clusters=None, test_clusters=None, benchmark_cluster_reps=None, train_color='#0072B2',val_color='#009E73',test_color='#E69F00',esm_embeddings_path=None, onehot_embeddings_path=None):
+    colormap = {
+        'train': train_color,
+        'val': val_color,
+        'test': test_color
+    }
+    valid_entry = False
+    # Add columns for plotting
+    if not(train_clusters is None) and not(val_clusters is None) and not(test_clusters is None):
+        visualize_train_val_test_splits(train_clusters, val_clusters, test_clusters,benchmark_cluster_reps=benchmark_cluster_reps,colormap=colormap)
+        valid_entry=True
+    if not(train_clusters is None) and (val_clusters is None) and not(test_clusters is None):
+        visualize_train_test_splits(train_clusters, test_clusters, benchmark_cluster_reps=benchmark_cluster_reps,colormap=colormap)
+        valid_entry=True
+    if not(train_clusters is None) and not(val_clusters is None) and (test_clusters is None):
+        visualize_train_val_splits(train_clusters, val_clusters, benchmark_cluster_reps=benchmark_cluster_reps,colormap=colormap)
+        valid_entry=True
+    if not(valid_entry): raise Exception("Must pass train and at least one of val or test")
+def visualize_train_val_test_splits(train_clusters, val_clusters, test_clusters, benchmark_cluster_reps=None, colormap=None, esm_embeddings_path=None, onehot_embeddings_path=None):
+    if colormap is None: colormap=default_color_map
+    # Add length column
+    train_clusters['member length'] = train_clusters['member seq'].str.len()
+    val_clusters['member length'] = val_clusters['member seq'].str.len()
+    test_clusters['member length'] = test_clusters['member seq'].str.len()
+    # Prepare lengths and seqs for plotting
+    train_lengths = train_clusters['member length'].tolist()
+    val_lengths = val_clusters['member length'].tolist()
+    test_lengths = test_clusters['member length'].tolist()
+    train_sequences = train_clusters['member seq'].tolist()
+    val_sequences = val_clusters['member seq'].tolist()
+    test_sequences = test_clusters['member seq'].tolist()
+    # Create a combined figure with 3 rows and 3 columns
+    set_font()
+    fig_combined, axs = plt.subplots(3, 3, figsize=(24, 18))
+    # Make the three visualization plots for saving TOGETHER
+    visualize_splits_hist(train_lengths=train_lengths,
+                          val_lengths=val_lengths,
+                          test_lengths=test_lengths,
+                          colormap=colormap, axes=axs[0])
+    visualize_splits_shannon_entropy(train_sequences=train_sequences,
+                                     val_sequences=val_sequences,
+                                     test_sequences=test_sequences,
+                                     colormap=colormap,axes=axs[1])
+    visualize_splits_scatter(train_clusters=train_clusters,
+                             val_clusters=val_clusters,
+                             test_clusters=test_clusters,
+                             benchmark_cluster_reps=benchmark_cluster_reps,
+                             colormap=colormap, axes=axs[2, 0])
+    visualize_splits_aa_composition(train_sequences=train_sequences,
+                                    val_sequences=val_sequences,
+                                    test_sequences=test_sequences,
+                                    colormap=colormap, axes=axs[2, 1])
+    if not(esm_embeddings_path is None) and os.path.exists(esm_embeddings_path):
+        visualize_splits_tsne(train_sequences=train_sequences,
+                              val_sequences=val_sequences,
+                              test_sequences=test_sequences,
+                              colormap=colormap, axes=axs[2, 2])
+    else:
+    # Leave the last subplot blank
+        axs[2, 2].axis('off')
+    plt.tight_layout()
+    fig_combined.savefig('splits/combined_plot.png')
+    log_update(f"\nSaved combined figure to splits/combined_plot.png")
+def visualize_train_test_splits(train_clusters, test_clusters,  benchmark_cluster_reps=None, colormap=None, esm_embeddings_path=None, onehot_embeddings_path=None):
+    if colormap is None: colormap=default_color_map
+    # Add length column
+    train_clusters['member length'] = train_clusters['member seq'].str.len()
+    test_clusters['member length'] = test_clusters['member seq'].str.len()
+    # Prepare lengths and seqs for plotting
+    train_lengths = train_clusters['member length'].tolist()
+    test_lengths = test_clusters['member length'].tolist()
+    train_sequences = train_clusters['member seq'].tolist()
+    test_sequences = test_clusters['member seq'].tolist()
+    # Create a combined figure with 4 rows and 2 columns if TSNE plot, 3 x 2 otherwise
+    if not(esm_embeddings_path is None) and os.path.exists(esm_embeddings_path):
+        set_font()
+        fig_combined, axs = plt.subplots(4, 2, figsize=(18, 36))
+        visualize_splits_tsne(train_sequences=train_sequences,
+                              val_sequences=None,
+                              test_sequences=test_sequences,
+                              colormap=colormap, axes=axs[3, 0])
+        axs[-1,1].axis('off')
+    else:
+        set_font()
+        fig_combined, axs = plt.subplots(3, 2, figsize=(18, 18))
+    # Make the three visualization plots for saving TOGETHER
+    visualize_splits_hist(train_lengths=train_lengths,
+                          val_lengths=None,
+                          test_lengths=test_lengths,
+                          colormap=colormap, axes=axs[0])
+    visualize_splits_shannon_entropy(train_sequences=train_sequences,
+                                     val_sequences=None,
+                                     test_sequences=test_sequences,
+                                     colormap=colormap,axes=axs[1])
+    visualize_splits_scatter(train_clusters=train_clusters,
+                             val_clusters=None,
+                             test_clusters=test_clusters,
+                             benchmark_cluster_reps=benchmark_cluster_reps,
+                             colormap=colormap, axes=axs[2, 0])
+    visualize_splits_aa_composition(train_sequences=train_sequences,
+                                    val_sequences=None,
+                                    test_sequences=test_sequences,
+                                    colormap=colormap, axes=axs[2, 1])
+    plt.tight_layout()
+    fig_combined.savefig('splits/combined_plot.png')
+    log_update(f"\nSaved combined figure to splits/combined_plot.png")
+def visualize_train_val_splits(train_clusters, val_clusters, benchmark_cluster_reps=None, colormap=None, esm_embeddings_path=None, onehot_embeddings_path=None):
+    if colormap is None: colormap=default_color_map
+    # Add length column
+    train_clusters['member length'] = train_clusters['member seq'].str.len()
+    val_clusters['member length'] = val_clusters['member seq'].str.len()
+    # Prepare lengths and seqs for plotting
+    train_lengths = train_clusters['member length'].tolist()
+    val_lengths = val_clusters['member length'].tolist()
+    train_sequences = train_clusters['member seq'].tolist()
+    val_sequences = val_clusters['member seq'].tolist()
+    # Create a combined figure with 4 rows and 2 columns if TSNE plot, 3 x 2 otherwise
+    if not(esm_embeddings_path is None) and os.path.exists(esm_embeddings_path):
+        set_font()
+        fig_combined, axs = plt.subplots(4, 2, figsize=(18, 36))
+        visualize_splits_tsne(train_sequences=train_sequences,
+                              val_sequences=val_sequences,
+                              test_sequences=None,
+                              colormap=colormap, axes=axs[3, 0])
+        axs[-1,1].axis('off')
+    else:
+        set_font()
+        fig_combined, axs = plt.subplots(3, 2, figsize=(18, 18))
+    # Make the three visualization plots for saving TOGETHER
+    visualize_splits_hist(train_lengths=train_lengths,
+                          val_lengths=val_lengths,
+                          test_lengths=None,
+                          colormap=colormap, axes=axs[0])
+    visualize_splits_shannon_entropy(train_sequences=train_sequences,
+                                     val_sequences=val_sequences,
+                                     test_sequences=None,
+                                     colormap=colormap,axes=axs[1])
+    visualize_splits_scatter(train_clusters=train_clusters,
+                             val_clusters=val_clusters,
+                             test_clusters=None,
+                             benchmark_cluster_reps=benchmark_cluster_reps,
+                             colormap=colormap, axes=axs[2, 0])
+    visualize_splits_aa_composition(train_sequences=train_sequences,
+                                    val_sequences=val_sequences,
+                                    test_sequences=None,
+                                    colormap=colormap, axes=axs[2, 1])
+    plt.tight_layout()
+    fig_combined.savefig('splits/combined_plot.png')
+    log_update(f"\nSaved combined figure to splits/combined_plot.png")