File size: 5,625 Bytes
ffaff91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import pandas as pd
import os
import subprocess
import sys
from Bio import SeqIO
import shutil
from fuson_plm.utils.logging import open_logfile, log_update
def ensure_mmseqs_in_path(mmseqs_dir):
"""
Checks if MMseqs2 is in the PATH. If it's not, add it. MMseqs2 will not run if this is not done correctly.
Args:
mmseqs_dir (str): Directory containing MMseqs2 binaries
"""
mmseqs_bin = os.path.join(mmseqs_dir, 'mmseqs')
# Check if mmseqs is already in PATH
if shutil.which('mmseqs') is None:
# Export the MMseqs2 directory to PATH
os.environ['PATH'] = f"{mmseqs_dir}:{os.environ['PATH']}"
log_update(f"\tAdded {mmseqs_dir} to PATH")
def process_fasta(fasta_path):
fasta_sequences = SeqIO.parse(open(fasta_path),'fasta')
d = {}
for fasta in fasta_sequences:
id, sequence = fasta.id, str(fasta.seq)
d[id] = sequence
return d
def analyze_clustering_result(input_fasta: str, tsv_path: str):
"""
Args:
input_fasta (str): path to input fasta file
"""
# Process input fasta
input_d = process_fasta(input_fasta)
# Process clusters.tsv
clusters = pd.read_csv(f'{tsv_path}',sep='\t',header=None)
clusters = clusters.rename(columns={
0: 'representative seq_id',
1: 'member seq_id'
})
clusters['representative seq'] = clusters['representative seq_id'].apply(lambda seq_id: input_d[seq_id])
clusters['member seq'] = clusters['member seq_id'].apply(lambda seq_id: input_d[seq_id])
# Sort them so that splitting results are reproducible
clusters = clusters.sort_values(by=['representative seq_id','member seq_id'],ascending=True).reset_index(drop=True)
return clusters
def make_fasta(sequences: dict, fasta_path: str):
"""
Makes a fasta file from sequences, where the key is the header and the value is the sequence.
Args:
sequences (dict): A dictionary where the key is the header and the value is the sequence.
Returns:
str: The path to the fasta file.
"""
with open(fasta_path, 'w') as f:
for header, sequence in sequences.items():
f.write(f'>{header}\n{sequence}\n')
return fasta_path
def run_mmseqs_clustering(input_fasta, output_dir, min_seq_id=0.3, c=0.8, cov_mode=0, cluster_mode=0, path_to_mmseqs='fuson_plm/mmseqs'):
"""
Runs MMSeqs2 clustering using easycluster module
Args:
input_fasta (str): path to input fasta file, formatted >header\nsequence\n>header\nsequence....
output_dir (str): path to output dir for clustering results
min_seq_id (float): number [0,1] representing --min-seq-id in cluster command
c (float): nunber [0,1] representing -c in cluster command
cov_mode (int): number 0, 1, 2, or 3 representing --cov-mode in cluster command
cluster_mode (int): number 0, 1, or 2 representing --cluster-mode in cluster command
"""
# Get mmseqs dir
log_update("\nRunning MMSeqs clustering...")
mmseqs_dir = os.path.join(path_to_mmseqs[0:path_to_mmseqs.index('/mmseqs')], 'mmseqs/bin')
# Ensure MMseqs2 is in the PATH
ensure_mmseqs_in_path(mmseqs_dir)
# Define paths for MMseqs2
mmseqs_bin = "mmseqs" # Ensure this is in your PATH or provide the full path to mmseqs binary
# Create the output directory
os.makedirs(output_dir, exist_ok=True)
# Run MMseqs2 easy-cluster
cmd_easy_cluster = [
mmseqs_bin, "easy-cluster", input_fasta, os.path.join(output_dir, "mmseqs"), output_dir,
"--min-seq-id", str(min_seq_id),
"-c", str(c),
"--cov-mode", str(cov_mode),
"--cluster-mode", str(cluster_mode),
"--dbtype", "1"
]
# Write the command to a log file
log_update("\n\tCommand entered to MMSeqs2:")
log_update("\t" + " ".join(cmd_easy_cluster) + "\n")
subprocess.run(cmd_easy_cluster, check=True)
log_update(f"Clustering completed. Results are in {output_dir}")
def cluster_summary(clusters: pd.DataFrame):
"""
Summarizes how many clusters were formed, how big they are, etc ...
"""
grouped_clusters = clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'})
assert len(grouped_clusters) == len(clusters['representative seq_id'].unique()) # make sure number of cluster reps = # grouped clusters
total_seqs = sum(grouped_clusters['member count'])
log_update(f"Created {len(grouped_clusters)} clusters of {total_seqs} sequences")
log_update(f"\t{len(grouped_clusters.loc[grouped_clusters['member count']==1])} clusters of size 1")
csize1_seqs = sum(grouped_clusters[grouped_clusters['member count']==1]['member count'])
log_update(f"\t\tsequences: {csize1_seqs} ({round(100*csize1_seqs/total_seqs, 2)}%)")
log_update(f"\t{len(grouped_clusters.loc[grouped_clusters['member count']>1])} clusters of size > 1")
csizeg1_seqs = sum(grouped_clusters[grouped_clusters['member count']>1]['member count'])
log_update(f"\t\tsequences: {csizeg1_seqs} ({round(100*csizeg1_seqs/total_seqs, 2)}%)")
log_update(f"\tlargest cluster: {max(grouped_clusters['member count'])}")
log_update("\nCluster size breakdown below...")
value_counts = grouped_clusters['member count'].value_counts().reset_index().rename(columns={'index':'cluster size (n_members)','member count': 'n_clusters'})
log_update(value_counts.sort_values(by='cluster size (n_members)',ascending=True).to_string(index=False)) |