# Significant contribution from Ben Fry

import copy
import os.path
import pickle
import random
from multiprocessing import Pool

import numpy as np
import pandas as pd
import torch
from rdkit import Chem
from rdkit.Chem import AllChem, MolFromSmiles
from scipy.spatial.distance import pdist, squareform
from torch_geometric.data import Dataset, HeteroData
from torch_geometric.utils import subgraph
from tqdm import tqdm

from datasets.constants import aa_to_cg_indices, amino_acid_smiles, cg_rdkit_indices
from datasets.parse_chi import aa_long2short, atom_order
from datasets.process_mols import new_extract_receptor_structure, get_lig_graph, generate_conformer
from utils.torsion import get_transformation_mask


def read_strings_from_txt(path):
    # every line will be one element of the returned list
    with open(path) as file:
        lines = file.readlines()
        return [line.rstrip() for line in lines]


def compute_num_ca_neighbors(coords, cg_coords, idx, is_valid_bb_node, max_dist=5, buffer_residue_num=7):
    """
    Counts number of residues with heavy atoms within max_dist (Angstroms) of this sidechain that are not
    residues within +/- buffer_residue_num in primary sequence.
    From Ben's code
    Note: Gabriele removed the chain_index
    """

    # Extract coordinates of all residues in the protein.
    bb_coords = coords

    # Compute the indices that we should not consider interactions.
    excluded_neighbors = [idx - x for x in reversed(range(0, buffer_residue_num+1)) if (idx - x) >= 0]
    excluded_neighbors.extend([idx + x for x in range(1, buffer_residue_num+1)])

    # Create indices of an N x M distance matrix where N is num BB nodes and M is num CG nodes.
    e_idx = torch.stack([
        torch.arange(bb_coords.shape[0]).unsqueeze(-1).expand((-1, cg_coords.shape[0])).flatten(),
        torch.arange(cg_coords.shape[0]).unsqueeze(0).expand((bb_coords.shape[0], -1)).flatten()
    ])

    # Expand bb_coords and cg_coords into the same dimensionality.
    bb_coords_exp = bb_coords[e_idx[0]]
    cg_coords_exp = cg_coords[e_idx[1]].unsqueeze(1)

    # Every row is distance of chemical group to each atom in backbone coordinate frame.
    bb_exp_idces, _ = (torch.cdist(bb_coords_exp, cg_coords_exp).squeeze(-1) < max_dist).nonzero(as_tuple=True)
    bb_idces_within_thresh = torch.unique(e_idx[0][bb_exp_idces])

    # Only count residues that are not adjacent or origin in primary sequence and are valid backbone residues (fully resolved coordinate frame).
    bb_idces_within_thresh = bb_idces_within_thresh[~torch.isin(bb_idces_within_thresh, torch.tensor(excluded_neighbors)) & is_valid_bb_node[bb_idces_within_thresh]]

    return len(bb_idces_within_thresh)


def identify_valid_vandermers(args):
    """
    Constructs a tensor containing all the number of contacts for each residue that can be sampled from for chemical groups.
    By using every sidechain as a chemical group, we will load the actual chemical groups at training time.
    These can be used to sample as probabilities once divided by the sum.
    """
    complex_graph, max_dist, buffer_residue_num = args

    # Constructs a mask tracking whether index is a valid coordinate frame / residue label to train over.
    #is_in_residue_vocabulary = torch.tensor([x in aa_short2long for x in data['seq']]).bool()
    coords, seq = complex_graph.coords, complex_graph.seq
    is_valid_bb_node = (coords[:, :4].isnan().sum(dim=(1,2)) == 0).bool() #* is_in_residue_vocabulary

    valid_cg_idces = []
    for idx, aa in enumerate(seq):

        if aa not in aa_to_cg_indices:
            valid_cg_idces.append(0)
        else:
            indices = aa_to_cg_indices[aa]
            cg_coordinates = coords[idx][indices]

            # remove chemical group residues that aren't fully resolved.
            if torch.any(cg_coordinates.isnan()).item():
                valid_cg_idces.append(0)
                continue

            nbr_count = compute_num_ca_neighbors(coords, cg_coordinates, idx, is_valid_bb_node,
                                                 max_dist=max_dist, buffer_residue_num=buffer_residue_num)
            valid_cg_idces.append(nbr_count)

    return complex_graph.name, torch.tensor(valid_cg_idces)


def fast_identify_valid_vandermers(coords, seq, max_dist=5, buffer_residue_num=7):

    offset = 10000 + max_dist
    R = coords.shape[0]

    coords = coords.numpy().reshape(-1, 3)
    pdist_mat = squareform(pdist(coords))
    pdist_mat = pdist_mat.reshape((R, 14, R, 14))
    pdist_mat = np.nan_to_num(pdist_mat, nan=offset)
    pdist_mat = np.min(pdist_mat, axis=(1, 3))

    # compute pairwise distances
    pdist_mat = pdist_mat + np.diag(np.ones(len(seq)) * offset)
    for i in range(1, buffer_residue_num+1):
        pdist_mat += np.diag(np.ones(len(seq)-i) * offset, k=i) + np.diag(np.ones(len(seq)-i) * offset, k=-i)

    # get number of residues that are within max_dist of each other
    nbr_count = np.sum(pdist_mat < max_dist, axis=1)
    return torch.tensor(nbr_count)


def compute_cg_features(aa, aa_smile):
    """
    Given an amino acid and a smiles string returns the stacked tensor of chemical group atom encodings.
    The order of the output tensor rows corresponds to the index the atoms appear in aa_to_cg_indices from constants.
    """

    # Handle any residues that we don't have chemical groups for (ex: GLY if not using bb_cnh and bb_cco)
    aa_short = aa_long2short[aa]
    if aa_short not in aa_to_cg_indices:
        return None

    # Create rdkit molecule from smiles string.
    mol = Chem.MolFromSmiles(aa_smile)

    complex_graph = HeteroData()
    get_lig_graph(mol, complex_graph)

    atoms_to_keep = torch.tensor([i for i, _ in cg_rdkit_indices[aa].items()]).long()
    complex_graph['ligand', 'ligand'].edge_index, complex_graph['ligand', 'ligand'].edge_attr = \
        subgraph(atoms_to_keep, complex_graph['ligand', 'ligand'].edge_index, complex_graph['ligand', 'ligand'].edge_attr, relabel_nodes=True)
    complex_graph['ligand'].x = complex_graph['ligand'].x[atoms_to_keep]

    edge_mask, mask_rotate = get_transformation_mask(complex_graph)
    complex_graph['ligand'].edge_mask = torch.tensor(edge_mask)
    complex_graph['ligand'].mask_rotate = mask_rotate
    return complex_graph


class PDBSidechain(Dataset):
    def __init__(self, root, transform=None, cache_path='data/cache', split='train', limit_complexes=0,
                 receptor_radius=30, num_workers=1, c_alpha_max_neighbors=None, remove_hs=True, all_atoms=False,
                 atom_radius=5, atom_max_neighbors=None, sequences_to_embeddings=None,
                 knn_only_graph=True, multiplicity=1, vandermers_max_dist=5, vandermers_buffer_residue_num=7,
                 vandermers_min_contacts=5, remove_second_segment=False, merge_clusters=1, vandermers_extraction=True,
                 add_random_ligand=False):

        super(PDBSidechain, self).__init__(root, transform)
        assert remove_hs == True, "not implemented yet"
        self.root = root
        self.split = split
        self.limit_complexes = limit_complexes
        self.receptor_radius = receptor_radius
        self.knn_only_graph = knn_only_graph
        self.multiplicity = multiplicity
        self.c_alpha_max_neighbors = c_alpha_max_neighbors
        self.num_workers = num_workers
        self.sequences_to_embeddings = sequences_to_embeddings
        self.remove_second_segment = remove_second_segment
        self.merge_clusters = merge_clusters
        self.vandermers_extraction = vandermers_extraction
        self.add_random_ligand = add_random_ligand
        self.all_atoms = all_atoms
        self.atom_radius = atom_radius
        self.atom_max_neighbors = atom_max_neighbors

        if vandermers_extraction:
            self.cg_node_feature_lookup_dict = {aa_long2short[aa]: compute_cg_features(aa, aa_smile) for aa, aa_smile in
                                           amino_acid_smiles.items()}

        self.cache_path = os.path.join(cache_path, f'PDB3_limit{self.limit_complexes}_INDEX{self.split}'
                                                        f'_recRad{self.receptor_radius}_recMax{self.c_alpha_max_neighbors}'
                                            + (''if not all_atoms else f'_atomRad{atom_radius}_atomMax{atom_max_neighbors}')
                                            + ('' if not self.knn_only_graph else '_knnOnly'))
        self.read_split()

        if not self.check_all_proteins():
            os.makedirs(self.cache_path, exist_ok=True)
            self.preprocess()

        self.vandermers_max_dist = vandermers_max_dist
        self.vandermers_buffer_residue_num = vandermers_buffer_residue_num
        self.vandermers_min_contacts = vandermers_min_contacts
        self.collect_proteins()

        filtered_proteins = []
        if vandermers_extraction:
            for complex_graph in tqdm(self.protein_graphs):
                if complex_graph.name in self.vandermers and torch.any(self.vandermers[complex_graph.name] >= 10):
                    filtered_proteins.append(complex_graph)
            print(f"Computed vandermers and kept {len(filtered_proteins)} proteins out of {len(self.protein_graphs)}")
        else:
            filtered_proteins = self.protein_graphs

        second_filter = []
        for complex_graph in tqdm(filtered_proteins):
            if sequences_to_embeddings is None or complex_graph.orig_seq in sequences_to_embeddings:
                second_filter.append(complex_graph)
        print(f"Checked embeddings available and kept {len(second_filter)} proteins out of {len(filtered_proteins)}")
        
        self.protein_graphs = second_filter

        # filter clusters that have no protein graphs
        self.split_clusters = list(set([g.cluster for g in self.protein_graphs]))
        self.cluster_to_complexes = {c: [] for c in self.split_clusters}
        for p in self.protein_graphs:
            self.cluster_to_complexes[p['cluster']].append(p)
        self.split_clusters = [c for c in self.split_clusters if len(self.cluster_to_complexes[c]) > 0]
        print("Total elements in set", len(self.split_clusters) * self.multiplicity // self.merge_clusters)

        self.name_to_complex = {p.name: p for p in self.protein_graphs}
        self.define_probabilities()

        if self.add_random_ligand:
            # read csv with all smiles
            with open('data/smiles_list.csv', 'r') as f:
                self.smiles_list = f.readlines()
            self.smiles_list = [s.split(',')[0] for s in self.smiles_list]

    def define_probabilities(self):
        if not self.vandermers_extraction:
            return

        if self.vandermers_min_contacts is not None:
            self.probabilities = torch.arange(1000) - self.vandermers_min_contacts + 1
            self.probabilities[:self.vandermers_min_contacts] = 0
        else:
            with open('data/pdbbind_counts.pkl', 'rb') as f:
                pdbbind_counts = pickle.load(f)

            pdb_counts = torch.ones(1000)
            for contacts in self.vandermers.values():
                pdb_counts.index_add_(0, contacts, torch.ones(contacts.shape))
            print(pdbbind_counts[:30])
            print(pdb_counts[:30])

            self.probabilities = pdbbind_counts / pdb_counts
            self.probabilities[:7] = 0

    def len(self):
        return len(self.split_clusters) * self.multiplicity // self.merge_clusters

    def get(self, idx=None, protein=None, smiles=None):
        assert idx is not None or (protein is not None and smiles is not None), "provide idx or protein or smile"

        if protein is None or smiles is None:
            idx = idx % len(self.split_clusters)
            if self.merge_clusters > 1:
                idx = idx * self.merge_clusters
                idx = idx + random.randint(0, self.merge_clusters - 1)
                idx = min(idx, len(self.split_clusters) - 1)
            cluster = self.split_clusters[idx]
            protein_graph = copy.deepcopy(random.choice(self.cluster_to_complexes[cluster]))
        else:
            protein_graph = copy.deepcopy(self.name_to_complex[protein])

        if self.sequences_to_embeddings is not None:
            #print(self.sequences_to_embeddings[protein_graph.orig_seq].shape, len(protein_graph.orig_seq), protein_graph.to_keep.shape)
            if len(protein_graph.orig_seq) != len(self.sequences_to_embeddings[protein_graph.orig_seq]):
                print('problem with ESM embeddings')
                return self.get(random.randint(0, self.len()))

            lm_embeddings = self.sequences_to_embeddings[protein_graph.orig_seq][protein_graph.to_keep]
            protein_graph['receptor'].x = torch.cat([protein_graph['receptor'].x, lm_embeddings], dim=1)

        if self.vandermers_extraction:
            # select sidechain to remove
            vandermers_contacts = self.vandermers[protein_graph.name]
            vandermers_probs = self.probabilities[vandermers_contacts].numpy()

            if not np.any(vandermers_contacts.numpy() >= 10):
                print('no vandarmers >= 10 retrying with new one')
                return self.get(random.randint(0, self.len()))

            sidechain_idx = np.random.choice(np.arange(len(vandermers_probs)), p=vandermers_probs / np.sum(vandermers_probs))

            # remove part of the sequence
            residues_to_keep = np.ones(len(protein_graph.seq), dtype=bool)
            residues_to_keep[max(0, sidechain_idx - self.vandermers_buffer_residue_num):
                             min(sidechain_idx + self.vandermers_buffer_residue_num + 1, len(protein_graph.seq))] = False

            if self.remove_second_segment:
                pos_idx = protein_graph['receptor'].pos[sidechain_idx]
                limit_closeness = 10
                far_enough = torch.sum((protein_graph['receptor'].pos - pos_idx[None, :]) ** 2, dim=-1) > limit_closeness ** 2
                vandermers_probs = vandermers_probs * far_enough.float().numpy()
                vandermers_probs[max(0, sidechain_idx - self.vandermers_buffer_residue_num):
                                 min(sidechain_idx + self.vandermers_buffer_residue_num + 1, len(protein_graph.seq))] = 0
                if np.all(vandermers_probs<=0):
                    print('no second vandermer available retrying with new one')
                    return self.get(random.randint(0, self.len()))
                sc2_idx = np.random.choice(np.arange(len(vandermers_probs)), p=vandermers_probs / np.sum(vandermers_probs))

                residues_to_keep[max(0, sc2_idx - self.vandermers_buffer_residue_num):
                                 min(sc2_idx + self.vandermers_buffer_residue_num + 1, len(protein_graph.seq))] = False

            residues_to_keep = torch.from_numpy(residues_to_keep)
            protein_graph['receptor'].pos = protein_graph['receptor'].pos[residues_to_keep]
            protein_graph['receptor'].x = protein_graph['receptor'].x[residues_to_keep]
            protein_graph['receptor'].side_chain_vecs = protein_graph['receptor'].side_chain_vecs[residues_to_keep]
            protein_graph['receptor', 'rec_contact', 'receptor'].edge_index = \
                subgraph(residues_to_keep, protein_graph['receptor', 'rec_contact', 'receptor'].edge_index, relabel_nodes=True)[0]

            # create the sidechain ligand
            sidechain_aa = protein_graph.seq[sidechain_idx]
            ligand_graph = self.cg_node_feature_lookup_dict[sidechain_aa]
            ligand_graph['ligand'].pos = protein_graph.coords[sidechain_idx][protein_graph.mask[sidechain_idx]]

            for type in ligand_graph.node_types + ligand_graph.edge_types:
                for key, value in ligand_graph[type].items():
                    protein_graph[type][key] = value

            protein_graph['ligand'].orig_pos = protein_graph['ligand'].pos.numpy()
            protein_center = torch.mean(protein_graph['receptor'].pos, dim=0, keepdim=True)
            protein_graph['receptor'].pos = protein_graph['receptor'].pos - protein_center
            protein_graph['ligand'].pos = protein_graph['ligand'].pos - protein_center
            protein_graph.original_center = protein_center
            protein_graph['receptor_name'] = protein_graph.name
        else:
            protein_center = torch.mean(protein_graph['receptor'].pos, dim=0, keepdim=True)
            protein_graph['receptor'].pos = protein_graph['receptor'].pos - protein_center
            protein_graph.original_center = protein_center
            protein_graph['receptor_name'] = protein_graph.name

        if self.add_random_ligand:
            if smiles is not None:
                mol = MolFromSmiles(smiles)
                try:
                    generate_conformer(mol)
                except Exception as e:
                    print("failed to generate the given ligand returning None", e)
                    return None
            else:
                success = False
                while not success:
                    smiles = random.choice(self.smiles_list)
                    mol = MolFromSmiles(smiles)
                    try:
                        success = not generate_conformer(mol)
                    except Exception as e:
                        print(e, "changing ligand")

            lig_graph = HeteroData()
            get_lig_graph(mol, lig_graph)

            edge_mask, mask_rotate = get_transformation_mask(lig_graph)
            lig_graph['ligand'].edge_mask = torch.tensor(edge_mask)
            lig_graph['ligand'].mask_rotate = mask_rotate
            lig_graph['ligand'].smiles = smiles
            lig_graph['ligand'].pos = lig_graph['ligand'].pos - torch.mean(lig_graph['ligand'].pos, dim=0, keepdim=True)

            for type in lig_graph.node_types + lig_graph.edge_types:
                for key, value in lig_graph[type].items():
                    protein_graph[type][key] = value

        for a in ['random_coords', 'coords', 'seq', 'sequence', 'mask', 'rmsd_matching', 'cluster', 'orig_seq', 'to_keep', 'chain_ids']:
            if hasattr(protein_graph, a):
                delattr(protein_graph, a)
            if hasattr(protein_graph['receptor'], a):
                delattr(protein_graph['receptor'], a)

        return protein_graph

    def read_split(self):
        # read CSV file
        df = pd.read_csv(self.root + "/list.csv")
        print("Loaded list CSV file")

        # get clusters and filter by split
        if self.split == "train":
            val_clusters = set(read_strings_from_txt(self.root + "/valid_clusters.txt"))
            test_clusters = set(read_strings_from_txt(self.root + "/test_clusters.txt"))
            clusters = df["CLUSTER"].unique()
            clusters = [int(c) for c in clusters if c not in val_clusters and c not in test_clusters]
        elif self.split == "val":
            clusters = [int(s) for s in read_strings_from_txt(self.root + "/valid_clusters.txt")]
        elif self.split == "test":
            clusters = [int(s) for s in read_strings_from_txt(self.root + "/test_clusters.txt")]
        else:
            raise ValueError("Split must be train, val or test")
        print(self.split, "clusters", len(clusters))
        clusters = set(clusters)

        self.chains_in_cluster = []
        complexes_in_cluster = set()
        for chain, cluster in zip(df["CHAINID"], df["CLUSTER"]):
            if cluster not in clusters:
                continue
            # limit to one chain per complex
            if chain[:4] not in complexes_in_cluster:
                self.chains_in_cluster.append((chain, cluster))
                complexes_in_cluster.add(chain[:4])
        print("Filtered chains in cluster", len(self.chains_in_cluster))

        if self.limit_complexes > 0:
            self.chains_in_cluster = self.chains_in_cluster[:self.limit_complexes]

    def check_all_proteins(self):
        for i in range(len(self.chains_in_cluster)//10000+1):
            if not os.path.exists(os.path.join(self.cache_path, f"protein_graphs{i}.pkl")):
                return False
        return True

    def collect_proteins(self):
        self.protein_graphs = []
        self.vandermers = {}
        total_recovered = 0
        print(f'Loading {len(self.chains_in_cluster)} protein graphs.')
        list_indices = list(range(len(self.chains_in_cluster) // 10000 + 1))
        random.shuffle(list_indices)
        for i in list_indices:
            with open(os.path.join(self.cache_path, f"protein_graphs{i}.pkl"), 'rb') as f:
                print(i)
                l = pickle.load(f)
                total_recovered += len(l)
                self.protein_graphs.extend(l)

            if not self.vandermers_extraction:
                continue

            if os.path.exists(os.path.join(self.cache_path, f'vandermers{i}_{self.vandermers_max_dist}_{self.vandermers_buffer_residue_num}.pkl')):
                with open(os.path.join(self.cache_path, f'vandermers{i}_{self.vandermers_max_dist}_{self.vandermers_buffer_residue_num}.pkl'), 'rb') as f:
                    vandermers = pickle.load(f)
                    self.vandermers.update(vandermers)
                continue

            vandermers = {}
            if self.num_workers > 1:
                p = Pool(self.num_workers, maxtasksperchild=1)
                p.__enter__()
            with tqdm(total=len(l), desc=f'computing vandermers {i}') as pbar:
                map_fn = p.imap_unordered if self.num_workers > 1 else map
                arguments = zip(l, [self.vandermers_max_dist] * len(l),
                                [self.vandermers_buffer_residue_num] * len(l))
                for t in map_fn(identify_valid_vandermers, arguments):
                    if t is not None:
                        vandermers[t[0]] = t[1]
                    pbar.update()
            if self.num_workers > 1: p.__exit__(None, None, None)

            with open(os.path.join(self.cache_path, f'vandermers{i}_{self.vandermers_max_dist}_{self.vandermers_buffer_residue_num}.pkl'), 'wb') as f:
                pickle.dump(vandermers, f)
            self.vandermers.update(vandermers)

        print(f"Kept {len(self.protein_graphs)} proteins out of {len(self.chains_in_cluster)} total")
        return

    def preprocess(self):
        # running preprocessing in parallel on multiple workers and saving the progress every 10000 proteins
        list_indices = list(range(len(self.chains_in_cluster) // 10000 + 1))
        random.shuffle(list_indices)
        for i in list_indices:
            if os.path.exists(os.path.join(self.cache_path, f"protein_graphs{i}.pkl")):
                continue
            chains_names = self.chains_in_cluster[10000 * i:10000 * (i + 1)]
            protein_graphs = []
            if self.num_workers > 1:
                p = Pool(self.num_workers, maxtasksperchild=1)
                p.__enter__()
            with tqdm(total=len(chains_names),
                      desc=f'loading protein batch {i}/{len(self.chains_in_cluster) // 10000 + 1}') as pbar:
                map_fn = p.imap_unordered if self.num_workers > 1 else map
                for t in map_fn(self.load_chain, chains_names):
                    if t is not None:
                        protein_graphs.append(t)
                    pbar.update()
            if self.num_workers > 1: p.__exit__(None, None, None)

            with open(os.path.join(self.cache_path, f"protein_graphs{i}.pkl"), 'wb') as f:
                pickle.dump(protein_graphs, f)

        print("Finished preprocessing and saving protein graphs")

    def load_chain(self, c):
        chain, cluster = c
        if not os.path.exists(self.root + f"/pdb/{chain[1:3]}/{chain}.pt"):
            print("File not found", chain)
            return None

        data = torch.load(self.root + f"/pdb/{chain[1:3]}/{chain}.pt")
        complex_graph = HeteroData()
        complex_graph['name'] = chain
        orig_seq = data["seq"]
        coords = data["xyz"]
        mask = data["mask"].bool()

        # remove residues with NaN backbone coordinates
        to_keep = torch.logical_not(torch.any(torch.isnan(coords[:, :4, 0]), dim=1))
        coords = coords[to_keep]
        seq = ''.join(np.asarray(list(orig_seq))[to_keep.numpy()].tolist())
        mask = mask[to_keep]

        if len(coords) == 0:
            print("All coords were NaN", chain)
            return None

        try:
            new_extract_receptor_structure(seq, coords.numpy(), complex_graph=complex_graph, neighbor_cutoff=self.receptor_radius,
                                           max_neighbors=self.c_alpha_max_neighbors, knn_only_graph=self.knn_only_graph,
                                           all_atoms=self.all_atoms, atom_cutoff=self.atom_radius,
                                           atom_max_neighbors=self.atom_max_neighbors)
        except Exception as e:
            print("Error in extracting receptor", chain)
            print(e)
            return None

        if torch.any(torch.isnan(complex_graph['receptor'].pos)):
            print("NaN in pos receptor", chain)
            return None

        complex_graph.coords = coords
        complex_graph.seq = seq
        complex_graph.mask = mask
        complex_graph.cluster = cluster
        complex_graph.orig_seq = orig_seq
        complex_graph.to_keep = to_keep
        return complex_graph


if __name__ == "__main__":
    dataset = PDBSidechain(root="data/pdb_2021aug02_sample", split="train", multiplicity=1, limit_complexes=150)
    print(len(dataset))
    print(dataset[0])
    for p in dataset:
        print(p)
        pass