Benchmarking pipeline. Predicts the specific type of the generated membrane protein and the subcellular localization of the generated protein

Browse files

Files changed (9) hide show

.gitattributes +3 -0
benchmarks/DeepLoc/OG_membrane_type_all.csv +3 -0
benchmarks/DeepLoc/cell_localization_predictor.py +137 -0
benchmarks/DeepLoc/cell_localization_test.csv +0 -0
benchmarks/DeepLoc/cell_localization_train_val.csv +3 -0
benchmarks/DeepLoc/membrane_localization_predictor.py +137 -0
benchmarks/DeepLoc/membrane_type_test.csv +0 -0
benchmarks/DeepLoc/membrane_type_train.csv +3 -0
benchmarks/DeepLoc/prep_deeploc_benchmark_data.ipynb +488 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+benchmarks/DeepLoc/cell_localization_train_val.csv filter=lfs diff=lfs merge=lfs -text
+benchmarks/DeepLoc/membrane_type_train.csv filter=lfs diff=lfs merge=lfs -text
+benchmarks/DeepLoc/OG_membrane_type_all.csv filter=lfs diff=lfs merge=lfs -text

benchmarks/DeepLoc/OG_membrane_type_all.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d878da32a06092f880262048e3c1eb692721c274b0a458fcc712a0dcbd80c71
+size 15683507

benchmarks/DeepLoc/cell_localization_predictor.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, Dataset
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
+from tqdm import tqdm
+from datetime import datetime
+import pandas as pd
+import numpy as np
+import pickle
+import os
+# Hyperparameters dictionary
+path = "/home/a03-sgoel/MDpLM"
+hyperparams = {
+    "batch_size": 1,
+    "learning_rate": 4e-5,
+    "num_epochs": 5,
+    "max_length": 2000,
+    "train_data": path + "/benchmarks/DeepLoc/cell_localization_train_val.csv.csv",
+    "test_data" : path + "/benchmarks/DeepLoc/cell_localization_test.csv",
+    "val_data": "", # None
+    "embeddings_pkl": "", # Need to generate ESM embeddings and save as pkl file
+}
+# Dataset class can load pickle file
+class LocalizationDataset(Dataset):
+    def __init__(self, csv_file, embeddings_pkl, max_length=2000):
+        self.data = pd.read_csv(csv_file)
+        self.max_length = max_length
+        # Map sequences to embeddings
+        with open(embeddings_pkl, 'rb') as f:
+            self.embeddings_dict = pickle.load(f)
+        self.data['embedding'] = self.data['Sequence'].map(self.embeddings_dict)
+        # Ensure sequences and embeddings are of the same length
+        assert len(self.data) == len(self.data['embedding']), "CSV data and embeddings length mismatch"
+        # Create multi-class label list
+        self.data['label'] = self.data.iloc[:, 1:9].value.tolist()
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        embeddings = torch.tensor(self.data['embedding'][idx], dtype=torch.float)
+        labels = torch.tensor(self.data['label'][idx], dtype=torch.long)
+        return embeddings, labels
+# Multi-class localization predictor
+class LocalizationPredictor(nn.Module):
+    def __init__(self, input_dim, num_classes):
+        super(LocalizationPredictor, self).__init__()
+        self.classifier = nn.Linear(input_dim, num_classes) # 1280 x 8
+    def forward(self, embeddings):
+        avg_embedding = torch.mean(embeddings, dim=0) # Average embedding dimension: 1280
+        logits = self.classifier(avg_embedding)
+        return logits # pass logits of dimension 1x8 (8-class distribution) to CE loss
+# Training function
+def train(model, dataloader, optimizer, criterion, device):
+    model.train()
+    total_loss = 0
+    for embeddings, labels in tqdm(dataloader):
+        embeddings, labels = embeddings.to(device), labels.to(device)
+        optimizer.zero_grad()
+        outputs = model(embeddings)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    return total_loss / len(dataloader)
+# Evaluation function
+def evaluate(model, dataloader, device):
+    model.eval()
+    preds, true_labels = [], []
+    with torch.no_grad():
+        for embeddings, labels in tqdm(dataloader):
+            embeddings, labels = embeddings.to(device), labels.to(device)
+            outputs = model(embeddings)
+            preds.append(outputs.cpu().numpy())
+            true_labels.append(labels.cpu().numpy())
+    return preds, true_labels
+# Metrics calculation
+def calculate_metrics(preds, labels, threshold=0.5):
+    flat_binary_preds, flat_labels = [], []
+    for pred, label in zip(preds, labels):
+        flat_binary_preds.extend((pred > threshold).astype(int).flatten())
+        flat_labels.extend(label.flatten())
+    flat_binary_preds = np.array(flat_binary_preds)
+    flat_labels = np.array(flat_labels)
+    accuracy = accuracy_score(flat_labels, flat_binary_preds)
+    precision = precision_score(flat_labels, flat_binary_preds, average='macro')
+    recall = recall_score(flat_labels, flat_binary_preds, average='macro')
+    f1 = f1_score(flat_labels, flat_binary_preds, average='macro')
+    return accuracy, precision, recall, f1
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+train_dataset = LocalizationDataset(hyperparams["train_data"], hyperparams["embeddings_pkl"], max_length=hyperparams["max_length"])
+test_dataset = LocalizationDataset(hyperparams["test_data"], hyperparams["embeddings_pkl"], max_length=hyperparams["max_length"])
+train_dataloader = DataLoader(train_dataset, batch_size=hyperparams["batch_size"], shuffle=True)
+test_dataloader = DataLoader(test_dataset, batch_size=hyperparams["batch_size"], shuffle=False)
+model = LocalizationPredictor(input_dim=1280, num_classes=8).to(device)
+optimizer = optim.Adam(model.parameters(), lr=hyperparams["learning_rate"])
+criterion = nn.CrossEntropyLoss()
+# Train the model
+for epoch in range(hyperparams["num_epochs"]):
+    train_loss = train(model, train_dataloader, optimizer, criterion, device)
+    print(f"EPOCH {epoch+1}/{hyperparams['num_epochs']}")
+    print(f"TRAIN LOSS: {train_loss:.4f}")
+    print("\n")
+# Evaluate model on test dataset
+print("Test set")
+test_preds, test_labels = evaluate(model, test_dataloader, device)
+test_metrics = calculate_metrics(test_preds, test_labels)
+print("TEST METRICS:")
+print(f"Accuracy: {test_metrics[0]:.4f}")
+print(f"Precision: {test_metrics[1]:.4f}")
+print(f"Recall: {test_metrics[2]:.4f}")
+print(f"F1 Score: {test_metrics[3]:.4f}")

benchmarks/DeepLoc/cell_localization_test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

benchmarks/DeepLoc/cell_localization_train_val.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29a07b293fed2994a966b70bdcd6bacc59915b8b01fa200cb2b07d8db18384a2
+size 17724293

benchmarks/DeepLoc/membrane_localization_predictor.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, Dataset
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
+from tqdm import tqdm
+from datetime import datetime
+import pandas as pd
+import numpy as np
+import pickle
+import os
+# Hyperparameters dictionary
+path = "/home/a03-sgoel/MDpLM"
+hyperparams = {
+    "batch_size": 1,
+    "learning_rate": 4e-5,
+    "num_epochs": 5,
+    "max_length": 2000,
+    "train_data": path + "/benchmarks/membrane_type_train.csv",
+    "test_data" : path + "/benchmarks/membrane_type_test.csv",
+    "val_data": "", # none
+    "embeddings_pkl": "" # Need to generate ESM embeddings
+}
+# Dataset class can load pickle file
+class LocalizationDataset(Dataset):
+    def __init__(self, csv_file, embeddings_pkl, max_length=2000):
+        self.data = pd.read_csv(csv_file)
+        self.max_length = max_length
+        # Map sequences to embeddings
+        with open(embeddings_pkl, 'rb') as f:
+            self.embeddings_dict = pickle.load(f)
+        self.data['embedding'] = self.data['Sequence'].map(self.embeddings_dict)
+        # Ensure sequences and embeddings are of the same length
+        assert len(self.data) == len(self.data['embedding']), "CSV data and embeddings length mismatch"
+        # Create multi-class label list
+        self.data['label'] = self.data.iloc[:, 2:7].value.tolist()
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        embeddings = torch.tensor(self.data['embedding'][idx], dtype=torch.float)
+        labels = torch.tensor(self.data['label'][idx], dtype=torch.long)
+        return embeddings, labels
+# Multi-class localization predictor
+class LocalizationPredictor(nn.Module):
+    def __init__(self, input_dim, num_classes):
+        super(LocalizationPredictor, self).__init__()
+        self.classifier = nn.Linear(input_dim, num_classes) # 1280 x 4
+    def forward(self, embeddings):
+        avg_embedding = torch.mean(embeddings, dim=0) # Average embedding dimension: 1280
+        logits = self.classifier(avg_embedding)
+        return logits # pass logits of dimension 1x4 (4-class distribution) to CE loss
+# Training function
+def train(model, dataloader, optimizer, criterion, device):
+    model.train()
+    total_loss = 0
+    for embeddings, labels in tqdm(dataloader):
+        embeddings, labels = embeddings.to(device), labels.to(device)
+        optimizer.zero_grad()
+        outputs = model(embeddings)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    return total_loss / len(dataloader)
+# Evaluation function
+def evaluate(model, dataloader, device):
+    model.eval()
+    preds, true_labels = [], []
+    with torch.no_grad():
+        for embeddings, labels in tqdm(dataloader):
+            embeddings, labels = embeddings.to(device), labels.to(device)
+            outputs = model(embeddings)
+            preds.append(outputs.cpu().numpy())
+            true_labels.append(labels.cpu().numpy())
+    return preds, true_labels
+# Metrics calculation
+def calculate_metrics(preds, labels, threshold=0.5):
+    flat_binary_preds, flat_labels = [], []
+    for pred, label in zip(preds, labels):
+        flat_binary_preds.extend((pred > threshold).astype(int).flatten())
+        flat_labels.extend(label.flatten())
+    flat_binary_preds = np.array(flat_binary_preds)
+    flat_labels = np.array(flat_labels)
+    accuracy = accuracy_score(flat_labels, flat_binary_preds)
+    precision = precision_score(flat_labels, flat_binary_preds, average='macro')
+    recall = recall_score(flat_labels, flat_binary_preds, average='macro')
+    f1 = f1_score(flat_labels, flat_binary_preds, average='macro')
+    return accuracy, precision, recall, f1
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+train_dataset = LocalizationDataset(hyperparams["train_data"], hyperparams["embeddings_pkl"], max_length=hyperparams["max_length"])
+test_dataset = LocalizationDataset(hyperparams["test_data"], hyperparams["embeddings_pkl"], max_length=hyperparams["max_length"])
+train_dataloader = DataLoader(train_dataset, batch_size=hyperparams["batch_size"], shuffle=True)
+test_dataloader = DataLoader(test_dataset, batch_size=hyperparams["batch_size"], shuffle=False)
+model = LocalizationPredictor(input_dim=1280, num_classes=4).to(device)
+optimizer = optim.Adam(model.parameters(), lr=hyperparams["learning_rate"])
+criterion = nn.CrossEntropyLoss()
+# Train the model
+for epoch in range(hyperparams["num_epochs"]):
+    train_loss = train(model, train_dataloader, optimizer, criterion, device)
+    print(f"EPOCH {epoch+1}/{hyperparams['num_epochs']}")
+    print(f"TRAIN LOSS: {train_loss:.4f}")
+    print("\n")
+# Evaluate model on test dataset
+print("Test set")
+test_preds, test_labels = evaluate(model, test_dataloader, device)
+test_metrics = calculate_metrics(test_preds, test_labels)
+print("TEST METRICS:")
+print(f"Accuracy: {test_metrics[0]:.4f}")
+print(f"Precision: {test_metrics[1]:.4f}")
+print(f"Recall: {test_metrics[2]:.4f}")
+print(f"F1 Score: {test_metrics[3]:.4f}")

benchmarks/DeepLoc/membrane_type_test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

benchmarks/DeepLoc/membrane_type_train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16b8eec677afa2de578d04ee1a0fc9582b2f8cfc47622cbd6374309cd6ab96f3
+size 12335695

benchmarks/DeepLoc/prep_deeploc_benchmark_data.ipynb ADDED Viewed

	@@ -0,0 +1,488 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = \"/home/a03-sgoel/mESMerize/benchmarks/DeepLoc\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>ACC</th>\n",
+       "      <th>Kingdom</th>\n",
+       "      <th>Partition</th>\n",
+       "      <th>Peripheral</th>\n",
+       "      <th>Transmembrane</th>\n",
+       "      <th>LipidAnchor</th>\n",
+       "      <th>Soluble</th>\n",
+       "      <th>Sequence</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>I3R9M8</td>\n",
+       "      <td>Archaea</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>MSTDSDAETVDLADGVDHQVAMVMDLNKCIGCQTCTVACKSLWTEG...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>I3R9M9</td>\n",
+       "      <td>Archaea</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>MSRNDASQLDDGETTAESPPDDQANDAPEVGDPPGDPVDADSGVSR...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>Q7ZAG8</td>\n",
+       "      <td>Archaea</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>MTKVLVLGGRFGALTAAYTLKRLVGSKADVKVINKSRFSYFRPALP...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>Q8PZ67</td>\n",
+       "      <td>Archaea</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MPPKIAEVIQHDVCAACGACEAVCPIGAVTVKKAAEIRDPNDLSLY...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>Q9YGA6</td>\n",
+       "      <td>Archaea</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>MAGVRLVDVWKVFGEVTAVREMSLEVKDGEFMILLGPSGCGKTTTL...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28021</th>\n",
+       "      <td>28021</td>\n",
+       "      <td>P86949</td>\n",
+       "      <td>Eukaryota</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MLRFIAIVALIATVNAKGGTYGIGVLPSVTYVSGGGGGYPGIYGTY...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28022</th>\n",
+       "      <td>28022</td>\n",
+       "      <td>P86950</td>\n",
+       "      <td>Eukaryota</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MKPFISLASLIVLIASASAGGDDDYGKYGYGSYGPGIGGIGGGGGG...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28023</th>\n",
+       "      <td>28023</td>\n",
+       "      <td>P86951</td>\n",
+       "      <td>Eukaryota</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MLKLVCAVVLIATVNAKGSSPGFGIGQLPGITVVSGGVSGGSLSGG...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28024</th>\n",
+       "      <td>28024</td>\n",
+       "      <td>P86983</td>\n",
+       "      <td>Eukaryota</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MHQSSLGVLVLFSLIYLCISVHVPFDLNGWKALRLDNNRVQDSTNL...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28025</th>\n",
+       "      <td>28025</td>\n",
+       "      <td>P86984</td>\n",
+       "      <td>Eukaryota</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MLMLLCIIATVIPFSLVEGRKGCWADPTPPGKECLYGKEIHGGRNL...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>28026 rows × 9 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       Unnamed: 0     ACC    Kingdom  Partition  Peripheral  Transmembrane  \\\n",
+       "0               0  I3R9M8    Archaea          0           1              0   \n",
+       "1               1  I3R9M9    Archaea          1           1              0   \n",
+       "2               2  Q7ZAG8    Archaea          2           1              0   \n",
+       "3               3  Q8PZ67    Archaea          0           1              0   \n",
+       "4               4  Q9YGA6    Archaea          0           1              0   \n",
+       "...           ...     ...        ...        ...         ...            ...   \n",
+       "28021       28021  P86949  Eukaryota          0           0              0   \n",
+       "28022       28022  P86950  Eukaryota          0           0              0   \n",
+       "28023       28023  P86951  Eukaryota          0           0              0   \n",
+       "28024       28024  P86983  Eukaryota          3           0              0   \n",
+       "28025       28025  P86984  Eukaryota          4           0              0   \n",
+       "\n",
+       "       LipidAnchor  Soluble                                           Sequence  \n",
+       "0                0        0  MSTDSDAETVDLADGVDHQVAMVMDLNKCIGCQTCTVACKSLWTEG...  \n",
+       "1                0        0  MSRNDASQLDDGETTAESPPDDQANDAPEVGDPPGDPVDADSGVSR...  \n",
+       "2                0        0  MTKVLVLGGRFGALTAAYTLKRLVGSKADVKVINKSRFSYFRPALP...  \n",
+       "3                0        1  MPPKIAEVIQHDVCAACGACEAVCPIGAVTVKKAAEIRDPNDLSLY...  \n",
+       "4                0        0  MAGVRLVDVWKVFGEVTAVREMSLEVKDGEFMILLGPSGCGKTTTL...  \n",
+       "...            ...      ...                                                ...  \n",
+       "28021            0        1  MLRFIAIVALIATVNAKGGTYGIGVLPSVTYVSGGGGGYPGIYGTY...  \n",
+       "28022            0        1  MKPFISLASLIVLIASASAGGDDDYGKYGYGSYGPGIGGIGGGGGG...  \n",
+       "28023            0        1  MLKLVCAVVLIATVNAKGSSPGFGIGQLPGITVVSGGVSGGSLSGG...  \n",
+       "28024            0        1  MHQSSLGVLVLFSLIYLCISVHVPFDLNGWKALRLDNNRVQDSTNL...  \n",
+       "28025            0        1  MLMLLCIIATVIPFSLVEGRKGCWADPTPPGKECLYGKEIHGGRNL...  \n",
+       "\n",
+       "[28026 rows x 9 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ACC</th>\n",
+       "      <th>Kingdom</th>\n",
+       "      <th>Partition</th>\n",
+       "      <th>Peripheral</th>\n",
+       "      <th>Transmembrane</th>\n",
+       "      <th>LipidAnchor</th>\n",
+       "      <th>Soluble</th>\n",
+       "      <th>Sequence</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>I3R9M8</td>\n",
+       "      <td>Archaea</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>MSTDSDAETVDLADGVDHQVAMVMDLNKCIGCQTCTVACKSLWTEG...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>I3R9M9</td>\n",
+       "      <td>Archaea</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>MSRNDASQLDDGETTAESPPDDQANDAPEVGDPPGDPVDADSGVSR...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Q7ZAG8</td>\n",
+       "      <td>Archaea</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>MTKVLVLGGRFGALTAAYTLKRLVGSKADVKVINKSRFSYFRPALP...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Q8PZ67</td>\n",
+       "      <td>Archaea</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MPPKIAEVIQHDVCAACGACEAVCPIGAVTVKKAAEIRDPNDLSLY...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Q9YGA6</td>\n",
+       "      <td>Archaea</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>MAGVRLVDVWKVFGEVTAVREMSLEVKDGEFMILLGPSGCGKTTTL...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28021</th>\n",
+       "      <td>P86949</td>\n",
+       "      <td>Eukaryota</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MLRFIAIVALIATVNAKGGTYGIGVLPSVTYVSGGGGGYPGIYGTY...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28022</th>\n",
+       "      <td>P86950</td>\n",
+       "      <td>Eukaryota</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MKPFISLASLIVLIASASAGGDDDYGKYGYGSYGPGIGGIGGGGGG...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28023</th>\n",
+       "      <td>P86951</td>\n",
+       "      <td>Eukaryota</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MLKLVCAVVLIATVNAKGSSPGFGIGQLPGITVVSGGVSGGSLSGG...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28024</th>\n",
+       "      <td>P86983</td>\n",
+       "      <td>Eukaryota</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MHQSSLGVLVLFSLIYLCISVHVPFDLNGWKALRLDNNRVQDSTNL...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28025</th>\n",
+       "      <td>P86984</td>\n",
+       "      <td>Eukaryota</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MLMLLCIIATVIPFSLVEGRKGCWADPTPPGKECLYGKEIHGGRNL...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>28026 rows × 8 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          ACC    Kingdom  Partition  Peripheral  Transmembrane  LipidAnchor  \\\n",
+       "0      I3R9M8    Archaea          0           1              0            0   \n",
+       "1      I3R9M9    Archaea          1           1              0            0   \n",
+       "2      Q7ZAG8    Archaea          2           1              0            0   \n",
+       "3      Q8PZ67    Archaea          0           1              0            0   \n",
+       "4      Q9YGA6    Archaea          0           1              0            0   \n",
+       "...       ...        ...        ...         ...            ...          ...   \n",
+       "28021  P86949  Eukaryota          0           0              0            0   \n",
+       "28022  P86950  Eukaryota          0           0              0            0   \n",
+       "28023  P86951  Eukaryota          0           0              0            0   \n",
+       "28024  P86983  Eukaryota          3           0              0            0   \n",
+       "28025  P86984  Eukaryota          4           0              0            0   \n",
+       "\n",
+       "       Soluble                                           Sequence  \n",
+       "0            0  MSTDSDAETVDLADGVDHQVAMVMDLNKCIGCQTCTVACKSLWTEG...  \n",
+       "1            0  MSRNDASQLDDGETTAESPPDDQANDAPEVGDPPGDPVDADSGVSR...  \n",
+       "2            0  MTKVLVLGGRFGALTAAYTLKRLVGSKADVKVINKSRFSYFRPALP...  \n",
+       "3            1  MPPKIAEVIQHDVCAACGACEAVCPIGAVTVKKAAEIRDPNDLSLY...  \n",
+       "4            0  MAGVRLVDVWKVFGEVTAVREMSLEVKDGEFMILLGPSGCGKTTTL...  \n",
+       "...        ...                                                ...  \n",
+       "28021        1  MLRFIAIVALIATVNAKGGTYGIGVLPSVTYVSGGGGGYPGIYGTY...  \n",
+       "28022        1  MKPFISLASLIVLIASASAGGDDDYGKYGYGSYGPGIGGIGGGGGG...  \n",
+       "28023        1  MLKLVCAVVLIATVNAKGSSPGFGIGQLPGITVVSGGVSGGSLSGG...  \n",
+       "28024        1  MHQSSLGVLVLFSLIYLCISVHVPFDLNGWKALRLDNNRVQDSTNL...  \n",
+       "28025        1  MLMLLCIIATVIPFSLVEGRKGCWADPTPPGKECLYGKEIHGGRNL...  \n",
+       "\n",
+       "[28026 rows x 8 columns]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(path + \"/OG_membrane_type_all.csv\")\n",
+    "df = df.drop(columns=['Unnamed: 0'])\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = df[df['Partition'] != 4]\n",
+    "test = df[df['Partition'] == 4]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train.to_csv(path + \"/membrane_type_train.csv\", index=False)\n",
+    "test.to_csv(path + \"/membrane_type_test.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}