Add alternative methods comparison examples

Files changed (4) hide show

benchmarking/castle_cell_type_annotation.r +80 -0
benchmarking/prepare_datasplits_for_cell_type_annotation.ipynb +288 -0
benchmarking/randomForest_token_classifier_dosageTF_10k.ipynb +0 -0
benchmarking/scDeepsort_train_predict.ipynb +166 -0

benchmarking/castle_cell_type_annotation.r ADDED Viewed

	@@ -0,0 +1,80 @@

+# Usage: Rscript castle_cell_type_annotation.r organ
+# parse ordered arguments
+args <- commandArgs(trailingOnly=TRUE)
+organ <- args[1]
+suppressPackageStartupMessages(library(scater))
+suppressPackageStartupMessages(library(xgboost))
+suppressPackageStartupMessages(library(igraph))
+BREAKS=c(-1, 0, 1, 6, Inf)
+nFeatures = 100
+print(paste("Training ", organ, sep=""))
+# import training and test data
+rootdir="/path/to/data/"
+train_counts <- t(as.matrix(read.csv(file = paste(rootdir, organ, "_filtered_data_train.csv", sep=""), row.names = 1)))
+test_counts <- t(as.matrix(read.csv(file = paste(rootdir, organ, "_filtered_data_test.csv", sep=""), row.names = 1)))
+train_celltype <- as.matrix(read.csv(file = paste(rootdir, organ, "_filtered_celltype_train.csv", sep="")))
+test_celltype <- as.matrix(read.csv(file = paste(rootdir, organ, "_filtered_celltype_test.csv", sep="")))
+# select features
+sourceCellTypes = as.factor(train_celltype[,"Cell_type"])
+ds = rbind(train_counts,test_counts)
+ds[is.na(ds)] <- 0
+isSource = c(rep(TRUE,nrow(train_counts)), rep(FALSE,nrow(test_counts)))
+topFeaturesAvg = colnames(ds[isSource,])[order(apply(ds[isSource,], 2, mean), decreasing = T)]
+topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),sourceCellTypes,method = "nmi") }), decreasing = T))
+selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )
+tmp = cor(ds[isSource,selectedFeatures], method = "pearson")
+tmp[!lower.tri(tmp)] = 0
+selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]
+remove(tmp)
+# bin expression values and expand features by bins
+dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)
+nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })
+ds = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))
+remove(dsBins, nUniq)
+# train model
+train = runif(nrow(ds[isSource,]))<0.8
+# slightly different setup for multiclass and binary classification
+if (length(unique(sourceCellTypes)) > 2) {
+  xg=xgboost(data=ds[isSource,][train, ] ,
+       label=as.numeric(sourceCellTypes[train])-1,
+       objective="multi:softmax", num_class=length(unique(sourceCellTypes)),
+       eta=0.7 , nthread=5, nround=20, verbose=0,
+       gamma=0.001, max_depth=5, min_child_weight=10)
+} else {
+  xg=xgboost(data=ds[isSource,][train, ] ,
+       label=as.numeric(sourceCellTypes[train])-1,
+       eta=0.7 , nthread=5, nround=20, verbose=0,
+       gamma=0.001, max_depth=5, min_child_weight=10)
+}
+# validate model
+predictedClasses = predict(xg, ds[!isSource, ])
+testCellTypes = as.factor(test_celltype[,"Cell_type"])
+trueClasses <- as.numeric(testCellTypes)-1
+cm <- as.matrix(table(Actual = trueClasses, Predicted = predictedClasses))
+n <- sum(cm)
+nc = nrow(cm) # number of classes
+diag = diag(cm) # number of correctly classified instances per class
+rowsums = apply(cm, 1, sum) # number of instances per class
+colsums = apply(cm, 2, sum) # number of predictions per class
+p = rowsums / n # distribution of instances over the actual classes
+q = colsums / n # distribution of instances over the predicted classes
+accuracy = sum(diag) / n
+precision = diag / colsums
+recall = diag / rowsums
+f1 = 2 * precision * recall / (precision + recall)
+macroF1 = mean(f1)
+print(paste(organ, " accuracy: ", accuracy, sep=""))
+print(paste(organ, " macroF1: ", macroF1, sep=""))
+results_df = data.frame(Accuracy=c(accuracy),macroF1=c(macroF1))
+write.csv(results_df,paste(rootdir, organ, "_castle_results_test.csv", sep=""), row.names = FALSE)

benchmarking/prepare_datasplits_for_cell_type_annotation.ipynb ADDED Viewed

	@@ -0,0 +1,288 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "25107132",
+   "metadata": {},
+   "source": [
+    "### Preparing train and test data splits for cell type annotation application"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "83d8d249-affe-45dd-915e-992b4b35b31a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from tqdm.notebook import tqdm\n",
+    "from collections import Counter\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e3e6a2bf-44c8-4164-9ecd-1686230ea8be",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['pancreas',\n",
+       " 'liver',\n",
+       " 'blood',\n",
+       " 'lung',\n",
+       " 'spleen',\n",
+       " 'placenta',\n",
+       " 'colorectum',\n",
+       " 'kidney',\n",
+       " 'brain']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "rootdir = \"/path/to/data/\"\n",
+    "\n",
+    "# collect panel of tissues to test\n",
+    "dir_list = []\n",
+    "for dir_i in os.listdir(rootdir):\n",
+    "    if (\"results\" not in dir_i) & (os.path.isdir(os.path.join(rootdir, dir_i))):\n",
+    "        dir_list += [dir_i]\n",
+    "dir_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "0b205eec-a518-472a-ab90-dd63ef9803cd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filter_pass</th>\n",
+       "      <th>original_cell_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>C_1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>C_2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>C_3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>C_4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>C_5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9590</th>\n",
+       "      <td>1</td>\n",
+       "      <td>C_9591</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9591</th>\n",
+       "      <td>1</td>\n",
+       "      <td>C_9592</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9592</th>\n",
+       "      <td>1</td>\n",
+       "      <td>C_9593</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9593</th>\n",
+       "      <td>1</td>\n",
+       "      <td>C_9594</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9594</th>\n",
+       "      <td>1</td>\n",
+       "      <td>C_9595</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>9595 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      filter_pass original_cell_id\n",
+       "0               0              C_1\n",
+       "1               1              C_2\n",
+       "2               0              C_3\n",
+       "3               1              C_4\n",
+       "4               0              C_5\n",
+       "...           ...              ...\n",
+       "9590            1           C_9591\n",
+       "9591            1           C_9592\n",
+       "9592            1           C_9593\n",
+       "9593            1           C_9594\n",
+       "9594            1           C_9595\n",
+       "\n",
+       "[9595 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# dictionary of cell barcodes that passed QC filtering applied by Geneformer \n",
+    "# to ensure same cells were used for comparison\n",
+    "with open(f\"{rootdir}deepsort_filter_dict.pickle\", \"rb\") as fp:\n",
+    "    filter_dict = pickle.load(fp)\n",
+    "\n",
+    "# for example:\n",
+    "filter_dict[\"human_Placenta9595_data\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "207e3571-0236-4493-83b3-a89b67b16cb2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "for dir_name in tqdm(dir_list):\n",
+    "\n",
+    "    df = pd.DataFrame()\n",
+    "    ct_df = pd.DataFrame(columns=[\"Cell\",\"Cell_type\"])\n",
+    "    \n",
+    "    subrootdir = f\"{rootdir}{dir_name}/\"\n",
+    "    for subdir, dirs, files in os.walk(subrootdir):\n",
+    "        for i in range(len(files)):\n",
+    "            file = files[i]\n",
+    "            if file.endswith(\"_data.csv\"):\n",
+    "                file_prefix = file.replace(\"_data.csv\",\"\")\n",
+    "                sample_prefix = file.replace(\".csv\",\"\")\n",
+    "                filter_df = filter_dict[sample_prefix]\n",
+    "                sample_to_analyze = list(filter_df[filter_df[\"filter_pass\"]==1][\"original_cell_id\"])\n",
+    "                \n",
+    "                # collect data for each tissue\n",
+    "                df_i = pd.read_csv(f\"{subrootdir}{file}\", index_col=0)\n",
+    "                df_i = df_i[sample_to_analyze]\n",
+    "                df_i.columns = [f\"{i}_{cell_id}\" for cell_id in df_i.columns]\n",
+    "                df = pd.concat([df,df_i],axis=1)\n",
+    "                \n",
+    "                # collect cell type metadata\n",
+    "                ct_df_i = pd.read_csv(f\"{subrootdir}{file_prefix}_celltype.csv\", index_col=0)\n",
+    "                ct_df_i.columns = [\"Cell\",\"Cell_type\"]\n",
+    "                ct_df_i[\"Cell\"] = [f\"{i}_{cell_id}\" for cell_id in ct_df_i[\"Cell\"]]\n",
+    "                ct_df = pd.concat([ct_df,ct_df_i],axis=0)\n",
+    "        \n",
+    "    # per published scDeepsort method, filter data for cell types >0.5% of data\n",
+    "    ct_counts = Counter(ct_df[\"Cell_type\"])\n",
+    "    total_count = sum(ct_counts.values())\n",
+    "    nonrare_cell_types = [cell_type for cell_type,count in ct_counts.items() if count>(total_count*0.005)]\n",
+    "    nonrare_cells = list(ct_df[ct_df[\"Cell_type\"].isin(nonrare_cell_types)][\"Cell\"])\n",
+    "    df = df[df.columns.intersection(nonrare_cells)]\n",
+    "\n",
+    "    # split into 80/20 train/test data\n",
+    "    train, test = train_test_split(df.T, test_size=0.2)\n",
+    "    train = train.T\n",
+    "    test = test.T  \n",
+    "    \n",
+    "    # save filtered train/test data\n",
+    "    train.to_csv(f\"{subrootdir}{dir_name}_filtered_data_train.csv\")\n",
+    "    test.to_csv(f\"{subrootdir}{dir_name}_filtered_data_test.csv\")\n",
+    "\n",
+    "    # split metadata into train/test data\n",
+    "    ct_df_train = ct_df[ct_df[\"Cell\"].isin(list(train.columns))]\n",
+    "    ct_df_test = ct_df[ct_df[\"Cell\"].isin(list(test.columns))]\n",
+    "    train_order_dict = dict(zip(train.columns,[i for i in range(len(train.columns))]))\n",
+    "    test_order_dict = dict(zip(test.columns,[i for i in range(len(test.columns))]))\n",
+    "    ct_df_train[\"order\"] = [train_order_dict[cell_id] for cell_id in ct_df_train[\"Cell\"]]\n",
+    "    ct_df_test[\"order\"] = [test_order_dict[cell_id] for cell_id in ct_df_test[\"Cell\"]]\n",
+    "    ct_df_train = ct_df_train.sort_values(\"order\")\n",
+    "    ct_df_test = ct_df_test.sort_values(\"order\")\n",
+    "    ct_df_train = ct_df_train.drop(\"order\",axis=1)\n",
+    "    ct_df_test = ct_df_test.drop(\"order\",axis=1)\n",
+    "    assert list(ct_df_train[\"Cell\"]) == list(train.columns)\n",
+    "    assert list(ct_df_test[\"Cell\"]) == list(test.columns)\n",
+    "    train_labels = list(Counter(ct_df_train[\"Cell_type\"]).keys())\n",
+    "    test_labels = list(Counter(ct_df_test[\"Cell_type\"]).keys())\n",
+    "    assert set(train_labels) == set(test_labels)\n",
+    "    \n",
+    "    # save train/test cell type annotations\n",
+    "    ct_df_train.to_csv(f\"{subrootdir}{dir_name}_filtered_celltype_train.csv\")\n",
+    "    ct_df_test.to_csv(f\"{subrootdir}{dir_name}_filtered_celltype_test.csv\")\n",
+    "                "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.6 64-bit ('3.8.6')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.6"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "eba1599a1f7e611c14c87ccff6793920aa63510b01fc0e229d6dd014149b8829"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

benchmarking/randomForest_token_classifier_dosageTF_10k.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

benchmarking/scDeepsort_train_predict.ipynb ADDED Viewed

	@@ -0,0 +1,166 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "83d8d249-affe-45dd-915e-992b4b35b31a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import deepsort\n",
+    "from sklearn.metrics import accuracy_score, f1_score\n",
+    "from tqdm.notebook import tqdm\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "25de46ec-8a41-484d-8e14-d2b19768fc2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_metrics(labels, preds):\n",
+    "\n",
+    "    # calculate accuracy and macro f1 using sklearn's function\n",
+    "    acc = accuracy_score(labels, preds)\n",
+    "    macro_f1 = f1_score(labels, preds, average='macro')\n",
+    "    return {\n",
+    "      'accuracy': acc,\n",
+    "      'macro_f1': macro_f1\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "a4029b2b-afca-4300-82a2-082fec59f191",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['pancreas',\n",
+       " 'liver',\n",
+       " 'blood',\n",
+       " 'lung',\n",
+       " 'spleen',\n",
+       " 'placenta',\n",
+       " 'colorectum',\n",
+       " 'kidney',\n",
+       " 'brain']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "rootdir = \"/path/to/data/\"\n",
+    "\n",
+    "dir_list = []\n",
+    "for dir_i in os.listdir(rootdir):\n",
+    "    if (\"results\" not in dir_i) & (os.path.isdir(os.path.join(rootdir, dir_i))):\n",
+    "        dir_list += [dir_i]\n",
+    "dir_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ddcdc5cd-871e-4fd2-8457-18d3049fa76c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "output_dir = \"results_EDefault_filtered\"\n",
+    "n_epochs = \"Default\"  # scDeepsort default epochs = 300\n",
+    "\n",
+    "results_dict = dict()\n",
+    "for dir_name in tqdm(dir_list):\n",
+    "    print(f\"TRAINING: {dir_name}\")\n",
+    "    subrootdir = f\"{rootdir}{dir_name}/\"\n",
+    "    train_files = [(f\"{subrootdir}{dir_name}_filtered_data_train.csv\",f\"{subrootdir}{dir_name}_filtered_celltype_train.csv\")]\n",
+    "    test_file = f\"{subrootdir}{dir_name}_filtered_data_test.csv\"\n",
+    "    label_file = f\"{subrootdir}{dir_name}_filtered_celltype_test.csv\"\n",
+    "    \n",
+    "    # define the model\n",
+    "    model = deepsort.DeepSortClassifier(species='human',\n",
+    "                               tissue=dir_name,\n",
+    "                               gpu_id=0,\n",
+    "                               random_seed=1,\n",
+    "                               validation_fraction=0)  # use all training data (already held out 20% in test data file)\n",
+    "\n",
+    "    # fit the model\n",
+    "    model.fit(train_files, save_path=f\"{subrootdir}{output_dir}\")\n",
+    "    \n",
+    "    # use the saved model to predict cell types in test data\n",
+    "    model.predict(input_file=test_file,\n",
+    "                   model_path=f\"{subrootdir}{output_dir}\",\n",
+    "                   save_path=f\"{subrootdir}{output_dir}\",\n",
+    "                   unsure_rate=0,\n",
+    "                   file_type='csv')\n",
+    "    labels_df = pd.read_csv(label_file)\n",
+    "    preds_df = pd.read_csv(f\"{subrootdir}{output_dir}/human_{dir_name}_{dir_name}_filtered_data_test.csv\")\n",
+    "    label_cell_ids = labels_df[\"Cell\"]\n",
+    "    pred_cell_ids = preds_df[\"index\"]\n",
+    "    assert list(label_cell_ids) == list(pred_cell_ids)\n",
+    "    labels = list(labels_df[\"Cell_type\"])\n",
+    "    if isinstance(preds_df[\"cell_subtype\"][0],float):\n",
+    "        if np.isnan(preds_df[\"cell_subtype\"][0]):\n",
+    "            preds = list(preds_df[\"cell_type\"])\n",
+    "            results = compute_metrics(labels, preds)\n",
+    "    else:\n",
+    "        preds1 = list(preds_df[\"cell_type\"])\n",
+    "        preds2 = list(preds_df[\"cell_subtype\"])\n",
+    "        results1 = compute_metrics(labels, preds1)\n",
+    "        results2 = compute_metrics(labels, preds2)\n",
+    "        if results2[\"accuracy\"] > results1[\"accuracy\"]:\n",
+    "            results = results2\n",
+    "        else:\n",
+    "            results = results1\n",
+    "        \n",
+    "    print(f\"{dir_name}: {results}\")\n",
+    "    results_dict[dir_name] = results\n",
+    "    with open(f\"{subrootdir}deepsort_E{n_epochs}_filtered_pred_{dir_name}.pickle\", \"wb\") as output_file:\n",
+    "        pickle.dump(results, output_file)\n",
+    "\n",
+    "# save results\n",
+    "with open(f\"{rootdir}deepsort_E{n_epochs}_filtered_pred_dict.pickle\", \"wb\") as output_file:\n",
+    "    pickle.dump(results_dict, output_file)\n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.6 64-bit ('3.8.6')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.6"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "eba1599a1f7e611c14c87ccff6793920aa63510b01fc0e229d6dd014149b8829"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}