{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Genotype Differenciation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import scipy.stats as stats\n", "import statsmodels.api as sm\n", "from statsmodels.formula.api import ols\n", "from statsmodels.regression.linear_model import RegressionResultsWrapper\n", "from statsmodels.stats.multicomp import pairwise_tukeyhsd\n", "\n", "from matplotlib.figure import Figure\n", "import seaborn as sns\n", "import panel as pn\n", "\n", "import com_const as cc\n", "import com_func as cf\n", "import com_image as ci" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "warnings.simplefilter(action=\"ignore\", category=UserWarning)\n", "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.set_option(\"display.max_colwidth\", 500)\n", "pd.set_option(\"display.max_columns\", 500)\n", "pd.set_option(\"display.width\", 1000)\n", "pd.set_option(\"display.max_rows\", 20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sns.set_style(\"whitegrid\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pn.extension(\"ipywidgets\", \"plotly\", design=\"material\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Constants" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "stars = [-np.log(0.05), -np.log(0.01), -np.log(0.001), -np.log(0.0001)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Functions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_single_progression(\n", " ax,\n", " df,\n", " target,\n", " title: str,\n", " hue=\"gen\",\n", " style=\"gen\",\n", " show_legend: bool = False,\n", "):\n", " lp = sns.lineplot(\n", " df.sort_values(hue),\n", " x=\"dpi\",\n", " y=target,\n", " hue=hue,\n", " markers=True,\n", " style=style,\n", " dashes=False,\n", " palette=\"tab10\",\n", " markersize=12,\n", " ax=ax,\n", " )\n", " lp.set_yticklabels([\"\", \"3\", \"\", \"5\", \"\", \"7\", \"\", \"9\"])\n", " ax.set_title(title)\n", " if show_legend is True:\n", " sns.move_legend(ax, \"upper left\", bbox_to_anchor=(1, 1))\n", " else:\n", " ax.get_legend().set_visible(False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_model(\n", " df: pd.DataFrame, target: str, formula: str, dpi: int = None\n", ") -> RegressionResultsWrapper:\n", " df_ = df[df.dpi == dpi] if dpi is not None else df\n", " return ols(f\"{target} {formula}\", data=df_).fit()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def anova_table(aov, add_columns: bool = True):\n", " \"\"\"\n", " The function below was created specifically for the one-way ANOVA table\n", " results returned for Type II sum of squares\n", " \"\"\"\n", " if add_columns is True:\n", " aov[\"mean_sq\"] = aov[:][\"sum_sq\"] / aov[:][\"df\"]\n", "\n", " aov[\"eta_sq\"] = aov[:-1][\"sum_sq\"] / sum(aov[\"sum_sq\"])\n", "\n", " aov[\"omega_sq\"] = (\n", " aov[:-1][\"sum_sq\"] - (aov[:-1][\"df\"] * aov[\"mean_sq\"][-1])\n", " ) / (sum(aov[\"sum_sq\"]) + aov[\"mean_sq\"][-1])\n", "\n", " cols = [\"sum_sq\", \"df\", \"mean_sq\", \"F\", \"PR(>F)\", \"eta_sq\", \"omega_sq\"]\n", " aov = aov[cols]\n", " return aov" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_assumptions(models: list, titles: list, figsize=(12, 4)):\n", " fig = Figure(figsize=figsize)\n", " fig.suptitle(\"Probability plot of model residual's\", fontsize=\"x-large\")\n", " axii = fig.subplots(1, len(models))\n", " for ax, model, title in zip(axii, models, titles):\n", " _ = stats.probplot(model.resid, plot=ax, rvalue=True)\n", " ax.set_title(title)\n", "\n", " return fig" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def hghlight_rejection(s):\n", " df = pd.DataFrame(columns=s.columns, index=s.index)\n", " df.loc[s[\"reject_pred\"].ne(s[\"reject_obs\"]), [\"group1\", \"group2\"]] = (\n", " \"background: red\"\n", " )\n", " df.loc[s[\"reject_pred\"].eq(s[\"reject_obs\"]), [\"group1\", \"group2\"]] = (\n", " \"background: green\"\n", " )\n", " df.loc[s.reject_pred, [\"reject_pred\"]] = \"background: green\"\n", " df.loc[~s.reject_pred, [\"reject_pred\"]] = \"background: red\"\n", " df.loc[s.reject_obs, [\"reject_obs\"]] = \"background: green\"\n", " df.loc[~s.reject_obs, [\"reject_obs\"]] = \"background: red\"\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_tuckey_df(endog, groups, df_genotypes) -> pd.DataFrame:\n", " tukey = pairwise_tukeyhsd(endog=endog, groups=groups)\n", " df_tuc = pd.DataFrame(tukey._results_table)\n", " df_tuc.columns = [str(c) for c in df_tuc.iloc[0]]\n", " ret = (\n", " df_tuc.drop(df_tuc.index[0])\n", " .assign(group1=lambda s: s.group1.astype(str))\n", " .assign(group2=lambda s: s.group2.astype(str))\n", " .assign(reject=lambda s: s.reject.astype(str) == \"True\")\n", " )\n", " ret[\"p-adj\"] = tukey.pvalues\n", " if df_genotypes is None:\n", " return ret\n", " else:\n", " return (\n", " ret.merge(right=df_genotypes, how=\"left\", left_on=\"group1\", right_on=\"gen\")\n", " .drop([\"gen\"], axis=1)\n", " .rename(columns={\"rpvloci\": \"group1_rpvloci\"})\n", " .merge(right=df_genotypes, how=\"left\", left_on=\"group2\", right_on=\"gen\")\n", " .drop([\"gen\"], axis=1)\n", " .rename(columns={\"rpvloci\": \"group2_rpvloci\"})\n", " )\n", "\n", "\n", "def get_tuckey_compare(df, df_genotypes=None, groups: str = \"gen\"):\n", " merge_on = (\n", " [\"group1\", \"group2\"]\n", " if df_genotypes is None\n", " else [\"group1\", \"group2\", \"group1_rpvloci\", \"group2_rpvloci\"]\n", " )\n", " df_poiv = get_tuckey_df(df.p_oiv, df[groups], df_genotypes=df_genotypes)\n", " df_oiv = get_tuckey_df(df.oiv, df[groups], df_genotypes=df_genotypes)\n", " df = pd.merge(left=df_poiv, right=df_oiv, on=merge_on, suffixes=[\"_pred\", \"_obs\"])\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def df_tukey_cmp_plot(df, groups):\n", " df_tukey = (\n", " get_tuckey_compare(df=df, groups=groups, df_genotypes=None)\n", " .assign(pair_groups=lambda s: s.group1 + \"\\n\" + s.group2)\n", " .sort_values(\"p-adj_obs\")\n", " )\n", "\n", " df_tukey_reject = df_tukey[df_tukey.reject_obs & df_tukey.reject_pred]\n", " df_tukey_accept = df_tukey[~df_tukey.reject_obs & ~df_tukey.reject_pred]\n", " df_tukey_diverge = df_tukey[df_tukey.reject_obs != df_tukey.reject_pred]\n", "\n", " fig = Figure(figsize=(20, 6))\n", " ax_reject, ax_diverge, ax_accept = fig.subplots(\n", " 1,\n", " 3,\n", " gridspec_kw={\n", " \"width_ratios\": [\n", " len(df_tukey_reject),\n", " len(df_tukey_diverge),\n", " len(df_tukey_accept),\n", " ]\n", " },\n", " sharey=True,\n", " )\n", "\n", " for ax in [ax_reject, ax_accept, ax_diverge]:\n", " ax.set_yticks(ticks=stars, labels=[\"*\", \"**\", \"***\", \"****\"])\n", " ax.grid(False)\n", "\n", " ax_reject.set_title(\"Rejected\")\n", " ax_diverge.set_title(\"Conflict\")\n", " ax_accept.set_title(\"Accepted\")\n", "\n", " for ax, df in zip(\n", " [ax_reject, ax_accept, ax_diverge],\n", " [df_tukey_reject, df_tukey_accept, df_tukey_diverge],\n", " ):\n", " for star in stars:\n", " ax.axhline(y=star, linestyle=\"-\", color=\"black\", alpha=0.5)\n", " ax.bar(\n", " x=df[\"pair_groups\"],\n", " height=-np.log(df[\"p-adj_pred\"]),\n", " width=-0.4,\n", " align=\"edge\",\n", " color=\"green\",\n", " label=\"predictions\",\n", " )\n", " ax.bar(\n", " x=df[\"pair_groups\"],\n", " height=-np.log(df[\"p-adj_obs\"]),\n", " width=0.4,\n", " align=\"edge\",\n", " color=\"blue\",\n", " label=\"scorings\",\n", " )\n", " ax.margins(0.01)\n", "\n", " ax_accept.legend(loc=\"upper left\", bbox_to_anchor=[0, 1], ncols=1, fancybox=True)\n", " ax_reject.set_ylabel(\"-log(p value)\")\n", " ax_reject.tick_params(axis=\"y\", which=\"major\", labelsize=16)\n", "\n", " fig.subplots_adjust(wspace=0.05, hspace=0.05)\n", "\n", " return fig" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_patches(df, diff_only: bool = True):\n", " if diff_only is True:\n", " df = df[(df.oiv != df.p_oiv)]\n", " df = df.assign(diff=lambda s: s.oiv != s.p_oiv).sort_values(\n", " [\"diff\", \"oiv\", \"p_oiv\"]\n", " )\n", " return pn.GridBox(\n", " *[\n", " pn.Column(\n", " pn.pane.Markdown(f\"### {row.file_name}|{row.oiv}->p{row.p_oiv}\"),\n", " pn.pane.Image(\n", " object=ci.enhance_pil_image(\n", " image=ci.load_image(\n", " file_name=row.file_name,\n", " path_to_images=cc.path_to_leaf_patches,\n", " ),\n", " brightness=1.5,\n", " )\n", " ),\n", " )\n", " for _, row in df.iterrows()\n", " ],\n", " ncols=len(df),\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = cf.read_dataframe(\n", " path=cc.path_to_data.joinpath(\"genotype_differenciation_dataset.csv\")\n", ").assign(exp=lambda s: s.experiment + s.inoc.astype(str))\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_dpi_6 = df[df.dpi == 6]\n", "df_dpi_6" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Visualizations" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig = Figure(figsize=(12, 4))\n", "ax_oiv, ax_p_oiv = fig.subplots(nrows=1, ncols=2)\n", "\n", "full_oiv = \"OIV 452-1\"\n", "df_oiv = df.copy()\n", "df_oiv[full_oiv] = df_oiv.oiv\n", "df_p_oiv = df.copy()\n", "df_p_oiv[full_oiv] = df_p_oiv.p_oiv\n", "\n", "var = \"gen\"\n", "\n", "plot_single_progression(\n", " ax=ax_oiv, df=df_oiv, target=full_oiv, title=\"Human scored OIV 452-1\"\n", ")\n", "\n", "plot_single_progression(\n", " ax=ax_p_oiv,\n", " df=df_p_oiv,\n", " target=full_oiv,\n", " title=\"Model predicted OIV 452-1\",\n", " show_legend=True,\n", ")\n", "\n", "fig" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig = Figure(figsize=(16, 6))\n", "sns.histplot(\n", " df_dpi_6.sort_values(\"gen\"),\n", " x=\"gen\",\n", " hue=\"gen\",\n", " shrink=0.8,\n", " ax=fig.subplots(1, 1),\n", ")\n", "\n", "fig" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ANOVA" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rpv_formula = f\"~ C(gen) + C(exp) + C(exp):C(gen)\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "(\n", " pd.concat(\n", " [\n", " sm.stats.anova_lm(\n", " get_model(df=df, target=\"oiv\", dpi=i, formula=rpv_formula)\n", " ).assign(dpi=i)\n", " for i in sorted(list(df.dpi.unique()))\n", " ]\n", " )\n", " .reset_index()\n", " .set_index(\"dpi\")\n", " .drop(\n", " [\"df\", \"sum_sq\", \"mean_sq\"],\n", " axis=1,\n", " )\n", " .query(\"index != 'Residual'\")\n", " .query(\"index != 'C(exp)'\")\n", " .rename(columns={\"index\": \"source of variation\"})\n", " .replace(\"C(gen)\", \"genotype (between)\")\n", " .replace(\"C(exp):C(gen)\", \"interaction genotype/experiment\")\n", " .reset_index()\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_dpi_6.groupby(\"gen\").agg(\n", " {\"oiv\": [\"mean\", \"std\"], \"p_oiv\": [\"mean\", \"std\"]}\n", ").reset_index()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pn.GridBox(\n", " pn.Column(\n", " pn.pane.Markdown(\"### Annotated\"),\n", " anova_table(\n", " sm.stats.anova_lm(\n", " get_model(df=df_dpi_6, target=\"oiv\", dpi=6, formula=rpv_formula),\n", " typ=2,\n", " )\n", " ),\n", " ),\n", " pn.Column(\n", " pn.pane.Markdown(\"### Predicted\"),\n", " anova_table(\n", " sm.stats.anova_lm(\n", " get_model(df=df_dpi_6, target=\"p_oiv\", dpi=6, formula=rpv_formula),\n", " typ=2,\n", " )\n", " ),\n", " ),\n", " ncols=2,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_assumptions(\n", " models=[\n", " get_model(df=df_dpi_6, target=\"oiv\", dpi=6, formula=rpv_formula),\n", " get_model(df=df_dpi_6, target=\"p_oiv\", dpi=6, formula=rpv_formula),\n", " ],\n", " titles=[\"Score OIV 452-1\", \"Predicted OIV 452-1\"],\n", " figsize=(10, 5),\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tukey HSD" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dft = get_tuckey_compare(df=df_dpi_6, groups=\"gen\", df_genotypes=None)\n", "dft.style.apply(hghlight_rejection, axis=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_tukey_cmp_plot(df=df_dpi_6, groups=\"gen\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_cmp_means = (\n", " (df_dpi_6[df_dpi_6.gen.isin([\"1441s\", \"1466s\"])])\n", " .groupby(\"gen\")\n", " .agg({\"oiv\": [\"mean\", \"std\"], \"p_oiv\": [\"mean\", \"std\"]})\n", " .reset_index()\n", ")\n", "df_cmp_means[\"difference\"] = df_cmp_means.oiv[\"mean\"] - df_cmp_means.p_oiv[\"mean\"]\n", "df_cmp_means" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_patches(df_dpi_6[df_dpi_6.gen.isin([\"1441s\"])], diff_only=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_patches(df_dpi_6[df_dpi_6.gen.isin([\"1466s\"])], diff_only=True)" ] } ], "metadata": { "kernelspec": { "display_name": "env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 2 }