# Genotype Differenciation

## Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import warnings

import numpy as np
import pandas as pd

import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.regression.linear_model import RegressionResultsWrapper
from statsmodels.stats.multicomp import pairwise_tukeyhsd

from matplotlib.figure import Figure
import seaborn as sns
import panel as pn

import com_const as cc
import com_func as cf
import com_image as ci

## Setup

In [None]:
warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
pd.set_option("display.max_colwidth", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("display.max_rows", 20)

In [None]:
sns.set_style("whitegrid")

In [None]:
pn.extension("ipywidgets", "plotly", design="material")

## Constants

In [None]:
stars = [-np.log(0.05), -np.log(0.01), -np.log(0.001), -np.log(0.0001)]

## Functions

In [None]:
def plot_single_progression(
 ax,
 df,
 target,
 title: str,
 hue="gen",
 style="gen",
 show_legend: bool = False,
):
 lp = sns.lineplot(
 df.sort_values(hue),
 x="dpi",
 y=target,
 hue=hue,
 markers=True,
 style=style,
 dashes=False,
 palette="tab10",
 markersize=12,
 ax=ax,
 )
 lp.set_yticklabels(["", "3", "", "5", "", "7", "", "9"])
 ax.set_title(title)
 if show_legend is True:
 sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
 else:
 ax.get_legend().set_visible(False)

In [None]:
def get_model(
 df: pd.DataFrame, target: str, formula: str, dpi: int = None
) -> RegressionResultsWrapper:
 df_ = df[df.dpi == dpi] if dpi is not None else df
 return ols(f"{target} {formula}", data=df_).fit()

In [None]:
def anova_table(aov, add_columns: bool = True):
 """
 The function below was created specifically for the one-way ANOVA table
 results returned for Type II sum of squares
 """
 if add_columns is True:
 aov["mean_sq"] = aov[:]["sum_sq"] / aov[:]["df"]

 aov["eta_sq"] = aov[:-1]["sum_sq"] / sum(aov["sum_sq"])

 aov["omega_sq"] = (
 aov[:-1]["sum_sq"] - (aov[:-1]["df"] * aov["mean_sq"][-1])
 ) / (sum(aov["sum_sq"]) + aov["mean_sq"][-1])

 cols = ["sum_sq", "df", "mean_sq", "F", "PR(>F)", "eta_sq", "omega_sq"]
 aov = aov[cols]
 return aov

In [None]:
def plot_assumptions(models: list, titles: list, figsize=(12, 4)):
 fig = Figure(figsize=figsize)
 fig.suptitle("Probability plot of model residual's", fontsize="x-large")
 axii = fig.subplots(1, len(models))
 for ax, model, title in zip(axii, models, titles):
 _ = stats.probplot(model.resid, plot=ax, rvalue=True)
 ax.set_title(title)

 return fig

In [None]:
def hghlight_rejection(s):
 df = pd.DataFrame(columns=s.columns, index=s.index)
 df.loc[s["reject_pred"].ne(s["reject_obs"]), ["group1", "group2"]] = (
 "background: red"
 )
 df.loc[s["reject_pred"].eq(s["reject_obs"]), ["group1", "group2"]] = (
 "background: green"
 )
 df.loc[s.reject_pred, ["reject_pred"]] = "background: green"
 df.loc[~s.reject_pred, ["reject_pred"]] = "background: red"
 df.loc[s.reject_obs, ["reject_obs"]] = "background: green"
 df.loc[~s.reject_obs, ["reject_obs"]] = "background: red"
 return df

In [None]:
def get_tuckey_df(endog, groups, df_genotypes) -> pd.DataFrame:
 tukey = pairwise_tukeyhsd(endog=endog, groups=groups)
 df_tuc = pd.DataFrame(tukey._results_table)
 df_tuc.columns = [str(c) for c in df_tuc.iloc[0]]
 ret = (
 df_tuc.drop(df_tuc.index[0])
 .assign(group1=lambda s: s.group1.astype(str))
 .assign(group2=lambda s: s.group2.astype(str))
 .assign(reject=lambda s: s.reject.astype(str) == "True")
 )
 ret["p-adj"] = tukey.pvalues
 if df_genotypes is None:
 return ret
 else:
 return (
 ret.merge(right=df_genotypes, how="left", left_on="group1", right_on="gen")
 .drop(["gen"], axis=1)
 .rename(columns={"rpvloci": "group1_rpvloci"})
 .merge(right=df_genotypes, how="left", left_on="group2", right_on="gen")
 .drop(["gen"], axis=1)
 .rename(columns={"rpvloci": "group2_rpvloci"})
 )


def get_tuckey_compare(df, df_genotypes=None, groups: str = "gen"):
 merge_on = (
 ["group1", "group2"]
 if df_genotypes is None
 else ["group1", "group2", "group1_rpvloci", "group2_rpvloci"]
 )
 df_poiv = get_tuckey_df(df.p_oiv, df[groups], df_genotypes=df_genotypes)
 df_oiv = get_tuckey_df(df.oiv, df[groups], df_genotypes=df_genotypes)
 df = pd.merge(left=df_poiv, right=df_oiv, on=merge_on, suffixes=["_pred", "_obs"])
 return df

In [None]:
def df_tukey_cmp_plot(df, groups):
 df_tukey = (
 get_tuckey_compare(df=df, groups=groups, df_genotypes=None)
 .assign(pair_groups=lambda s: s.group1 + "\n" + s.group2)
 .sort_values("p-adj_obs")
 )

 df_tukey_reject = df_tukey[df_tukey.reject_obs & df_tukey.reject_pred]
 df_tukey_accept = df_tukey[~df_tukey.reject_obs & ~df_tukey.reject_pred]
 df_tukey_diverge = df_tukey[df_tukey.reject_obs != df_tukey.reject_pred]

 fig = Figure(figsize=(20, 6))
 ax_reject, ax_diverge, ax_accept = fig.subplots(
 1,
 3,
 gridspec_kw={
 "width_ratios": [
 len(df_tukey_reject),
 len(df_tukey_diverge),
 len(df_tukey_accept),
 ]
 },
 sharey=True,
 )

 for ax in [ax_reject, ax_accept, ax_diverge]:
 ax.set_yticks(ticks=stars, labels=["*", "**", "***", "****"])
 ax.grid(False)

 ax_reject.set_title("Rejected")
 ax_diverge.set_title("Conflict")
 ax_accept.set_title("Accepted")

 for ax, df in zip(
 [ax_reject, ax_accept, ax_diverge],
 [df_tukey_reject, df_tukey_accept, df_tukey_diverge],
 ):
 for star in stars:
 ax.axhline(y=star, linestyle="-", color="black", alpha=0.5)
 ax.bar(
 x=df["pair_groups"],
 height=-np.log(df["p-adj_pred"]),
 width=-0.4,
 align="edge",
 color="green",
 label="predictions",
 )
 ax.bar(
 x=df["pair_groups"],
 height=-np.log(df["p-adj_obs"]),
 width=0.4,
 align="edge",
 color="blue",
 label="scorings",
 )
 ax.margins(0.01)

 ax_accept.legend(loc="upper left", bbox_to_anchor=[0, 1], ncols=1, fancybox=True)
 ax_reject.set_ylabel("-log(p value)")
 ax_reject.tick_params(axis="y", which="major", labelsize=16)

 fig.subplots_adjust(wspace=0.05, hspace=0.05)

 return fig

In [None]:
def plot_patches(df, diff_only: bool = True):
 if diff_only is True:
 df = df[(df.oiv != df.p_oiv)]
 df = df.assign(diff=lambda s: s.oiv != s.p_oiv).sort_values(
 ["diff", "oiv", "p_oiv"]
 )
 return pn.GridBox(
 *[
 pn.Column(
 pn.pane.Markdown(f"### {row.file_name}|{row.oiv}->p{row.p_oiv}"),
 pn.pane.Image(
 object=ci.enhance_pil_image(
 image=ci.load_image(
 file_name=row.file_name,
 path_to_images=cc.path_to_leaf_patches,
 ),
 brightness=1.5,
 )
 ),
 )
 for _, row in df.iterrows()
 ],
 ncols=len(df),
 )

## Load Data

In [None]:
df = cf.read_dataframe(
 path=cc.path_to_data.joinpath("genotype_differenciation_dataset.csv")
).assign(exp=lambda s: s.experiment + s.inoc.astype(str))
df

In [None]:
df_dpi_6 = df[df.dpi == 6]
df_dpi_6

## Visualizations

In [None]:
fig = Figure(figsize=(12, 4))
ax_oiv, ax_p_oiv = fig.subplots(nrows=1, ncols=2)

full_oiv = "OIV 452-1"
df_oiv = df.copy()
df_oiv[full_oiv] = df_oiv.oiv
df_p_oiv = df.copy()
df_p_oiv[full_oiv] = df_p_oiv.p_oiv

var = "gen"

plot_single_progression(
 ax=ax_oiv, df=df_oiv, target=full_oiv, title="Human scored OIV 452-1"
)

plot_single_progression(
 ax=ax_p_oiv,
 df=df_p_oiv,
 target=full_oiv,
 title="Model predicted OIV 452-1",
 show_legend=True,
)

fig

In [None]:
fig = Figure(figsize=(16, 6))
sns.histplot(
 df_dpi_6.sort_values("gen"),
 x="gen",
 hue="gen",
 shrink=0.8,
 ax=fig.subplots(1, 1),
)

fig

## ANOVA

In [None]:
rpv_formula = f"~ C(gen) + C(exp) + C(exp):C(gen)"

In [None]:
(
 pd.concat(
 [
 sm.stats.anova_lm(
 get_model(df=df, target="oiv", dpi=i, formula=rpv_formula)
 ).assign(dpi=i)
 for i in sorted(list(df.dpi.unique()))
 ]
 )
 .reset_index()
 .set_index("dpi")
 .drop(
 ["df", "sum_sq", "mean_sq"],
 axis=1,
 )
 .query("index != 'Residual'")
 .query("index != 'C(exp)'")
 .rename(columns={"index": "source of variation"})
 .replace("C(gen)", "genotype (between)")
 .replace("C(exp):C(gen)", "interaction genotype/experiment")
 .reset_index()
)

In [None]:
df_dpi_6.groupby("gen").agg(
 {"oiv": ["mean", "std"], "p_oiv": ["mean", "std"]}
).reset_index()

In [None]:
pn.GridBox(
 pn.Column(
 pn.pane.Markdown("### Annotated"),
 anova_table(
 sm.stats.anova_lm(
 get_model(df=df_dpi_6, target="oiv", dpi=6, formula=rpv_formula),
 typ=2,
 )
 ),
 ),
 pn.Column(
 pn.pane.Markdown("### Predicted"),
 anova_table(
 sm.stats.anova_lm(
 get_model(df=df_dpi_6, target="p_oiv", dpi=6, formula=rpv_formula),
 typ=2,
 )
 ),
 ),
 ncols=2,
)

In [None]:
plot_assumptions(
 models=[
 get_model(df=df_dpi_6, target="oiv", dpi=6, formula=rpv_formula),
 get_model(df=df_dpi_6, target="p_oiv", dpi=6, formula=rpv_formula),
 ],
 titles=["Score OIV 452-1", "Predicted OIV 452-1"],
 figsize=(10, 5),
)

# Tukey HSD

In [None]:
dft = get_tuckey_compare(df=df_dpi_6, groups="gen", df_genotypes=None)
dft.style.apply(hghlight_rejection, axis=None)

In [None]:
df_tukey_cmp_plot(df=df_dpi_6, groups="gen")

In [None]:
df_cmp_means = (
 (df_dpi_6[df_dpi_6.gen.isin(["1441s", "1466s"])])
 .groupby("gen")
 .agg({"oiv": ["mean", "std"], "p_oiv": ["mean", "std"]})
 .reset_index()
)
df_cmp_means["difference"] = df_cmp_means.oiv["mean"] - df_cmp_means.p_oiv["mean"]
df_cmp_means

In [None]:
plot_patches(df_dpi_6[df_dpi_6.gen.isin(["1441s"])], diff_only=True)

In [None]:
plot_patches(df_dpi_6[df_dpi_6.gen.isin(["1466s"])], diff_only=True)