virus_explorer / app.py
Hack90's picture
Update app.py
a17ec79 verified
raw
history blame
42.7 kB
from shiny import render
from shiny.express import input, ui
from datasets import load_dataset
import pandas as pd
from pathlib import Path
import matplotlib
import numpy as np
import gradio as gr
from shiny.express import input, output, render, ui
############################################################# 2D Line Plot ########################################################
### dvq stuff, obvs this will just be an import in the final version
from typing import Dict, Optional
from collections import namedtuple
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as mplstyle
from pathlib import Path
from shiny import render
from shiny.express import input, ui
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
import numpy as np
# Mapping of nucleotides to float coordinates
mapping_easy = {
'A': np.array([0.5, -0.8660254037844386]),
'T': np.array([0.5, 0.8660254037844386]),
'G': np.array([0.8660254037844386, -0.5]),
'C': np.array([0.8660254037844386, 0.5]),
'N': np.array([0, 0])
}
# coordinates for x+iy
Coord = namedtuple("Coord", ["x","y"])
# coordinates for a CGR encoding
CGRCoords = namedtuple("CGRCoords", ["N","x","y"])
# coordinates for each nucleotide in the 2d-plane
DEFAULT_COORDS = dict(A=Coord(1,1),C=Coord(-1,1),G=Coord(-1,-1),T=Coord(1,-1))
# Function to convert a DNA sequence to a list of coordinates
def _dna_to_coordinates(dna_sequence, mapping):
dna_sequence = dna_sequence.upper()
coordinates = np.array([mapping.get(nucleotide, mapping['N']) for nucleotide in dna_sequence])
return coordinates
# Function to create the cumulative sum of a list of coordinates
def _get_cumulative_coords(mapped_coords):
cumulative_coords = np.cumsum(mapped_coords, axis=0)
return cumulative_coords
# Function to take a list of DNA sequences and plot them in a single figure
def plot_2d_sequences(dna_sequences, mapping=mapping_easy, single_sequence=False):
fig, ax = plt.subplots()
if single_sequence:
dna_sequences = [dna_sequences]
for dna_sequence in dna_sequences:
mapped_coords = _dna_to_coordinates(dna_sequence, mapping)
cumulative_coords = _get_cumulative_coords(mapped_coords)
ax.plot(*cumulative_coords.T)
return fig
# Function to plot a comparison of DNA sequences
def plot_2d_comparison(dna_sequences_grouped, labels, mapping=mapping_easy):
fig, ax = plt.subplots()
colors = plt.cm.rainbow(np.linspace(0, 1, len(dna_sequences_grouped)))
for count, (dna_sequences, color) in enumerate(zip(dna_sequences_grouped, colors)):
for dna_sequence in dna_sequences:
mapped_coords = _dna_to_coordinates(dna_sequence, mapping)
cumulative_coords = _get_cumulative_coords(mapped_coords)
ax.plot(*cumulative_coords.T, color=color, label=labels[count])
# Only show unique labels in the legend
handles, labels = ax.get_legend_handles_labels()
by_label = dict(zip(labels, handles))
ax.legend(by_label.values(), by_label.keys())
return fig
############################################################# Virus Dataset ########################################################
#ds = load_dataset('Hack90/virus_tiny')
df = pd.read_parquet('virus_ds.parquet')
virus = df['Organism_Name'].unique()
virus = {v: v for v in virus}
############################################################# Filter and Select ########################################################
def filter_and_select(group):
if len(group) >= 3:
return group.head(3)
############################################################# Wens Method ########################################################
import numpy as np
WEIGHTS = {'0100': 1/6, '0101': 2/6, '1100' : 3/6, '0110':3/6, '1101': 4/6, '1110': 5/6,'0111':5/6, '1111': 6/6}
LOWEST_LENGTH = 5000
def _get_subsequences(sequence):
return {nuc: [i+1 for i, x in enumerate(sequence) if x == nuc] for nuc in 'ACTG'}
def _calculate_coordinates_fixed(subsequence, L=LOWEST_LENGTH):
return [((2 * np.pi / (L - 1)) * (K-1), np.sqrt((2 * np.pi / (L - 1)) * (K-1))) for K in subsequence]
def _calculate_weighting_full(sequence, WEIGHTS, L=LOWEST_LENGTH, E=0.0375):
weightings = [0]
for i in range(1, len(sequence) - 1):
if i < len(sequence) - 2:
subsequence = sequence[i-1:i+3]
comparison_pattern = f"{'1' if subsequence[0] == subsequence[1] else '0'}1{'1' if subsequence[2] == subsequence[1] else '0'}{'1' if subsequence[3] == subsequence[1] else '0'}"
weight = WEIGHTS.get(comparison_pattern, 0)
weight = weight * E if i > L else weight
else:
weight = 0
weightings.append(weight)
weightings.append(0)
return weightings
def _centre_of_mass(polar_coordinates, weightings):
x, y = _calculate_standard_coordinates(polar_coordinates)
return sum(weightings[i] * ((x[i] - (x[i]*weightings[i]))**2 + (y[i] - y[i]*weightings[i])**2) for i in range(len(x)))
def _normalised_moment_of_inertia(polar_coordinates, weightings):
moment = _centre_of_mass(polar_coordinates, weightings)
return np.sqrt(moment / sum(weightings))
def _calculate_standard_coordinates(polar_coordinates):
return [rho * np.cos(theta) for theta, rho in polar_coordinates], [rho * np.sin(theta) for theta, rho in polar_coordinates]
def _moments_of_inertia(polar_coordinates, weightings):
return [_normalised_moment_of_inertia(indices, weightings) for subsequence, indices in polar_coordinates.items()]
def moment_of_inertia(sequence, WEIGHTS, L=5000, E=0.0375):
subsequences = _get_subsequences(sequence)
polar_coordinates = {subsequence: _calculate_coordinates_fixed(indices, len(sequence)) for subsequence, indices in subsequences.items()}
weightings = _calculate_weighting_full(sequence, WEIGHTS, L=L, E=E)
return _moments_of_inertia(polar_coordinates, weightings)
def similarity_wen(sequence1, sequence2, WEIGHTS, L=5000, E=0.0375):
L = min(len(sequence1), len(sequence2))
inertia1 = moment_of_inertia(sequence1, WEIGHTS, L=L, E=E)
inertia2 = moment_of_inertia(sequence2, WEIGHTS, L=L, E=E)
similarity = np.sqrt(sum((x - y)**2 for x, y in zip(inertia1, inertia2)))
return similarity
def heatmap(data, row_labels, col_labels, ax=None,
cbar_kw=None, cbarlabel="", **kwargs):
"""
Create a heatmap from a numpy array and two lists of labels.
Parameters
----------
data
A 2D numpy array of shape (M, N).
row_labels
A list or array of length M with the labels for the rows.
col_labels
A list or array of length N with the labels for the columns.
ax
A `matplotlib.axes.Axes` instance to which the heatmap is plotted. If
not provided, use current axes or create a new one. Optional.
cbar_kw
A dictionary with arguments to `matplotlib.Figure.colorbar`. Optional.
cbarlabel
The label for the colorbar. Optional.
**kwargs
All other arguments are forwarded to `imshow`.
"""
if ax is None:
ax = plt.gca()
if cbar_kw is None:
cbar_kw = {}
# Plot the heatmap
im = ax.imshow(data, **kwargs)
# Create colorbar
cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
# Show all ticks and label them with the respective list entries.
ax.set_xticks(np.arange(data.shape[1]), labels=col_labels)
ax.set_yticks(np.arange(data.shape[0]), labels=row_labels)
# Let the horizontal axes labeling appear on top.
ax.tick_params(top=True, bottom=False,
labeltop=True, labelbottom=False)
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=-30, ha="right",
rotation_mode="anchor")
# Turn spines off and create white grid.
ax.spines[:].set_visible(False)
ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
ax.tick_params(which="minor", bottom=False, left=False)
return im, cbar
def annotate_heatmap(im, data=None, valfmt="{x:.2f}",
textcolors=("black", "white"),
threshold=None, **textkw):
"""
A function to annotate a heatmap.
Parameters
----------
im
The AxesImage to be labeled.
data
Data used to annotate. If None, the image's data is used. Optional.
valfmt
The format of the annotations inside the heatmap. This should either
use the string format method, e.g. "$ {x:.2f}", or be a
`matplotlib.ticker.Formatter`. Optional.
textcolors
A pair of colors. The first is used for values below a threshold,
the second for those above. Optional.
threshold
Value in data units according to which the colors from textcolors are
applied. If None (the default) uses the middle of the colormap as
separation. Optional.
**kwargs
All other arguments are forwarded to each call to `text` used to create
the text labels.
"""
if not isinstance(data, (list, np.ndarray)):
data = im.get_array()
# Normalize the threshold to the images color range.
if threshold is not None:
threshold = im.norm(threshold)
else:
threshold = im.norm(data.max())/2.
# Set default alignment to center, but allow it to be
# overwritten by textkw.
kw = dict(horizontalalignment="center",
verticalalignment="center")
kw.update(textkw)
# Get the formatter in case a string is supplied
if isinstance(valfmt, str):
valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)
# Loop over the data and create a `Text` for each "pixel".
# Change the text's color depending on the data.
texts = []
for i in range(data.shape[0]):
for j in range(data.shape[1]):
kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
texts.append(text)
return texts
def wens_method_heatmap(df, virus_species):
# Create a dataframe to store the similarity values
similarity_df = pd.DataFrame(index=virus_species, columns=virus_species)
# Fill the dataframe with similarity values
for virus1 in virus_species:
for virus2 in virus_species:
if virus1 == virus2:
sequence1 = df[df['Organism_Name'] == virus1]['Sequence'].values[0]
sequence2 = df[df['Organism_Name'] == virus2]['Sequence'].values[1]
similarity = similarity_wen(sequence1, sequence2, WEIGHTS)
similarity_df.loc[virus1, virus2] = similarity
else:
sequence1 = df[df['Organism_Name'] == virus1]['Sequence'].values[0]
sequence2 = df[df['Organism_Name'] == virus2]['Sequence'].values[0]
similarity = similarity_wen(sequence1, sequence2, WEIGHTS)
similarity_df.loc[virus1, virus2] = similarity
similarity_df = similarity_df.apply(pd.to_numeric)
# Optional: Handle NaN values if your similarity computation might result in them
# similarity_df.fillna(0, inplace=True)
fig, ax = plt.subplots()
# Plotting
im = ax.imshow(similarity_df, cmap="YlGn")
ax.set_xticks(np.arange(len(virus_species)), labels=virus_species)
ax.set_yticks(np.arange(len(virus_species)), labels=virus_species)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.set_ylabel("Similarity", rotation=-90, va="bottom")
return fig
############################################################# ColorSquare ########################################################
import math
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import pandas as pd
def _fill_spiral(matrix, seq_colors, k):
left, top, right, bottom = 0, 0, k-1, k-1
index = 0
while left <= right and top <= bottom:
for i in range(left, right + 1): # Top row
if index < len(seq_colors):
matrix[top][i] = seq_colors[index]
index += 1
top += 1
for i in range(top, bottom + 1): # Right column
if index < len(seq_colors):
matrix[i][right] = seq_colors[index]
index += 1
right -= 1
for i in range(right, left - 1, -1): # Bottom row
if index < len(seq_colors):
matrix[bottom][i] = seq_colors[index]
index += 1
bottom -= 1
for i in range(bottom, top - 1, -1): # Left column
if index < len(seq_colors):
matrix[i][left] = seq_colors[index]
index += 1
left += 1
def _generate_color_square(sequence,virus, save=False, count=0, label=None):
# Define the sequence and corresponding colors with indices
colors = {'a': 0, 't': 1, 'c': 2, 'g': 3, 'n': 4} # Assign indices to each color
seq_colors = [colors[char] for char in sequence.lower()] # Map the sequence to color indices
# Calculate k (size of the square)
k = math.ceil(math.sqrt(len(sequence)))
# Initialize a k x k matrix filled with the index for 'white'
matrix = np.full((k, k), colors['n'], dtype=int)
# Fill the matrix in a clockwise spiral
_fill_spiral(matrix, seq_colors, k)
# Define a custom color map for plotting
cmap = ListedColormap(['red', 'green', 'yellow', 'blue', 'white'])
# Plot the matrix
plt.figure(figsize=(5, 5))
plt.imshow(matrix, cmap=cmap, interpolation='nearest')
if label:
plt.title(label)
plt.axis('off') # Hide the axes
if save:
plt.savefig(f'color_square_{virus}_{count}.png', dpi=300, bbox_inches='tight')
# plt.show()
def plot_color_square(df, virus_species):
ncols = 3
nrows = len(virus_species)
fig, axeses = plt.subplots(
nrows=nrows,
ncols=ncols,
squeeze=False,
)
for i in range(0, ncols * nrows):
row = i // ncols
col = i % ncols
axes = axeses[row, col]
data = df[i]
virus = virus_species[row]
# Define the sequence and corresponding colors with indices
colors = {'a': 0, 't': 1, 'c': 2, 'g': 3, 'n': 4}
# remove all non-nucleotide characters
data = ''.join([char for char in data.lower() if char in 'atcgn'])
# Assign indices to each color
seq_colors = [colors[char] for char in data.lower()] # Map the sequence to color indices
# Calculate k (size of the square)
k = math.ceil(math.sqrt(len(data)))
# Initialize a k x k matrix filled with the index for 'white'
matrix = np.full((k, k), colors['n'], dtype=int)
# Fill the matrix in a clockwise spiral
_fill_spiral(matrix, seq_colors, k)
# Define a custom color map for plotting
cmap = ListedColormap(['red', 'green', 'yellow', 'blue', 'white'])
axes.imshow(matrix, cmap=cmap, interpolation='nearest')
axes.set_title(virus)
return fig
def generate_color_square(sequence,virus, multi=False, save=False, label=None):
if multi:
for i,seq in enumerate(sequence):
_generate_color_square(seq, virus,save, i, label[i] if label else None)
else:
_generate_color_square(sequence, save, label=label)
############################################################# FCGR ########################################################
from typing import Dict, Optional
from collections import namedtuple
# coordinates for x+iy
Coord = namedtuple("Coord", ["x","y"])
# coordinates for a CGR encoding
CGRCoords = namedtuple("CGRCoords", ["N","x","y"])
# coordinates for each nucleotide in the 2d-plane
DEFAULT_COORDS = dict(A=Coord(1,1),C=Coord(-1,1),G=Coord(-1,-1),T=Coord(1,-1))
class CGR:
"Chaos Game Representation for DNA"
def __init__(self, coords: Optional[Dict[chr,tuple]]=None):
self.nucleotide_coords = DEFAULT_COORDS if coords is None else coords
self.cgr_coords = CGRCoords(0,0,0)
def nucleotide_by_coords(self,x,y):
"Get nucleotide by coordinates (x,y)"
# filter nucleotide by coordinates
filtered = dict(filter(lambda item: item[1] == Coord(x,y), self.nucleotide_coords.items()))
return list(filtered.keys())[0]
def forward(self, nucleotide: str):
"Compute next CGR coordinates"
x = (self.cgr_coords.x + self.nucleotide_coords.get(nucleotide).x)/2
y = (self.cgr_coords.y + self.nucleotide_coords.get(nucleotide).y)/2
# update cgr_coords
self.cgr_coords = CGRCoords(self.cgr_coords.N+1,x,y)
def backward(self,):
"Compute last CGR coordinates. Current nucleotide can be inferred from (x,y)"
# get current nucleotide based on coordinates
n_x,n_y = self.coords_current_nucleotide()
nucleotide = self.nucleotide_by_coords(n_x,n_y)
# update coordinates to the previous one
x = 2*self.cgr_coords.x - n_x
y = 2*self.cgr_coords.y - n_y
# update cgr_coords
self.cgr_coords = CGRCoords(self.cgr_coords.N-1,x,y)
return nucleotide
def coords_current_nucleotide(self,):
x = 1 if self.cgr_coords.x>0 else -1
y = 1 if self.cgr_coords.y>0 else -1
return x,y
def encode(self, sequence: str):
"From DNA sequence to CGR"
# reset starting position to (0,0,0)
self.reset_coords()
for nucleotide in sequence:
self.forward(nucleotide)
return self.cgr_coords
def reset_coords(self,):
self.cgr_coords = CGRCoords(0,0,0)
def decode(self, N:int, x:int, y:int)->str:
"From CGR to DNA sequence"
self.cgr_coords = CGRCoords(N,x,y)
# decoded sequence
sequence = []
# Recover the entire genome
while self.cgr_coords.N>0:
nucleotide = self.backward()
sequence.append(nucleotide)
return "".join(sequence[::-1])
from itertools import product
from collections import defaultdict
import numpy as np
class FCGR(CGR):
"""Frequency matrix CGR
an (2**k x 2**k) 2D representation will be created for a
n-long sequence.
- k represents the k-mer.
- 2**k x 2**k = 4**k the total number of k-mers (sequences of length k)
- pixel value correspond to the value of the frequency for each k-mer
"""
def __init__(self, k: int,):
super().__init__()
self.k = k # k-mer representation
self.kmers = list("".join(kmer) for kmer in product("ACGT", repeat=self.k))
self.kmer2pixel = self.kmer2pixel_position()
def __call__(self, sequence: str):
"Given a DNA sequence, returns an array with his frequencies in the same order as FCGR"
self.count_kmers(sequence)
# Create an empty array to save the FCGR values
array_size = int(2**self.k)
freq_matrix = np.zeros((array_size,array_size))
# Assign frequency to each box in the matrix
for kmer, freq in self.freq_kmer.items():
pos_x, pos_y = self.kmer2pixel[kmer]
freq_matrix[int(pos_x)-1,int(pos_y)-1] = freq
return freq_matrix
def count_kmer(self, kmer):
if "N" not in kmer:
self.freq_kmer[kmer] += 1
def count_kmers(self, sequence: str):
self.freq_kmer = defaultdict(int)
# representativity of kmers
last_j = len(sequence) - self.k + 1
kmers = (sequence[i:(i+self.k)] for i in range(last_j))
# count kmers in a dictionary
list(self.count_kmer(kmer) for kmer in kmers)
def kmer_probabilities(self, sequence: str):
self.probabilities = defaultdict(float)
N=len(sequence)
for key, value in self.freq_kmer.items():
self.probabilities[key] = float(value) / (N - self.k + 1)
def pixel_position(self, kmer: str):
"Get pixel position in the FCGR matrix for a k-mer"
coords = self.encode(kmer)
N,x,y = coords.N, coords.x, coords.y
# Coordinates from [-1,1]² to [1,2**k]²
np_coords = np.array([(x + 1)/2, (y + 1)/2]) # move coordinates from [-1,1]² to [0,1]²
np_coords *= 2**self.k # rescale coordinates from [0,1]² to [0,2**k]²
x,y = np.ceil(np_coords) # round to upper integer
# Turn coordinates (cx,cy) into pixel (px,py) position
# px = 2**k-cy+1, py = cx
return 2**self.k-int(y)+1, int(x)
def kmer2pixel_position(self,):
kmer2pixel = dict()
for kmer in self.kmers:
kmer2pixel[kmer] = self.pixel_position(kmer)
return kmer2pixel
from tqdm import tqdm
from pathlib import Path
import numpy as np
class GenerateFCGR:
def __init__(self, kmer: int = 5, ):
self.kmer = kmer
self.fcgr = FCGR(kmer)
self.counter = 0 # count number of time a sequence is converted to fcgr
def __call__(self, list_fasta,):
for fasta in tqdm(list_fasta, desc="Generating FCGR"):
self.from_fasta(fasta)
def from_seq(self, seq: str):
"Get FCGR from a sequence"
seq = self.preprocessing(seq)
chaos = self.fcgr(seq)
self.counter +=1
return chaos
def reset_counter(self,):
self.counter=0
@staticmethod
def preprocessing(seq):
seq = seq.upper()
for letter in seq:
if letter not in "ATCG":
seq = seq.replace(letter,"N")
return seq
def plot_fcgr(df, virus_species):
ncols = 3
nrows = len(virus_species)
fig, axeses = plt.subplots(
nrows=nrows,
ncols=ncols,
squeeze=False,
)
for i in range(0, ncols * nrows):
row = i // ncols
col = i % ncols
axes = axeses[row, col]
data = df[i].upper()
chaos = GenerateFCGR().from_seq(seq=data)
virus = virus_species[row]
axes.imshow(chaos)
axes.set_title(virus)
return fig
############################################################# Persistant Homology ########################################################
import numpy as np
import persim
import ripser
import matplotlib.pyplot as plt
NUCLEOTIDE_MAPPING = {
'a': np.array([1, 0, 0, 0]),
'c': np.array([0, 1, 0, 0]),
'g': np.array([0, 0, 1, 0]),
't': np.array([0, 0, 0, 1])
}
def encode_nucleotide_to_vector(nucleotide):
return NUCLEOTIDE_MAPPING.get(nucleotide)
def chaos_4d_representation(dna_sequence):
points = [encode_nucleotide_to_vector(dna_sequence[0])]
for nucleotide in dna_sequence[1:]:
vector = encode_nucleotide_to_vector(nucleotide)
if vector is None:
continue
next_point = 0.5 * (points[-1] + vector)
points.append(next_point)
return np.array(points)
def persistence_homology(dna_sequence, multi=False, plot=False, sample_rate=7):
if multi:
c4dr_points = np.array([chaos_4d_representation(sequence) for sequence in dna_sequence])
dgm_dna = [ripser.ripser(points[::sample_rate], maxdim=1)['dgms'] for points in c4dr_points]
if plot:
persim.plot_diagrams([dgm[1] for dgm in dgm_dna], labels=[f'sequence {i}' for i in range(len(dna_sequence))])
else:
c4dr_points = chaos_4d_representation(dna_sequence)
dgm_dna = ripser.ripser(c4dr_points[::sample_rate], maxdim=1)['dgms']
if plot:
persim.plot_diagrams(dgm_dna[1])
return dgm_dna
def plot_diagrams(
diagrams,
plot_only=None,
title=None,
xy_range=None,
labels=None,
colormap="default",
size=20,
ax_color=np.array([0.0, 0.0, 0.0]),
diagonal=True,
lifetime=False,
legend=True,
show=False,
ax=None
):
"""A helper function to plot persistence diagrams.
Parameters
----------
diagrams: ndarray (n_pairs, 2) or list of diagrams
A diagram or list of diagrams. If diagram is a list of diagrams,
then plot all on the same plot using different colors.
plot_only: list of numeric
If specified, an array of only the diagrams that should be plotted.
title: string, default is None
If title is defined, add it as title of the plot.
xy_range: list of numeric [xmin, xmax, ymin, ymax]
User provided range of axes. This is useful for comparing
multiple persistence diagrams.
labels: string or list of strings
Legend labels for each diagram.
If none are specified, we use H_0, H_1, H_2,... by default.
colormap: string, default is 'default'
Any of matplotlib color palettes.
Some options are 'default', 'seaborn', 'sequential'.
See all available styles with
.. code:: python
import matplotlib as mpl
print(mpl.styles.available)
size: numeric, default is 20
Pixel size of each point plotted.
ax_color: any valid matplotlib color type.
See [https://matplotlib.org/api/colors_api.html](https://matplotlib.org/api/colors_api.html) for complete API.
diagonal: bool, default is True
Plot the diagonal x=y line.
lifetime: bool, default is False. If True, diagonal is turned to False.
Plot life time of each point instead of birth and death.
Essentially, visualize (x, y-x).
legend: bool, default is True
If true, show the legend.
show: bool, default is False
Call plt.show() after plotting. If you are using self.plot() as part
of a subplot, set show=False and call plt.show() only once at the end.
"""
fig, ax = plt.subplots() if ax is None else ax
plt.style.use(colormap)
xlabel, ylabel = "Birth", "Death"
if not isinstance(diagrams, list):
# Must have diagrams as a list for processing downstream
diagrams = [diagrams]
if labels is None:
# Provide default labels for diagrams if using self.dgm_
labels = ["$H_{{{}}}$".format(i) for i , _ in enumerate(diagrams)]
if plot_only:
diagrams = [diagrams[i] for i in plot_only]
labels = [labels[i] for i in plot_only]
if not isinstance(labels, list):
labels = [labels] * len(diagrams)
# Construct copy with proper type of each diagram
# so we can freely edit them.
diagrams = [dgm.astype(np.float32, copy=True) for dgm in diagrams]
# find min and max of all visible diagrams
concat_dgms = np.concatenate(diagrams).flatten()
has_inf = np.any(np.isinf(concat_dgms))
finite_dgms = concat_dgms[np.isfinite(concat_dgms)]
# clever bounding boxes of the diagram
if not xy_range:
# define bounds of diagram
ax_min, ax_max = np.min(finite_dgms), np.max(finite_dgms)
x_r = ax_max - ax_min
# Give plot a nice buffer on all sides.
# ax_range=0 when only one point,
buffer = 1 if xy_range == 0 else x_r / 5
x_down = ax_min - buffer / 2
x_up = ax_max + buffer
y_down, y_up = x_down, x_up
else:
x_down, x_up, y_down, y_up = xy_range
yr = y_up - y_down
if lifetime:
# Don't plot landscape and diagonal at the same time.
diagonal = False
# reset y axis so it doesn't go much below zero
y_down = -yr * 0.05
y_up = y_down + yr
# set custom ylabel
ylabel = "Lifetime"
# set diagrams to be (x, y-x)
for dgm in diagrams:
dgm[:, 1] -= dgm[:, 0]
# plot horizon line
ax.plot([x_down, x_up], [0, 0], c=ax_color)
# Plot diagonal
if diagonal:
ax.plot([x_down, x_up], [x_down, x_up], "--", c=ax_color)
# Plot inf line
if has_inf:
# put inf line slightly below top
b_inf = y_down + yr * 0.95
ax.plot([x_down, x_up], [b_inf, b_inf], "--", c="k", label=r"$\infty$")
# convert each inf in each diagram with b_inf
for dgm in diagrams:
dgm[np.isinf(dgm)] = b_inf
# Plot each diagram
for dgm, label in zip(diagrams, labels):
# plot persistence pairs
ax.scatter(dgm[:, 0], dgm[:, 1], size, label=label, edgecolor="none")
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.set_xlim([x_down, x_up])
ax.set_ylim([y_down, y_up])
ax.set_aspect('equal', 'box')
if title is not None:
ax.set_title(title)
if legend is True:
ax.legend(loc="lower right")
if show is True:
plt.show()
return fig, ax
def plot_persistence_homology(df, virus_species):
# if len(virus_species.unique()) > 1:
c4dr_points = [chaos_4d_representation(sequence.lower()) for sequence in df]
dgm_dna = [ripser.ripser(points[::15], maxdim=1)['dgms'] for points in c4dr_points]
labels =[f'{virus_specie}_{i}' for i, virus_specie in enumerate(virus_species)]
fig, ax = plot_diagrams([dgm[1] for dgm in dgm_dna], labels=labels)
# else:
# c4dr_points = [chaos_4d_representation(sequence.lower()) for sequence in df]
# dgm_dna = [ripser.ripser(points[::10], maxdim=1)['dgms'] for points in c4dr_points]
# labels =[f'{virus_specie}_{i}' for i, virus_specie in enumerate(virus_species)]
# print(labels)
# print(len(dgm_dna))
# fig, ax = plot_diagrams([dgm[1] for dgm in dgm_dna], labels=labels)
return fig
def compare_persistence_homology(dna_sequence1, dna_sequence2):
dgm_dna1 = persistence_homology(dna_sequence1)
dgm_dna2 = persistence_homology(dna_sequence2)
distance = persim.sliced_wasserstein(dgm_dna1[1], dgm_dna2[1])
return distance
############################################################# UI #################################################################
ui.page_opts(fillable=True)
with ui.navset_card_tab(id="tab"):
with ui.nav_panel("Viral Macrostructure"):
ui.page_opts(fillable=True)
ui.panel_title("Do viruses have underlying structure?")
with ui.layout_columns():
with ui.card():
ui.input_selectize(
"virus_selector",
"Select your viruses:",
virus,
multiple=True, selected=None
)
with ui.card():
ui.input_selectize(
"plot_type_macro",
"Select your method:",
["Chaos Game Representation", "2D Line", "ColorSquare", "Persistant Homology", "Wens Method"],
multiple=False, selected=None
)
############################################################# Plotting ########################################################
here = Path(__file__).parent
import matplotlib as mpl
# @output(suspend_when_hidden=True)
@render.plot()
def plot_macro():
#ds = load_dataset('Hack90/virus_tiny')
df = pd.read_parquet('virus_ds.parquet')
df = df[df['Organism_Name'].isin(input.virus_selector())]
# group by virus
grouped = df.groupby('Organism_Name')['Sequence'].apply(list)
mpl.rcParams.update(mpl.rcParamsDefault)
# plot the comparison
fig = None
if input.plot_type_macro() == "2D Line":
fig = plot_2d_comparison(grouped, grouped.index)
if input.plot_type_macro() == "ColorSquare":
filtered_df = df.groupby('Organism_Name').apply(filter_and_select).reset_index(drop=True)
fig = plot_color_square(filtered_df['Sequence'], filtered_df['Organism_Name'].unique())
if input.plot_type_macro() == "Wens Method":
fig = wens_method_heatmap(df, df['Organism_Name'].unique())
if input.plot_type_macro() == "Chaos Game Representation":
filtered_df = df.groupby('Organism_Name').apply(filter_and_select).reset_index(drop=True)
fig = plot_fcgr(filtered_df['Sequence'], df['Organism_Name'].unique())
if input.plot_type_macro() == "Persistant Homology":
filtered_df = df.groupby('Organism_Name').apply(filter_and_select).reset_index(drop=True)
fig = plot_persistence_homology(filtered_df['Sequence'], filtered_df['Organism_Name'])
return fig
# ui.output_plot("plot_macro_output")
# with ui.nav_panel("Viral Model"):
# gr.load("models/Hack90/virus_pythia_31_1024").launch()
with ui.nav_panel("Viral Microstructure"):
ui.page_opts(fillable=True)
ui.panel_title("Kmer Distribution")
with ui.layout_columns():
with ui.card():
ui.input_slider("kmer", "kmer", 0, 10, 4)
ui.input_slider("top_k", "top:", 0, 1000, 15)
ui.input_selectize(
"plot_type",
"Select metric:",
["percentage", "count"],
multiple=False, selected=None
)
import matplotlib as mpl
# @output(suspend_when_hidden=True)
@render.plot()
def plot_micro():
df = pd.read_csv('kmers.csv')
k = input.kmer()
top_k = input.top_k()
fig = None
mpl.rcParams.update(mpl.rcParamsDefault)
if input.plot_type() == "count" and input.kmer() > 0:
df = df[df['k'] == k]
df = df.head(top_k)
fig, ax = plt.subplots()
ax.bar(df['kmer'], df['count'])
ax.set_title(f"Most common {k}-mers")
ax.set_xlabel("K-mer")
ax.set_ylabel("Count")
ax.set_xticklabels(df['kmer'], rotation=90)
if input.plot_type() == "percentage" and input.kmer() > 0:
df = df[df['k'] == k]
df = df.head(top_k)
fig, ax = plt.subplots()
ax.bar(df['kmer'], df['percent']*100)
ax.set_title(f"Most common {k}-mers")
ax.set_xlabel("K-mer")
ax.set_ylabel("Percentage")
ax.set_xticklabels(df['kmer'], rotation=90)
return fig
#ui.output_plot("plot_micro_output")
with ui.nav_panel("Viral Model Training"):
ui.page_opts(fillable=True)
ui.panel_title("Does context size matter for a nucleotide model?")
def plot_loss_rates(df, type):
# interplot each column to be same number of points
x = np.linspace(0, 1, 1000)
loss_rates = []
labels = ['32', '64', '128', '256', '512', '1024']
#drop the column step
df = df.drop(columns=['Step'])
for col in df.columns:
y = df[col].dropna().astype('float', errors = 'ignore').dropna().values
f = interp1d(np.linspace(0, 1, len(y)), y)
loss_rates.append(f(x))
fig, ax = plt.subplots()
for i, loss_rate in enumerate(loss_rates):
ax.plot(x, loss_rate, label=labels[i])
ax.legend()
ax.set_title(f'Loss rates for a {type} parameter model across context windows')
ax.set_xlabel('Training steps')
ax.set_ylabel('Loss rate')
return fig
import matplotlib as mpl
@render.plot()
def plot_context_size_scaling():
fig = None
df = pd.read_csv('14m.csv')
mpl.rcParams.update(mpl.rcParamsDefault)
fig = plot_loss_rates(df, '14M')
return fig
with ui.nav_panel("Model loss analysis"):
ui.page_opts(fillable=True)
ui.panel_title("Neurips stuff")
with ui.card():
ui.input_selectize(
"param_type",
"Select Param Type:",
["14", "31", "70", "160"],
multiple=True,
selected=["14", "70"]
)
ui.input_selectize(
"model_type",
"Select Model Type:",
["pythia", "denseformer"],
multiple=True,
selected=['pythia','denseformer']
)
ui.input_selectize(
"loss_type",
"Select Loss Type:",
["compliment", "cross_entropy", "headless"],
multiple=True,
selected=["compliment", "cross_entropy", "headless"]
)
def plot_loss_rates_model(df, param_types, loss_types, model_types):
# interplot each column to be same number of points
x = np.linspace(0, 1, 1000)
loss_rates = []
labels = []
print(param_types, loss_types, model_types)
for param_type in param_types:
for loss_type in loss_types:
for model_type in model_types:
y = df[(df['param_type'] == int(param_type)) & (df['loss_type'] == loss_type) & (df['model_type'] == model_type)]['loss'].values
print(y)
if len(y) > 0:
f = interp1d(np.linspace(0, 1, len(y)), y)
loss_rates.append(f(x))
labels.append(str(param_type) + '_' + loss_type + '_' + model_type)
fig, ax = plt.subplots()
print(loss_rates)
for i, loss_rate in enumerate(loss_rates):
ax.plot(x, loss_rate, label=labels[i])
ax.legend()
ax.set_xlabel('Training steps')
ax.set_ylabel('Loss rate')
return fig
import matplotlib as mpl
@render.plot()
def plot_model_scaling():
fig = None
df = pd.read_csv('results - denseformer.csv')
mpl.rcParams.update(mpl.rcParamsDefault)
fig = plot_loss_rates_model(df, input.param_type(),input.loss_type(),input.model_type())
return fig
# @output
# @render.plot
# def plot_training_loss():
# # if csv_file() is None:
# # return None
# df = pd.read_csv('results - denseformer.csv')
# filtered_df = df[
# (df["param_type"].isin(input.param_type()))
# & (df["model_type"].isin(input.model_type()))
# & (df["loss_type"].isin(input.loss_type()))
# ]
# if filtered_df.empty:
# return None
# # Define colors for sizes and shapes for loss types
# size_colors = {
# "14": "blue",
# "31": "green",
# "70": "orange",
# "160": "red"
# }
# loss_markers = {
# "compliment": "o",
# "cross_entropy": "^",
# "headless": "s"
# }
# # Create the plot
# fig, ax = plt.subplots(figsize=(10, 6))
# # Plot each combination of size and loss type
# for size in filtered_df["param_type"].unique():
# for loss_type in filtered_df["loss_type"].unique():
# data = filtered_df[(filtered_df["param_type"] == size) & (filtered_df["loss_type"] == loss_type)]
# ax.plot(data["epoch"], data["loss"], marker=loss_markers[loss_type], color=size_colors[size], label=f"{size} - {loss_type}")
# # Customize the plot
# ax.set_xlabel("Epoch")
# ax.set_ylabel("Loss")
# # ax.set_title("Training Loss by Size and Loss Type", fontsize=16)
# # Create a legend for sizes
# size_legend = ax.legend(title="Size", loc="upper right")
# ax.add_artist(size_legend)
# # Create a separate legend for loss types
# loss_legend_labels = ["Compliment", "Cross Entropy", "Headless"]
# loss_legend_handles = [plt.Line2D([0], [0], marker=loss_markers[loss_type], color='black', linestyle='None', markersize=8) for loss_type in loss_markers]
# loss_legend = ax.legend(loss_legend_handles, loss_legend_labels, title="Loss Type", loc="upper right")
# plt.tight_layout()
# return fig
# # Define colors for sizes and shapes for loss types
# size_colors = {
# "14": "blue",
# "31": "green",
# "70": "orange",
# "160": "red"
# }
# loss_markers = {
# "compliment": "o",
# "cross_entropy": "^",
# "headless": "s"
# }
# # Create a relplot using Seaborn
# g = sns.relplot(
# data=filtered_df,
# x="epoch",
# y="loss",
# hue="param_type",
# style="loss_type",
# palette=size_colors,
# markers=loss_markers,
# height=6,
# aspect=1.5
# )
# # Customize the plot
# g.set_xlabels("Epoch")
# g.set_ylabels("Loss")
# g.fig.suptitle("Training Loss by Size and Loss Type", fontsize=16)
# g.add_legend(title="Size")
# # Create a separate legend for loss types
# loss_legend = plt.legend(title="Loss Type", loc="upper right", labels=["Compliment", "Cross Entropy", "Headless"])
# plt.gca().add_artist(loss_legend)
# plt.tight_layout()
# return g.fig
# @render.image
# def image():
# img = None
# if input.plot_type() == "ColorSquare":
# img = {"src": f"color_square_{input.virus_selector()[0]}_0.png", "alt": "ColorSquare"}
# return img
# return img