import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import scipy.integrate
import scipy.stats as st
import numba
import cmdstanpy
import arviz as az
import colorcet
import bokeh
import bokeh_catplot
import bokeh.plotting
from bokeh.models import Range1d
import panel as pn
import holoviews as hv
import bebi103
hv.extension('bokeh')
import bokeh.io
bokeh.io.output_notebook()
bebi103.hv.set_defaults()
# recycled code from last term
def plotter_bokeh(df, width, bacterium):
'''plots frames and areas of bacteria'''
x, y = "frame", "areas (μm^2)"
color = df["colored"]
p = bokeh.plotting.figure(width=width, height=300,
x_axis_label=x, y_axis_label=y,
tooltips=[(x, "@{frame}"), (y, "@{areas (μm^2)}"), ("cycle", "@{colored}")],
title=f"{bacterium}"
)
p.circle(source=df.loc[color % 2 == 0], x=x, y=y, color="maroon")
p.circle(source=df.loc[color % 2 == 1], x=x, y=y, color="darksalmon")
p.xgrid.grid_line_color, p.ygrid.grid_line_color = None, None
p.outline_line_color = None
p.title.align, p.title.text_font_style = "center", "bold"
p.toolbar.autohide = True
return p
# to determine biphasic growth, i want to look at sinusoidal behavior of mins, maxs, and how that affects time
def growth_data(df):
'''takes in dataframe with colored cycles,
returns min and max areas, length of cycle, frame of reference'''
df_stacked = df.set_index("colored")
mins, maxs, times, frames = [], [], [], []
# anticipated error for times since popped off a few frames for time filtering purposes
for i in range(df["colored"].max()+1):
df_cycle = df.loc[df["colored"] == i]
mins.append(df_cycle["areas"].min())
maxs.append(df_cycle["areas"].max())
times.append(df_cycle["frame"].max() - df_cycle["frame"].min())
frames.append(df_cycle["frame"].min())
growths = [(maxs[j] - mins[j]) for j in range(0, len(mins))]
data = pd.DataFrame({"mins": mins, "maxs": maxs, "growths":growths,
"times": times, "frames": frames})
return data
df1 = pd.read_csv("../../bi103a/CCTV/datasets_csv/bacterium_1.csv")
df2 = pd.read_csv("../../bi103a/CCTV/datasets_csv/bacterium_2.csv")
data1, data2 = growth_data(df1), growth_data(df2)
df1['bacterium']=['bacterium 1']*len(df1)
df2['bacterium']=['bacterium 2']*len(df2)
df12 = pd.concat([df1, df2])
data1['bacterium']=['bacterium 1']*len(data1)
data2['bacterium']=['bacterium 2']*len(data2)
data12 = pd.concat([data1, data2])
Let's remind ourselves of what our growth cycles look like!
width_b1, width_b2 = int(df1.shape[0] / 3), int(df2.shape[0] / 4)
p1 = plotter_bokeh(df1, width_b1, "bacterium 1")
p2 = plotter_bokeh(df2, width_b2, "bacterium 2")
bokeh.io.show(p1)
bokeh.io.show(p2)
So we have something that looks like this:
We expect each growth event to have its own $a_o$ and k, the reason being that each event will start out at a different $a_o$, and perhaps a specific process that is collectively slowing down or accelerating division in that particular growth cycle, or something. We further expect these to be distributed around some location and scale parameter of $a_o$ and k belonging to the cell itself. I make the assumption that because these cells belong to the same species, the spread of these values will be some shared $\sigma_{ao}$ and $\sigma_k$.
So those inform our final μ's. For σ, we will use assume the same, that it is shared among all caulobacter cells, and will have its own center and spread. The choice of distributions are discussed next.
note 1: I only have one layer of hierarchy for the location parameters; the cost function of getting two layers to work vs. using one and being able to do more with the estimates after is more appealing. I am not suuper interested in generalizing my parameter estimates to caulobacter en masse; i'm more interested in using the spread of those parameter estimates to compare exponential vs. linear growth with two parent cells. (we also only have two cells which is quite a small sample size).
note 2: I considered scaling σ with μ for the exponential model, but decided that since we are in the near linear range of the exponential model, it perhaps isn't worth the effort...
From the plots above, we see that $a_0$ seems to behave sinusoidally. We know from last term (team 14 for hw5), that our times between growth rates remain fairly constant, so let's see what the ECDF of random draws from an evenly spaced distribution of a sine function, and think about what distributions we should look for...
x = np.linspace(0, 2*np.pi,1000)
y = np.sin(x)
ecdf = bokeh_catplot.ecdf(np.random.choice(y, 1000),
title='ECDF Random Draws from sin(x)',height=300,palette=['maroon'])
bokeh.io.show(ecdf)
We should perhaps not choose a normal distribution for $a_0$--the sinoisoid is going in the other direction!
But of course, the actual sinusoid of $a_0$ seems tilted, so just for fun, I'll try to apply a shear transformation to retrieve a new function. I will plot this function, and then sample from it.
As a reminder, a shear transformation matrix looks something like:
In functional form that means we are mapping $f(x, y) \to f((x+\lambda y), y)$.
This maps $\sin(x) \to \sin(x-\lambda \sin(x))$
We want it to tilt to the right, so we'll now plot λ = -0.5.
p = bokeh.plotting.figure(width=600, height=250, title="sin(x-0.75 sin(x))")
p.circle(x, np.sin(x), legend_label='unsheared',color='maroon')
p.circle(x, np.sin(x-0.75*np.sin(x)), color='indianred', legend_label='sheared')
bokeh.io.show(p)
Scrolling through bacterium 2's growth cycles, we see how this kind of capture the tilt in the negative space underneath the curves.
ecdf = bokeh_catplot.ecdf(np.random.choice(np.sin(x), 1000),
title='ECDF Random Draws from sin(x-0.75sin(x))',height=300,palette=['maroon'])
q = bokeh_catplot.ecdf(np.random.choice(np.sin(x-0.75*np.sin(x)), 1000), palette=['indianred'],p=ecdf)
bokeh.io.show(q)
Wobbly! But not very informative. This tells us when hunting for ECDFS, the center bulk of the mass should behave almost linearly. But we see that shearing (red) brings back that sigmoid inflection we're used to seeing with gaussian distributions...
Bounding $a_0$: it would be silly for the parent cell to disappear, or experience significant shrinkage, so we definitely want a lower bound at ~0.5 μm^2; it would also be weird if the bacterium retained more than 90% of its contents--90% of the maximum area < 3 μm^2 gives us something like 2.7 μm^2 for our upper bound.
Bounding $σ_{ao\mathrm{I}}$: I will tolerate a maximum value of 0.25, for the reasons given above (area considerations).
WANT k_mu ~ 0.005 ->[0.003,0.006], ... k_sigma ~ 0.001 -> [0,0.0035] ... Exponential(1000)
\begin{align} \mu_{k\mathrm{I}} &\sim \mathrm{Normal}(0.0116, 0.0032) \\[0.5em] \mu_{k\mathrm{II}} &\sim \mathrm{Normal}(0.0116, 0.0032) \\[0.5em] \sigma_{k} &\sim \mathrm{Exponential}(1000) \\[0.5em] \end{align}The 80-120s range of interest can be seen from the following ecdf:
p = bokeh_catplot.ecdf(data12,cats='bacterium',val='times',tools=['hover'],
title='Growth Times (s) ECDF',palette=['slateblue','purple'])
p.legend.location='top_left'
bokeh.io.show(p)
Some lingering segmentation errors, in b2, but I turned on hover, found the growth cycles, and plan on dropping these cycles . Can see the errors from EDA (single colored dots with 1-2 frames), completely justifies abandoning these cycles. Analysis for bacterium 1 is quite clean, will be untouched.
# dropping bad frames
bad_frames = [33, 34, 63, 77, 78, 91]
for frame in bad_frames:
df2 = df2[df2.colored!=frame]
Now that we have our model, let's sample from it and
The idea here is that our model is sampling terms of severely different order of magnitudes, so we will likely run into high 'curvatures' in higher dimensional spaces, so recentering all our parameters, we have the following:
Our stan file will be relatively simple. We will only use stan for sampling (so ignore functions block, and transformed parameters block), retrieving area values from the parameters will be done in numpy.
All that we really have to do is consistently draw from normal distributions (0, 1) and properly transform our parameters. Note that all our tilde terms should be unbounded! They are being drawn from normals that allow for negative values, bc these are the scaled terms that tell us whether our error is undershooting or overshooting.
functions {
real a_theor(real ao, real k) {
return ao + k*(100);
/*return ao * exp(k*100); // for now, look at end of growth cycle*/
/*return ao * exp(k); //returns a_o effectively */
}
}
parameters {
real<lower=0> mu_ao;
real<lower=0> sig_ao;
real<lower=0> sig_k;
real<lower=0> mu_sigma;
real<lower=0> sig_sigma;
real ao_tilde; // bounding tildes? -> probs not
real k_tilde;
real mu_k_tilde;
real sigma_tilde;
real a_tilde;
}
transformed parameters {
real ao;
real mu_k;
real k;
real sigma;
real mu;
real a;
ao = mu_ao + ao_tilde * sig_ao;
mu_k = 0.0116 + 0.0032 * mu_k_tilde;
k = mu_k + k_tilde * sig_k;
sigma = mu_sigma + sigma_tilde * sig_sigma;
mu = a_theor(ao, k);
a = mu + a_tilde * sigma;
}
model {
ao_tilde ~ normal(0,1);
mu_ao ~ lognormal(0.15,0.15);
sig_ao ~ exponential(20);
k_tilde ~ normal(0,1);
mu_k_tilde ~ normal(0,1);
sig_k ~ exponential(1000);
sigma_tilde ~ normal(0,1);
mu_sigma ~ exponential(100);
sig_sigma ~ exponential(500);
a_tilde ~ normal(0, 1);
}
bebi103.stan.clean_cmdstan()
sampler_nc = cmdstanpy.CmdStanModel(stan_file='testing_noncentering.stan')
with bebi103.stan.disable_logging():
samples_nc = sampler_nc.sample(chains=4,
cores=8,
sampling_iters=5000,
warmup_iters=10000,
thin=5,
adapt_delta=0.97)
samples_nc = az.from_cmdstanpy(posterior=samples_nc)
# samples_nc.to_netcdf('testing_noncentering.nc')
samples_nc = az.from_netcdf('testing_noncentering.nc')
bebi103.stan.check_all_diagnostics(samples_nc)
parcoord_sm = bebi103.viz.parcoord_plot(samples_nc,pars=["mu","ao","sig_ao"])
parcoord_bi = bebi103.viz.parcoord_plot(samples_nc,pars=["k","sig_k","sigma","sig_sigma"])
bokeh.io.show(bokeh.layouts.layout([[parcoord_sm, parcoord_bi]]))
Everything looks of order of magnitude!we can see the sigmas of each shrinking ~1 oom below th eactual parameter...
To study funnels in particular, we'll look at each parameter along with it's σ.
import bokeh.palettes
palette = bokeh.palettes.Category20b[4]
pars = ["ao","sig_ao","k","sig_k","sigma","sig_sigma","mu","sigma"]
ao, sig_ao, k, sig_k, sigma, sig_sig, mu, sigma= (
[samples_nc.posterior[f"{par}"].values.flatten() for par in pars] )
ps = []
for i, tup in enumerate([(ao, sig_ao),(k, sig_k),(sigma, sig_sig),(mu, sigma)]):
p = bokeh.plotting.figure(height=300, width=250,title=f"σ & {pars[2*i]}",
x_axis_label=pars[2*i],y_axis_label=pars[2*i+1])
p.circle(tup[0],tup[1],alpha=0.8, color=palette[0],size=1)
ps.append(p)
bokeh.io.show(bokeh.layouts.gridplot(ps, ncols=4))
We are getting some negative values for k; let's find out what fraction...
num_neg_k = np.sum(samples_nc.posterior.k.values.flatten()<0)
num_neg_mu = np.sum(samples_nc.posterior.mu.values.flatten()<0)
num_neg_ao = np.sum(samples_nc.posterior.ao.values.flatten()<0)
num_total_draws = len(samples_nc.posterior.chain)*len(samples_nc.posterior.draw)
print("fraction of negative k's", num_neg_k / num_total_draws)
print("fraction of negative mu's", num_neg_mu / num_total_draws)
print("fraction of negative ao's", num_neg_ao / num_total_draws)
I am willing to accept this as a tradeoff for letting k's sigma be able to reach 0.02 or a tad higher, but evrything else that is positive seems good!
Let's look at the trace plot for one chain...
pars=['k','sig_k','ao','sig_ao','sigma','sig_sigma','mu_ao','mu_k','mu_sigma',]
bokeh.io.show(bebi103.viz.trace_plot(samples_nc, pars=pars, plot_width=2000,plot_height=80,
palette=['purple','pink','red','black','white','white','white','white','white',]))
Things could be much worse! Our squiggles aren't as squiggly as they could be, but even with one level of hierarchy, we have like 9 parameters to keep happy, which is difficult. (To keep the size of this html file down, I did not include the trace plots from the centered distribution samples, but they went quite flat for a concerning amount of parameters.)
To check if our actual numbers and distributions are ok, let's now check to see what kind of areas during the growth cycle these parameters generate.
I will now manually do prior predictive checks by retrieving $a_o$ and k values from the stan sample, and use a linspace for frame/time.
(reason: wrote stan code with generated quantities and started getting NaN values in the parameters, but ihave zero NaN values the way things are now, so I'm switching my pcc to python!)
times = np.arange(100) # ~100 seconds, ~100 frames per growth event
ao_prior = samples_nc.posterior.ao.values.flatten()
k_prior = samples_nc.posterior.k.values.flatten()
sigma_prior = samples_nc.posterior.sigma.values.flatten()
def a_theor_lin(ao,k,t):
return ao+k*t
def a_theor_exp(ao,k,t):
return ao*np.exp(k*t)
a_lins = np.empty([len(k_prior),len(times)])
a_exps = np.empty([len(k_prior),len(times)])
for i in range(len(k_prior)):
a_lin=np.empty(len(times))
a_exp=np.empty(len(times))
for t in range(len(times)):
mu_lin = a_theor_lin(ao_prior[i], k_prior[i], t)
mu_exp = a_theor_exp(ao_prior[i], k_prior[i], t)
a_lin[t] = st.norm.rvs(loc=mu_lin, scale=abs(sigma_prior[i]), size=1)
a_exp[t] = st.norm.rvs(loc=mu_exp, scale=abs(sigma_prior[i]), size=1)
a_lins[i] = a_lin
a_exps[i] = a_exp
palette = bokeh.palettes.Category20b[4]
p = bokeh.plotting.figure(height=400,width=300,title='linear prior checks: 8 draws',
x_axis_label='time',y_axis_label='area (μm^2)')
q = bokeh.plotting.figure(height=400,width=300,title="exponential prior checks: 8 draws",
x_axis_label='time',y_axis_label='area (μm^2)')
for i in range(0,len(a_lins),150):
p.circle(times, a_lins[i],fill_alpha=0.1,alpha=0.5,color=palette[1])
q.circle(times, a_exps[i],fill_alpha=0.1,alpha=0.5,color=palette[1])
bokeh.io.show(bokeh.layouts.layout([[p,q]]))
These look completely reasonable!! Since we have ~100 discrete time frames, we can actually simulate the percentile plots as well (above are less informative density-wise).
a_linsT, a_expsT = a_lins.T, a_exps.T
def return_array_percentiles(interval):
low_lins, high_lins = np.empty(len(times)), np.empty(len(times))
low_exps, high_exps = np.empty(len(times)), np.empty(len(times))
for t in times:
low_lins[t], high_lins[t] = np.percentile(a_linsT[t], interval)
low_exps[t], high_exps[t] = np.percentile(a_expsT[t], interval)
return low_lins, high_lins, low_exps, high_exps
low_lin_99,high_lin_99,low_exp_99,high_exp_99 = return_array_percentiles([0.05, 99.5])
low_lin_95,high_lin_95,low_exp_95,high_exp_95 = return_array_percentiles([2.5, 97.5])
low_lin_50,high_lin_50,low_exp_50,high_exp_50 = return_array_percentiles([25, 75])
def draw_circles(xarr, yarr, color, plot):
plot.circle(xarr, yarr, color=color)
pass
arrs_lin = [low_lin_99,high_lin_99,low_lin_95,high_lin_95,low_lin_50,high_lin_50]
arrs_exp = [low_exp_99,high_exp_99,low_exp_95,high_exp_95,low_exp_50,high_exp_50]
palette = ['thistle']*2 + ['purple']*2 + ['indigo']*2
p = bokeh.plotting.figure(height=400, width=300,
title="lin prior pred %: 99th, 95th, 50th")
q = bokeh.plotting.figure(height=400, width=300,
title="expo prior pred %: 99th, 95th, 50th")
for i, arr in enumerate(arrs_lin):
draw_circles(times, arr, palette[i], p)
for i, arr in enumerate(arrs_exp):
draw_circles(times, arr, palette[i], q)
for i in range(len(times)):
p.line(times[i], (low_lin_99[i], high_lin_99[i]), line_color=palette[0], line_width=1.5)
p.line(times[i], (low_lin_95[i], high_lin_95[i]), line_color=palette[2], line_width=1.5)
p.line(times[i], (low_lin_50[i], high_lin_50[i]), line_color=palette[4], line_width=1.5)
q.line(times[i], (low_exp_99[i], high_exp_99[i]), line_color=palette[0], line_width=1.5)
q.line(times[i], (low_exp_95[i], high_exp_95[i]), line_color=palette[2], line_width=1.5)
q.line(times[i], (low_exp_50[i], high_exp_50[i]), line_color=palette[4], line_width=1.5)
bokeh.io.show(bokeh.layouts.layout([[p,q]]))
this is great, these are exactly the kind of percentiles i had in mind! 'k' is a little wild, but again any beta above an exponential(1000) caused divergences, and I don't want an extreme prior on something as nebulous as the prior on the sigma of 'k' to overwhelm anything... also, the center 50% of the percentiles capture what I had in mind.
Here's a broad picture of how the analysis will go: if growth is exponential, we will see a smaller 'k' from the exponential model, because the linear model has to kind of compensate for the final growth areas with a higher 'k'. On the other hand, if growth is linear, we will observe higher 'k's from the exponential model since the predominant values are in the ~linear region, and it will try to fit the initial flatter parts to a more linear function. (this can be seen with the desmos plots).
In stan file, we will model cycle by cycle, so let's define new column in dataframe called Δframe that gives us Δt (in seconds) since we start our clock.
lots of code was needed to get the Δframe column :/ ...
can't wait to see the one-liner solution
df1 = pd.read_csv("../../bi103a/CCTV/datasets_csv/bacterium_1.csv")
df2 = pd.read_csv("../../bi103a/CCTV/datasets_csv/bacterium_2.csv")
bad_frames = [33, 34, 63, 77, 78, 91]
for frame in bad_frames:
df2 = df2[df2.colored!=frame]
df1['frame'] = df1['frame']-4 # let's start at zero
df2['frame'] = df2['frame']-7 # let's start at zero
for df in [df1, df2]:
cutoff_frames = (df[df.colored.diff()==1]).values.T[0]
frames, delta_frames = df.frame.values, []
delta_frames.append((frames[0:np.where(frames==cutoff_frames[0])[0][0]]).tolist())
for i in range(len(cutoff_frames)-1):
start = np.where(frames==cutoff_frames[i])[0][0]
stop = np.where(frames==cutoff_frames[i+1])[0][0]
delta_frames.append((frames[start:stop] - frames[start]).tolist())
last = np.where(frames==cutoff_frames[-1])[0][0]
delta_frames.append((frames[last:]-frames[last]).tolist())
delta_frames = np.hstack(delta_frames)
df['Δframes'] = delta_frames
df1['cycles'] = df1['colored']
# fixing cycles so that they match the index for stan file!
_, counts = np.unique(df2.colored, return_counts=True)
cycles = np.arange(len(_))
fresh_cycles = []
for i in range(len(cycles)):
fresh_cycles.append([cycles[i]]*counts[i])
df2['cycles'] = np.hstack(fresh_cycles)
df12 = pd.concat([df1, df2])
data = dict(len_df1=len(df1), len_df2=len(df2),
nI_cycles=len(np.unique(df1.cycles)),
nII_cycles=len(np.unique(df2.cycles)),
timesI=df1.Δframes.values,
timesII=df2.Δframes.values,
areasI=df1['areas (μm^2)'].values,
areasII=df2['areas (μm^2)'].values,
cyclesI=df1.cycles.values,
cyclesII=df2.cycles.values,)
bebi103.stan.clean_cmdstan()
sampler_exp = cmdstanpy.CmdStanModel(stan_file="ccexp3.stan")
sampler_lin = cmdstanpy.CmdStanModel(stan_file="cclin3.stan")
with bebi103.stan.disable_logging():
samples_lin_ = sampler_lin.sample(data=data,
chains=4,
warmup_iters=2000,
adapt_delta=0.98,)
with bebi103.stan.disable_logging():
samples_exp_ = sampler_exp.sample(data=data,
chains=4,
cores=8,
warmup_iters=3000,
max_treedepth=12,
adapt_delta=0.97,)
samples_lin = az.from_cmdstanpy(posterior=samples_lin_)
samples_exp = az.from_cmdstanpy(posterior=samples_exp_)
# these were the samples used
samples_lin = az.from_netcdf('good_all_cycles_lin.nc')
samples_exp = az.from_netcdf('good_all_cycles_exp.nc')
I tried to be super careful with my diagnostics, since I didnt have time to do SBC (probably spent an extra day on priors instead of running sbc)
bebi103.stan.check_all_diagnostics
will treat the very very large mu-vector as a ton of parameters, which they are really just an intermediate to retrieve location parameters for the final a
values we pass in as our data... these two are not the actual parameters of interest, (a_o and k), so we will manually look at .sample_stats
to check for divergences and whatnot.
num_divergences_exp = np.sum(samples_exp.sample_stats.diverging.values.flatten())
num_divergences_lin = np.sum(samples_lin.sample_stats.diverging.values.flatten())
total_draws_exp = len(samples_exp.sample_stats.draw) * len(samples_exp.sample_stats.chain)
total_draws_lin = len(samples_lin.sample_stats.draw) * len(samples_lin.sample_stats.chain)
print('lin fraction divergences: ', num_divergences_lin / total_draws_lin)
print('expo fraction divergences: ', num_divergences_exp / total_draws_exp)
wOw -- our non-centering worked nicely!
# parameters of interest--backbone parameters driving the model
var_names=['mu_sigma','sig_sigma','sigma','sig_ao','sig_k','mu_aoI','mu_aoII','mu_kI','mu_kII']
Rhat
: want less than 1.1# linear
az.rhat(samples_lin, var_names=var_names)
# exponential
az.rhat(samples_exp, var_names=var_names)
these are soooo good... noncentering is GREAT
ess_tail
: want > 400# linear
az.summary(samples_lin, var_names=var_names)
# exponential
az.summary(samples_exp, var_names=var_names)
pretty good! Virtually all are above 400 :)
With this many parameters, looking for possible correlation and anti-correlation will be hard on parcoord plots, and only really makes sense when they're of similar orders of magnitude, so we'll group them like so. This will also help us check our initial bounds on our priors.
oom = order of magnitude
pars = ['sigma','sig_sigma','mu_sigma','sig_ao','kI[0]','kII[0]','sig_k','mu_kI','mu_kII']
pc_lin_s = bebi103.stan.viz.parcoord_plot(samples_lin,pars=pars,title='linear: oom ~10^-2')
pc_exp_s = bebi103.stan.viz.parcoord_plot(samples_exp,pars=pars,title='exponential: oom ~10^-2')
pars = ['mu_aoI','mu_aoII','aoI[0]','aoII[0]']
pc_lin_b = bebi103.stan.viz.parcoord_plot(samples_lin, pars=pars,title='linear: oom ~10^0')
pc_exp_b = bebi103.stan.viz.parcoord_plot(samples_exp, pars=pars,title='exponential: oom ~10^0')
bokeh.io.show(bokeh.layouts.layout([pc_lin_s, pc_lin_b, pc_exp_s, pc_exp_b]))
After staring at these for awhile, the OOMs look good; i also noticed that after repeated samples and more staring, small values of sig_sigma are where the orange strings come from -> this is too be expected! sig_sigma has an exponential that has most of its mass sitting closest to zero out of all the other parameters, but given the tiny fraction of divergences i think we can move on.
Let's look at trace plots to see if any of our chains get stuck.
pars=['kI[0]','sig_k','aoI[0]','sig_ao','sigma','sig_sigma','mu_aoI','mu_kI','mu_sigma',]
bokeh.io.show(bebi103.viz.trace_plot(samples_lin,pars=pars,plot_width=1000,plot_height=100,
palette=['purple','salmon','black','white','white','white','white','white','white',]))
bokeh.io.show(bebi103.viz.trace_plot(samples_exp, pars=pars, plot_width=1000,plot_height=110,
palette=['purple','salmon','black','white','white','white','white','white','white',]))
No flatlines! ekg's stayin alive
To compare plots with our priors, let's check those funnels out... each parameter with it's σ.
pars = ["aoI[0]","sig_ao","kI[0]","sig_k","sigma","sig_sigma","muI[50]","sigma"]
Let's limit our very initial analysis to one cycle and see how these vector parameters behaved.
palette = bokeh.palettes.Category20b[4]
def sig_par_plots(cycle=0):
pars = [f"aoI[{cycle}]","sig_ao",f"kI[{cycle}]","sig_k",
"sigma","sig_sigma",f"muI[{cycle}]","sigma"]
plots,title=[],['lin','exp']
for i, samples in enumerate([samples_lin, samples_exp]):
ao, k, mu = ([samples.posterior[f"{par}"][cycle].values.flatten()
for par in ["aoI","kI","muI"]] )
sig_ao, sig_k, sigma, sig_sigma, sigma = (
[samples.posterior[f"{par}"].values.flatten() for par in
["sig_ao", "sig_k", "sigma", "sig_sigma", "sigma"]])
ps = []
for j, tup in enumerate([(ao, sig_ao),(k, sig_k),(sigma, sig_sigma),(mu, sigma)]):
p = bokeh.plotting.figure(height=300, width=350,
title=f"{title[i]}: σ of {pars[2*j]}",
x_axis_label=pars[2*j],y_axis_label=pars[2*j+1])
p.circle(tup[0][:2000],tup[1][:2000],alpha=0.8, color=palette[0],size=1)
ps.append(p)
plots.append(ps)
for i in range(4):
plots[0][i].y_range = plots[1][i].y_range
#plots[0][i].x_range = plots[1][i].x_range
return bokeh.layouts.layout([[bokeh.layouts.gridplot(plots[0],ncols=1),
bokeh.layouts.gridplot(plots[1], ncols=1)]])
pn.interact(sig_par_plots,cycle=(0,data['nI_cycles']-1))
Our sampler seems to be having just like we need it to!
Although we are really only looking at one growth cycle at a time right now, we can already see that kI[0]
(bacterium I's first cycle) has a much tighter sigma_k
fit. All the sigma's for all these parameters are in general much smaller in comparison as well!
Additionally, the 'k' values (look @ x-axis second row), are smaller in the exponential model than in the linear model. From the analysis given in the cells above, this is indicative of the linear model overcompensating for the areas toward the end of growth.
So this is what I mean:
x = np.linspace(-2, 130, 1000)
ao_exp, k_exp=1, 0.01
def ak_comparison(hundredx_ao_linear=80.0, hundredx_k_linear=2.0):
ylin = hundredx_ao_linear/100 + hundredx_k_linear/100*x
yexp = ao_exp * np.exp(k_exp*x)
p = bokeh.plotting.figure(height=200, width=700,title="comparison of 'ao' and 'k' behavior")
p.circle(x, ylin, color='darksalmon',legend_label='linear')
p.circle(x, yexp, color='maroon',legend_label='exponential')
p.legend.location='bottom_right'
return p
pn.interact(ak_comparison,hundredx_ao_linear=(20.0, 130.0), hundredx_k_linear=(1.0, 4.0))
(The sliders are discretized and cna only move in +/- 0.10 on my machine, so I scaled the inputs to 100a_o and 100k for smoothness.)
We can see that if a linear model is trying to fit an exponential model, it will have a higher 'k' (here it is 0.02 as opposed to 0.01) and thus have a lower 'a' (here a snapshot of 0.8 as opposed to 1.00)
def sig_par_same_plot(cycle=0):
ao_lin,k_lin=([samples_lin.posterior[f"{par}"][cycle].values.flatten() for par in ["aoI","kI"]])
sig_ao_lin,sig_k_lin = ([samples_lin.posterior[f"{par}"].values.flatten() for par in ["sig_ao","sig_k"]])
ao_exp,k_exp=([samples_exp.posterior[f"{par}"][cycle].values.flatten() for par in ["aoI","kI"]])
sig_ao_exp,sig_k_exp = ([samples_exp.posterior[f"{par}"].values.flatten() for par in ["sig_ao", "sig_k",]])
ps = []
for j, tup in enumerate([(ao_lin,sig_ao_lin,ao_exp,sig_ao_exp),(k_lin,sig_k_lin,k_exp,sig_k_exp),]):
p = bokeh.plotting.figure(height=400, width=350,
title=f"{pars[2*j][:-3]}[{cycle}] and it's σ",x_axis_label=pars[2*j],y_axis_label=pars[2*j+1])
p.circle(tup[0][:2000],tup[1][:2000],alpha=0.8,color='darksalmon',size=0.8,legend_label='linear')
p.circle(tup[2][:2000],tup[3][:2000],alpha=0.8,color='maroon',size=0.8,legend_label='exponential')
p.legend.location='top_left'
ps.append(p)
return bokeh.layouts.gridplot(ps, ncols=2)
pn.interact(sig_par_same_plot, cycle=(0, data["nI_cycles"]-1))
COOOL This is very exciting, we now have a greater appreciation for which plots we might want to try next...(these are definitely not my final plots, let's start studying all growth cycles)
p.s. the model took quite a long time to get right, so I will plot so many things to see what works best!
We have SO many parameters. In a world where I have infinite time, a model comparison with LPDF's would hold my comparitive analysis to some numerical standard. To do so, stan code would need to be adding log probabilities, which given the model, seems like a lot of work. But the reason hierarchy is so powerful is we can look at the ~9 major parameters from both the linear and exponential model and see how they behave, and our comparison will thus be more interesting than two final numbers. There are so many combinations of visualizations, and I've included a couple.
n_chains = len(samples_lin.posterior.chain)
n_draws = len(samples_lin.posterior.draw)
nI_cycles, nII_cycles = data['nI_cycles'], data['nII_cycles']
var_names=['sigma','mu_sigma', 'sig_sigma',
'mu_aoI','mu_aoII','sig_ao',
'mu_kI','mu_kII','sig_k',]
df_lin = bebi103.stan.posterior_to_dataframe(samples_lin,var_names=var_names)
df_exp = bebi103.stan.posterior_to_dataframe(samples_exp,var_names=var_names)
def hist_ecdf_param(par):
p = bokeh_catplot.histogram(df_exp, val=f'mu_{par}I',palette=['maroon'],width=450,height=100,
fill_kwargs=dict(fill_alpha=0.8, line_alpha=0.3),
title=f'{par} bacterium I: exponential (maroon) vs. linear (pink)',)
q = bokeh_catplot.histogram(df_exp, val=f'mu_{par}II',palette=['maroon'],width=450,height=100,
fill_kwargs=dict(fill_alpha=0.8, line_alpha=0.3),
title=f'{par} bacterium II: exponential (maroon) vs. linear (pink)')
r = bokeh_catplot.histogram(df_lin, val=f'mu_{par}I',palette=['darksalmon'],p=p,
fill_kwargs=dict(fill_alpha=0.9, line_alpha=0.3))
s = bokeh_catplot.histogram(df_lin, val=f'mu_{par}II',palette=['darksalmon'],p=q,
fill_kwargs=dict(fill_alpha=0.9, line_alpha=0.3))
t = bokeh_catplot.ecdf(df_exp, val=f'mu_{par}I',palette=['maroon'], width=450, height=250)
u = bokeh_catplot.ecdf(df_lin, val=f'mu_{par}I',palette=['darksalmon'], p=t)
v = bokeh_catplot.ecdf(df_exp, val=f'mu_{par}II',palette=['maroon'], width=450, height=250)
w = bokeh_catplot.ecdf(df_lin, val=f'mu_{par}II',palette=['darksalmon'], p=v)
p.xaxis.visible,q.xaxis.visible,=False,False
p.toolbar.autohide,q.toolbar.autohide,t.toolbar.autohide,v.toolbar.autohide=True,True,True,True
bokeh.io.show(bokeh.layouts.layout([[p,q],[t,v]]))
pass
hist_ecdf_param('ao')
hist_ecdf_param('k')
The linear parameters' compensation is clear when we see where the distributions for μ_k and μ_ao fall for both bacterium. Furthermore, in both cases, we can see that the exponential model consistently gives us a smaller spread for its parameters, meaning that its parameters have to work less hard to account for our data.
But inspired from some starry plots from last term's 9.2 solution, let's now see how these parameters behave together.
p = bokeh.plotting.figure(width=500,height=500,title='95% bars: linear vs. exponential',x_axis_label='k',y_axis_label='ao')
n_cycles,ao,k = [nI_cycles, nII_cycles],['aoI', 'aoII'],['kI','kII']
for i in [0,1]: # two bacterium
lin_ao = samples_lin.posterior[ao[i]].values.T.reshape(n_cycles[i], n_chains*n_draws)
lin_k = samples_lin.posterior[k[i]].values.T.reshape(n_cycles[i], n_chains*n_draws)
exp_ao = samples_exp.posterior[ao[i]].values.T.reshape(n_cycles[i], n_chains*n_draws)
exp_k = samples_exp.posterior[k[i]].values.T.reshape(n_cycles[i], n_chains*n_draws)
for cycle in range(n_cycles[i]):
low_lin_ao, high_lin_ao = az.hpd(lin_ao[cycle])
low_lin_k, high_lin_k = az.hpd(lin_k[cycle])
low_exp_ao, high_exp_ao = az.hpd(exp_ao[cycle])
low_exp_k, high_exp_k = az.hpd(exp_k[cycle])
p.line(x=np.median(lin_k[cycle]),y=(low_lin_ao, high_lin_ao),line_color='salmon',
line_width=1.2,legend_label='linear')
p.line(x=np.median(exp_k[cycle]), y=(low_exp_ao, high_exp_ao),line_color='maroon',
line_width=1.2,legend_label='exponential')
p.line(x=(low_lin_k, high_lin_k),y=np.median(lin_ao[cycle]),line_color='salmon',line_width=1.2)
p.line(x=(low_exp_k, high_exp_k),y=np.median(exp_ao[cycle]),line_color='maroon',line_width=1.2)
p.legend.location, p.toolbar.autohide='bottom_right', True
bokeh.io.show(p)
Let's zooom in.
p.x_range=Range1d(0.004, 0.014)
p.y_range=Range1d(1.0,1.6)
bokeh.io.show(p)
YAY it is clear that the 't's from the exponential model are much smaller :)
An important parameter worth considering on its own is mu_sigma
, which is a hyperparameter for the centers of our σ's that dictate how the areas spread about their curves. The model with a smaller $μ_σ$ again will further inform us of which model is more fitting.
p = bokeh_catplot.histogram(df_lin, val="mu_sigma", palette=['darksalmon'],width=450,height=150,
fill_kwargs=dict(fill_alpha=0.8, line_alpha=0.3),title="location of scale μ_σ: linear vs. exponential")
q = bokeh_catplot.histogram(df_exp, val="mu_sigma", palette=['maroon'],p=p,
fill_kwargs=dict(fill_alpha=0.8, line_alpha=0.3))
s = bokeh_catplot.ecdf(df_lin, val='mu_sigma',palette=['darksalmon'],width=450, height=250)
t = bokeh_catplot.ecdf(df_exp, val='mu_sigma',palette=['maroon'],p=s)
p.xaxis.visible,p.toolbar.autohide,s.toolbar.autohide=False,True,True
bokeh.io.show(bokeh.layouts.layout([p,s]))
The hierarchy gives us mu_sigma, the location of our error; the linear is higher
inits
keeps track of the index of our dataframe when the cycles are new# let's add on to our very original plotitng function!
def regression_bokeh(df, width, bacterium,samples,param,n_draws):
'''plots frames and areas of bacteria'''
x, y = "frame", "areas (μm^2)"
color = df["colored"]
p = bokeh.plotting.figure(width=width, height=300,
x_axis_label=x, y_axis_label=y,
tooltips=[(x, "@{frame}"), (y, "@{areas (μm^2)}"), ("cycle", "@{colored}")],
title=f"{bacterium}",
)
inits = df[df['Δframes']==0].index.values
# predictive regressions from `generated quantities`
for i in range(len(inits)-1):
start = inits[i]
end = inits[i+1]
yyy = samples.posterior[f'{param}'].values.T.reshape(len(df), n_draws)[start:end].T
q = bebi103.viz.predictive_regression(yyy,df.frame.values[start:end],p=p,color='purple')
# populate last cycle
end = inits[-1]
yyy = samples.posterior[f'{param}'].values.T.reshape(len(df), n_draws)[end:].T
q = bebi103.viz.predictive_regression(yyy,df.frame.values[end:],p=p,color='purple')
# actual dataa
p.circle(source=df.loc[color % 2 == 0], x=x, y=y, color="maroon",size=2)
p.circle(source=df.loc[color % 2 == 1], x=x, y=y, color="darksalmon",size=2)
p.xgrid.grid_line_color,p.ygrid.grid_line_color = None, None
p.outline_line_color = None
p.title.align, p.title.text_font_style = "center", "bold"
return p
p = regression_bokeh(df1,350,'bacterium 1: linear predictives',samples_lin,'areasI_post',4000)
q = regression_bokeh(df2,900,'bacterium 2: linear predictives',samples_lin,'areasII_post',4000)
r = regression_bokeh(df1,350,'bacterium 1: exponential predictives',samples_exp,'areasI_post',4000)
s = regression_bokeh(df2,900,'bacterium 2: exponential predictives',samples_exp,'areasII_post',4000)
bokeh.io.show(bokeh.layouts.layout([[p,q]]))
bokeh.io.show(bokeh.layouts.layout([[r,s]]))
bebi103.stan.clean_cmdstan()