In [48]:
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import scipy.integrate
import scipy.stats as st
import numba
import cmdstanpy
import arviz as az

import colorcet
import bokeh 
import bokeh_catplot
import bokeh.plotting
from bokeh.models import Range1d
import panel as pn
import holoviews as hv
import bebi103
hv.extension('bokeh')
import bokeh.io
bokeh.io.output_notebook()
bebi103.hv.set_defaults()
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loading BokehJS ...

Visualizing Our Data



In [49]:
# recycled code from last term
def plotter_bokeh(df, width, bacterium):
    '''plots frames and areas of bacteria'''
    x, y = "frame", "areas (μm^2)"
    color = df["colored"]
    
    p = bokeh.plotting.figure(width=width, height=300, 
        x_axis_label=x, y_axis_label=y,
        tooltips=[(x, "@{frame}"), (y, "@{areas (μm^2)}"), ("cycle", "@{colored}")],
        title=f"{bacterium}"
    )
    p.circle(source=df.loc[color % 2 == 0], x=x, y=y, color="maroon")
    p.circle(source=df.loc[color % 2 == 1], x=x, y=y, color="darksalmon")

    p.xgrid.grid_line_color, p.ygrid.grid_line_color = None, None
    p.outline_line_color = None
    p.title.align, p.title.text_font_style = "center", "bold"
    p.toolbar.autohide = True
    return p

# to determine biphasic growth, i want to look at sinusoidal behavior of mins, maxs, and how that affects time
def growth_data(df):
    '''takes in dataframe with colored cycles, 
    returns min and max areas, length of cycle, frame of reference'''
    df_stacked = df.set_index("colored")
    mins, maxs, times, frames = [], [], [], []
    # anticipated error for times since  popped off a few frames for time filtering purposes
    
    for i in range(df["colored"].max()+1):
        df_cycle = df.loc[df["colored"] == i]
        mins.append(df_cycle["areas"].min())
        maxs.append(df_cycle["areas"].max())
        times.append(df_cycle["frame"].max() - df_cycle["frame"].min())
        frames.append(df_cycle["frame"].min())
    growths = [(maxs[j] - mins[j]) for j in range(0, len(mins))]
    
    data = pd.DataFrame({"mins": mins, "maxs": maxs, "growths":growths, 
                         "times": times, "frames": frames})
    return data
In [50]:
df1 = pd.read_csv("../../bi103a/CCTV/datasets_csv/bacterium_1.csv")
df2 = pd.read_csv("../../bi103a/CCTV/datasets_csv/bacterium_2.csv")

data1, data2 = growth_data(df1), growth_data(df2)

df1['bacterium']=['bacterium 1']*len(df1)
df2['bacterium']=['bacterium 2']*len(df2)
df12 = pd.concat([df1, df2])

data1['bacterium']=['bacterium 1']*len(data1)
data2['bacterium']=['bacterium 2']*len(data2)
data12 = pd.concat([data1, data2])

Let's remind ourselves of what our growth cycles look like!

In [51]:
width_b1, width_b2 = int(df1.shape[0] / 3), int(df2.shape[0] / 4)
p1 = plotter_bokeh(df1, width_b1, "bacterium 1")
p2 = plotter_bokeh(df2, width_b2, "bacterium 2")
bokeh.io.show(p1)
bokeh.io.show(p2)

Modeling Individual Caulobacter Growth

HIERARCHICAL MODEL


We expect each growth event to have its own $a_o$ and k, the reason being that each event will start out at a different $a_o$, and perhaps a specific process that is collectively slowing down or accelerating division in that particular growth cycle, or something. We further expect these to be distributed around some location and scale parameter of $a_o$ and k belonging to the cell itself. I make the assumption that because these cells belong to the same species, the spread of these values will be some shared $\sigma_{ao}$ and $\sigma_k$.

So we have something that looks like this:

We expect each growth event to have its own $a_o$ and k, the reason being that each event will start out at a different $a_o$, and perhaps a specific process that is collectively slowing down or accelerating division in that particular growth cycle, or something. We further expect these to be distributed around some location and scale parameter of $a_o$ and k belonging to the cell itself. I make the assumption that because these cells belong to the same species, the spread of these values will be some shared $\sigma_{ao}$ and $\sigma_k$.

So those inform our final μ's. For σ, we will use assume the same, that it is shared among all caulobacter cells, and will have its own center and spread. The choice of distributions are discussed next.

  • note 1: I only have one layer of hierarchy for the location parameters; the cost function of getting two layers to work vs. using one and being able to do more with the estimates after is more appealing. I am not suuper interested in generalizing my parameter estimates to caulobacter en masse; i'm more interested in using the spread of those parameter estimates to compare exponential vs. linear growth with two parent cells. (we also only have two cells which is quite a small sample size).

  • note 2: I considered scaling σ with μ for the exponential model, but decided that since we are in the near linear range of the exponential model, it perhaps isn't worth the effort...

PRIOR DISTRIBUTION FOR $a_o$:


From the plots above, we see that $a_0$ seems to behave sinusoidally. We know from last term (team 14 for hw5), that our times between growth rates remain fairly constant, so let's see what the ECDF of random draws from an evenly spaced distribution of a sine function, and think about what distributions we should look for...

In [52]:
x = np.linspace(0, 2*np.pi,1000)
y = np.sin(x) 
ecdf = bokeh_catplot.ecdf(np.random.choice(y, 1000), 
            title='ECDF Random Draws from sin(x)',height=300,palette=['maroon'])
bokeh.io.show(ecdf)

We should perhaps not choose a normal distribution for $a_0$--the sinoisoid is going in the other direction!

But of course, the actual sinusoid of $a_0$ seems tilted, so just for fun, I'll try to apply a shear transformation to retrieve a new function. I will plot this function, and then sample from it.
As a reminder, a shear transformation matrix looks something like:

\begin{align} \left[\begin{array}{ccc} 1 & \lambda \\ 0 & 1 \end{array}\right] \end{align}

In functional form that means we are mapping $f(x, y) \to f((x+\lambda y), y)$.
This maps $\sin(x) \to \sin(x-\lambda \sin(x))$
We want it to tilt to the right, so we'll now plot λ = -0.5.

In [53]:
p = bokeh.plotting.figure(width=600, height=250, title="sin(x-0.75 sin(x))")
p.circle(x, np.sin(x), legend_label='unsheared',color='maroon')
p.circle(x, np.sin(x-0.75*np.sin(x)), color='indianred', legend_label='sheared')
bokeh.io.show(p)

Scrolling through bacterium 2's growth cycles, we see how this kind of capture the tilt in the negative space underneath the curves.

In [54]:
ecdf = bokeh_catplot.ecdf(np.random.choice(np.sin(x), 1000), 
            title='ECDF Random Draws from sin(x-0.75sin(x))',height=300,palette=['maroon'])
q = bokeh_catplot.ecdf(np.random.choice(np.sin(x-0.75*np.sin(x)), 1000), palette=['indianred'],p=ecdf)
bokeh.io.show(q)

Wobbly! But not very informative. This tells us when hunting for ECDFS, the center bulk of the mass should behave almost linearly. But we see that shearing (red) brings back that sigmoid inflection we're used to seeing with gaussian distributions...

  • Bounding $a_0$: it would be silly for the parent cell to disappear, or experience significant shrinkage, so we definitely want a lower bound at ~0.5 μm^2; it would also be weird if the bacterium retained more than 90% of its contents--90% of the maximum area < 3 μm^2 gives us something like 2.7 μm^2 for our upper bound.

  • Bounding $σ_{ao\mathrm{I}}$: I will tolerate a maximum value of 0.25, for the reasons given above (area considerations).

\begin{align} ao_{\mathrm{I}i} &\sim \mathrm{Normal}(\mu_{ao\mathrm{I}}, \sigma_{ao\mathrm{I}})\\[0.5em] ao_{\mathrm{II}i} &\sim \mathrm{Normal}(\mu_{ao\mathrm{II}}, \sigma_{ao\mathrm{II}})\\[1em] \mu_{ao\mathrm{I}} &\sim \mathrm{LogNormal}(0.15, 0.15) \\[0.5em] \mu_{ao\mathrm{II}} &\sim \mathrm{LogNormal}(0.15, 0.15) \\[0.5em] \sigma_{ao} &\sim \mathrm{Exponential}(20) \\[0.5em] \end{align}

PRIOR DISTRIBUTION FOR k:


  • Bounding k: This desmos graph was used to help choose priors. What's cool about this parameter is that because we can't differentiate between the linear and exponential form, we don't observe extreme growth in ~ 80-120 frames. So back to the desmos plot! The ECDF tells us that in 80-120 seconds, the 'k' term must allow for some growth (no flatlining behavior), so the lower bound for k will be 0.002 (When Δt = 120 seconds, we won't let it only grow 0.2 μm^2). The upper bound will be set at k = 0.01 (when Δt = 80 seconds, we won't let it grow bigger than 3 um^2).
\begin{align} k_{\mathrm{I} i} &\sim \mathrm{Normal}(\mu_{k\mathrm{I}}, \sigma_{k\mathrm{I}})\\[0.5em] k_{\mathrm{II} i} &\sim \mathrm{Normal}(\mu_{k\mathrm{II}}, \sigma_{k\mathrm{II}})\\[1em] \end{align}

WANT k_mu ~ 0.005 ->[0.003,0.006], ... k_sigma ~ 0.001 -> [0,0.0035] ... Exponential(1000)

\begin{align} \mu_{k\mathrm{I}} &\sim \mathrm{Normal}(0.0116, 0.0032) \\[0.5em] \mu_{k\mathrm{II}} &\sim \mathrm{Normal}(0.0116, 0.0032) \\[0.5em] \sigma_{k} &\sim \mathrm{Exponential}(1000) \\[0.5em] \end{align}

The 80-120s range of interest can be seen from the following ecdf:

In [55]:
p = bokeh_catplot.ecdf(data12,cats='bacterium',val='times',tools=['hover'],
        title='Growth Times (s) ECDF',palette=['slateblue','purple'])
p.legend.location='top_left'
bokeh.io.show(p)

Some lingering segmentation errors, in b2, but I turned on hover, found the growth cycles, and plan on dropping these cycles . Can see the errors from EDA (single colored dots with 1-2 frames), completely justifies abandoning these cycles. Analysis for bacterium 1 is quite clean, will be untouched.

  • 33, 34, 63, 77, 78, 91 << 91 is the last frame, cut short so definitely will be dropping
In [56]:
# dropping bad frames
bad_frames = [33, 34, 63, 77, 78, 91]
for frame in bad_frames:
    df2 = df2[df2.colored!=frame]

PRIOR DISTRIBUTION FOR σ:


  • Bounding $σ_I$ and $σ_{II}$: When zoooming in on the growth curves to each individual cycle, we see for bacterium I how each growth faithfully follows the curve and doesn't stray outside of 0.05 um^2. Having a sharp upper bound on σ will be nice bc it might mute all those segmentation errors that the initial code didn't catch for bacterium 2. Let's give em both an exponential that cuts off at 0.05.
\begin{align} \sigma_{\mathrm{I}i} &\sim \mathrm{Normal}(\mu_{\sigma}, \sigma_{\sigma})\\[0.5em] \sigma_{\mathrm{II}i} &\sim \mathrm{Normal}(\mu_{\sigma}, \sigma_{\sigma})\\[1em] \mu_{\sigma} &\sim \mathrm{Exponential}(100)\\[0.5em] \sigma_{\sigma} &\sim \mathrm{Exponential}(500)\\[0.5em] \end{align}

Prior Predictives: searching for potential funnel-like behavior



Now that we have our model, let's sample from it and

  • check if our priors can generate our data
  • check if we can actually sample informatively from these priors

The idea here is that our model is sampling terms of severely different order of magnitudes, so we will likely run into high 'curvatures' in higher dimensional spaces, so recentering all our parameters, we have the following:

Our stan file will be relatively simple. We will only use stan for sampling (so ignore functions block, and transformed parameters block), retrieving area values from the parameters will be done in numpy.

All that we really have to do is consistently draw from normal distributions (0, 1) and properly transform our parameters. Note that all our tilde terms should be unbounded! They are being drawn from normals that allow for negative values, bc these are the scaled terms that tell us whether our error is undershooting or overshooting.

functions {
    real a_theor(real ao, real k) {
        return ao + k*(100);
        /*return ao * exp(k*100); // for now, look at end of growth cycle*/
        /*return ao * exp(k); //returns a_o effectively */
    }
}

parameters {
    real<lower=0> mu_ao;
    real<lower=0> sig_ao;
    real<lower=0> sig_k;
    real<lower=0> mu_sigma;
    real<lower=0> sig_sigma;

    real ao_tilde;     // bounding tildes? -> probs not
    real k_tilde;
    real mu_k_tilde;
    real sigma_tilde;
    real a_tilde;
}

transformed parameters {
    real ao;
    real mu_k;
    real k;
    real sigma;
    real mu;
    real a;

    ao    = mu_ao    + ao_tilde * sig_ao;
    mu_k  = 0.0116   + 0.0032  * mu_k_tilde;
    k     = mu_k     + k_tilde * sig_k;
    sigma = mu_sigma + sigma_tilde * sig_sigma;
    mu    = a_theor(ao, k);
    a     = mu       + a_tilde * sigma;
}

model {
    ao_tilde     ~ normal(0,1);
    mu_ao        ~ lognormal(0.15,0.15);
    sig_ao       ~ exponential(20);

    k_tilde      ~ normal(0,1);
    mu_k_tilde   ~ normal(0,1);
    sig_k        ~ exponential(1000);

    sigma_tilde  ~ normal(0,1);
    mu_sigma     ~ exponential(100);
    sig_sigma    ~ exponential(500);

    a_tilde      ~ normal(0, 1);
}

SAMPLING


In [58]:
bebi103.stan.clean_cmdstan()
sampler_nc = cmdstanpy.CmdStanModel(stan_file='testing_noncentering.stan')
INFO:cmdstanpy:stan to c++ (/Users/rosita/bi103/bi103b/week10/testing_noncentering.hpp)
INFO:cmdstanpy:compiling c++
INFO:cmdstanpy:compiled model file: /Users/rosita/bi103/bi103b/week10/testing_noncentering
In [ ]:
with bebi103.stan.disable_logging():
    samples_nc = sampler_nc.sample(chains=4,
                                   cores=8,
                                   sampling_iters=5000,
                                   warmup_iters=10000, 
                                   thin=5,
                                   adapt_delta=0.97)
samples_nc = az.from_cmdstanpy(posterior=samples_nc)
In [59]:
# samples_nc.to_netcdf('testing_noncentering.nc')
samples_nc = az.from_netcdf('testing_noncentering.nc')
In [60]:
bebi103.stan.check_all_diagnostics(samples_nc)
Effective sample size looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0 of 4000 (0.0%) iterations ended with a divergence.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
Out[60]:
0
In [61]:
parcoord_sm = bebi103.viz.parcoord_plot(samples_nc,pars=["mu","ao","sig_ao"])
parcoord_bi = bebi103.viz.parcoord_plot(samples_nc,pars=["k","sig_k","sigma","sig_sigma"])
bokeh.io.show(bokeh.layouts.layout([[parcoord_sm, parcoord_bi]]))

Everything looks of order of magnitude!we can see the sigmas of each shrinking ~1 oom below th eactual parameter...

To study funnels in particular, we'll look at each parameter along with it's σ.

In [62]:
import bokeh.palettes
palette = bokeh.palettes.Category20b[4]
pars = ["ao","sig_ao","k","sig_k","sigma","sig_sigma","mu","sigma"]
ao, sig_ao, k, sig_k, sigma, sig_sig, mu, sigma= (
    [samples_nc.posterior[f"{par}"].values.flatten() for par in pars] )

ps = []
for i, tup in enumerate([(ao, sig_ao),(k, sig_k),(sigma, sig_sig),(mu, sigma)]):
    p = bokeh.plotting.figure(height=300, width=250,title=f"σ & {pars[2*i]}",
                              x_axis_label=pars[2*i],y_axis_label=pars[2*i+1])
    p.circle(tup[0],tup[1],alpha=0.8, color=palette[0],size=1)
    ps.append(p)
bokeh.io.show(bokeh.layouts.gridplot(ps, ncols=4))

We are getting some negative values for k; let's find out what fraction...

In [63]:
num_neg_k = np.sum(samples_nc.posterior.k.values.flatten()<0) 
num_neg_mu = np.sum(samples_nc.posterior.mu.values.flatten()<0)
num_neg_ao = np.sum(samples_nc.posterior.ao.values.flatten()<0) 

num_total_draws = len(samples_nc.posterior.chain)*len(samples_nc.posterior.draw)
print("fraction of negative k's", num_neg_k / num_total_draws)
print("fraction of negative mu's", num_neg_mu / num_total_draws)
print("fraction of negative ao's", num_neg_ao / num_total_draws)
fraction of negative k's 0.002
fraction of negative mu's 0.0
fraction of negative ao's 0.0

I am willing to accept this as a tradeoff for letting k's sigma be able to reach 0.02 or a tad higher, but evrything else that is positive seems good!

Let's look at the trace plot for one chain...

In [64]:
pars=['k','sig_k','ao','sig_ao','sigma','sig_sigma','mu_ao','mu_k','mu_sigma',]
bokeh.io.show(bebi103.viz.trace_plot(samples_nc, pars=pars, plot_width=2000,plot_height=80,
    palette=['purple','pink','red','black','white','white','white','white','white',]))

Things could be much worse! Our squiggles aren't as squiggly as they could be, but even with one level of hierarchy, we have like 9 parameters to keep happy, which is difficult. (To keep the size of this html file down, I did not include the trace plots from the centered distribution samples, but they went quite flat for a concerning amount of parameters.)

To check if our actual numbers and distributions are ok, let's now check to see what kind of areas during the growth cycle these parameters generate.

PLOTTING


I will now manually do prior predictive checks by retrieving $a_o$ and k values from the stan sample, and use a linspace for frame/time.

(reason: wrote stan code with generated quantities and started getting NaN values in the parameters, but ihave zero NaN values the way things are now, so I'm switching my pcc to python!)

In [65]:
times = np.arange(100) # ~100 seconds, ~100 frames per growth event

ao_prior = samples_nc.posterior.ao.values.flatten()
k_prior = samples_nc.posterior.k.values.flatten()
sigma_prior = samples_nc.posterior.sigma.values.flatten()

def a_theor_lin(ao,k,t):
    return ao+k*t
def a_theor_exp(ao,k,t):
    return ao*np.exp(k*t)

a_lins = np.empty([len(k_prior),len(times)])
a_exps = np.empty([len(k_prior),len(times)])

for i in range(len(k_prior)):
    a_lin=np.empty(len(times))
    a_exp=np.empty(len(times))
    for t in range(len(times)):
        mu_lin = a_theor_lin(ao_prior[i], k_prior[i], t)
        mu_exp = a_theor_exp(ao_prior[i], k_prior[i], t)
        a_lin[t] = st.norm.rvs(loc=mu_lin, scale=abs(sigma_prior[i]), size=1)
        a_exp[t] = st.norm.rvs(loc=mu_exp, scale=abs(sigma_prior[i]), size=1)
    a_lins[i] = a_lin
    a_exps[i] = a_exp
In [66]:
palette = bokeh.palettes.Category20b[4]
p = bokeh.plotting.figure(height=400,width=300,title='linear prior checks: 8 draws',
                         x_axis_label='time',y_axis_label='area (μm^2)')
q = bokeh.plotting.figure(height=400,width=300,title="exponential prior checks: 8 draws",
                         x_axis_label='time',y_axis_label='area (μm^2)')
for i in range(0,len(a_lins),150):
    p.circle(times, a_lins[i],fill_alpha=0.1,alpha=0.5,color=palette[1])
    q.circle(times, a_exps[i],fill_alpha=0.1,alpha=0.5,color=palette[1])
bokeh.io.show(bokeh.layouts.layout([[p,q]]))

These look completely reasonable!! Since we have ~100 discrete time frames, we can actually simulate the percentile plots as well (above are less informative density-wise).

In [67]:
a_linsT, a_expsT = a_lins.T, a_exps.T

def return_array_percentiles(interval):
    low_lins, high_lins = np.empty(len(times)), np.empty(len(times))
    low_exps, high_exps = np.empty(len(times)), np.empty(len(times))
    for t in times:
        low_lins[t], high_lins[t] = np.percentile(a_linsT[t], interval)
        low_exps[t], high_exps[t] = np.percentile(a_expsT[t], interval)
    return low_lins, high_lins, low_exps, high_exps

low_lin_99,high_lin_99,low_exp_99,high_exp_99 = return_array_percentiles([0.05, 99.5])
low_lin_95,high_lin_95,low_exp_95,high_exp_95 = return_array_percentiles([2.5, 97.5])
low_lin_50,high_lin_50,low_exp_50,high_exp_50 = return_array_percentiles([25, 75])

def draw_circles(xarr, yarr, color, plot):
    plot.circle(xarr, yarr, color=color)
    pass

arrs_lin = [low_lin_99,high_lin_99,low_lin_95,high_lin_95,low_lin_50,high_lin_50]
arrs_exp = [low_exp_99,high_exp_99,low_exp_95,high_exp_95,low_exp_50,high_exp_50]
palette = ['thistle']*2 + ['purple']*2 + ['indigo']*2

p = bokeh.plotting.figure(height=400, width=300,
        title="lin prior pred %: 99th, 95th, 50th")
q = bokeh.plotting.figure(height=400, width=300, 
        title="expo prior pred %: 99th, 95th, 50th")

for i, arr in enumerate(arrs_lin):
    draw_circles(times, arr, palette[i], p)
for i, arr in enumerate(arrs_exp):
    draw_circles(times, arr, palette[i], q)
    
for i in range(len(times)): 
    p.line(times[i], (low_lin_99[i], high_lin_99[i]), line_color=palette[0], line_width=1.5)
    p.line(times[i], (low_lin_95[i], high_lin_95[i]), line_color=palette[2], line_width=1.5)
    p.line(times[i], (low_lin_50[i], high_lin_50[i]), line_color=palette[4], line_width=1.5)
    q.line(times[i], (low_exp_99[i], high_exp_99[i]), line_color=palette[0], line_width=1.5)
    q.line(times[i], (low_exp_95[i], high_exp_95[i]), line_color=palette[2], line_width=1.5)
    q.line(times[i], (low_exp_50[i], high_exp_50[i]), line_color=palette[4], line_width=1.5)

bokeh.io.show(bokeh.layouts.layout([[p,q]]))

this is great, these are exactly the kind of percentiles i had in mind! 'k' is a little wild, but again any beta above an exponential(1000) caused divergences, and I don't want an extreme prior on something as nebulous as the prior on the sigma of 'k' to overwhelm anything... also, the center 50% of the percentiles capture what I had in mind.

Here's a broad picture of how the analysis will go: if growth is exponential, we will see a smaller 'k' from the exponential model, because the linear model has to kind of compensate for the final growth areas with a higher 'k'. On the other hand, if growth is linear, we will observe higher 'k's from the exponential model since the predominant values are in the ~linear region, and it will try to fit the initial flatter parts to a more linear function. (this can be seen with the desmos plots).

Posterior Sampling: and so we begin...



In stan file, we will model cycle by cycle, so let's define new column in dataframe called Δframe that gives us Δt (in seconds) since we start our clock.

lots of code was needed to get the Δframe column :/ ...
can't wait to see the one-liner solution

In [68]:
df1 = pd.read_csv("../../bi103a/CCTV/datasets_csv/bacterium_1.csv")
df2 = pd.read_csv("../../bi103a/CCTV/datasets_csv/bacterium_2.csv")

bad_frames = [33, 34, 63, 77, 78, 91]
for frame in bad_frames:
    df2 = df2[df2.colored!=frame]

df1['frame'] = df1['frame']-4 # let's start at zero
df2['frame'] = df2['frame']-7 # let's start at zero

for df in [df1, df2]:
    cutoff_frames = (df[df.colored.diff()==1]).values.T[0]
    frames, delta_frames = df.frame.values, []
    delta_frames.append((frames[0:np.where(frames==cutoff_frames[0])[0][0]]).tolist())
    for i in range(len(cutoff_frames)-1):
        start = np.where(frames==cutoff_frames[i])[0][0]
        stop = np.where(frames==cutoff_frames[i+1])[0][0]
        delta_frames.append((frames[start:stop] - frames[start]).tolist())
    last = np.where(frames==cutoff_frames[-1])[0][0]
    delta_frames.append((frames[last:]-frames[last]).tolist())
    delta_frames = np.hstack(delta_frames)
    df['Δframes'] = delta_frames

df1['cycles'] = df1['colored']
# fixing cycles so that they match the index for stan file! 
_, counts = np.unique(df2.colored, return_counts=True)
cycles = np.arange(len(_))
fresh_cycles = []
for i in range(len(cycles)):
    fresh_cycles.append([cycles[i]]*counts[i])
df2['cycles'] = np.hstack(fresh_cycles)
df12 = pd.concat([df1, df2])
In [69]:
data = dict(len_df1=len(df1), len_df2=len(df2), 
            nI_cycles=len(np.unique(df1.cycles)), 
            nII_cycles=len(np.unique(df2.cycles)),
            timesI=df1.Δframes.values,
            timesII=df2.Δframes.values,
            areasI=df1['areas (μm^2)'].values,
            areasII=df2['areas (μm^2)'].values,
            cyclesI=df1.cycles.values,
            cyclesII=df2.cycles.values,)
In [ ]:
bebi103.stan.clean_cmdstan()
sampler_exp = cmdstanpy.CmdStanModel(stan_file="ccexp3.stan")
sampler_lin = cmdstanpy.CmdStanModel(stan_file="cclin3.stan")
In [ ]:
with bebi103.stan.disable_logging():
    samples_lin_ = sampler_lin.sample(data=data, 
                chains=4,
                warmup_iters=2000, 
                adapt_delta=0.98,)
    
with bebi103.stan.disable_logging():
    samples_exp_ = sampler_exp.sample(data=data, 
                chains=4,
                cores=8,
                warmup_iters=3000, 
                max_treedepth=12,
                adapt_delta=0.97,)
In [ ]:
samples_lin = az.from_cmdstanpy(posterior=samples_lin_)
samples_exp = az.from_cmdstanpy(posterior=samples_exp_)
In [70]:
# these were the samples used
samples_lin = az.from_netcdf('good_all_cycles_lin.nc')
samples_exp = az.from_netcdf('good_all_cycles_exp.nc')

CHECKING DIAGNOSTICS


I tried to be super careful with my diagnostics, since I didnt have time to do SBC (probably spent an extra day on priors instead of running sbc) bebi103.stan.check_all_diagnostics will treat the very very large mu-vector as a ton of parameters, which they are really just an intermediate to retrieve location parameters for the final a values we pass in as our data... these two are not the actual parameters of interest, (a_o and k), so we will manually look at .sample_stats to check for divergences and whatnot.

  • Let's get fractions of divergences:
In [71]:
num_divergences_exp = np.sum(samples_exp.sample_stats.diverging.values.flatten())
num_divergences_lin = np.sum(samples_lin.sample_stats.diverging.values.flatten())
total_draws_exp = len(samples_exp.sample_stats.draw) * len(samples_exp.sample_stats.chain)
total_draws_lin = len(samples_lin.sample_stats.draw) * len(samples_lin.sample_stats.chain)
print('lin fraction divergences: ', num_divergences_lin / total_draws_lin)
print('expo fraction divergences: ', num_divergences_exp / total_draws_exp)
lin fraction divergences:  0.0
expo fraction divergences:  0.0

wOw -- our non-centering worked nicely!

In [72]:
# parameters of interest--backbone parameters driving the model
var_names=['mu_sigma','sig_sigma','sigma','sig_ao','sig_k','mu_aoI','mu_aoII','mu_kI','mu_kII']
  • Look at Rhat : want less than 1.1
In [73]:
# linear
az.rhat(samples_lin, var_names=var_names)
Out[73]:
<xarray.Dataset>
Dimensions:    ()
Data variables:
    mu_sigma   float64 1.008
    sig_sigma  float64 1.006
    sigma      float64 0.9997
    sig_ao     float64 1.024
    sig_k      float64 1.014
    mu_aoI     float64 1.024
    mu_aoII    float64 1.084
    mu_kI      float64 1.027
    mu_kII     float64 1.042
In [74]:
# exponential
az.rhat(samples_exp, var_names=var_names)
Out[74]:
<xarray.Dataset>
Dimensions:    ()
Data variables:
    mu_sigma   float64 1.004
    sig_sigma  float64 1.001
    sigma      float64 1.001
    sig_ao     float64 1.012
    sig_k      float64 1.007
    mu_aoI     float64 1.021
    mu_aoII    float64 1.058
    mu_kI      float64 1.004
    mu_kII     float64 1.031

these are soooo good... noncentering is GREAT

  • look at ess_tail : want > 400
In [75]:
# linear
az.summary(samples_lin, var_names=var_names)
Out[75]:
mean sd hpd_3% hpd_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
mu_sigma 0.033 0.004 0.025 0.039 0.000 0.000 375.0 375.0 720.0 489.0 1.01
sig_sigma 0.002 0.002 0.000 0.007 0.000 0.000 845.0 801.0 906.0 834.0 1.01
sigma 0.034 0.000 0.034 0.035 0.000 0.000 3716.0 3716.0 3721.0 2112.0 1.00
sig_ao 0.195 0.013 0.172 0.220 0.001 0.001 140.0 139.0 144.0 344.0 1.02
sig_k 0.001 0.000 0.001 0.001 0.000 0.000 175.0 173.0 179.0 499.0 1.01
mu_aoI 1.259 0.043 1.177 1.337 0.003 0.002 154.0 154.0 155.0 310.0 1.02
mu_aoII 1.199 0.021 1.158 1.238 0.003 0.002 45.0 45.0 45.0 93.0 1.08
mu_kI 0.010 0.000 0.010 0.011 0.000 0.000 159.0 159.0 159.0 256.0 1.03
mu_kII 0.010 0.000 0.010 0.010 0.000 0.000 107.0 106.0 108.0 215.0 1.04
In [76]:
# exponential
az.summary(samples_exp, var_names=var_names)
Out[76]:
mean sd hpd_3% hpd_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
mu_sigma 0.032 0.003 0.025 0.037 0.000 0.000 1285.0 1285.0 1943.0 1283.0 1.00
sig_sigma 0.002 0.002 0.000 0.006 0.000 0.000 2268.0 2066.0 2558.0 2033.0 1.00
sigma 0.033 0.000 0.032 0.033 0.000 0.000 3870.0 3866.0 3863.0 2911.0 1.00
sig_ao 0.145 0.010 0.128 0.164 0.001 0.000 337.0 337.0 335.0 665.0 1.01
sig_k 0.000 0.000 0.000 0.001 0.000 0.000 486.0 486.0 479.0 883.0 1.01
mu_aoI 1.302 0.033 1.243 1.368 0.002 0.001 286.0 286.0 286.0 611.0 1.02
mu_aoII 1.263 0.017 1.232 1.295 0.002 0.001 65.0 64.0 64.0 166.0 1.06
mu_kI 0.006 0.000 0.006 0.006 0.000 0.000 392.0 392.0 394.0 683.0 1.00
mu_kII 0.006 0.000 0.006 0.006 0.000 0.000 147.0 147.0 145.0 426.0 1.03

pretty good! Virtually all are above 400 :)

With this many parameters, looking for possible correlation and anti-correlation will be hard on parcoord plots, and only really makes sense when they're of similar orders of magnitude, so we'll group them like so. This will also help us check our initial bounds on our priors.
oom = order of magnitude

In [77]:
pars = ['sigma','sig_sigma','mu_sigma','sig_ao','kI[0]','kII[0]','sig_k','mu_kI','mu_kII']
pc_lin_s = bebi103.stan.viz.parcoord_plot(samples_lin,pars=pars,title='linear: oom ~10^-2')
pc_exp_s = bebi103.stan.viz.parcoord_plot(samples_exp,pars=pars,title='exponential: oom ~10^-2')

pars = ['mu_aoI','mu_aoII','aoI[0]','aoII[0]']
pc_lin_b = bebi103.stan.viz.parcoord_plot(samples_lin, pars=pars,title='linear: oom ~10^0')
pc_exp_b = bebi103.stan.viz.parcoord_plot(samples_exp, pars=pars,title='exponential: oom ~10^0')
bokeh.io.show(bokeh.layouts.layout([pc_lin_s, pc_lin_b, pc_exp_s, pc_exp_b]))

After staring at these for awhile, the OOMs look good; i also noticed that after repeated samples and more staring, small values of sig_sigma are where the orange strings come from -> this is too be expected! sig_sigma has an exponential that has most of its mass sitting closest to zero out of all the other parameters, but given the tiny fraction of divergences i think we can move on.

Let's look at trace plots to see if any of our chains get stuck.

  • linear
In [78]:
pars=['kI[0]','sig_k','aoI[0]','sig_ao','sigma','sig_sigma','mu_aoI','mu_kI','mu_sigma',]
bokeh.io.show(bebi103.viz.trace_plot(samples_lin,pars=pars,plot_width=1000,plot_height=100,
    palette=['purple','salmon','black','white','white','white','white','white','white',]))
  • exponential
In [79]:
bokeh.io.show(bebi103.viz.trace_plot(samples_exp, pars=pars, plot_width=1000,plot_height=110,
    palette=['purple','salmon','black','white','white','white','white','white','white',]))

No flatlines! ekg's stayin alive

  • mu_aoI looks suspiciously glued to the value 1.2, but when we think about it, this is the mu_ao belonging to the very first growth cycle, which if you scroll allll the way back up to our bacterium 1's growth cycles, a_o is indeed around that value... this value does not characterize all growth cycles, only one

To compare plots with our priors, let's check those funnels out... each parameter with it's σ.

In [80]:
pars = ["aoI[0]","sig_ao","kI[0]","sig_k","sigma","sig_sigma","muI[50]","sigma"]

Let's limit our very initial analysis to one cycle and see how these vector parameters behaved.

In [81]:
palette = bokeh.palettes.Category20b[4]
def sig_par_plots(cycle=0):
    pars = [f"aoI[{cycle}]","sig_ao",f"kI[{cycle}]","sig_k",
            "sigma","sig_sigma",f"muI[{cycle}]","sigma"]
    plots,title=[],['lin','exp']
    for i, samples in enumerate([samples_lin, samples_exp]):
        ao, k, mu = ([samples.posterior[f"{par}"][cycle].values.flatten() 
                  for par in ["aoI","kI","muI"]] )
        sig_ao, sig_k, sigma, sig_sigma, sigma = (
            [samples.posterior[f"{par}"].values.flatten() for par in 
                 ["sig_ao", "sig_k", "sigma", "sig_sigma", "sigma"]])
        ps = []
        for j, tup in enumerate([(ao, sig_ao),(k, sig_k),(sigma, sig_sigma),(mu, sigma)]):
            p = bokeh.plotting.figure(height=300, width=350,
                title=f"{title[i]}: σ of {pars[2*j]}",
                                  x_axis_label=pars[2*j],y_axis_label=pars[2*j+1])
            p.circle(tup[0][:2000],tup[1][:2000],alpha=0.8, color=palette[0],size=1)
            ps.append(p)
        plots.append(ps)
    for i in range(4):
        plots[0][i].y_range = plots[1][i].y_range
        #plots[0][i].x_range = plots[1][i].x_range
    return bokeh.layouts.layout([[bokeh.layouts.gridplot(plots[0],ncols=1),
                                     bokeh.layouts.gridplot(plots[1], ncols=1)]])
pn.interact(sig_par_plots,cycle=(0,data['nI_cycles']-1))
Out[81]:

Our sampler seems to be having just like we need it to!

Although we are really only looking at one growth cycle at a time right now, we can already see that kI[0] (bacterium I's first cycle) has a much tighter sigma_k fit. All the sigma's for all these parameters are in general much smaller in comparison as well! Additionally, the 'k' values (look @ x-axis second row), are smaller in the exponential model than in the linear model. From the analysis given in the cells above, this is indicative of the linear model overcompensating for the areas toward the end of growth.

So this is what I mean:

In [82]:
x = np.linspace(-2, 130, 1000)
ao_exp, k_exp=1, 0.01
def ak_comparison(hundredx_ao_linear=80.0, hundredx_k_linear=2.0):
    ylin = hundredx_ao_linear/100 + hundredx_k_linear/100*x
    yexp = ao_exp * np.exp(k_exp*x)
    p = bokeh.plotting.figure(height=200, width=700,title="comparison of 'ao' and 'k' behavior")
    p.circle(x, ylin, color='darksalmon',legend_label='linear')
    p.circle(x, yexp, color='maroon',legend_label='exponential')
    p.legend.location='bottom_right'
    return p
pn.interact(ak_comparison,hundredx_ao_linear=(20.0, 130.0), hundredx_k_linear=(1.0, 4.0))
Out[82]:

(The sliders are discretized and cna only move in +/- 0.10 on my machine, so I scaled the inputs to 100a_o and 100k for smoothness.)

We can see that if a linear model is trying to fit an exponential model, it will have a higher 'k' (here it is 0.02 as opposed to 0.01) and thus have a lower 'a' (here a snapshot of 0.8 as opposed to 1.00)

In [37]:
def sig_par_same_plot(cycle=0):
    ao_lin,k_lin=([samples_lin.posterior[f"{par}"][cycle].values.flatten() for par in ["aoI","kI"]])
    sig_ao_lin,sig_k_lin = ([samples_lin.posterior[f"{par}"].values.flatten() for par in ["sig_ao","sig_k"]])
    ao_exp,k_exp=([samples_exp.posterior[f"{par}"][cycle].values.flatten() for par in ["aoI","kI"]])
    sig_ao_exp,sig_k_exp = ([samples_exp.posterior[f"{par}"].values.flatten() for par in ["sig_ao", "sig_k",]])
    ps = []
    for j, tup in enumerate([(ao_lin,sig_ao_lin,ao_exp,sig_ao_exp),(k_lin,sig_k_lin,k_exp,sig_k_exp),]):
        p = bokeh.plotting.figure(height=400, width=350,
            title=f"{pars[2*j][:-3]}[{cycle}] and it's σ",x_axis_label=pars[2*j],y_axis_label=pars[2*j+1])
        p.circle(tup[0][:2000],tup[1][:2000],alpha=0.8,color='darksalmon',size=0.8,legend_label='linear')
        p.circle(tup[2][:2000],tup[3][:2000],alpha=0.8,color='maroon',size=0.8,legend_label='exponential')
        p.legend.location='top_left'
        ps.append(p)
    return bokeh.layouts.gridplot(ps, ncols=2)
pn.interact(sig_par_same_plot, cycle=(0, data["nI_cycles"]-1))
Out[37]:

COOOL This is very exciting, we now have a greater appreciation for which plots we might want to try next...(these are definitely not my final plots, let's start studying all growth cycles)

p.s. the model took quite a long time to get right, so I will plot so many things to see what works best!

you have potheads, potterheads, and plotterheads...



We have SO many parameters. In a world where I have infinite time, a model comparison with LPDF's would hold my comparitive analysis to some numerical standard. To do so, stan code would need to be adding log probabilities, which given the model, seems like a lot of work. But the reason hierarchy is so powerful is we can look at the ~9 major parameters from both the linear and exponential model and see how they behave, and our comparison will thus be more interesting than two final numbers. There are so many combinations of visualizations, and I've included a couple.

CENTERS FOR `a_o` & `k`


In [83]:
n_chains = len(samples_lin.posterior.chain)
n_draws = len(samples_lin.posterior.draw)
nI_cycles, nII_cycles = data['nI_cycles'], data['nII_cycles']

var_names=['sigma','mu_sigma', 'sig_sigma', 
           'mu_aoI','mu_aoII','sig_ao', 
           'mu_kI','mu_kII','sig_k',]
df_lin = bebi103.stan.posterior_to_dataframe(samples_lin,var_names=var_names)
df_exp = bebi103.stan.posterior_to_dataframe(samples_exp,var_names=var_names)
In [84]:
def hist_ecdf_param(par):
    p = bokeh_catplot.histogram(df_exp, val=f'mu_{par}I',palette=['maroon'],width=450,height=100,
                                fill_kwargs=dict(fill_alpha=0.8, line_alpha=0.3),
                                title=f'{par} bacterium I: exponential (maroon) vs. linear (pink)',)
    q = bokeh_catplot.histogram(df_exp, val=f'mu_{par}II',palette=['maroon'],width=450,height=100,
                                fill_kwargs=dict(fill_alpha=0.8, line_alpha=0.3),
                                title=f'{par} bacterium II: exponential (maroon) vs. linear (pink)')
    r = bokeh_catplot.histogram(df_lin, val=f'mu_{par}I',palette=['darksalmon'],p=p,
                                fill_kwargs=dict(fill_alpha=0.9, line_alpha=0.3))
    s = bokeh_catplot.histogram(df_lin, val=f'mu_{par}II',palette=['darksalmon'],p=q,
                                fill_kwargs=dict(fill_alpha=0.9, line_alpha=0.3))
    t = bokeh_catplot.ecdf(df_exp, val=f'mu_{par}I',palette=['maroon'], width=450, height=250)
    u = bokeh_catplot.ecdf(df_lin, val=f'mu_{par}I',palette=['darksalmon'], p=t)
    v = bokeh_catplot.ecdf(df_exp, val=f'mu_{par}II',palette=['maroon'], width=450, height=250)
    w = bokeh_catplot.ecdf(df_lin, val=f'mu_{par}II',palette=['darksalmon'], p=v)
    p.xaxis.visible,q.xaxis.visible,=False,False
    p.toolbar.autohide,q.toolbar.autohide,t.toolbar.autohide,v.toolbar.autohide=True,True,True,True
    bokeh.io.show(bokeh.layouts.layout([[p,q],[t,v]]))
    pass
In [85]:
hist_ecdf_param('ao')
hist_ecdf_param('k')

The linear parameters' compensation is clear when we see where the distributions for μ_k and μ_ao fall for both bacterium. Furthermore, in both cases, we can see that the exponential model consistently gives us a smaller spread for its parameters, meaning that its parameters have to work less hard to account for our data.

But inspired from some starry plots from last term's 9.2 solution, let's now see how these parameters behave together.

  • All growth cycles are pooled together.
  • Each bar is a density from 4000 draws from stan belonging to each parameter in a particular growth cycle.
  • The location of one parameter's bar along the other axis is @ the median of that other parameter, so each 't' is a cycle.
  • The x and y ranges are set equal for comparison purposes.
In [86]:
p = bokeh.plotting.figure(width=500,height=500,title='95% bars: linear vs. exponential',x_axis_label='k',y_axis_label='ao')

n_cycles,ao,k = [nI_cycles, nII_cycles],['aoI', 'aoII'],['kI','kII']
for i in [0,1]: # two bacterium
    lin_ao = samples_lin.posterior[ao[i]].values.T.reshape(n_cycles[i], n_chains*n_draws)
    lin_k = samples_lin.posterior[k[i]].values.T.reshape(n_cycles[i], n_chains*n_draws)
    exp_ao = samples_exp.posterior[ao[i]].values.T.reshape(n_cycles[i], n_chains*n_draws)
    exp_k = samples_exp.posterior[k[i]].values.T.reshape(n_cycles[i], n_chains*n_draws)
    for cycle in range(n_cycles[i]):
        low_lin_ao, high_lin_ao = az.hpd(lin_ao[cycle])
        low_lin_k, high_lin_k = az.hpd(lin_k[cycle])
        low_exp_ao, high_exp_ao = az.hpd(exp_ao[cycle])
        low_exp_k, high_exp_k = az.hpd(exp_k[cycle])
        p.line(x=np.median(lin_k[cycle]),y=(low_lin_ao, high_lin_ao),line_color='salmon',
               line_width=1.2,legend_label='linear')
        p.line(x=np.median(exp_k[cycle]), y=(low_exp_ao, high_exp_ao),line_color='maroon',
               line_width=1.2,legend_label='exponential')
        p.line(x=(low_lin_k, high_lin_k),y=np.median(lin_ao[cycle]),line_color='salmon',line_width=1.2)
        p.line(x=(low_exp_k, high_exp_k),y=np.median(exp_ao[cycle]),line_color='maroon',line_width=1.2)
p.legend.location, p.toolbar.autohide='bottom_right', True
bokeh.io.show(p)

Let's zooom in.

In [87]:
p.x_range=Range1d(0.004, 0.014)
p.y_range=Range1d(1.0,1.6)
bokeh.io.show(p)

YAY it is clear that the 't's from the exponential model are much smaller :)

LOCATION OF $\mu_{\sigma}$


An important parameter worth considering on its own is mu_sigma, which is a hyperparameter for the centers of our σ's that dictate how the areas spread about their curves. The model with a smaller $μ_σ$ again will further inform us of which model is more fitting.

In [88]:
p = bokeh_catplot.histogram(df_lin, val="mu_sigma", palette=['darksalmon'],width=450,height=150,
                           fill_kwargs=dict(fill_alpha=0.8, line_alpha=0.3),title="location of scale μ_σ: linear vs. exponential")
q = bokeh_catplot.histogram(df_exp, val="mu_sigma", palette=['maroon'],p=p,
                           fill_kwargs=dict(fill_alpha=0.8, line_alpha=0.3))
s = bokeh_catplot.ecdf(df_lin, val='mu_sigma',palette=['darksalmon'],width=450, height=250)
t = bokeh_catplot.ecdf(df_exp, val='mu_sigma',palette=['maroon'],p=s)
p.xaxis.visible,p.toolbar.autohide,s.toolbar.autohide=False,True,True
bokeh.io.show(bokeh.layouts.layout([p,s]))

The hierarchy gives us mu_sigma, the location of our error; the linear is higher

PREDICTIVE REGRESSIONS


  • life feels incomplete withought these final predictive regressions;zooming in, we can see the behavior characterized abovev by the parmaeters
  • i addded a loop so we don't get weird zig zags (in between frame values.,) index of inits keeps track of the index of our dataframe when the cycles are new
In [89]:
# let's add on to our very original plotitng function!
def regression_bokeh(df, width, bacterium,samples,param,n_draws):
    '''plots frames and areas of bacteria'''
    x, y = "frame", "areas (μm^2)"
    color = df["colored"]
    
    p = bokeh.plotting.figure(width=width, height=300, 
        x_axis_label=x, y_axis_label=y,
        tooltips=[(x, "@{frame}"), (y, "@{areas (μm^2)}"), ("cycle", "@{colored}")],
        title=f"{bacterium}",
    )
    inits = df[df['Δframes']==0].index.values
    # predictive regressions from `generated quantities`
    for i in range(len(inits)-1):
        start = inits[i]
        end = inits[i+1]
        yyy = samples.posterior[f'{param}'].values.T.reshape(len(df), n_draws)[start:end].T
        q = bebi103.viz.predictive_regression(yyy,df.frame.values[start:end],p=p,color='purple')
    # populate last cycle
    end = inits[-1]
    yyy = samples.posterior[f'{param}'].values.T.reshape(len(df), n_draws)[end:].T
    q = bebi103.viz.predictive_regression(yyy,df.frame.values[end:],p=p,color='purple')
    
    # actual dataa
    p.circle(source=df.loc[color % 2 == 0], x=x, y=y, color="maroon",size=2)
    p.circle(source=df.loc[color % 2 == 1], x=x, y=y, color="darksalmon",size=2)

    p.xgrid.grid_line_color,p.ygrid.grid_line_color = None, None
    p.outline_line_color = None
    p.title.align, p.title.text_font_style = "center", "bold"
    return p
In [90]:
p = regression_bokeh(df1,350,'bacterium 1: linear predictives',samples_lin,'areasI_post',4000)
q = regression_bokeh(df2,900,'bacterium 2: linear predictives',samples_lin,'areasII_post',4000)
r = regression_bokeh(df1,350,'bacterium 1: exponential predictives',samples_exp,'areasI_post',4000)
s = regression_bokeh(df2,900,'bacterium 2: exponential predictives',samples_exp,'areasII_post',4000)
bokeh.io.show(bokeh.layouts.layout([[p,q]]))
bokeh.io.show(bokeh.layouts.layout([[r,s]]))
In [91]:
bebi103.stan.clean_cmdstan()
In [ ]: