Introduction¶
Here we attempt to separate corn plants from NPC for the ACC Test 1 experiment using "dynamics" features created from <pipeline-01-dynamics.ipynb>.
Keep in mind that for each device, paddle, and electrode we transform the V1, V255 trajectory into a new coordinate system, which roughly corresponds to how much those two values move in tandom and how much they diverge. We will show below that there seems to be signal in the latter for separating plants from NPC.
Setup¶
import os
import h5py
import pandas as pd
import numpy as np
import progressbar
import importlib as il
from matplotlib import pyplot as plt
import plotnine as gg
from torch.utils.data import Dataset
import statsmodels.api as sm
from patsy import dmatrices
%matplotlib inline
il.import_module('rtphenos')
<module 'rtphenos' from '/home/jesse/rtphenos/src/rtphenos/__init__.py'>
from rtphenos.detects2 import transforms as transforms2
from rtphenos.detects1 import transforms as transforms1
Get list of data to process¶
rt_deps = pd.read_csv("rt_acc1_deps.csv")
rt_deps.head()
deployment_id | roottracker_serial | ram_serial | x | y | bench | crop | genotype | order | pot_on_bench | pot_on_row | rep | row | deployment_name | NPC | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 54 | EG00043 | D76080D25B7EA126 | 1.0 | 11.0 | 2 | Soy | PI416937 | 11 | 2 | 11 | 1 | 1 | ACC Test 1 | False |
1 | 54 | EG00044 | D76080D25B3BD828 | 1.0 | 3.0 | 1 | Wheat | Comet | 3 | 3 | 3 | 1 | 1 | ACC Test 1 | False |
2 | 54 | EG00045 | D76080D25B7F7526 | 4.0 | 7.0 | 1 | Cotton | PI513384 | 115 | 7 | 7 | 1 | 4 | ACC Test 1 | False |
3 | 54 | EG00046 | D76080D25B3C9628 | 2.0 | 11.0 | 2 | Soy | PI416937 | 47 | 2 | 11 | 1 | 2 | ACC Test 1 | False |
4 | 54 | EG00047 | D76080D25B3A6C28 | 3.0 | 8.0 | 1 | Cotton | PI513384 | 80 | 8 | 8 | 1 | 3 | ACC Test 1 | False |
We want to limit our attention to just corn and NPC devices, since we believe that will have the clearest separation.
rt_deps_to_use = rt_deps.query("(crop == 'Corn' | crop == 'NPC')").reset_index(drop=True).reset_index(drop=False)
rt_deps_to_use['rt_serial'] = rt_deps_to_use['roottracker_serial']
Load data¶
We use the DataSet class from PyTorch to access the data we have created.
# This is an old approach that let us load all the data in one go.
# out_dict = dict(dyn=list(), Y=list(), mean=list(), var=list(), c1=list())
# crop_list = list()
# rt_list = list()
# for rec in progressbar.progressbar(rt_deps.query("(crop == 'Corn' | crop == 'NPC')").iloc[0:,:].to_dict(orient='records')):
# DEP_NAME = rec['deployment_name']
# RT_SERIAL = rec['roottracker_serial']
# dynamics_base = os.path.join("workspace", "dynamics", DEP_NAME)
# try:
# dynamics_file = os.path.join(dynamics_base, f"{RT_SERIAL}.h5")
# with h5py.File(dynamics_file, 'r') as h5r:
# for k in out_dict.keys():
# out_dict[k].append(np.array(h5r[k])) # h5 is giving us a pointer to the data
# crop_list.append(rec['crop'])
# rt_list.append(RT_SERIAL)
# except Exception as e:
# print(e)
class Dynamics(Dataset):
def __init__(self, rt_deps, base_dir, key):
self.iter_idx = 0
if isinstance(rt_deps, str):
self.rt_deps = pd.read_csv(deps_file)
elif isinstance(rt_deps, pd.DataFrame):
self.rt_deps = rt_deps.reset_index()
else:
raise ValueError("rt_deps must be a path to a csv file or a Dataframe")
self.base_dir = base_dir
self.key = key
def __len__(self):
return self.rt_deps.shape[0]
def get_path(self, idx):
rt_serial = self.rt_deps['roottracker_serial'][idx]
full_path = os.path.join(self.base_dir, self.rt_deps['deployment_name'][idx], f"{rt_serial}.h5")
if not os.path.exists(full_path):
raise Exception(f"Path {full_path} does not exist")
return full_path
def get_shape(self, idx):
out = None
with h5py.File(self.get_path(idx), 'r') as h5r:
out = h5r[self.key].shape # h5 is giving us a pointer to the data
return out
def __getitem__(self, idx):
out = None
with h5py.File(self.get_path(idx), 'r') as h5r:
out = np.array(h5r[self.key]) # h5 is giving us a pointer to the data
return out
def lookup(self, rt_serial):
out = self.rt_deps.query(f"roottracker_serial == '{rt_serial}'")
nout = len(out)
if nout == 1:
return out.index[0]
elif nout > 1:
raise Exception("Found multiple")
else:
raise Exception("Found none")
def __iter__(self):
self.iter_idx = 0
return self
def __next__(self):
if self.iter_idx < len(self):
out = self[self.iter_idx]
self.iter_idx += 1
return out
raise StopIteration
Construct the objects for accessing our dynamics data:
base_dir = "/usr/local/share/rtphenos-workspace"
gm_data = Dynamics(rt_deps_to_use, os.path.join(base_dir, "dynamics"), key="mean")
gv_data = Dynamics(rt_deps_to_use, os.path.join(base_dir, "dynamics"), key="var")
c1_data = Dynamics(rt_deps_to_use, os.path.join(base_dir, "dynamics"), key="c1")
dyn_data = Dynamics(rt_deps_to_use, os.path.join(base_dir, "dynamics"), key="dyn")
gm_array = np.array([o for o in gm_data])
gv_array = np.array([o for o in gv_data])
c1_array = np.array([o for o in c1_data])
Global statistics¶
We we transform the original V1, V255 data into common and orthogonal directions of movement, we use the principal components decomposition. When doing that we keep track of the mean of the whole 2D trajectory and the variances of the transformed data.
Below we analyze how those global statistics differ between corn and NPC.
# gm_array = np.array([o for o in out_dict['mean']])
# gv_array = np.array([o for o in out_dict['var']])
# c1_array = np.array([o for o in out_dict['c1']])
crop_array = rt_deps_to_use['crop']
rt_array = rt_deps_to_use['roottracker_serial']
gm_array.shape, gv_array.shape, c1_array.shape # crop_array.shape, rt_array.shape
((48, 12, 22, 2), (48, 12, 22, 2), (48, 12, 22))
mix = pd.MultiIndex.from_product([rt_array, np.arange(12), np.arange(22)],names=["rt_serial", "paddle","electrode"])
# crop_df = pd.DataFrame(data=crop_array, index=rt_array, columns=["crop"]).reset_index(drop=False, names='rt_serial')
crop_df = rt_deps_to_use[['rt_serial', 'crop']]
gm_df = pd.DataFrame(data=gm_array.reshape((-1,2)), index=mix, columns=["gm1", "gm2"])
gv_df = pd.DataFrame(data=gv_array.reshape((-1,2)), index=mix, columns=["gv1", "gv2"])
gv_df['gtv'] = gv_df['gv1'] + gv_df['gv2']
c1_df = pd.DataFrame(data=c1_array.flatten(), index=mix, columns=["c1"])
# gm_df.head()
Here is our single, global statistics data frame that we will use for plotting.
gstats = pd.concat([gm_df, gv_df, c1_df], axis=1).reset_index(drop=False).merge(crop_df, on='rt_serial')
gstats['lgv1'] = np.log(gstats['gv1'])
gstats['lgv2'] = np.log(gstats['gv2'])
/home/jesse/rtphenos/venv-rtphenos2/lib/python3.10/site-packages/pandas/core/arraylike.py:396: RuntimeWarning: divide by zero encountered in log /home/jesse/rtphenos/venv-rtphenos2/lib/python3.10/site-packages/pandas/core/arraylike.py:396: RuntimeWarning: divide by zero encountered in log
gstats.head()
rt_serial | paddle | electrode | gm1 | gm2 | gv1 | gv2 | gtv | c1 | crop | lgv1 | lgv2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | EG00054 | 0 | 0 | 556.663980 | 589.691259 | 8476.007809 | 18.282411 | 8494.290219 | 0.998606 | Corn | 9.044995 | 2.905939 |
1 | EG00054 | 0 | 1 | 646.822567 | 699.660508 | 5339.363699 | 11.973631 | 5351.337330 | 0.994539 | Corn | 8.582862 | 2.482707 |
2 | EG00054 | 0 | 2 | 617.742343 | 677.520769 | 9575.443376 | 20.846439 | 9596.289815 | 0.993526 | Corn | 9.166957 | 3.037183 |
3 | EG00054 | 0 | 3 | 628.711593 | 681.512833 | 13617.099897 | 17.044937 | 13634.144834 | 0.994724 | Corn | 9.519082 | 2.835853 |
4 | EG00054 | 0 | 4 | 572.286423 | 618.721389 | 10021.243756 | 15.339421 | 10036.583177 | 0.995525 | Corn | 9.212462 | 2.730426 |
And here is the data frame with the mean of those values by crop, paddle, and electrode.
gstats_ave = gstats.groupby(['crop', 'paddle', 'electrode']).agg(
{'gm1': 'mean', 'gm2': 'mean', 'gv1': 'mean', 'gv2': 'mean', 'gtv': 'mean', 'c1': 'mean'}
).reset_index(drop=False)
gstats_ave.head()
crop | paddle | electrode | gm1 | gm2 | gv1 | gv2 | gtv | c1 | |
---|---|---|---|---|---|---|---|---|---|
0 | Corn | 0 | 0 | 738.257346 | 788.790427 | 8380.142913 | 33.700301 | 8413.843214 | 0.997881 |
1 | Corn | 0 | 1 | 718.479744 | 773.193609 | 6229.639808 | 42.715793 | 6272.355601 | 0.997577 |
2 | Corn | 0 | 2 | 689.364606 | 754.961427 | 6370.560862 | 58.982296 | 6429.543158 | 0.996659 |
3 | Corn | 0 | 3 | 692.777519 | 751.060568 | 7587.832542 | 55.438604 | 7643.271146 | 0.997187 |
4 | Corn | 0 | 4 | 698.128753 | 760.413589 | 6867.828686 | 52.119477 | 6919.948164 | 0.996454 |
The corn plants show much more variation in the first component compared to NPC.
gg.ggplot(gstats_ave) + gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='gv1')) + gg.facet_wrap("~crop")
<Figure Size: (640 x 480)>
Let's look at the second component:
gg.ggplot(gstats_ave) + gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='gv2')) + gg.facet_wrap("~crop")
<Figure Size: (640 x 480)>
For some reason, paddle 2 and 3 for both corn and NPC has much higher variance in the second component. Let's look at things with those paddles removed. After doing that, we see that there is much more variance for corn compared to NPC as well in component 2.
gg.ggplot(gstats_ave.query("(paddle != 2) & (paddle !=3)")) + gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='gv2')) + gg.facet_wrap("~crop")
<Figure Size: (640 x 480)>
# gstats.query("paddle == 2").groupby(['rt_serial', 'crop'])['gv2'].describe()
Now, let's actually break out each variance by device and crop.
# More of the action is in the variation of the second component, since this looks more uniform
# gg.ggplot(gstats.query("(paddle != 2) & (paddle !=3) & (rt_serial != 'EG00237') & (rt_serial != 'EG00226')")) + \
# gg.ggplot(gstats.query("(rt_serial != 'EG00237') & (rt_serial != 'EG00226')")) + \
gg.ggplot(gstats) + \
gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='lgv1')) + gg.facet_wrap("~ crop + rt_serial") + \
gg.theme(figure_size=(12, 8))
<Figure Size: (1200 x 800)>
# gg.ggplot(gstats.query("(paddle != 2) & (paddle !=3) & (rt_serial != 'EG00237') & (rt_serial != 'EG00226')")) + \
# gg.ggplot(gstats.query("(rt_serial != 'EG00237') & (rt_serial != 'EG00226')")) + \
gg.ggplot(gstats) + \
gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='lgv2')) + gg.facet_wrap("~ crop + rt_serial") + \
gg.theme(figure_size=(12, 8))
<Figure Size: (1200 x 800)>
For the variance in both the first and second component, we see that EG00237 and EG00226 and driving the problems. If we remove those two devices and then plot again, we see that the variation in the second component seems to most clearly differentiate the two groups.
bad_roottrackers = ['EG00237', 'EG00226'] # Corn
# bad_roottrackers = ['EG00133', 'EG00105', 'EG00226'] # Soy
# bad_roottrackers = ['EG00226'] # Wheat
gg.ggplot(gstats.query(f"~ (rt_serial in {bad_roottrackers})")) + \
gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='lgv1')) + gg.facet_wrap("~ crop + rt_serial") + \
gg.theme(figure_size=(12, 8))
<Figure Size: (1200 x 800)>
gg.ggplot(gstats.query(f"~ (rt_serial in {bad_roottrackers})")) + \
gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='lgv2')) + gg.facet_wrap("~ crop + rt_serial") + \
gg.theme(figure_size=(12, 8)) + \
gg.ggtitle("Log variance of the second principal component by electrode, paddle, and treatment")
<Figure Size: (1200 x 800)>
# gg.ggsave(p, filename="log-variance-second-component.png")
Data Exploration¶
It seems to be the case that variation in the second component differentiates the two groups. We want to see if we can find a way to flag specific sections of the dynamics to differentiate the two groups.
First, let's filter our data to remove RootTrackers with limited data or those that we flagged as bad.
# We have to filter out the one item that does not have the correct shape
# keep = np.array([o.shape[0] == 8043 for o in out_dict['dyn']])
keep = np.array([dyn_data.get_shape(i)[0] == 8043 for i in range(48)]) & (~rt_deps_to_use['rt_serial'].isin(bad_roottrackers))
# keep
Now compile the trajectories. This will take a minute or so:
# dyn_array = np.array([o[:,paddle_keep,:,:] for o in out_dict['dyn'] if o.shape[0] == 8043])
dyn_array = np.array([o for o, k in zip(dyn_data, keep) if k])
dyn_array.shape
(46, 8043, 12, 22, 8)
# crop_df.loc[keep,'crop'][0:10];
Let's plot the dynamics to see what they look like. For the standard deviations, we also plot on the log scale. We have computed the momentum of each component and the angular momentum with respect to the center of the trajectory.
One thing that's worth noticing is that, if we use the standard deviation as a proxy for when there are jumps, the jumps in mom_2
coincides with jumps in mom_1
, but there are times where a jump in mom_1
does not seem to include a jump in mom_2
.
attributes = ["loc_1", "loc_2", "mean_mom_1", "mean_mom_2", "std_mom_1", "std_mom_2", "mean_angm", "std_angm"]
fig, axs = plt.subplots(4, 2)
fig.tight_layout()
for i in range(8):
c = int(i/4)
r = i % 4
axs[r, c].plot(dyn_array[0,:,0,4,i])
axs[r, c].set_title(f"Stat {i}: {attributes[i]}")
if r == 3:
axs[r,c].set_xlabel("time (5 minute increments)", size=8)
fig, axs = plt.subplots(3, 1)
fig.tight_layout()
for i, j in enumerate([4, 5, 7]):
axs[i].plot(np.log(dyn_array[0,:,0,4,j] + 1e-3))
axs[i].set_title(f"Stat {j}: log {attributes[j]}")
Let's get the very high and very low for each attribute:
extremes = np.percentile(dyn_array, (1, 99), axis=(0, 1, 2, 3))
extremes.shape
(2, 8)
extremes
array([[-2.06517631e+02, -1.75383705e+01, -4.35344901e+00, -2.68861454e-01, 0.00000000e+00, 0.00000000e+00, -3.81601732e+00, 0.00000000e+00], [ 1.91960599e+02, 1.41603271e+01, 1.45102238e+00, 1.62018039e-01, 5.01240962e+02, 4.03651708e+00, 1.91341991e+00, 3.46234711e+01]])
To summarize those attributes, we compute the mean amount of time that we are above or below those extremes for each device.
low_integrated = np.mean(dyn_array[:,:,:,2:,:] <= extremes[0,:].reshape((1, 1, 1, 1, 8)), axis=(1, 2, 3))
high_integrated = np.mean(dyn_array[:,:,:,2:,:] >= extremes[1,:].reshape((1, 1, 1, 1, 8)), axis=(1, 2, 3))
low_df = pd.concat([pd.DataFrame(low_integrated, columns=attributes), crop_df.loc[keep,:].reset_index(drop=True)], axis=1)
high_df = pd.concat([pd.DataFrame(high_integrated, columns=attributes), crop_df.loc[keep,:].reset_index(drop=True)], axis=1)
low_df.head()
loc_1 | loc_2 | mean_mom_1 | mean_mom_2 | std_mom_1 | std_mom_2 | mean_angm | std_angm | rt_serial | crop | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.016282 | 0.012950 | 0.015381 | 0.012310 | 0.004264 | 0.004264 | 0.011759 | 0.004318 | EG00054 | Corn |
1 | 0.001625 | 0.000000 | 0.003682 | 0.001174 | 0.010529 | 0.010529 | 0.004160 | 0.010650 | EG00055 | NPC |
2 | 0.025127 | 0.019365 | 0.012861 | 0.014298 | 0.010715 | 0.010715 | 0.012822 | 0.010806 | EG00056 | Corn |
3 | 0.008031 | 0.020985 | 0.009055 | 0.010772 | 0.007537 | 0.007537 | 0.010538 | 0.007614 | EG00059 | Corn |
4 | 0.008623 | 0.016321 | 0.010708 | 0.012938 | 0.017491 | 0.017491 | 0.013330 | 0.017611 | EG00060 | Corn |
Looking at mom_2
, it seems that extreme values in one direction do separate corn from NPC.
gg.ggplot(high_df) + gg.geom_histogram(gg.aes(x='mean_mom_2')) + gg.facet_wrap("~crop")
/home/jesse/rtphenos/venv-rtphenos2/lib/python3.10/site-packages/plotnine/stats/stat_bin.py:109: PlotnineWarning: 'stat_bin()' using 'bins = 9'. Pick better value with 'binwidth'.
<Figure Size: (640 x 480)>
gg.ggplot(low_df) + gg.geom_histogram(gg.aes(x='mean_mom_2')) + gg.facet_wrap("~crop")
/home/jesse/rtphenos/venv-rtphenos2/lib/python3.10/site-packages/plotnine/stats/stat_bin.py:109: PlotnineWarning: 'stat_bin()' using 'bins = 4'. Pick better value with 'binwidth'.
<Figure Size: (640 x 480)>
Now, let's try to get a sense of how well all of these attributes my separate the two groups by comparing their average values across devices. Again, mom_2
seems to be the most informative.
low_df.loc[:,low_df.columns != 'rt_serial'].groupby('crop').median()
loc_1 | loc_2 | mean_mom_1 | mean_mom_2 | std_mom_1 | std_mom_2 | mean_angm | std_angm | |
---|---|---|---|---|---|---|---|---|
crop | ||||||||
Corn | 0.010785 | 0.020192 | 0.010708 | 0.012682 | 0.010255 | 0.010255 | 0.010192 | 0.01036 |
NPC | 0.004164 | 0.000653 | 0.008826 | 0.005156 | 0.010529 | 0.010529 | 0.008363 | 0.01065 |
high_df.loc[:,low_df.columns != 'rt_serial'].groupby('crop').median()
loc_1 | loc_2 | mean_mom_1 | mean_mom_2 | std_mom_1 | std_mom_2 | mean_angm | std_angm | |
---|---|---|---|---|---|---|---|---|
crop | ||||||||
Corn | 0.015289 | 0.019067 | 0.008533 | 0.007389 | 0.010431 | 0.008228 | 0.008807 | 0.010377 |
NPC | 0.001130 | 0.001321 | 0.008012 | 0.004919 | 0.007515 | 0.003631 | 0.007293 | 0.007891 |
Identifying anomalous portions of trajectories¶
We know we want to focus on mom_2
. Here we will use that attribute (along with potentially other information) to try to flag portions of trajectories that are "anomalous" in some sense. We then try to consolidate anomalous trajectories to avoid things like double counting. At the end, we will then plot the integrated amount or count of anomalous time.
extremes[:,3], extremes[:,5], np.exp(-1.0)
(array([-0.26886145, 0.16201804]), array([0. , 4.03651708]), 0.36787944117144233)
# # Use all of extreme mom_2 --- if you do this the integration separates things, but the count does not
# traj_list = list()
# # for tup in [(3,0), (4,1), (5,1)]:
# for tup in [(3,0),]:
# for i, rec in enumerate(progressbar.progressbar(crop_df.loc[keep,:].to_dict(orient='records'))):
# hl = tup[1]
# thresh = extremes[hl, tup[0]]
# if hl == 1:
# temp_df = transforms1.detection_trajectories(dyn_array[i,:,:,:,tup[0]] > thresh)
# else:
# temp_df = transforms1.detection_trajectories(dyn_array[i,:,:,:,tup[0]] < thresh)
# temp_df['weight'] = thresh
# # Consolidating takes some time, either here or below - if we do it here we save mem
# # temp_df = transforms1.consolidate_trajectories(temp_df, grouping=['paddle', 'electrode'], cush=6)
# temp_df['attribute'] = attributes[tup[0]]
# temp_df['rt_serial'] = rec['rt_serial']
# temp_df['crop'] = rec['crop']
# traj_list.append(temp_df)
# Use extreme mom_2 except for when it jumps - there is one NPC that has lots of extreme mom_2 because of jumps.
# Though it may not make a difference here, later it will be key to also exclude when we are interpolating.
traj_list = list()
for i, rec in enumerate(progressbar.progressbar(crop_df.loc[keep,:].to_dict(orient='records'))):
thresh = extremes[0, 3]
temp_df = transforms1.detection_trajectories(
(dyn_array[i,:,:,:,3] < extremes[0,3]) &
(dyn_array[i,:,:,:,5] < np.exp(-1.0)) &
(dyn_array[i,:,:,:,5] > np.exp(-2.0)) # If you interpolate over a large period you have 0 std
)
temp_df['weight'] = thresh
temp_df['attribute'] = attributes[3]
temp_df['rt_serial'] = rec['rt_serial']
temp_df['crop'] = rec['crop']
traj_list.append(temp_df)
100% (46 of 46) |########################| Elapsed Time: 0:00:03 Time: 0:00:03
Now we try to consolidate the trajectories as much as possible.
# Create a single data frame with all trajectories.
traj_df_1 = pd.concat(traj_list)
traj_df_1['length'] = traj_df_1['end'] - traj_df_1['start']
traj_df_1.shape
(14671, 9)
# traj_df_1.head()
traj_df_1_int = traj_df_1.groupby(["crop", "rt_serial"]).agg({'length': 'sum'}).reset_index(drop=False)
traj_df_1_int['log_length'] = np.log(traj_df_1_int['length'])
# traj_df_1_int.groupby("crop").agg({'length': 'describe'})
# traj_df_1_int.sort_values(["length"])
# gg.ggplot(traj_df_1_int) + gg.geom_histogram(gg.aes('length', fill='crop'), bins=10)
# gg.ggplot(traj_df_1_int) + gg.geom_histogram(gg.aes('log_length', fill='crop'))
pd.crosstab(traj_df_1['crop'], traj_df_1['length'])
length | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 46 | 47 | 50 | 51 | 52 | 53 | 54 | 55 | 58 | 67 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
crop | |||||||||||||||||||||
Corn | 4078 | 1581 | 785 | 525 | 380 | 276 | 267 | 208 | 171 | 150 | ... | 3 | 1 | 2 | 2 | 1 | 1 | 2 | 2 | 1 | 1 |
NPC | 2270 | 824 | 391 | 259 | 162 | 131 | 124 | 93 | 88 | 69 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 54 columns
# traj_df_1['electrode'].value_counts()
# Consolidate trajectories that nearly overlap in time on the same device, paddle, and electrode.
# traj_df_2 = transforms1.flag_overlapping_trajectories(traj_df_1, grouping=['attribute', 'rt_serial', 'paddle', 'electrode'], cush=6)
traj_df_2 = transforms1.consolidate_trajectories(traj_df_1.query("electrode > 0"), grouping=['attribute', 'crop', 'rt_serial', 'paddle', 'electrode'], cush=6)
# traj_df_2 = traj_df_1
traj_df_2.shape
(8648, 9)
# traj_df_2.head()
# Consolidate trajectories that overlap in time and on adjascent electrodes on the same device, paddle.
# traj_df_3 = transforms1.consolidate_trajectories(traj_df_2.sort_values(['attribute', 'rt_serial', 'paddle', 'start', 'end']), grouping=['attribute', 'rt_serial', 'paddle'], cush=6)
# traj_df_3.shape
traj_df_3a = transforms1.flag_overlapping_trajectories(traj_df_2.sort_values(['attribute', 'crop', 'rt_serial', 'start', 'end']),
grouping=['attribute', 'crop', 'rt_serial', 'paddle'], cush=6)
# traj_df_3a.head(n=10)
traj_df_3 = traj_df_3a.groupby(['attribute', 'crop', 'rt_serial', 'paddle', 'group_num']).agg({
'start': 'min',
'end': 'max',
'electrode': 'min'
}).reset_index(drop=False)
traj_df_3.shape
(4582, 8)
# traj_df_3.head()
# Remove trajectories that overlap in time and on adjascent paddles (we do not use modular arithmatic here, so this is only approximate).
traj_df_4a = transforms1.flag_overlapping_trajectories(traj_df_3.sort_values(['attribute', 'crop', 'rt_serial', 'start', 'end']), grouping=['attribute', 'crop', 'rt_serial'], cush=6)
traj_df_4a['num_in_group'] = traj_df_4a.groupby(['group_num'])['group_num'].transform(lambda x: len(x))
traj_df_4a.shape
(4582, 10)
traj_df_4a.head()
attribute | crop | rt_serial | paddle | group_num | start | end | electrode | overlap_prev | num_in_group | |
---|---|---|---|---|---|---|---|---|---|---|
0 | mean_mom_2 | Corn | EG00054 | 10 | 1 | 36 | 37 | 20 | False | 1 |
1 | mean_mom_2 | Corn | EG00054 | 10 | 2 | 383 | 396 | 20 | False | 1 |
2 | mean_mom_2 | Corn | EG00054 | 10 | 3 | 534 | 550 | 8 | False | 3 |
3 | mean_mom_2 | Corn | EG00054 | 3 | 3 | 535 | 549 | 12 | True | 3 |
4 | mean_mom_2 | Corn | EG00054 | 2 | 3 | 537 | 538 | 10 | True | 3 |
There are a lot of trajectories that occur on just 1 or 2 paddles. If you just keep the number of trajectories that have only one active paddle, then you filter 2/3 of your detections! Whether we use 1 or 1 and 2, we still get reasonable separation of NPC and Corn below. We could also do this after the fact by modeling.
traj_df_4a['num_in_group'].value_counts()
num_in_group 1 1706 2 814 3 465 4 352 5 305 6 252 7 189 8 168 10 100 9 90 11 88 12 24 16 16 13 13 Name: count, dtype: int64
In fact, we seem to get roughly 1/4 to 1/5 as many detections across paddles on NPC compared to Corn for relatively small numbers of paddles, but that increases as the number of coincident paddles goes up.
pd.crosstab(traj_df_4a['num_in_group'], traj_df_4a['crop'])
crop | Corn | NPC |
---|---|---|
num_in_group | ||
1 | 1386 | 320 |
2 | 674 | 140 |
3 | 366 | 99 |
4 | 244 | 108 |
5 | 210 | 95 |
6 | 162 | 90 |
7 | 147 | 42 |
8 | 128 | 40 |
9 | 54 | 36 |
10 | 60 | 40 |
11 | 55 | 33 |
12 | 24 | 0 |
13 | 0 | 13 |
16 | 0 | 16 |
# We remove 2/3 of trajectories by removing things at occur at the same time!!!
traj_df_4 = traj_df_4a.query("num_in_group <= 1")
traj_df_4.shape
(1706, 10)
# traj_df_4.head()
traj_df_5 = traj_df_4.copy()
# traj_df_5 = traj_df_4.merge(crop_df, on='rt_serial')
traj_df_5.shape
(1706, 10)
traj_df_5['length'] = traj_df_5['end'] - traj_df_5['start']
traj_df_5['day']= np.floor(traj_df_5['start'] / (12 * 24)).astype(int)
traj_df_5.head()
attribute | crop | rt_serial | paddle | group_num | start | end | electrode | overlap_prev | num_in_group | length | day | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | mean_mom_2 | Corn | EG00054 | 10 | 1 | 36 | 37 | 20 | False | 1 | 1 | 0 |
1 | mean_mom_2 | Corn | EG00054 | 10 | 2 | 383 | 396 | 20 | False | 1 | 13 | 1 |
7 | mean_mom_2 | Corn | EG00054 | 10 | 5 | 557 | 562 | 20 | False | 1 | 5 | 1 |
8 | mean_mom_2 | Corn | EG00054 | 11 | 6 | 647 | 668 | 8 | False | 1 | 21 | 2 |
14 | mean_mom_2 | Corn | EG00054 | 0 | 9 | 973 | 993 | 7 | False | 1 | 20 | 3 |
Let's look at the distribution of trajectory lengths by group. Corn seems to exceed NPC usually.
gg.ggplot(traj_df_5) + gg.geom_histogram(gg.aes(x='length', fill='crop'), bins=20) + gg.facet_wrap("~attribute")
<Figure Size: (640 x 480)>
To create a single measure for each RootTracker we'll try two different things: i) we will integrate the total amount of anomalous trajectory time, and ii) we will count the number of what we have collapsed into distinct trajectories.
Filtering out short trajectories, even just trajectories less than 30 minutes can be important here, especially in the case of looking at extreme mom_2
and nothing else.
# We integrate here, which can be critical to getting a difference if you do not filter out short-length anomalies.
traj_df_6 = traj_df_5.query("length > 3").groupby(['attribute', 'crop', 'rt_serial']).agg({'length': 'sum', 'start': 'size'}).reset_index(drop=False)
traj_df_6['count'] = traj_df_6.pop('start')
traj_df_6.shape
(45, 5)
# traj_df_6.head()
Looking at differences in cumulative time:
gg.ggplot(traj_df_6) + gg.geom_histogram(gg.aes(x='length', fill='crop'), bins=20) + gg.facet_wrap("~attribute") + \
gg.ggtitle("Distribution of cumulative anomalous trajectory time by crop")
<Figure Size: (640 x 480)>
Looking at differences in counts:
gg.ggplot(traj_df_6) + gg.geom_histogram(gg.aes(x='count', fill='crop'), bins=20) + gg.facet_wrap("~attribute") + \
gg.ggtitle("Distribution of number of anomalous trajectories by crop")
<Figure Size: (640 x 480)>
Aside: One could interpret the results above as saying something about our false positive rate. It seems there are somewhere between 0-15 false detections for the NPCs. Say this is over 30 days, which is 0-0.5 per day. So let's say we expect p=0.25 false detections per day. If each day is an iid Bernoulli random variable, then this sums to a binomial random variable with mean 30p and variance 30p(1-p).
Of course, this assumes that there are the same number of false positives for the corn plants, which is not necessarily the case as the presence of roots might actually stabalize the soil and prevent soil shifting without cause. Even if it is the case, our observations should include those perturbations, so comparing differences of treatments should still be valid.
Now let's look at how the detections vary over time. First, we look on a device-by-device basis and then we group by crop. It appears that there are two or three periods of increased activity for corn.
It looks like there are time like t=~800 when there is consistent NPC activity, which could also be indicative of watering events we haven't already filtered out. One could incorporate this into a second-stage model.
gg.ggplot(traj_df_5.query("length > 3")) + gg.geom_point(gg.aes('start', 'rt_serial', fill='crop')) + \
gg.ggtitle("Detection times by device")
<Figure Size: (640 x 480)>
traj_df_day = traj_df_5.groupby(['crop', 'rt_serial', 'day']).agg({'length': 'sum', 'start': 'size', 'electrode': 'mean'}).reset_index(drop=False)
traj_df_day['day'] = pd.Categorical(traj_df_day['day'], ordered=True)
# traj_df_day.head()
gg.ggplot(traj_df_day) + gg.geom_boxplot(gg.aes('day', 'start')) + gg.facet_wrap("~ crop") + \
gg.ggtitle("Distribution of number of detections by day")
<Figure Size: (640 x 480)>
You can also see that where we register the detections steadily gets deeper for corn. This also gives us another further option for filtering out spurious detections --- if we assume that the depth of detections will be changing smoothly and within a certain band.
gg.ggplot(traj_df_day) + gg.geom_boxplot(gg.aes('day', 'electrode')) + gg.facet_wrap("~ crop") + \
gg.ggtitle("Distribution of number of detections by day")
<Figure Size: (640 x 480)>
Global metrics for separation¶
In order to screen trials, it seems that employing variance in the second component may be useful. Here we quantify how one might do that.
gg.ggplot(gstats_ave.query("(paddle != 2) & (paddle !=3)")) + gg.geom_point(gg.aes(x='gv2', y='crop')) + gg.facet_wrap("~electrode")
<Figure Size: (640 x 480)>
temp_y, temp_X = dmatrices('crop ~ np.log(gv2)', data=gstats_ave, return_type='dataframe')
temp_y
crop[Corn] | crop[NPC] | |
---|---|---|
0 | 1.0 | 0.0 |
1 | 1.0 | 0.0 |
2 | 1.0 | 0.0 |
3 | 1.0 | 0.0 |
4 | 1.0 | 0.0 |
... | ... | ... |
523 | 0.0 | 1.0 |
524 | 0.0 | 1.0 |
525 | 0.0 | 1.0 |
526 | 0.0 | 1.0 |
527 | 0.0 | 1.0 |
528 rows × 2 columns
mod = sm.GLM(temp_y, temp_X, family=sm.families.Binomial())
results = mod.fit()
print(results.summary())
Generalized Linear Model Regression Results ======================================================================================= Dep. Variable: ['crop[Corn]', 'crop[NPC]'] No. Observations: 528 Model: GLM Df Residuals: 526 Model Family: Binomial Df Model: 1 Link Function: Logit Scale: 1.0000 Method: IRLS Log-Likelihood: -234.15 Date: Wed, 01 Nov 2023 Deviance: 468.29 Time: 15:19:04 Pearson chi2: 739. No. Iterations: 5 Pseudo R-squ. (CS): 0.3931 Covariance Type: nonrobust =============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------- Intercept -5.0223 0.428 -11.739 0.000 -5.861 -4.184 np.log(gv2) 1.5071 0.121 12.415 0.000 1.269 1.745 ===============================================================================
Analysis of specific trajectories ...¶
Now let's look at specific trajectories. The hope is that we might see a consistent pattern in the NPCs that we do not see for the corn plants.
Maybe we see a tendency for the NPC to have a cursive, reverse J shape, which quickly drops and then moves right.
Since we are watching movement down, one other thing we could consider is the starting level. I we start at a high level relative to the whole time series, it shouldn't be surprising that we go down.
Y_data = Dynamics(rt_deps_to_use, "/usr/local/share/rtphenos-workspace/dynamics", "Y")
dyn_data.get_shape(0), Y_data.get_shape(0)
((8043, 12, 22, 8), (12, 22, 8065, 2))
traj_df_plot = traj_df_4.query("end > (start + 12) & (electrode > 0)").merge(rt_deps_to_use[['rt_serial', 'index']], on='rt_serial')
traj_df_plot.shape # , traj_df_plot.shape[0] / len(traj_df_plot['rt_serial'].unique())
(418, 11)
traj_df_plot.head()
attribute | crop | rt_serial | paddle | group_num | start | end | electrode | overlap_prev | num_in_group | index | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | mean_mom_2 | Corn | EG00054 | 10 | 2 | 383 | 396 | 20 | False | 1 | 0 |
1 | mean_mom_2 | Corn | EG00054 | 11 | 6 | 647 | 668 | 8 | False | 1 | 0 |
2 | mean_mom_2 | Corn | EG00054 | 0 | 9 | 973 | 993 | 7 | False | 1 | 0 |
3 | mean_mom_2 | Corn | EG00054 | 11 | 12 | 1221 | 1235 | 10 | False | 1 | 0 |
4 | mean_mom_2 | Corn | EG00054 | 5 | 16 | 1530 | 1544 | 6 | False | 1 | 0 |
traj_df_plot['crop'].value_counts()
crop Corn 368 NPC 50 Name: count, dtype: int64
def plot_some(Y_data, traj_df, nr, nc, half_width=11, extra_start=0, extra_end=0, figsize=(12,8), diff=False, plot_against_time=False):
fig, axs = plt.subplots(nr, nc, squeeze=False, figsize=figsize)
fig.tight_layout()
num = nr * nc
cache = dict()
for i, rec in enumerate(traj_df.sample(num).to_dict(orient='records')):
idx = rec['index']
elec = rec['electrode']
pad = rec['paddle']
start = rec['start'] + half_width - extra_start
end = rec['end'] + half_width + extra_end
if not idx in cache.keys():
cache[idx] = Y_data[idx]
temp_data = cache[idx][pad, elec, start:end, :]
if diff:
temp_data = np.diff(temp_data, axis=0)
temp_diff = np.mean(np.diff(temp_data, axis=0), axis=0)
r = i % nr
c = int(i / nr)
if plot_against_time:
axs[r, c].plot(range(start, end), temp_data[:,0])
axs[r, c].plot(range(start, end), temp_data[:,1])
axs[r, c].scatter([start + extra_start, end - extra_end - 1], [temp_data[extra_start,0], temp_data[(-extra_end+1),0]])
axs[r, c].scatter([start + extra_start, end - extra_end - 1], [temp_data[extra_start,1], temp_data[(-extra_end+1),1]])
else:
axs[r, c].plot(temp_data[:,0], temp_data[:,1])
axs[r, c].scatter(temp_data[:,0], temp_data[:,1])
axs[r, c].scatter(temp_data[extra_start,0], temp_data[extra_start,1])
axs[r, c].scatter(temp_data[-(extra_end+1),0], temp_data[-(extra_end+1),1])
axs[r, c].set_title(f"E{elec} from {start}-{end} ({len(temp_data)}) for {rec['crop']}")
# axs[r, c].set_title(f"{temp_diff}")
return fig, axs
The orange dot represents the start of a flagged trajectory and a green dot represents the end. We pad the trajectories to see what is happening before and after.
fig, axs = plot_some(Y_data, traj_df_plot.query("crop == 'Corn'"), 5, 4, 11, extra_start=50, extra_end=50);
fig.show()
fig, axs = plot_some(Y_data, traj_df_plot.query("crop == 'NPC'"), 5, 4, 11, extra_start=50, extra_end=50);
fig.show()
# fig, axs = plot_some(Y_data, traj_df_plot.query("crop == 'Corn'"), 5, 4, 11, extra_start=0, extra_end=0, diff=True);
# fig.show()
# fig, axs = plot_some(Y_data, traj_df_plot.query("crop == 'NPC'"), 5, 4, 11, extra_start=50, extra_end=50, diff=True);
# fig.show()
Here we look at the components of Y over time.
fig, axs = plot_some(Y_data, traj_df_plot.query("crop == 'Corn'"), 5, 4, 11, extra_start=50, extra_end=50, plot_against_time=True)
fig, axs = plot_some(Y_data, traj_df_plot.query("crop == 'NPC'"), 5, 4, 11, extra_start=50, extra_end=50, plot_against_time=True)
# Can we describe dynamics of 2d time series?
# Can we classify via differences in the flagged trajectories above?
# Can we mimic consolidation by neural net to produce a differentiable approximation?
Conclusion¶
Herein we have shown that you can separate corn plants from no plants by monitoring movements in the second principal component.