import os
import h5py
import pandas as pd
import numpy as np
import progressbar
import importlib as il
from matplotlib import pyplot as plt
import plotnine as gg
from torch.utils.data import Dataset
import statsmodels.api as sm
from patsy import dmatrices

%matplotlib inline

il.import_module('rtphenos')

<module 'rtphenos' from '/home/jesse/rtphenos/src/rtphenos/__init__.py'>

from rtphenos.detects2 import transforms as transforms2
from rtphenos.detects1 import transforms as transforms1

rt_deps = pd.read_csv("rt_acc1_deps.csv")
rt_deps.head()

rt_deps_to_use = rt_deps.query("(crop == 'Corn' | crop == 'NPC')").reset_index(drop=True).reset_index(drop=False)
rt_deps_to_use['rt_serial'] = rt_deps_to_use['roottracker_serial']

# This is an old approach that let us load all the data in one go.
# out_dict = dict(dyn=list(), Y=list(), mean=list(), var=list(), c1=list())
# crop_list = list()
# rt_list = list()
# for rec in progressbar.progressbar(rt_deps.query("(crop == 'Corn' | crop == 'NPC')").iloc[0:,:].to_dict(orient='records')):
#     DEP_NAME = rec['deployment_name']
#     RT_SERIAL = rec['roottracker_serial']
#     dynamics_base = os.path.join("workspace", "dynamics", DEP_NAME)
#     try:
#         dynamics_file = os.path.join(dynamics_base, f"{RT_SERIAL}.h5")
#         with h5py.File(dynamics_file, 'r') as h5r:
#             for k in out_dict.keys():
#                 out_dict[k].append(np.array(h5r[k])) # h5 is giving us a pointer to the data
#             crop_list.append(rec['crop'])
#             rt_list.append(RT_SERIAL)
#     except Exception as e:
#         print(e)

class Dynamics(Dataset):

    def __init__(self, rt_deps, base_dir, key):
        self.iter_idx = 0
        if isinstance(rt_deps, str):
            self.rt_deps = pd.read_csv(deps_file)
        elif isinstance(rt_deps, pd.DataFrame):
            self.rt_deps = rt_deps.reset_index()
        else:
            raise ValueError("rt_deps must be a path to a csv file or a Dataframe")
        self.base_dir = base_dir
        self.key = key

    def __len__(self):
        return self.rt_deps.shape[0]

    def get_path(self, idx):
        rt_serial = self.rt_deps['roottracker_serial'][idx]
        full_path = os.path.join(self.base_dir, self.rt_deps['deployment_name'][idx], f"{rt_serial}.h5")
        if not os.path.exists(full_path):
            raise Exception(f"Path {full_path} does not exist")
        return full_path        

    def get_shape(self, idx):
        out = None
        with h5py.File(self.get_path(idx), 'r') as h5r:
            out = h5r[self.key].shape # h5 is giving us a pointer to the data
        return out

    def __getitem__(self, idx):
        out = None
        with h5py.File(self.get_path(idx), 'r') as h5r:
            out = np.array(h5r[self.key]) # h5 is giving us a pointer to the data
        return out

    def lookup(self, rt_serial):
        out = self.rt_deps.query(f"roottracker_serial == '{rt_serial}'")
        nout = len(out)
        if nout == 1:
            return out.index[0]
        elif nout > 1:
            raise Exception("Found multiple")
        else:
            raise Exception("Found none")

    def __iter__(self):
        self.iter_idx = 0
        return self

    def __next__(self):
        if self.iter_idx < len(self):
            out = self[self.iter_idx]
            self.iter_idx += 1
            return out
        raise StopIteration

base_dir = "/usr/local/share/rtphenos-workspace"

gm_data = Dynamics(rt_deps_to_use, os.path.join(base_dir, "dynamics"), key="mean")
gv_data = Dynamics(rt_deps_to_use, os.path.join(base_dir, "dynamics"), key="var")
c1_data = Dynamics(rt_deps_to_use, os.path.join(base_dir, "dynamics"), key="c1")
dyn_data = Dynamics(rt_deps_to_use, os.path.join(base_dir, "dynamics"), key="dyn")

gm_array = np.array([o for o in gm_data])
gv_array = np.array([o for o in gv_data])
c1_array = np.array([o for o in c1_data])

# gm_array = np.array([o for o in out_dict['mean']])
# gv_array = np.array([o for o in out_dict['var']])
# c1_array = np.array([o for o in out_dict['c1']])
crop_array = rt_deps_to_use['crop']
rt_array = rt_deps_to_use['roottracker_serial']
gm_array.shape, gv_array.shape, c1_array.shape # crop_array.shape, rt_array.shape

((48, 12, 22, 2), (48, 12, 22, 2), (48, 12, 22))

mix = pd.MultiIndex.from_product([rt_array, np.arange(12), np.arange(22)],names=["rt_serial", "paddle","electrode"])

# crop_df = pd.DataFrame(data=crop_array, index=rt_array, columns=["crop"]).reset_index(drop=False, names='rt_serial')
crop_df = rt_deps_to_use[['rt_serial', 'crop']]
gm_df = pd.DataFrame(data=gm_array.reshape((-1,2)), index=mix, columns=["gm1", "gm2"])
gv_df = pd.DataFrame(data=gv_array.reshape((-1,2)), index=mix, columns=["gv1", "gv2"])
gv_df['gtv'] = gv_df['gv1'] + gv_df['gv2']
c1_df = pd.DataFrame(data=c1_array.flatten(), index=mix, columns=["c1"])

# gm_df.head()

gstats = pd.concat([gm_df, gv_df, c1_df], axis=1).reset_index(drop=False).merge(crop_df, on='rt_serial')
gstats['lgv1'] = np.log(gstats['gv1'])
gstats['lgv2'] = np.log(gstats['gv2'])

/home/jesse/rtphenos/venv-rtphenos2/lib/python3.10/site-packages/pandas/core/arraylike.py:396: RuntimeWarning: divide by zero encountered in log
/home/jesse/rtphenos/venv-rtphenos2/lib/python3.10/site-packages/pandas/core/arraylike.py:396: RuntimeWarning: divide by zero encountered in log

gstats.head()

gstats_ave = gstats.groupby(['crop', 'paddle', 'electrode']).agg(
    {'gm1': 'mean', 'gm2': 'mean', 'gv1': 'mean', 'gv2': 'mean', 'gtv': 'mean', 'c1': 'mean'}
).reset_index(drop=False)

gstats_ave.head()

gg.ggplot(gstats_ave) + gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='gv1')) + gg.facet_wrap("~crop")

<Figure Size: (640 x 480)>

gg.ggplot(gstats_ave) + gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='gv2')) + gg.facet_wrap("~crop")

<Figure Size: (640 x 480)>

gg.ggplot(gstats_ave.query("(paddle != 2) & (paddle !=3)")) + gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='gv2')) + gg.facet_wrap("~crop")

<Figure Size: (640 x 480)>

# gstats.query("paddle == 2").groupby(['rt_serial', 'crop'])['gv2'].describe()

# More of the action is in the variation of the second component, since this looks more uniform
# gg.ggplot(gstats.query("(paddle != 2) & (paddle !=3) & (rt_serial != 'EG00237') & (rt_serial != 'EG00226')")) + \
# gg.ggplot(gstats.query("(rt_serial != 'EG00237') & (rt_serial != 'EG00226')")) + \
gg.ggplot(gstats) + \
  gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='lgv1')) + gg.facet_wrap("~ crop + rt_serial") + \
  gg.theme(figure_size=(12, 8))

<Figure Size: (1200 x 800)>

# gg.ggplot(gstats.query("(paddle != 2) & (paddle !=3) & (rt_serial != 'EG00237') & (rt_serial != 'EG00226')")) + \
# gg.ggplot(gstats.query("(rt_serial != 'EG00237') & (rt_serial != 'EG00226')")) + \
gg.ggplot(gstats) + \
  gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='lgv2')) + gg.facet_wrap("~ crop + rt_serial") + \
  gg.theme(figure_size=(12, 8))

<Figure Size: (1200 x 800)>

bad_roottrackers = ['EG00237', 'EG00226'] # Corn
# bad_roottrackers = ['EG00133', 'EG00105', 'EG00226'] # Soy
# bad_roottrackers = ['EG00226'] # Wheat

gg.ggplot(gstats.query(f"~ (rt_serial in {bad_roottrackers})")) + \
  gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='lgv1')) + gg.facet_wrap("~ crop + rt_serial") + \
  gg.theme(figure_size=(12, 8))

<Figure Size: (1200 x 800)>

gg.ggplot(gstats.query(f"~ (rt_serial in {bad_roottrackers})")) + \
  gg.geom_tile(gg.aes(x='paddle', y='electrode', fill='lgv2')) + gg.facet_wrap("~ crop + rt_serial") + \
  gg.theme(figure_size=(12, 8)) + \
  gg.ggtitle("Log variance of the second principal component by electrode, paddle, and treatment")

<Figure Size: (1200 x 800)>

# gg.ggsave(p, filename="log-variance-second-component.png")

# We have to filter out the one item that does not have the correct shape
# keep = np.array([o.shape[0] == 8043 for o in out_dict['dyn']])
keep = np.array([dyn_data.get_shape(i)[0] == 8043 for i in range(48)]) & (~rt_deps_to_use['rt_serial'].isin(bad_roottrackers))
# keep

# dyn_array = np.array([o[:,paddle_keep,:,:] for o in out_dict['dyn'] if o.shape[0] == 8043])
dyn_array = np.array([o for o, k in zip(dyn_data, keep) if k])

dyn_array.shape

(46, 8043, 12, 22, 8)

# crop_df.loc[keep,'crop'][0:10];

attributes = ["loc_1", "loc_2", "mean_mom_1", "mean_mom_2", "std_mom_1", "std_mom_2", "mean_angm", "std_angm"]

fig, axs = plt.subplots(4, 2)
fig.tight_layout()
for i in range(8):
    c = int(i/4)
    r = i % 4
    axs[r, c].plot(dyn_array[0,:,0,4,i])
    axs[r, c].set_title(f"Stat {i}: {attributes[i]}")
    if r == 3:
        axs[r,c].set_xlabel("time (5 minute increments)", size=8)

fig, axs = plt.subplots(3, 1)
fig.tight_layout()
for i, j in enumerate([4, 5, 7]):
    axs[i].plot(np.log(dyn_array[0,:,0,4,j] + 1e-3))
    axs[i].set_title(f"Stat {j}: log {attributes[j]}")

extremes = np.percentile(dyn_array, (1, 99), axis=(0, 1, 2, 3))
extremes.shape

(2, 8)

extremes

array([[-2.06517631e+02, -1.75383705e+01, -4.35344901e+00,
        -2.68861454e-01,  0.00000000e+00,  0.00000000e+00,
        -3.81601732e+00,  0.00000000e+00],
       [ 1.91960599e+02,  1.41603271e+01,  1.45102238e+00,
         1.62018039e-01,  5.01240962e+02,  4.03651708e+00,
         1.91341991e+00,  3.46234711e+01]])

low_integrated = np.mean(dyn_array[:,:,:,2:,:] <= extremes[0,:].reshape((1, 1, 1, 1, 8)), axis=(1, 2, 3))

high_integrated = np.mean(dyn_array[:,:,:,2:,:] >= extremes[1,:].reshape((1, 1, 1, 1, 8)), axis=(1, 2, 3))

low_df = pd.concat([pd.DataFrame(low_integrated, columns=attributes), crop_df.loc[keep,:].reset_index(drop=True)], axis=1)
high_df = pd.concat([pd.DataFrame(high_integrated, columns=attributes), crop_df.loc[keep,:].reset_index(drop=True)], axis=1)
low_df.head()

gg.ggplot(high_df) + gg.geom_histogram(gg.aes(x='mean_mom_2')) + gg.facet_wrap("~crop")

/home/jesse/rtphenos/venv-rtphenos2/lib/python3.10/site-packages/plotnine/stats/stat_bin.py:109: PlotnineWarning: 'stat_bin()' using 'bins = 9'. Pick better value with 'binwidth'.

<Figure Size: (640 x 480)>

gg.ggplot(low_df) + gg.geom_histogram(gg.aes(x='mean_mom_2')) + gg.facet_wrap("~crop")

/home/jesse/rtphenos/venv-rtphenos2/lib/python3.10/site-packages/plotnine/stats/stat_bin.py:109: PlotnineWarning: 'stat_bin()' using 'bins = 4'. Pick better value with 'binwidth'.

<Figure Size: (640 x 480)>

low_df.loc[:,low_df.columns != 'rt_serial'].groupby('crop').median()

high_df.loc[:,low_df.columns != 'rt_serial'].groupby('crop').median()

extremes[:,3], extremes[:,5], np.exp(-1.0)

(array([-0.26886145,  0.16201804]),
 array([0.        , 4.03651708]),
 0.36787944117144233)

# # Use all of extreme mom_2 --- if you do this the integration separates things, but the count does not
# traj_list = list()
# # for tup in [(3,0), (4,1), (5,1)]:
# for tup in [(3,0),]:
#     for i, rec in enumerate(progressbar.progressbar(crop_df.loc[keep,:].to_dict(orient='records'))):
#         hl = tup[1]
#         thresh = extremes[hl, tup[0]]
#         if hl == 1:
#             temp_df = transforms1.detection_trajectories(dyn_array[i,:,:,:,tup[0]] > thresh)
#         else:
#             temp_df = transforms1.detection_trajectories(dyn_array[i,:,:,:,tup[0]] < thresh)
#         temp_df['weight'] = thresh
#         # Consolidating takes some time, either here or below - if we do it here we save mem
#         # temp_df = transforms1.consolidate_trajectories(temp_df, grouping=['paddle', 'electrode'], cush=6)
#         temp_df['attribute'] = attributes[tup[0]]
#         temp_df['rt_serial'] = rec['rt_serial']
#         temp_df['crop'] = rec['crop']
#         traj_list.append(temp_df)

# Use extreme mom_2 except for when it jumps - there is one NPC that has lots of extreme mom_2 because of jumps.
# Though it may not make a difference here, later it will be key to also exclude when we are interpolating.
traj_list = list()
for i, rec in enumerate(progressbar.progressbar(crop_df.loc[keep,:].to_dict(orient='records'))):
    thresh = extremes[0, 3]
    temp_df = transforms1.detection_trajectories(
        (dyn_array[i,:,:,:,3] < extremes[0,3]) & 
        (dyn_array[i,:,:,:,5] < np.exp(-1.0)) &
        (dyn_array[i,:,:,:,5] > np.exp(-2.0)) # If you interpolate over a large period you have 0 std
    )
    temp_df['weight'] = thresh
    temp_df['attribute'] = attributes[3]
    temp_df['rt_serial'] = rec['rt_serial']
    temp_df['crop'] = rec['crop']
    traj_list.append(temp_df)

100% (46 of 46) |########################| Elapsed Time: 0:00:03 Time:  0:00:03

# Create a single data frame with all trajectories.
traj_df_1 = pd.concat(traj_list)
traj_df_1['length'] = traj_df_1['end'] - traj_df_1['start']
traj_df_1.shape

(14671, 9)

# traj_df_1.head()

traj_df_1_int = traj_df_1.groupby(["crop", "rt_serial"]).agg({'length': 'sum'}).reset_index(drop=False)
traj_df_1_int['log_length'] = np.log(traj_df_1_int['length'])

# traj_df_1_int.groupby("crop").agg({'length': 'describe'})

# traj_df_1_int.sort_values(["length"])

# gg.ggplot(traj_df_1_int) + gg.geom_histogram(gg.aes('length', fill='crop'), bins=10)
# gg.ggplot(traj_df_1_int) + gg.geom_histogram(gg.aes('log_length', fill='crop'))

pd.crosstab(traj_df_1['crop'], traj_df_1['length'])

# traj_df_1['electrode'].value_counts()

# Consolidate trajectories that nearly overlap in time on the same device, paddle, and electrode.
# traj_df_2 = transforms1.flag_overlapping_trajectories(traj_df_1, grouping=['attribute', 'rt_serial', 'paddle', 'electrode'], cush=6)
traj_df_2 = transforms1.consolidate_trajectories(traj_df_1.query("electrode > 0"), grouping=['attribute', 'crop', 'rt_serial', 'paddle', 'electrode'], cush=6)
# traj_df_2 = traj_df_1
traj_df_2.shape

(8648, 9)

# traj_df_2.head()

# Consolidate trajectories that overlap in time and on adjascent electrodes on the same device, paddle.
# traj_df_3 = transforms1.consolidate_trajectories(traj_df_2.sort_values(['attribute', 'rt_serial', 'paddle', 'start', 'end']), grouping=['attribute', 'rt_serial', 'paddle'], cush=6)
# traj_df_3.shape

traj_df_3a = transforms1.flag_overlapping_trajectories(traj_df_2.sort_values(['attribute', 'crop', 'rt_serial', 'start', 'end']), 
                                                       grouping=['attribute', 'crop', 'rt_serial', 'paddle'], cush=6)
# traj_df_3a.head(n=10)

traj_df_3 = traj_df_3a.groupby(['attribute', 'crop', 'rt_serial', 'paddle', 'group_num']).agg({
    'start': 'min',
    'end': 'max',
    'electrode': 'min'
    }).reset_index(drop=False)
traj_df_3.shape

(4582, 8)

# traj_df_3.head()

# Remove trajectories that overlap in time and on adjascent paddles (we do not use modular arithmatic here, so this is only approximate).
traj_df_4a = transforms1.flag_overlapping_trajectories(traj_df_3.sort_values(['attribute', 'crop', 'rt_serial', 'start', 'end']), grouping=['attribute', 'crop', 'rt_serial'], cush=6)
traj_df_4a['num_in_group'] = traj_df_4a.groupby(['group_num'])['group_num'].transform(lambda x: len(x))
traj_df_4a.shape

(4582, 10)

traj_df_4a.head()

traj_df_4a['num_in_group'].value_counts()

num_in_group
1     1706
2      814
3      465
4      352
5      305
6      252
7      189
8      168
10     100
9       90
11      88
12      24
16      16
13      13
Name: count, dtype: int64

pd.crosstab(traj_df_4a['num_in_group'], traj_df_4a['crop'])

# We remove 2/3 of trajectories by removing things at occur at the same time!!!
traj_df_4 = traj_df_4a.query("num_in_group <= 1")
traj_df_4.shape

(1706, 10)

# traj_df_4.head()

traj_df_5 = traj_df_4.copy()
# traj_df_5 = traj_df_4.merge(crop_df, on='rt_serial')
traj_df_5.shape

(1706, 10)

traj_df_5['length'] = traj_df_5['end'] - traj_df_5['start']
traj_df_5['day']= np.floor(traj_df_5['start'] / (12 * 24)).astype(int)
traj_df_5.head()

gg.ggplot(traj_df_5) + gg.geom_histogram(gg.aes(x='length', fill='crop'), bins=20) + gg.facet_wrap("~attribute")

<Figure Size: (640 x 480)>

# We integrate here, which can be critical to getting a difference if you do not filter out short-length anomalies.
traj_df_6 = traj_df_5.query("length > 3").groupby(['attribute', 'crop', 'rt_serial']).agg({'length': 'sum', 'start': 'size'}).reset_index(drop=False)
traj_df_6['count'] = traj_df_6.pop('start')
traj_df_6.shape

(45, 5)

# traj_df_6.head()

gg.ggplot(traj_df_6) + gg.geom_histogram(gg.aes(x='length', fill='crop'), bins=20) + gg.facet_wrap("~attribute") + \
  gg.ggtitle("Distribution of cumulative anomalous trajectory time by crop")

<Figure Size: (640 x 480)>

gg.ggplot(traj_df_6) + gg.geom_histogram(gg.aes(x='count', fill='crop'), bins=20) + gg.facet_wrap("~attribute") + \
  gg.ggtitle("Distribution of number of anomalous trajectories by crop")

<Figure Size: (640 x 480)>

gg.ggplot(traj_df_5.query("length > 3")) + gg.geom_point(gg.aes('start', 'rt_serial', fill='crop')) + \
  gg.ggtitle("Detection times by device")

<Figure Size: (640 x 480)>

traj_df_day = traj_df_5.groupby(['crop', 'rt_serial', 'day']).agg({'length': 'sum', 'start': 'size', 'electrode': 'mean'}).reset_index(drop=False)
traj_df_day['day'] = pd.Categorical(traj_df_day['day'], ordered=True)
# traj_df_day.head()

gg.ggplot(traj_df_day) + gg.geom_boxplot(gg.aes('day', 'start')) + gg.facet_wrap("~ crop") + \
  gg.ggtitle("Distribution of number of detections by day")

<Figure Size: (640 x 480)>

gg.ggplot(traj_df_day) + gg.geom_boxplot(gg.aes('day', 'electrode')) + gg.facet_wrap("~ crop") + \
  gg.ggtitle("Distribution of number of detections by day")

<Figure Size: (640 x 480)>

gg.ggplot(gstats_ave.query("(paddle != 2) & (paddle !=3)")) + gg.geom_point(gg.aes(x='gv2', y='crop')) + gg.facet_wrap("~electrode")

<Figure Size: (640 x 480)>

temp_y, temp_X = dmatrices('crop ~ np.log(gv2)', data=gstats_ave, return_type='dataframe')

temp_y

mod = sm.GLM(temp_y, temp_X, family=sm.families.Binomial())

results = mod.fit()

print(results.summary())

                      Generalized Linear Model Regression Results                      
=======================================================================================
Dep. Variable:     ['crop[Corn]', 'crop[NPC]']   No. Observations:                  528
Model:                                     GLM   Df Residuals:                      526
Model Family:                         Binomial   Df Model:                            1
Link Function:                           Logit   Scale:                          1.0000
Method:                                   IRLS   Log-Likelihood:                -234.15
Date:                         Wed, 01 Nov 2023   Deviance:                       468.29
Time:                                 15:19:04   Pearson chi2:                     739.
No. Iterations:                              5   Pseudo R-squ. (CS):             0.3931
Covariance Type:                     nonrobust                                         
===============================================================================
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      -5.0223      0.428    -11.739      0.000      -5.861      -4.184
np.log(gv2)     1.5071      0.121     12.415      0.000       1.269       1.745
===============================================================================

Y_data = Dynamics(rt_deps_to_use, "/usr/local/share/rtphenos-workspace/dynamics", "Y")

dyn_data.get_shape(0), Y_data.get_shape(0)

((8043, 12, 22, 8), (12, 22, 8065, 2))

traj_df_plot = traj_df_4.query("end > (start + 12) & (electrode > 0)").merge(rt_deps_to_use[['rt_serial', 'index']], on='rt_serial')
traj_df_plot.shape # , traj_df_plot.shape[0] / len(traj_df_plot['rt_serial'].unique())

(418, 11)

traj_df_plot.head()

traj_df_plot['crop'].value_counts()

crop
Corn    368
NPC      50
Name: count, dtype: int64

def plot_some(Y_data, traj_df, nr, nc, half_width=11, extra_start=0, extra_end=0, figsize=(12,8), diff=False, plot_against_time=False):
    fig, axs = plt.subplots(nr, nc, squeeze=False, figsize=figsize)
    fig.tight_layout()
    num = nr * nc
    cache = dict()
    for i, rec in enumerate(traj_df.sample(num).to_dict(orient='records')):
        idx = rec['index']
        elec = rec['electrode']
        pad = rec['paddle']
        start = rec['start'] + half_width - extra_start
        end = rec['end'] + half_width + extra_end
        if not idx in cache.keys():
            cache[idx] = Y_data[idx]
        temp_data = cache[idx][pad, elec, start:end, :]
        if diff:
            temp_data = np.diff(temp_data, axis=0)
        temp_diff = np.mean(np.diff(temp_data, axis=0), axis=0)
        r = i % nr
        c = int(i / nr)
        if plot_against_time:
            axs[r, c].plot(range(start, end), temp_data[:,0])
            axs[r, c].plot(range(start, end), temp_data[:,1])
            axs[r, c].scatter([start + extra_start, end - extra_end - 1], [temp_data[extra_start,0], temp_data[(-extra_end+1),0]])
            axs[r, c].scatter([start + extra_start, end - extra_end - 1], [temp_data[extra_start,1], temp_data[(-extra_end+1),1]])
        else:
            axs[r, c].plot(temp_data[:,0], temp_data[:,1])
            axs[r, c].scatter(temp_data[:,0], temp_data[:,1])
            axs[r, c].scatter(temp_data[extra_start,0], temp_data[extra_start,1])
            axs[r, c].scatter(temp_data[-(extra_end+1),0], temp_data[-(extra_end+1),1])
        axs[r, c].set_title(f"E{elec} from {start}-{end} ({len(temp_data)}) for {rec['crop']}")
        # axs[r, c].set_title(f"{temp_diff}")
    return fig, axs

fig, axs = plot_some(Y_data, traj_df_plot.query("crop == 'Corn'"), 5, 4, 11, extra_start=50, extra_end=50);
fig.show()

fig, axs = plot_some(Y_data, traj_df_plot.query("crop == 'NPC'"), 5, 4, 11, extra_start=50, extra_end=50);
fig.show()

# fig, axs = plot_some(Y_data, traj_df_plot.query("crop == 'Corn'"), 5, 4, 11, extra_start=0, extra_end=0, diff=True);
# fig.show()

# fig, axs = plot_some(Y_data, traj_df_plot.query("crop == 'NPC'"), 5, 4, 11, extra_start=50, extra_end=50, diff=True);
# fig.show()

fig, axs = plot_some(Y_data, traj_df_plot.query("crop == 'Corn'"), 5, 4, 11, extra_start=50, extra_end=50, plot_against_time=True)

fig, axs = plot_some(Y_data, traj_df_plot.query("crop == 'NPC'"), 5, 4, 11, extra_start=50, extra_end=50, plot_against_time=True)

# Can we describe dynamics of 2d time series?
# Can we classify via differences in the flagged trajectories above?
# Can we mimic consolidation by neural net to produce a differentiable approximation?

	deployment_id	roottracker_serial	ram_serial	x	y	bench	crop	genotype	order	pot_on_bench	pot_on_row	rep	row	deployment_name	NPC
0	54	EG00043	D76080D25B7EA126	1.0	11.0	2	Soy	PI416937	11	2	11	1	1	ACC Test 1	False
1	54	EG00044	D76080D25B3BD828	1.0	3.0	1	Wheat	Comet	3	3	3	1	1	ACC Test 1	False
2	54	EG00045	D76080D25B7F7526	4.0	7.0	1	Cotton	PI513384	115	7	7	1	4	ACC Test 1	False
3	54	EG00046	D76080D25B3C9628	2.0	11.0	2	Soy	PI416937	47	2	11	1	2	ACC Test 1	False
4	54	EG00047	D76080D25B3A6C28	3.0	8.0	1	Cotton	PI513384	80	8	8	1	3	ACC Test 1	False

	rt_serial	electrode	gm1	gm2	gv1	gv2	gtv	c1	crop	lgv1	lgv2
0	EG00054	0	556.663980	589.691259	8476.007809	18.282411	8494.290219	0.998606	Corn	9.044995	2.905939
1	EG00054	1	646.822567	699.660508	5339.363699	11.973631	5351.337330	0.994539	Corn	8.582862	2.482707
2	EG00054	2	617.742343	677.520769	9575.443376	20.846439	9596.289815	0.993526	Corn	9.166957	3.037183
3	EG00054	3	628.711593	681.512833	13617.099897	17.044937	13634.144834	0.994724	Corn	9.519082	2.835853
4	EG00054	4	572.286423	618.721389	10021.243756	15.339421	10036.583177	0.995525	Corn	9.212462	2.730426

	crop	electrode	gm1	gm2	gv1	gv2	gtv	c1
0	Corn	0	738.257346	788.790427	8380.142913	33.700301	8413.843214	0.997881
1	Corn	1	718.479744	773.193609	6229.639808	42.715793	6272.355601	0.997577
2	Corn	2	689.364606	754.961427	6370.560862	58.982296	6429.543158	0.996659
3	Corn	3	692.777519	751.060568	7587.832542	55.438604	7643.271146	0.997187
4	Corn	4	698.128753	760.413589	6867.828686	52.119477	6919.948164	0.996454

	loc_1	loc_2	mean_mom_1	mean_mom_2	std_mom_1	std_mom_2	mean_angm	std_angm	rt_serial	crop
0	0.016282	0.012950	0.015381	0.012310	0.004264	0.004264	0.011759	0.004318	EG00054	Corn
1	0.001625	0.000000	0.003682	0.001174	0.010529	0.010529	0.004160	0.010650	EG00055	NPC
2	0.025127	0.019365	0.012861	0.014298	0.010715	0.010715	0.012822	0.010806	EG00056	Corn
3	0.008031	0.020985	0.009055	0.010772	0.007537	0.007537	0.010538	0.007614	EG00059	Corn
4	0.008623	0.016321	0.010708	0.012938	0.017491	0.017491	0.013330	0.017611	EG00060	Corn

	loc_1	loc_2	mean_mom_1	mean_mom_2	std_mom_1	std_mom_2	mean_angm	std_angm
crop
Corn	0.010785	0.020192	0.010708	0.012682	0.010255	0.010255	0.010192	0.01036
NPC	0.004164	0.000653	0.008826	0.005156	0.010529	0.010529	0.008363	0.01065

Introduction¶

Setup¶

Get list of data to process¶

Load data¶

Global statistics¶

Data Exploration¶

Identifying anomalous portions of trajectories¶

Global metrics for separation¶

Analysis of specific trajectories ...¶

Conclusion¶

	loc_1	loc_2	mean_mom_1	mean_mom_2	std_mom_1	std_mom_2	mean_angm	std_angm
crop
Corn	0.015289	0.019067	0.008533	0.007389	0.010431	0.008228	0.008807	0.010377
NPC	0.001130	0.001321	0.008012	0.004919	0.007515	0.003631	0.007293	0.007891

length	1	2	3	4	5	6	7	8	9	10	...	46	47	50	51	52	53	54	55	58	67
crop
Corn	4078	1581	785	525	380	276	267	208	171	150	...	3	1	2	2	1	1	2	2	1	1
NPC	2270	824	391	259	162	131	124	93	88	69	...	0	0	0	1	0	0	0	0	0	0

crop	Corn	NPC
num_in_group
1	1386	320
2	674	140
3	366	99
4	244	108
5	210	95
6	162	90
7	147	42
8	128	40
9	54	36
10	60	40
11	55	33
12	24	0
13	0	13
16	0	16

	crop[Corn]	crop[NPC]
0	1.0	0.0
1	1.0	0.0
2	1.0	0.0
3	1.0	0.0
4	1.0	0.0
...	...	...
523	0.0	1.0
524	0.0	1.0
525	0.0	1.0
526	0.0	1.0
527	0.0	1.0

	crop[Corn]	crop[NPC]
0	1.0	0.0
1	1.0	0.0
2	1.0	0.0
3	1.0	0.0
4	1.0	0.0
...	...	...
523	0.0	1.0
524	0.0	1.0
525	0.0	1.0
526	0.0	1.0
527	0.0	1.0

	crop[Corn]	crop[NPC]
0	1.0	0.0
1	1.0	0.0
2	1.0	0.0
3	1.0	0.0
4	1.0	0.0
...	...	...
523	0.0	1.0
524	0.0	1.0
525	0.0	1.0
526	0.0	1.0
527	0.0	1.0