This notebooks shows results for principal components regression of the NGAqua data.

In [1]:
import xarray as xr
import numpy as np
import pandas as pd

from toolz import *
from toolz.curried import get

from lib.util import mat_to_xarray
from xnoah.data_matrix import unstack_cat

from tqdm import tqdm
from sklearn.externals import joblib
from lib.models import get_pcr_mod, mse, weighted_r2_score
from lib.util import dict_to_xr
from lib.hvops import ndoverlay_to_contours
import holoviews as hv
hv.extension('matplotlib')

import matplotlib.pyplot as plt
from matplotlib.colors import SymLogNorm
%matplotlib inline
In [2]:
data = joblib.load("../data/ml/ngaqua/data.pkl")
ntrain = 10000

_, weight_out = data['w']

x_train, y_train = data['train']
x_test, y_test = data['test']

# training indices (random sample for speed)
train_inds = np.random.choice(x_train.shape[0], ntrain, replace=False)


def score_model(mod, x, y):
    pred = mod.predict(x)
    return weighted_r2_score(y, pred, weight_out)

Make the PCR model

In [3]:
pcr = get_pcr_mod(data)
In [4]:
cv_data = {}


for n in tqdm([1,2,5,10,20,30,40,50,60,68]):
#     print(f"Fitting model for {n} components")
    # set the number of components to keep
    pcr.set_params(n_components=n)
    pcr.fit(x_train[train_inds], y_train[train_inds])
    cross_val_mse = mse(pcr.predict(x_test), y_test, dims=['samples'])
    
    cv_data[n] = {'test_score': score_model(pcr, x_test, y_test),
                 'train_score': score_model(pcr, x_train, y_train),
                 'mse_profile': cross_val_mse}
    
100%|██████████| 10/10 [00:54<00:00,  5.43s/it]
In [5]:
%%opts NdOverlay[legend_position='top_left']
df = pd.DataFrame({'test': valmap(get('test_score'), cv_data),
                   'train': valmap(get('train_score'), cv_data)}).reset_index()
hv.Table(pd.melt(df, id_vars='index')).to.curve("index", "value").overlay()\
.redim.range(value=(0,.4))\
.redim.label(index="n", value="R2 score")
Out[5]:

PCR performs poorly for small numbers of retained components. What is the variance explained of the input?

In [6]:
hv.Curve(pcr.explained_variance_ratio_.cumsum(), kdims=['n'], vdims=['Cumulative fraction of explained variance'])\
.redim.range(n=(0,20))
Out[6]:

The first few modes, explain much of the variance, so as with MCA based regression, they main problem is likely the nonlinearity between the modes rather than the importance of extremely low variance modes.

Now what do the vertical structures of the errors look like? To do this, let's first collect all the MSE data into one data array.

In [7]:
cross_val_mse = valmap(get('mse_profile'), cv_data)

mse_data = xr.concat(cross_val_mse.values(), dim=pd.Index(cross_val_mse.keys(), name='n')).unstack("features")
mse_data
Out[7]:
<xarray.DataArray 'Q1c' (n: 10, variable: 2, z: 34)>
array([[[  1.895459e+00,   1.672636e+00, ...,   1.866873e+00,   1.812099e+00],
        [  8.866934e+00,   1.808025e+01, ...,   0.000000e+00,   0.000000e+00]],

       [[  1.844926e+00,   1.641113e+00, ...,   1.871709e+00,   1.812150e+00],
        [  8.867331e+00,   1.812689e+01, ...,   0.000000e+00,   0.000000e+00]],

       ..., 
       [[  1.125056e+00,   1.139625e+00, ...,   1.710842e+00,   1.695240e+00],
        [  4.380645e+00,   1.008255e+01, ...,   0.000000e+00,   0.000000e+00]],

       [[  2.260130e+02,   6.483677e+02, ...,   7.558959e+01,   3.123170e+00],
        [  4.856414e+02,   2.257794e+03, ...,   0.000000e+00,   0.000000e+00]]])
Coordinates:
  * n         (n) int64 1 2 5 10 20 30 40 50 60 68
  * variable  (variable) object 'Q1c' 'Q2'
  * z         (z) float64 37.0 112.0 194.0 288.0 395.0 520.0 667.0 843.0 ...

I overlay all the different curves in this plot.

In [8]:
%%opts Curve[invert_axes=True] NdOverlay[show_legend=False]
m = hv.Dataset(mse_data.sel(n=slice(0,40))).to.curve("z").overlay("n").layout("variable").redim.label(Q1c="MSE")
m
Out[8]:
In [9]:
%%opts Contours[invert_axes=True, colorbar=True](cmap='Blues')


# curves = hv.HoloMap({(chr(n+65), n): hv.Curve(np.random.rand(10)) for n in range(10)}, kdims=['variable', 'n'])
curves = hv.Dataset(mse_data.sel(n=slice(0,40))).to.curve("z")
lay =curves.overlay('n').layout('variable')
lay.map(ndoverlay_to_contours, hv.NdOverlay)
Out[9]:

It seems that the errors are larger for Q1c than they are for Q2. It is interesting that including more components decreases the error in the troposphere,but does .not decrease the error for Q1c in the stratosphere. This indicates that the errors there are inherently unpreditable there.

Visualizing the response

In [10]:
def plot_coefs(nc):
    """Plot the linear response to the components when nc components are used in total"""
    pcr = get_pcr_mod(data)
    pcr.set_params(n_components=nc)

    pcr.fit(x_train[train_inds], y_train[train_inds])

    coefs = pcr.mod.coef_
    coefs = unstack_cat(mat_to_xarray(coefs, {0: y_train.features}), "features")

    opts = 'Curve[invert_axes=True] {+framewise } NdOverlay[legend_position="top_left"]'

    plotme = coefs.to_array().to_dataset(name="K/day")
    return hv.Dataset(plotme).to.curve("z").overlay("variable").opts(opts).redim.label(dim_1="mode")

Here is the response of Q1c and Q2 to a one standard deviation change in the components.

In [11]:
plot_coefs(40)
Out[11]: