Source code for bmorph.core.bmorph

"""
bmorph: modify a time series by removing elements of persistent differences
        (aka bias correction)

Persistent differences are inferred by comparing a 'ref' sample with a
'training' sample. These differences are then used to correct a 'raw' sample
that is presumed to have the same persistent differences as the 'training'
sample. The resulting 'bmorph' sample should then be consistent with the
'ref' sample.
"""

import numpy as np
import pandas as pd
import scipy.stats
from sklearn.neighbors import KernelDensity

# Done fail silently on divide by zero, but raise an error instead
np.seterr(divide='raise')


[docs]def kde2D(x, y, xbins=200, ybins=10, **kwargs):
    """ Estimate a 2 dimensional pdf via kernel density estimation """
    xx, yy = np.mgrid[x.min():x.max():(xbins * 1j), y.min():y.max():(ybins * 1j)]
    xy_sample = np.vstack([yy.ravel(), xx.ravel()]).T
    xy_train  = np.vstack([y, x]).T
    kde = KernelDensity(**kwargs)
    kde.fit(xy_train)
    # Need to exponentiate to get true probabilities
    z = np.exp(kde.score_samples(xy_sample))
    return xx[:, 0], yy[0, :], np.reshape(z, yy.shape)


[docs]def hist2D(x, y, xbins, ybins, **kwargs):
    """ Create a 2 dimensional pdf vias numpy histogram2d"""
    H, xedg, yedg = np.histogram2d(x=x, y=y, bins=[xbins,ybins], density=True, **kwargs)
    xcen = (xedg[:-1] + xedg[1:]) / 2
    ycen = (yedg[:-1] + yedg[1:]) / 2
    return xcen, ycen, H


[docs]def marginalize_cdf(y_raw, z_raw, vals):
    """Find the marginalized cdf by computing cumsum(P(x|y=val)) for each val"""
    y_raw = np.array(y_raw)
    z_raw = np.array(z_raw)
    z_raw[z_raw == 0] = 1e-15
    vals = np.array(vals)
    locs = np.argmin(np.abs(vals[:, np.newaxis] - y_raw), axis=1)
    z = np.cumsum(z_raw[:, locs], axis=0)
    z /= z[-1, :]
    return z


[docs]def cqm(raw_x: pd.Series, train_x: pd.Series, ref_x: pd.Series,
        raw_y: pd.Series, train_y: pd.Series, ref_y: pd.Series=None,
        method='hist', xbins=200, ybins=10, bw=3, rtol=1e-7, atol=0, 
        nsmooth=5, train_cdf_min=1e-6) -> pd.Series:
    """Conditional Quantile Mapping

    Multidimensional conditional equidistant CDF matching function:
    \tilde{x_{mp}} = x_{mp} + F^{-1}_{oc}(F_{mp}(x_{mp}|y_{mp})|y_{oc})
                            - F^{-1}_{mc}(F_{mp}(x_{mp}|y_{mp})|y_{mc})
    """
    if method == 'kde':
        x_raw, y_raw, z_raw = kde2D(raw_x, raw_y, xbins, ybins,
                                    bandwidth=bw, rtol=rtol, atol=atol)
        x_train, y_train, z_train = kde2D(train_x, train_y, xbins, ybins,
                                          bandwidth=bw, rtol=rtol, atol=atol)
        x_ref, y_ref, z_ref = kde2D(ref_x, ref_y, xbins, ybins,
                                          bandwidth=bw, rtol=rtol, atol=atol)
    elif method == 'hist':
        x_raw, y_raw, z_raw = hist2D(raw_x, raw_y, xbins, ybins)
        x_train, y_train, z_train = hist2D(train_x, train_y, xbins, ybins)
        x_ref, y_ref, z_ref = hist2D(ref_x, ref_y, xbins, ybins)
    else:
        raise Exception("Current methods for cqm only include 'hist' to use hist2D "
                        "and 'kde' to use kde2D, please select one.")

    # Lookup tables of actual cdfs with dimensions (xbins, time)
    raw_cdfs = marginalize_cdf(y_raw, z_raw, raw_y)
    # Lookup tables of actual cdfs with dimensions (time, xbins)
    # These are transposed to do the inverse lookup
    # Note we always marginalize with respect to the raw values
    train_cdf = marginalize_cdf(y_train, z_train, raw_y).T
    ref_cdf = marginalize_cdf(y_ref, z_ref, raw_y).T

    # Quantiles of the raw flows
    nx_raw = np.arange(len(raw_x))
    u_t = raw_cdfs[np.argmin(np.abs(raw_x.values[:, np.newaxis] - x_raw), axis=1), nx_raw]
    u_t = u_t[:, np.newaxis]

    # Lookup the associated flow values in the CDFs for the reference and training periods
    mapped_ref = x_ref[np.argmin(np.abs(u_t - ref_cdf[nx_raw, :]), axis=1)]
    mapped_train = x_train[np.argmin(np.abs(u_t - train_cdf[nx_raw, :]), axis=1)]
    mapped_train[mapped_train < train_cdf_min] = train_cdf_min
    multipliers = pd.Series(mapped_ref / mapped_train, index=raw_x.index, name='multiplier')
    if method == 'hist':
        # Do some smoothing just to reduce effects of histogram bin edges
        multipliers = multipliers.rolling(nsmooth, win_type='triang', min_periods=1).mean()
    return multipliers


[docs]def edcdfm(raw_x, raw_cdf, train_cdf, ref_cdf, train_cdf_min=1e-6):
    '''Calculate  multipliers using an adapted version of the EDCDFm technique

    This routine implements part of the PresRat bias correction method from
    Pierce et al. (2015; http://dx.doi.org/10.1175/JHM-D-14-0236.1), which is
    itself an extension of the Equidistant quantile matching (EDCDFm) technique
    of Li et al. (2010; http://dx.doi.org/10.1029/94JD00483). The part that is
    implemented here is the amended form of EDCDFm that determines
    multiplicative changes in the quantiles of a CDF.

    In particular, if the value `raw_x` falls at quantile `u_t` (in `raw_cdf`),
    then the bias-corrected value is the value in `ref_cdf` at `u_t`
    (`ref_x`) multiplied by the model-predicted change at `u_t` evaluated as
    a ratio (i.e., model future (or `raw_x`) / model historical (or
    `ref_x`)). Thus, the bias-corrected value is `raw_x` multiplied by
    `ref_x`/`train_x`. Here we only return the multiplier
    `ref_x`/`train_x`. This method preserves the model-predicted median
    (not mean) change evaluated multiplicatively. Additional corrections
    are required to preserve the mean change. Inclusion of these additional
    corrections constitutes the PresRat method.

    Parameters
    ----------
    raw_x : pandas.Series
        Series of raw values that will be used to determine the quantile `u_t`
    raw_cdf : pandas.Series
        Series of raw values that represents the CDF that is used to
        determine the non-parametric quantile of `raw_x`
    train_cdf: pandas.Series
        Series of training values that represents the CDF based on
        the same process as `raw_cdf`, but overlapping in time with `ref_cdf`
    ref_cdf: pandas.Series
         Series of ref values that represents the ref CDF and that
         overlaps in time with `train_cdf`

    Returns
    -------
    multiplier : pandas.Series
        Multipliers for `raw_x`. The pandas.Series has the same index as
        `raw_x`
    '''
    # Type checking (note that the checking is more strict here then it
    # probably needs to be)
    assert isinstance(raw_x, pd.Series)
    assert isinstance(raw_cdf, pd.Series)
    assert isinstance(train_cdf, pd.Series)
    assert isinstance(ref_cdf, pd.Series)

    # Given raw_x and raw_cdf determine the quantiles u_t
    # This method is slightly more efficient than using
    # scipy.percentileofscore, especially on large arrays
    cdf_idx = np.argsort(raw_cdf)
    cdf_sort = raw_cdf[cdf_idx]
    cdf_rank = 100 * scipy.stats.rankdata(cdf_sort) / len(cdf_idx)
    offset = 100 / len(cdf_idx)
    u_t = [cdf_rank[np.searchsorted(cdf_sort, x, side='left')]-offset for x in raw_x]

    # Given u_t and train_cdf determine train_x
    train_x = np.percentile(train_cdf, u_t)
    train_x[train_x < train_cdf_min] = train_cdf_min

    # Given u_t and ref_cdf determine ref_x
    ref_x = np.percentile(ref_cdf, u_t)

    # Calculate multiplier
    multiplier = ref_x / train_x
    return pd.Series(multiplier, index=raw_x.index, name='multiplier')


[docs]def bmorph(raw_ts, train_ts, ref_ts,
           raw_apply_window, raw_train_window, ref_train_window, raw_cdf_window,
           raw_y=None, ref_y=None, train_y=None,
           nsmooth=12, bw=3, xbins=200, ybins=10, rtol=1e-7, atol=0, method='hist',
           smooth_multipliers=True, train_cdf_min=1e-6):
    '''Morph `raw_ts` based on differences between `ref_ts` and `train_ts`

    bmorph is an adaptation of the PresRat bias correction procedure from
    Pierce et al. (2015; http://dx.doi.org/10.1175/JHM-D-14-0236.1), which is
    itself an extension of the Equidistant quantile matching (EDCDFm) technique
    of Li et al. (2010; http://dx.doi.org/10.1029/94JD00483). The method as
    implemented here uses a multiplicative change in the quantiles of a CDF,
    followed by a simple correction to preserve changes in the long-term mean.
    No further frequency-based corrections are applied.

    The method differs from PresRat in that it is not applied for fixed periods
    (but uses a moving window) to prevent discontinuities in the corrected time
    series and it does not apply a frequency-based correction.

    The method also allows changes to be made through an adapted version of the
    EDCDFm technique or through the multiDimensional ConDitional EquiDistant CDF
    matching function if a second timeseries variable is passed.

    Parameters
    ----------
    raw_ts : pandas.Series
        Raw time series that will be bmorphed
    raw_cdf_window : slice
        Slice used to determine the CDF for `raw_ts`
    raw_bmorph_window : slice
        Slice of `raw_ts` that will be bmorphed
    ref_ts : pandas.Series
        Target time series. This is the time series with ref values that
        overlaps with `train_ts` and is used to calculated `ref_cdf`
    train_ts : pandas.Series
        Training time series. This time series is generated by the same process
        as `raw_ts` but overlaps with `ref_ts`. It is used to calculate
        `train_cdf`
    training_window : slice
        Slice used to subset `ref_ts` and `train_ts` when the mapping between
        them is created
    nsmooth : int
        Number of elements that will be smoothed when determining CDFs
    raw_y : pandas.Series
        Raw time series of the second time series variable for cqm
    ref_y : pandas.Series
        Target second time series
    train_y : pandas.Series
        Training second time series
    bw : int
        bandwidth for KernelDensity
    xbins : int
        Bins for the flow time series
    ybins : int
        Bins for the second time series
    train_cdf_min : float (optional)
        Minimum percentile allowed for train cdf. Defaults as 1e-4 to help handle
        data spikes in corrections caused by multipliers being too large from
        the ratio between reference and training flows being large.

    Returns
    -------
    bmorph_ts : pandas.Series
        Returns a time series of length `raw_bmorph_window` with bmorphed
        values
    '''
    # Type checking
    assert isinstance(raw_ts, pd.Series)
    assert isinstance(ref_ts, pd.Series)
    assert isinstance(train_ts, pd.Series)

    assert isinstance(raw_apply_window, slice)
    assert isinstance(raw_train_window, slice)
    assert isinstance(ref_train_window, slice)
    assert isinstance(raw_cdf_window, slice)

    # Smooth the raw Series
    raw_smoothed_ts = raw_ts.rolling(window=nsmooth, min_periods=1, center=True).mean()

    # Create the CDFs that are used for morphing the raw_ts. The mapping is
    # based on the training_window
    ref_cdf = ref_ts[ref_train_window].rolling(window=nsmooth, min_periods=1, center=True).mean()
    train_cdf = train_ts[raw_train_window].rolling(window=nsmooth, min_periods=1, center=True).mean()
    raw_cdf = raw_smoothed_ts[raw_cdf_window]

    # Check if using edcdfm or cqm through second variable being added
    # for the raw and train series because ref can be set as train later
    if (raw_y is None) or (ref_y is None) or (train_y is None):
        # Calculate the bmorph multipliers based on the smoothed time series and
        # PDFs
        bmorph_multipliers = edcdfm(raw_smoothed_ts[raw_apply_window],
                                    raw_cdf, train_cdf, ref_cdf, train_cdf_min)

        # Apply the bmorph multipliers to the raw time series
        bmorph_ts = bmorph_multipliers * raw_ts[raw_apply_window]

    else:
        # Continue Type Checking additionally series
        assert isinstance(raw_y, pd.Series)
        assert isinstance(ref_y, pd.Series)
        assert isinstance(train_y, pd.Series)

        # smooth the y series as well
        raw_smoothed_y = raw_y.rolling(
            window=nsmooth, min_periods=1, center=True).mean()

        ref_smoothed_y = ref_y[ref_train_window].rolling(
            window=nsmooth, min_periods=1, center=True).mean()
        train_smoothed_y = train_y[raw_train_window].rolling(
            window=nsmooth, min_periods=1, center=True).mean()

        bmorph_multipliers = cqm(raw_cdf, train_cdf, ref_cdf,
                                 raw_smoothed_y[raw_cdf_window],
                                 train_smoothed_y, ref_smoothed_y,
                                 bw=bw, xbins=xbins, ybins=ybins,
                                 rtol=rtol, atol=atol, method=method,
                                 train_cdf_min=train_cdf_min)
        if smooth_multipliers:
            bmorph_multipliers = bmorph_multipliers.rolling(window=nsmooth, min_periods=1, center=True).mean()
        bmorph_ts = bmorph_multipliers[raw_apply_window] * raw_ts[raw_apply_window]

    return bmorph_ts, bmorph_multipliers


[docs]def bmorph_correct(raw_ts, bmorph_ts, correction_window,
                   ref_mean, train_mean, nsmooth):
    '''Correct bmorphed values to preserve the ratio of change

    Apply a correction to bmorphed values to preserve the mean change over a
    `correction_window`. This is similar to teh correction applied in the
    original PresRat algorithm; Pierce et al. 2015;
    http://dx.doi.org/10.1175/JHM-D-14-0236.1), except that we use a rolling
    mean to determine the correction to avoid discontinuities on the
    boundaries.

    Parameters
    ----------
    raw_ts :    pandas.series
        Series of raw values that have not been bmorphed
    bmorph_ts : Bmorphed version of `raw_ts`
        Series of bmorphed values
    correction_window : slice
        Slice of `raw_ts` and `bmorph_ts` over which the correction is applied
    ref_mean : float
        Mean of target time series (`ref_ts`) for the base period
    train_mean : float
        Mean of training time series (`train_ts`) for the base period
    nsmooth : int
        Number of elements that will be smoothed in `raw_ts` and `bmorph_ts`.
        The nsmooth value in this case is typically much larger than the one
        used for the bmorph function itself. For example, 365 days.

    Returns
    -------
    bmorph_corrected_ts : pandas.Series
       Corrected series of length `correction_window`.

    '''
    # Type checking
    assert isinstance(raw_ts, pd.Series)
    assert isinstance(bmorph_ts, pd.Series)
    assert isinstance(correction_window, slice)

    # Smooth the raw and bmorphed time series
    raw_ts_smoothed = raw_ts.rolling(window=nsmooth, min_periods=1,
                                     center=True).mean()
    bmorph_ts_smoothed = bmorph_ts.rolling(window=nsmooth, min_periods=1,
                                           center=True).mean()

    # Calculate the correction factors
    correction_ts = raw_ts_smoothed[correction_window] / \
        bmorph_ts_smoothed[correction_window] * \
        ref_mean/train_mean

    # Apply the correction to the bmorph time series
    bmorph_corrected_ts = correction_ts * bmorph_ts[correction_window]

    return bmorph_corrected_ts, correction_ts