Source code for bmorph.core.bmorph

"""
bmorph: modify a time series by removing elements of persistent differences
        (aka bias correction)

Persistent differences are inferred by comparing a 'ref' sample with a
'training' sample. These differences are then used to correct a 'raw' sample
that is presumed to have the same persistent differences as the 'training'
sample. The resulting 'bmorph' sample should then be consistent with the
'ref' sample.
"""

import numpy as np
import pandas as pd
import scipy.stats
from sklearn.neighbors import KernelDensity

# Done fail silently on divide by zero, but raise an error instead
np.seterr(divide='raise')


[docs]def kde2D(x, y, xbins=200, ybins=10, **kwargs): """ Estimate a 2 dimensional pdf via kernel density estimation """ xx, yy = np.mgrid[x.min():x.max():(xbins * 1j), y.min():y.max():(ybins * 1j)] xy_sample = np.vstack([yy.ravel(), xx.ravel()]).T xy_train = np.vstack([y, x]).T kde = KernelDensity(**kwargs) kde.fit(xy_train) # Need to exponentiate to get true probabilities z = np.exp(kde.score_samples(xy_sample)) return xx[:, 0], yy[0, :], np.reshape(z, yy.shape)
[docs]def hist2D(x, y, xbins, ybins, **kwargs): """ Create a 2 dimensional pdf vias numpy histogram2d""" H, xedg, yedg = np.histogram2d(x=x, y=y, bins=[xbins,ybins], density=True, **kwargs) xcen = (xedg[:-1] + xedg[1:]) / 2 ycen = (yedg[:-1] + yedg[1:]) / 2 return xcen, ycen, H
[docs]def marginalize_cdf(y_raw, z_raw, vals): """Find the marginalized cdf by computing cumsum(P(x|y=val)) for each val""" y_raw = np.array(y_raw) z_raw = np.array(z_raw) z_raw[z_raw == 0] = 1e-15 vals = np.array(vals) locs = np.argmin(np.abs(vals[:, np.newaxis] - y_raw), axis=1) z = np.cumsum(z_raw[:, locs], axis=0) z /= z[-1, :] return z
[docs]def cqm(raw_x: pd.Series, train_x: pd.Series, ref_x: pd.Series, raw_y: pd.Series, train_y: pd.Series, ref_y: pd.Series=None, method='hist', xbins=200, ybins=10, bw=3, rtol=1e-7, atol=0, nsmooth=5, train_cdf_min=1e-6) -> pd.Series: """Conditional Quantile Mapping Multidimensional conditional equidistant CDF matching function: \tilde{x_{mp}} = x_{mp} + F^{-1}_{oc}(F_{mp}(x_{mp}|y_{mp})|y_{oc}) - F^{-1}_{mc}(F_{mp}(x_{mp}|y_{mp})|y_{mc}) """ if method == 'kde': x_raw, y_raw, z_raw = kde2D(raw_x, raw_y, xbins, ybins, bandwidth=bw, rtol=rtol, atol=atol) x_train, y_train, z_train = kde2D(train_x, train_y, xbins, ybins, bandwidth=bw, rtol=rtol, atol=atol) x_ref, y_ref, z_ref = kde2D(ref_x, ref_y, xbins, ybins, bandwidth=bw, rtol=rtol, atol=atol) elif method == 'hist': x_raw, y_raw, z_raw = hist2D(raw_x, raw_y, xbins, ybins) x_train, y_train, z_train = hist2D(train_x, train_y, xbins, ybins) x_ref, y_ref, z_ref = hist2D(ref_x, ref_y, xbins, ybins) else: raise Exception("Current methods for cqm only include 'hist' to use hist2D " "and 'kde' to use kde2D, please select one.") # Lookup tables of actual cdfs with dimensions (xbins, time) raw_cdfs = marginalize_cdf(y_raw, z_raw, raw_y) # Lookup tables of actual cdfs with dimensions (time, xbins) # These are transposed to do the inverse lookup # Note we always marginalize with respect to the raw values train_cdf = marginalize_cdf(y_train, z_train, raw_y).T ref_cdf = marginalize_cdf(y_ref, z_ref, raw_y).T # Quantiles of the raw flows nx_raw = np.arange(len(raw_x)) u_t = raw_cdfs[np.argmin(np.abs(raw_x.values[:, np.newaxis] - x_raw), axis=1), nx_raw] u_t = u_t[:, np.newaxis] # Lookup the associated flow values in the CDFs for the reference and training periods mapped_ref = x_ref[np.argmin(np.abs(u_t - ref_cdf[nx_raw, :]), axis=1)] mapped_train = x_train[np.argmin(np.abs(u_t - train_cdf[nx_raw, :]), axis=1)] mapped_train[mapped_train < train_cdf_min] = train_cdf_min multipliers = pd.Series(mapped_ref / mapped_train, index=raw_x.index, name='multiplier') if method == 'hist': # Do some smoothing just to reduce effects of histogram bin edges multipliers = multipliers.rolling(nsmooth, win_type='triang', min_periods=1).mean() return multipliers
[docs]def edcdfm(raw_x, raw_cdf, train_cdf, ref_cdf, train_cdf_min=1e-6): '''Calculate multipliers using an adapted version of the EDCDFm technique This routine implements part of the PresRat bias correction method from Pierce et al. (2015; http://dx.doi.org/10.1175/JHM-D-14-0236.1), which is itself an extension of the Equidistant quantile matching (EDCDFm) technique of Li et al. (2010; http://dx.doi.org/10.1029/94JD00483). The part that is implemented here is the amended form of EDCDFm that determines multiplicative changes in the quantiles of a CDF. In particular, if the value `raw_x` falls at quantile `u_t` (in `raw_cdf`), then the bias-corrected value is the value in `ref_cdf` at `u_t` (`ref_x`) multiplied by the model-predicted change at `u_t` evaluated as a ratio (i.e., model future (or `raw_x`) / model historical (or `ref_x`)). Thus, the bias-corrected value is `raw_x` multiplied by `ref_x`/`train_x`. Here we only return the multiplier `ref_x`/`train_x`. This method preserves the model-predicted median (not mean) change evaluated multiplicatively. Additional corrections are required to preserve the mean change. Inclusion of these additional corrections constitutes the PresRat method. Parameters ---------- raw_x : pandas.Series Series of raw values that will be used to determine the quantile `u_t` raw_cdf : pandas.Series Series of raw values that represents the CDF that is used to determine the non-parametric quantile of `raw_x` train_cdf: pandas.Series Series of training values that represents the CDF based on the same process as `raw_cdf`, but overlapping in time with `ref_cdf` ref_cdf: pandas.Series Series of ref values that represents the ref CDF and that overlaps in time with `train_cdf` Returns ------- multiplier : pandas.Series Multipliers for `raw_x`. The pandas.Series has the same index as `raw_x` ''' # Type checking (note that the checking is more strict here then it # probably needs to be) assert isinstance(raw_x, pd.Series) assert isinstance(raw_cdf, pd.Series) assert isinstance(train_cdf, pd.Series) assert isinstance(ref_cdf, pd.Series) # Given raw_x and raw_cdf determine the quantiles u_t # This method is slightly more efficient than using # scipy.percentileofscore, especially on large arrays cdf_idx = np.argsort(raw_cdf) cdf_sort = raw_cdf[cdf_idx] cdf_rank = 100 * scipy.stats.rankdata(cdf_sort) / len(cdf_idx) offset = 100 / len(cdf_idx) u_t = [cdf_rank[np.searchsorted(cdf_sort, x, side='left')]-offset for x in raw_x] # Given u_t and train_cdf determine train_x train_x = np.percentile(train_cdf, u_t) train_x[train_x < train_cdf_min] = train_cdf_min # Given u_t and ref_cdf determine ref_x ref_x = np.percentile(ref_cdf, u_t) # Calculate multiplier multiplier = ref_x / train_x return pd.Series(multiplier, index=raw_x.index, name='multiplier')
[docs]def bmorph(raw_ts, train_ts, ref_ts, raw_apply_window, raw_train_window, ref_train_window, raw_cdf_window, raw_y=None, ref_y=None, train_y=None, nsmooth=12, bw=3, xbins=200, ybins=10, rtol=1e-7, atol=0, method='hist', smooth_multipliers=True, train_cdf_min=1e-6): '''Morph `raw_ts` based on differences between `ref_ts` and `train_ts` bmorph is an adaptation of the PresRat bias correction procedure from Pierce et al. (2015; http://dx.doi.org/10.1175/JHM-D-14-0236.1), which is itself an extension of the Equidistant quantile matching (EDCDFm) technique of Li et al. (2010; http://dx.doi.org/10.1029/94JD00483). The method as implemented here uses a multiplicative change in the quantiles of a CDF, followed by a simple correction to preserve changes in the long-term mean. No further frequency-based corrections are applied. The method differs from PresRat in that it is not applied for fixed periods (but uses a moving window) to prevent discontinuities in the corrected time series and it does not apply a frequency-based correction. The method also allows changes to be made through an adapted version of the EDCDFm technique or through the multiDimensional ConDitional EquiDistant CDF matching function if a second timeseries variable is passed. Parameters ---------- raw_ts : pandas.Series Raw time series that will be bmorphed raw_cdf_window : slice Slice used to determine the CDF for `raw_ts` raw_bmorph_window : slice Slice of `raw_ts` that will be bmorphed ref_ts : pandas.Series Target time series. This is the time series with ref values that overlaps with `train_ts` and is used to calculated `ref_cdf` train_ts : pandas.Series Training time series. This time series is generated by the same process as `raw_ts` but overlaps with `ref_ts`. It is used to calculate `train_cdf` training_window : slice Slice used to subset `ref_ts` and `train_ts` when the mapping between them is created nsmooth : int Number of elements that will be smoothed when determining CDFs raw_y : pandas.Series Raw time series of the second time series variable for cqm ref_y : pandas.Series Target second time series train_y : pandas.Series Training second time series bw : int bandwidth for KernelDensity xbins : int Bins for the flow time series ybins : int Bins for the second time series train_cdf_min : float (optional) Minimum percentile allowed for train cdf. Defaults as 1e-4 to help handle data spikes in corrections caused by multipliers being too large from the ratio between reference and training flows being large. Returns ------- bmorph_ts : pandas.Series Returns a time series of length `raw_bmorph_window` with bmorphed values ''' # Type checking assert isinstance(raw_ts, pd.Series) assert isinstance(ref_ts, pd.Series) assert isinstance(train_ts, pd.Series) assert isinstance(raw_apply_window, slice) assert isinstance(raw_train_window, slice) assert isinstance(ref_train_window, slice) assert isinstance(raw_cdf_window, slice) # Smooth the raw Series raw_smoothed_ts = raw_ts.rolling(window=nsmooth, min_periods=1, center=True).mean() # Create the CDFs that are used for morphing the raw_ts. The mapping is # based on the training_window ref_cdf = ref_ts[ref_train_window].rolling(window=nsmooth, min_periods=1, center=True).mean() train_cdf = train_ts[raw_train_window].rolling(window=nsmooth, min_periods=1, center=True).mean() raw_cdf = raw_smoothed_ts[raw_cdf_window] # Check if using edcdfm or cqm through second variable being added # for the raw and train series because ref can be set as train later if (raw_y is None) or (ref_y is None) or (train_y is None): # Calculate the bmorph multipliers based on the smoothed time series and # PDFs bmorph_multipliers = edcdfm(raw_smoothed_ts[raw_apply_window], raw_cdf, train_cdf, ref_cdf, train_cdf_min) # Apply the bmorph multipliers to the raw time series bmorph_ts = bmorph_multipliers * raw_ts[raw_apply_window] else: # Continue Type Checking additionally series assert isinstance(raw_y, pd.Series) assert isinstance(ref_y, pd.Series) assert isinstance(train_y, pd.Series) # smooth the y series as well raw_smoothed_y = raw_y.rolling( window=nsmooth, min_periods=1, center=True).mean() ref_smoothed_y = ref_y[ref_train_window].rolling( window=nsmooth, min_periods=1, center=True).mean() train_smoothed_y = train_y[raw_train_window].rolling( window=nsmooth, min_periods=1, center=True).mean() bmorph_multipliers = cqm(raw_cdf, train_cdf, ref_cdf, raw_smoothed_y[raw_cdf_window], train_smoothed_y, ref_smoothed_y, bw=bw, xbins=xbins, ybins=ybins, rtol=rtol, atol=atol, method=method, train_cdf_min=train_cdf_min) if smooth_multipliers: bmorph_multipliers = bmorph_multipliers.rolling(window=nsmooth, min_periods=1, center=True).mean() bmorph_ts = bmorph_multipliers[raw_apply_window] * raw_ts[raw_apply_window] return bmorph_ts, bmorph_multipliers
[docs]def bmorph_correct(raw_ts, bmorph_ts, correction_window, ref_mean, train_mean, nsmooth): '''Correct bmorphed values to preserve the ratio of change Apply a correction to bmorphed values to preserve the mean change over a `correction_window`. This is similar to teh correction applied in the original PresRat algorithm; Pierce et al. 2015; http://dx.doi.org/10.1175/JHM-D-14-0236.1), except that we use a rolling mean to determine the correction to avoid discontinuities on the boundaries. Parameters ---------- raw_ts : pandas.series Series of raw values that have not been bmorphed bmorph_ts : Bmorphed version of `raw_ts` Series of bmorphed values correction_window : slice Slice of `raw_ts` and `bmorph_ts` over which the correction is applied ref_mean : float Mean of target time series (`ref_ts`) for the base period train_mean : float Mean of training time series (`train_ts`) for the base period nsmooth : int Number of elements that will be smoothed in `raw_ts` and `bmorph_ts`. The nsmooth value in this case is typically much larger than the one used for the bmorph function itself. For example, 365 days. Returns ------- bmorph_corrected_ts : pandas.Series Corrected series of length `correction_window`. ''' # Type checking assert isinstance(raw_ts, pd.Series) assert isinstance(bmorph_ts, pd.Series) assert isinstance(correction_window, slice) # Smooth the raw and bmorphed time series raw_ts_smoothed = raw_ts.rolling(window=nsmooth, min_periods=1, center=True).mean() bmorph_ts_smoothed = bmorph_ts.rolling(window=nsmooth, min_periods=1, center=True).mean() # Calculate the correction factors correction_ts = raw_ts_smoothed[correction_window] / \ bmorph_ts_smoothed[correction_window] * \ ref_mean/train_mean # Apply the correction to the bmorph time series bmorph_corrected_ts = correction_ts * bmorph_ts[correction_window] return bmorph_corrected_ts, correction_ts