Source code for bmorph.evaluation.plotting

import numpy as np
import xarray as xr
import pandas as pd
from typing import List
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy
import probscale

import networkx as nx
import graphviz as gv
import pygraphviz as pgv
import probscale
from networkx.drawing.nx_agraph import graphviz_layout

from bmorph.evaluation.constants import colors99p99
from bmorph.evaluation import descriptive_statistics as dst

from statsmodels.distributions.empirical_distribution import ECDF

#*****************************************************************************************
# Plotting Helper Functions:
#      custom_legend
#      calc_water_year
#      find_index_water_year
#      determine_row_col
#      log10_1p
#      scatter_series_axes
#*****************************************************************************************

[docs]def custom_legend(names:List, colors=colors99p99):
    """Creates a list of patches to be passed in as `handles` for the plt.legends function.

    Parameters
    ----------
    names : list
        Legend names.
    colors : list
        A list of the colors corresponding to `names`.

    Returns
    -------
    handles
        Handle parameter for matplotlib.legend.
    """
    legend_elements = list()
    for i,name in enumerate(names):
        legend_elements.append(mpl.patches.Patch(facecolor = colors[i], label = name))
    return legend_elements

[docs]def calc_water_year(df: pd.DataFrame):
    """Calculates the water year.

    Parameters
    ----------
    df : pandas.DataFrame
        Flow timeseries with a DataTimeIndex.

    Returns
    -------
    pandas.DataFrame.index
        A pandas.DataFrame index grouped by water year.

    """
    return df.index.year + (df.index.month >= 10).astype(int)

[docs]def find_index_water_year(data: pd.DataFrame) -> np.int:
    """ Finds the index of the first hydrologic year.

    Parameters
    ----------
    data : pd.DataFrame
        Flow timeseries with a DateTime index.

    Returns
    -------
    int
        Index of the first hydrologic year.
    """
    water_year = pd.Timestamp(0)
    i = 0
    while water_year == pd.Timestamp(0):
        date = data.index[i]
        if date.month == 10 and date.day == 1:
            water_year = date
        else:
            i = i + 1
    return i

[docs]def determine_row_col(n:int, pref_rows = True):
    """Determines rows and columns for rectangular subplots

    Calculates a rectangular subplot layout that contains at least n subplots,
    some may need to be turned off in plotting. If a square configuration is
    possible, then a square configuration will be proposed. This helps automate
    the process of plotting a variable number of subplots.

    Parameters
    ----------
    n : int
        Total number of plots.
    pref_rows : boolean
        If True, and only a rectangular arrangment is possible, then put the
        longer dimension in n_rows. If False, then it is placed in the n_columns.

    Returns
    -------
    n_rows : int
        Number of rows for matplotlib.subplot.
    n_columns : int
        Number of columns for matplotlib.subplot.
    """
    if n < 0:
        raise Exception("Please enter a positive n")

    # use square root to test because we want a square
    # arrangment
    sqrt_n = np.sqrt(n)
    int_sqrt_n = int(sqrt_n)
    if sqrt_n == float(int_sqrt_n):
        return int_sqrt_n, int_sqrt_n
    elif int_sqrt_n*(int_sqrt_n+1)>=n:
        # see if a rectangular orientation would work
        # eg. sqrt(12) = 3.464, int(3.464) = 3
        # 3*(3+1) = 3*4 = 12
        if pref_rows:
            return int_sqrt_n+1, int_sqrt_n
        else:
            return int_sqrt_n, int_sqrt_n+1
    else:
        # since sqrt(n)*sqrt(n) = n,
        # (sqrt(n)+1)*sqrt(n) < n,
        # then (sqrt(n)+1)^2 > n
        return int_sqrt_n+1, int_sqrt_n+1

[docs]def log10_1p(x: np.ndarray):
    """Return the log10 of one plus the input array, element-wise.

    Parameters
    ----------
    x : numpy.ndarray
        An array of values greater than -1. If values are less than or
        equal to -1, then a domain error will occur in computing the
        logarithm.

    Returns
    --------
    y : numpy.ndarray
        Array of the values having the log10(element+1) computer.
    """
    y = np.nan*x
    for i, element in enumerate(x):
        y[i] = np.log10(element + 1)
    return y

[docs]def scatter_series_axes(data_x, data_y, label: str, color: str, alpha: float,
                        ax = None) -> plt.axes:
    """Creates a scatter axis for plotting.

    Parameters
    ----------
    data_x : array-like
        Data for the x series.
    data_y : array-like
        Data for the y series.
    label : str
        Name for the axes.
    color : str
        Color for the markers.
    alpha : float
        Transparency for the markers.

    Returns
    -------
    matplotlib.axes
    """
    if ax is None:
        fig, ax = plt.subplots()
    ax.scatter(data_x, data_y, label = label, color = color, alpha = alpha)
    return ax

#*****************************************************************************************
# General Bias Correction Summary Statistics:
#      pbias_sites
#      diff_maxflow_sites
#      pbias_plotter
#      diff_maxflow_plotter
#      site_diff_scatter
#      stat_corrections_scatter2D
#      anomaly_scatter2D
#      rmseFraePlot
#*****************************************************************************************

[docs]def pbias_sites(observed: pd.DataFrame, predicted: pd.DataFrame):
    """Calculates percent bias on a hydrologic year and site-by-site basis.

    Parameters
    ----------
    observed : pandas.DataFrame
        Dataframe containing all observations.
    predicted : pandas.DataFrame
        Dataframe containing all predictions.

    Returns
    -------
    pandas.DataFrame
        Dataframe contain the percent bias computed.
    """
    #finds the start of the first hydraulic year
    i = find_index_water_year(observed)
    water_year_start = observed.index[i]
    #counts the number of hydraylic years
    water_years = 0
    while i < len(observed):
        date = observed.index[i]
        if date.month == 9 and date.day == 30:
            water_years = water_years + 1
        i = i + 1

    pbias_site_df = pd.DataFrame(columns = observed.columns,
                                 index = pd.Series(range(0, water_years)))
    pbias_current_year = pd.DataFrame(columns = observed.columns)

    for i in range(0, water_years):
        #establish end of hydraulic year
        water_year_end = water_year_start + pd.Timedelta(364.25, unit = 'd')

        #need to truncate datetimes since time indicies do not align
        O = observed.loc[water_year_start : water_year_end]
        P = predicted.loc[water_year_start : water_year_end]
        O.index = O.index.floor('d')
        P.index = P.index.floor('d')

        pbias_current_year = dst.pbias(O, P)

        #puts the computations for the hydraulic year into our bigger dataframe
        pbias_site_df.iloc[i] = pbias_current_year.iloc[0].T

        #set up next hydraulic year
        water_year_start = water_year_start + pd.Timedelta(365.25, unit = 'd')

    return pbias_site_df

[docs]def diff_maxflow_sites(observed: pd.DataFrame, predicted: pd.DataFrame):
    """Calculates difference in maximum flows on a hydrologic year and site-by-site basis.

    Parameters
    ----------
    observed : pandas.DataFrame
        Dataframe containing all observations.
    predicted : pandas.DataFrame
        Dataframe containing all predictions.

    Returns
    -------
    pandas.DataFrame
        DataFrame containing the difference in maximum flows.
    """
    #finds the start of the first hydraulic year
    i = find_index_water_year(observed)
    water_year_start = observed.index[i]

    #counts the number of hydraylic years
    water_years = 0
    while i < len(observed):
        date = observed.index[i]
        if date.month == 9 and date.day == 30:
            water_years = water_years + 1
        i = i + 1

    diff_maxflow_sites_df = pd.DataFrame(columns = observed.columns,
                                         index = pd.Series(range(0,water_years)))
    diff_maxflow_current_year = pd.DataFrame(columns = observed.columns)

    for i in range(0,water_years):
        #establish end of hydraulic year
        water_year_end = water_year_start + pd.Timedelta(364.25, unit = 'd')

        #need to truncate datetimes since time indicies do not align
        O = observed.loc[water_year_start : water_year_end]
        P = predicted.loc[water_year_start : water_year_end]
        O.index = O.index.floor('d')
        P.index = P.index.floor('d')

        diff_maxflow_current_year = P.max().to_frame().T - O.max().to_frame().T

        #puts the computations for the hydraulic year into our bigger dataframe
        diff_maxflow_sites_df.iloc[i] = diff_maxflow_current_year.iloc[0].T

        #set up next hydraulic year
        water_year_start = water_year_start + pd.Timedelta(365.25, unit = 'd')


    return diff_maxflow_sites_df

[docs]def pbias_plotter(observed: pd.DataFrame, names: list, colors: list, *models: pd.DataFrame):
    """Plots box plots of numerous models grouped by site.

    Parameters
    ----------
    observed : pandas.Dataframe
        Dataframe containing observations.
    names  : list
        List of the model names.
    colors: list
        List of colors to be plotted.
    *models : List[pandas.DataFrame]
        Any number of pandas.DataFrame objects to be evaluated.

    Returns
    -------
    matplotlib.figure, matplotlib.axes
    """
    num_models = len(models)
    sites = observed.columns
    pbias_models = list()

    #runs each model through pbias_sites
    for model in models:
        pbias_models.append(pbias_sites(observed, model))

    fig = plt.figure()
    ax = plt.axes(xlabel = 'Sites', ylabel = 'Percent Bias')
    position = 1
    for site in sites:
        df = pd.DataFrame(index = pbias_models[0].index)

        #fill out the dataframe for a single site with each model's percent bias
        i = 0
        for model in pbias_models:
            entry = f"{site}:{names[i]}"
            df[entry] = model[site]
            i = i + 1

        boxplots = plt.boxplot(df.T, positions = np.arange(position, position + num_models),
                               patch_artist=True)

        for patch, color in zip(boxplots['boxes'], colors):
            patch.set_facecolor(color)

        position = position + num_models + 1

    tick_location = list()
    start_tick = int(np.ceil(len(models) / 2))
    tick_spacing = len(models) + 1
    for j in range(0, len(sites)):
        tick_location.append(start_tick + j * tick_spacing)

    ax.set(xticks = tick_location, xticklabels = sites)
    plt.xticks(rotation = 90)

    ax.legend(handles=custom_legend(names, colors), loc='upper left')
    return fig, ax

[docs]def diff_maxflow_plotter(observed: pd.DataFrame, names: list, colors: list, *models: pd.DataFrame):
    """Plots box plots of numerous models grouped by site.

    Parameters
    ----------
    observed : pandas.Dataframe
        a dataframe containing observations
    names : list
        List of the model names.
    colors: list
        List of colors to be plotted.
    *models : List[pandas.DataFrame]
        Any number of pandas.DataFrame objects to be evaluated.

    Returns
    -------
    matplotlib.figure, matplotlib.axes
    """
    num_models = len(models)
    sites = observed.columns
    diff_maxflow_models = list()

    #runs each model through pbSites
    for model in models:
        diff_maxflow_models.append(diff_maxflow_sites(observed, model))

    fig = plt.figure()
    ax = plt.axes(xlabel = 'Sites', ylabel = 'Difference in Max Flow')
    position = 1
    for site in sites:
        df = pd.DataFrame(index = diff_maxflow_models[0].index)

        #fill out the dataframe for a single site with each model's percent bias
        for i, model in enumerate(diff_maxflow_models):
            entry = f"{site}:{names[i]}"
            df[entry] = model[site]

        bp = plt.boxplot(df.T, positions = np.arange(position, position + num_models),
                         patch_artist = True)

        for patch, color in zip(bp['boxes'], colors):
            patch.set_facecolor(color)

        position = position + num_models + 1
        #follow plot style: https://stackoverflow.com/questions/16592222/matplotlib-group-boxplots

    tick_location = list()
    start_tick = int(np.ceil(len(models) / 2))
    tick_spacing = len(models) + 1
    for j in range(0, len(sites)):
        tick_location.append(start_tick + j * tick_spacing)

    ax.set(xticks = tick_location, xticklabels = sites)
    plt.xticks(rotation = 45)
    plt.title('Yearly Difference in Max Flow due to Correction')

    ax.legend(handles = custom_legend(names, colors), loc = 'upper left')

    return fig, ax

[docs]def site_diff_scatter(predictions: dict, raw_key: str, model_keys: list,
                      compare: dict, compare_key: str, site: str, colors = colors99p99):
    """Creates a scatter plot of Raw-BC versus some measure.

    Parameters
    ----------

    predictions : dict
        Expects {'Prediction Names' : Prediction pandas.DataFrame}.
        'Prediction Names' will be printed in the legend.
    raw_key : str
        The key for the predictions dictionary that directs to the
        raw data that each model will be subtracting.
    model_keys : list
        A list of dictoionary keys pertaining to the correction models
        that are wanting to be plotted.
    compare : dict
        Expecting {'Measure name' : measure pandas.DataFrame}.
        These are what is being plotted against on the horizontal-axis.
    compare_key : str
        The dictionary key for the measure desired in the compare dictionary.
        'compare_key' will be printed on the horizontal axis.
    site : str
        A single site designiation to be examined in the plot. This will
        be listed as the title of the plot.
    colors : List[str], optional
        Colors as strings to be plotted from.

    Returns
    -------
    matplotlib.figure, matplotlib.axes

    """
    #retreiving DataFrames and establishing data to be plotted
    raw = predictions[raw_key]
    raw = raw.loc[:, site]

    Y = list()
    for model_key in model_keys:
        predict = predictions[model_key]
        Y.append(raw - predict.loc[:, site])

    X = compare[compare_key]
    X = X.loc[:, site]
    fig,ax = plt.subplots()
    for i,y in enumerate(Y):
        scatter_series_axes(X, y, model_keys[i], color = colors[i], alpha = 0.05, ax = ax)
    plt.xlabel(compare_key)
    plt.ylabel('Raw-BC')
    plt.title(site)
    plt.axhline(0)
    plt.legend(handles = custom_legend(model_keys, colors))

    return fig, ax

[docs]def stat_corrections_scatter2D(computations: dict, baseline_key: str, cor_keys: list, uncor_key: str,
                               sites = [], multi = True, colors = colors99p99):
    """Creates a scatter plot of the flow before/after corrections relative to observations.

    Parameters
    ----------
    computations : dict
        Expecting {"Correction Name": correction pandas.DataFrame}.
    baseline_key : str
        Contains the dictionary key for the `computations` dictionary
        that accesses what baseline the corrections should be
        compared to. This is typically observations.
    cor_keys : list
        Dictionary keys accessing the correction DataFrames in `computations`.
        These will be printed in the legend.
    uncor_key : str
        The dictionary key that accesses the uncorrected data in `computations`.
    sites : list
        Site(s) to be compared in the plot, can have a size of 1.
        If `multi` is set to False and this is not changed to a
        single site, then the first value in the list will be
        chosen.
    multi : boolean, optional
        Determines whether the plot uses data from multiple sites or a
        single site, defaults as True.
    colors : List[str], optional
        Colors as strings to be plotted from.
        Plotting colors are different for each correction
        DataFrame, but same across sites for a singular
        correction. An error will be thrown if there are
        more `cor_keys` then `colors`.

    Returns
    -------
    matplotlib.figure, matplotlib.axes

    """
    #retreiving DataFrames and establishing data to be plotted
    datum = computations[baseline_key]
    X = datum - computations[uncor_key]

    #we need to make the values replacable by the sites,
    #hence making the max the min and the min the max
    xmax = X.min().min()
    xmin = X.max().max()

    ymax = 0
    ymin = 100

    #thrown in are Smax and Smin to determine what the max
    #and min values overall are so that the axi may be
    #appropriately scaled
    fig,ax = plt.subplots()
    for i, cor_key in enumerate(cor_keys):
        Y = datum - computations[cor_key]


        if multi == True:
            for site in sites:
                    x = X.loc[:, site]
                    y = Y.loc[:, site]

                    xmax_site = x.max()
                    xmin_site = x.min()
                    ymax_site = y.max()
                    ymin_site = y.min()
                    if xmax_site > xmax:
                        xmax = xmax_site
                    if xmin_site < xmin:
                        xmin = xmin_site
                    if ymax_site > ymax:
                        ymax = ymax_site
                    if ymin_site < ymin:
                        ymin = ymin_site

                    scatter_series_axes(x, y, site, colors[i], 0.05, ax)
        else: #meaning site should be set to a singular value
            #double check that this was actually changed, otherwise picks first value
            site = sites
            if isinstance(sites, list):
                site = sites[0]

            x = X.loc[:, site]
            y = Y.loc[:, site]
            xmax_site = x.max()
            xmin_site = x.min()
            ymax_site = y.max()
            ymin_site = y.min()
            if xmax_site > xmax:
                xmax = xmax_site
            if xmin_site < xmin:
                xmin = xmin_site
            if ymax_site > ymax:
                ymax = ymax_site
            if ymin_site < ymin:
                ymin = ymin_site

            scatter_series_axes(x, y, site, colors[i], 0.05, ax)


    #Sets up labels based on whether one or more sites were plotted
    plt.xlabel(f'{baseline_key}-Uncorrected')
    plt.ylabel(f'{baseline_key}-Corrected')

    if multi==True:
        plt.title("Statistical Corrections")
    else:
        plt.title(f'Statistical Corrections: {sites}')

    minlin = xmin
    maxlin = xmax

    if ymin < minlin:
        minlin = ymin
    if ymax > maxlin:
        maxlin = ymax

    minlin = minlin * 0.9
    maxlin = maxlin * 1.1

    plt.plot([minlin, maxlin], [minlin, maxlin])
    plt.legend(handles = custom_legend(cor_keys, colors))

    return fig, ax

[docs]def anomaly_scatter2D(computations: dict, baseline_key: str, vert_key: str, horz_key: str,
                      sites = [], multi = True, colors = colors99p99, show_legend = True):
    """Plots two correction models against each other after Raw is subracted from each.

    Parameters
    ----------
    computations : dict
        Expecting {"Correction Name": correction pandas.DataFrame}.
    baseline_key : str
        Dictionary key for the `computations` dictionary
        that accesses what baseline the corrections should be
        compared to. This is typically observations.
    vert_key : str
        Dictionary key for the `computations` dictionary that accesses the
        model to be plotted on the vertical axis.
    horz_key : str
        Dictionary key for the `computations` dictionary that accesses the
        model to be plotted on the horizontal axis.
    sites : list
        Site(s) to be compared in the plot, can have a size of 1.
        If `multi` is set to False and this is not changed to a
        single site, then the first value in the list will be
        chosen.
    multi : boolean, optional
        Whether the plot uses data from multiple sites or a single site.
    colors : list, optional
        Colors as strings to be plotted from. Plotting colors are different
        for each correction DataFrame, but same across sites for a singular
        correction. An error will be thrown if there are  more `cor_keys` then
        colors.
    show_legend : boolean, optional
        Whether or not to display the legend, defaults as True.
    """
    #retreiving DataFrames and establishing data to be plotted
    datum = computations[baseline_key]
    X = datum - computations[horz_key]
    Y = datum - computations[vert_key]

    i = 0
    fig, ax = plt.subplots()

    if multi == True:
        for site in sites:
            x = X.loc[:, site]
            y = Y.loc[:, site]
            scatter_series_axes(x, y, site, colors[i], 0.05, ax)
            i = i + 1

            if i >= len(colors):
                #recycles colors if all exhausted
                i = 0
    else: #meaning site should be set to a singular value
        #double check that this was actually changed, otherwise picks first value
        site = sites
        if type(sites) == list:
            site = sites[0]

        X = X.loc[:, site]
        Y = Y.loc[:, site]
        scatter_series_axes(X, Y, site, colors[i], 0.05, ax)

    plt.xlabel(f'{baseline_key}-{horz_key}')
    plt.ylabel(f'{baseline_key}-{vert_key}')
    plt.title("Statistical Correction Anomolies")
    plt.axhline(0)
    plt.axvline(0)
    if show_legend:
        plt.legend(handles = custom_legend(sites, colors))

[docs]def rmseFracPlot(data_dict: dict, obs_key: str, sim_keys: list,
                sites = [], multi = True, colors = colors99p99):
    """Root mean square values calculated by including descending values one-by-one.

    Parameters
    ----------
    data_dict : dict
        Expecting {"Data Name": data pandas.DataFrame}.
    obs_key : str
        Dictionary key for the `computations` dictionary that accesses the
        observations to be used as true in calculating root mean squares.
    sim_keys : list
        Dictionary keys accessing the simulated DataFrames in `computations`,
        used in predictions in calculating root mean squares.
    sites : list
        Site(s) to be compared in the plot, can have a size of 1.
        If `multi` is set to False and this is not changed to a
        single site, then the first value in the list will be
        chosen.
    multi : boolean, optional
        Whether the plot uses data from multiple sites or a single site.
    colors : list, optional
        Colors as strings to be plotted from. Plotting colors are different
        for each correction DataFrame, but same across sites for a singular
        correction. An error will be thrown if there are  more `sim_keys` then
        colors.
    """
    #retrieving data and flooring time stamps
    observations = data_dict[obs_key]
    observations.index = observations.index.floor('d')

    color_num = 0
    for sim_key in sim_keys:
        predictions = data_dict[sim_key]
        predictions.index = predictions.index.floor('d')

        N = len(predictions.index)
        rmse_tot = dst.rmse(observations, predictions)
        rmse_n = pd.DataFrame(index = np.arange(0, N))

        errors = predictions - observations
        square_errors = errors.pow(2)
        if multi == True:

            #constructs a dataframe where each column is independently sorted
            for site in sites:
                square_errors_site = square_errors.loc[:, site].sort_values(ascending = False)
                rmse_n[site]=square_errors_site.values


            #performs cumulative mean calcualtion
            for site in sites:
                vals = rmse_n[site].values
                mat = np.vstack([vals,] * N).T
                nantri = np.tri(N)
                nantri[nantri == 0] = np.nan
                mat_mean = np.nanmean(mat * nantri, axis = 1)
                mat_rmse = np.power(mat_mean, 0.5)
                rmse_n[site] = np.divide(mat_rmse, rmse_tot[site].values)
        else:
            site = sites
            if type(sites) == list:
                site = sites[0]

            square_errors_site = square_errors.loc[:, site].sort_values(ascending = False)
            rmse_n[site] = square_errors_site.values
            vals = rmse_n[site].values
            mat = np.vstack([vals,] * N).T
            nantri = np.tri(N)
            nantri[nantri == 0] = np.nan
            mat_mean = np.nanmean(mat * nantri, axis=1)
            mat_rmse = np.power(mat_mean, 0.5)
            rmse_n[site] = np.divide(mat_rmse, rmse_tot[site].values)

        rmse_n.index = rmse_n.index / N
        plt.plot(rmse_n, color = colors[color_num], alpha = 0.5)
        color_num = color_num + 1

    plt.xlabel('n/N')
    plt.xscale('log')
    plt.ylabel('RMSE_Cumulative/RMSE_total')
    plt.yscale('log')

    if multi == True:
        plt.title('RMSE Distribution in Descending Sort')
    else:
        plt.title(f'RMSE Distribution in Descending Sort: {site}')
    plt.axhline(1)
    plt.legend(handles = custom_legend(sim_keys , colors))

#*****************************************************************************************
# SimpleRiverNetwork Plots and Related NetworkX Functions:
#      find_upstream
#      find_all_upstream
#      create_adj_mat
#      create_nxgraph
#      organize_nxgraph
#      color_code_nxgraph
#      draw_dataset
#*****************************************************************************************

[docs]def find_upstream(topo: xr.Dataset, segID: int, return_segs: list = []):
    """Finds what river segment is directly upstream from the xarray.Dataset.

    Parameters
    ----------
    topo : xarray.Dataset
        Contains river network topography. Expecting each river segment's
        immeditate downstream river segment is designated by 'Tosegment'/
    segID : int
        Current river segment identification number.
    return_segs : list
        River segment identification numbers upstream from `segID`.
        This defaults as an empty list to be filled by the method.
    """
    upsegs = np.argwhere((topo['Tosegment'] == segID).values).flatten()
    upsegIDs = topo['seg_id'][upsegs].values
    return_segs += list(upsegs)

[docs]def find_all_upstream(topo: xr.Dataset, segID: int, return_segs: list = []) -> np.ndarray:
    """Finds all upstream river segments for a given river segment from the xarray.Dataset.

    Parameters
    ----------
    topo : xarray.Dataset
    segID : int
    return_segs : list

    Returns
    -------
    numpy.ndarray
    """
    upsegs = np.argwhere((topo['Tosegment'] == segID).values).flatten()
    upsegIDs = topo['seg_id'][upsegs].values
    return_segs += list(upsegs)
    for upsegID in upsegIDs:
        find_all_upstream(topo, upsegID, return_segs = return_segs)
    return np.unique(return_segs).flatten()

[docs]def create_adj_mat(topo: xr.Dataset) -> np.ndarray:
    """Forms the adjacency matrix for the graph of the topography.

    Note that this is independent of whatever the segments are called, it
    is a purely a map of the relative object locations.
    Parameters
    ----------
    topo : xarray.Dataset
        Describes the topograph of the river network.

    Returns
    --------
    numpy.ndarray
        An adjacency matrix describing the river network.
    """
    #creates the empty adjacency matrix
    N = topo.dims['seg']
    adj_mat = np.zeros(shape=(N, N), dtype = int)

    #builds adjacency matrix based on what segements are upstream
    i = 0
    for ID in topo['seg_id'].values:
        adj = list()
        find_upstream(topo, ID, adj)
        for dex in adj:
            adj_mat[i, dex] += 1
        i += 1
    return adj_mat

[docs]def create_nxgraph(adj_mat: np.ndarray) -> nx.Graph:
    """Creates a NetworkX Graph object given an adjacency matrix.

    Parameters
    ----------
    adj_mat : numpy.ndarray
        Adjacency matrix describing the river network.

    Returns
    -------
    networkx.graph
        NetworkX Graph of respective nodes.
    """
    topog = nx.from_numpy_matrix(adj_mat)
    return topog

[docs]def organize_nxgraph(topo: nx.Graph):
    """Orders the node positions into a hierarchical structure.

    Based on the "dot" layout and given topography.
    Parameters
    ----------
    topo : xarray.Dataset
        Contains river segment identifications and relationships.

    Returns
    -------
    networkx.positions
    """
    pos = nx.drawing.nx_agraph.graphviz_layout(topo, prog = 'dot')
    return pos

[docs]def color_code_nxgraph(graph: nx.graph, measure: pd.Series,
                       cmap=mpl.cm.get_cmap('coolwarm_r'),
                       vmin=None, vmax=None)-> dict:
    """Creates a dictionary mapping of nodes to color values.

    Parameters
    ----------
    graph : networkx.graph
        Graph to be color coded
    measure : pandas.Series
        Contains river segment ID's as the index and desired measures as
        values.
    cmap : matplotlib.colors.LinearSegmentedColormap, optional
        Colormap to be used for coloring the SimpleRiverNewtork
        plot. This defaults as 'coolwarm_r'.
    vmin: float, optional
        Minimum value for coloring
    vmax: float, optional
        Maximum value for coloring

    Returns
    -------
    dict
        Dictionary of {i:color} where i is the index of the river
        segment.

    """
    if np.where(measure < 0)[0].size == 0:
        # meaning we have only positive values and do not need to establish
        # zero at the center of the color bar
        segs = measure.index
        minimum = 0 #set to zero to preserve the coloring of the scale
        maximum = measure.max()

        color_vals = (measure.values) / (maximum)
        color_bar = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin = minimum, vmax = maximum))

        color_dict =  {f'{seg}' : mpl.colors.to_hex(cmap(i)) for i, seg in zip(color_vals, segs)}
        return color_dict, color_bar
    else:
        segs = measure.index
        #determine colorbar range
        if vmin is None and vmax is None:
            extreme = np.max(np.abs([np.min(measure), np.max(measure)]))
            vmin = -extreme
            vmax = extreme
        elif vmin is None:
            vmin = measure.min()
        elif vmax is None:
            vmax = measure.max()

        norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
        color_bar = plt.cm.ScalarMappable(cmap=cmap, norm=norm)

        color_dict =  {f'{seg}': mpl.colors.to_hex(cmap(norm(i)))
                       for i, seg in zip(measure.values, segs)}
        return color_dict, color_bar

[docs]def draw_dataset(topo: xr.Dataset, color_measure: pd.Series, cmap = mpl.cm.get_cmap('coolwarm_r')):
    """"Plots the river network through networkx.

    Draws a networkx graph from a topological xrarray.Dataset and color codes
    it based on a pandas.Series.

    Parameters
    ----------
    topo : xarray.Dataset
        Contains river segment identifications and relationships.
    color_measure : pandas.Series
        Indicies are concurrent with the number of segs in topo.
        Typically this contains statistical information about the flows
        that will be color coded by least to greatest value.
    cmap : matplotlib.colors.LinearSegmentedColormap, optional
        Colormap to be used for coloring the SimpleRiverNewtork
        plot. This defaults as 'coolwarm_r'.
    """
    topo_adj_mat = create_adj_mat(topo)
    topo_graph = create_nxgraph(topo_adj_mat)
    topo_positions = organize_nxgraph(topo_graph)
    topo_color_dict, topo_color_cbar = color_code_nxgraph(topo_graph, color_measure, cmap)
    topo_nodecolors = [topo_color_dict[f'{node}'] for node in topo_graph.nodes()]
    nx.draw_networkx(topo_graph, topo_positions, node_size = 200, font_size = 8, font_weight = 'bold',
                     node_shape = 's', linewidths = 2, font_color = 'w', node_color = topo_nodecolors)
    plt.colorbar(topo_color_cbar)

#*****************************************************************************************
# BMORPH Summary Statistics:
#      plot_reduced_flows
#      plot_spearman_rank_difference
#      correction_scatter
#      pbias_diff_hist
#      plot_residual_overlay
#      norm_change_annual_flow
#      pbias_compare_hist
#      compare_PDF
#      compare_CDF
#      spearman_diff_boxplots_annual
#      kl_divergence_annual_compare
#      spearman_diff_boxplots_annual_compare
#      compare_CDF_all
#      compare_mean_grouped_CPD
#*****************************************************************************************

[docs]def plot_reduced_flows(flow_dataset: xr.Dataset, plot_sites: list,
                        reduce_func=np.mean, interval = "day",
                        statistic_label='Mean',
                        units_label = r'$(m^3/s)$',
                        title_label=f'Annual Mean Flows',
                        raw_var = 'IRFroutedRunoff', raw_name = 'Mizuroute Raw',
                        ref_var = 'upstream_ref_flow', ref_name = 'upstream_ref_flow',
                        bc_vars = list(), bc_names = list(),
                        fontsize_title = 24, fontsize_legend = 20, fontsize_subplot = 20,
                        fontsize_tick = 20, fontcolor = 'black',
                        figsize_width = 20, figsize_height = 12,
                        plot_colors = ['grey', 'black', 'blue', 'red'],
                        return_reduced_flows = False):
    """Creates a series of subplots plotting statistical day of year flows per gauge site.

    Parameters
    ----------
    flow_dataset : xarray.Dataset
        Contatains raw, reference, and bias corrected flows.
    plot_sites : list
        Sites to be plotted.
    reduce_func : function, optional
        A function to apply to flows grouped by `interval`, defaults as np.mean.
    interval : str, optional
        What time interval annual `reduce_func` should be computed on. Currently supported
        is `day` for dayofyear (default), `week` for weekofyear, and `month` for monthly.
    statistic_label : str, optional
        Label for the statistic representing the `reduce_func`, defaults as
        'Mean' to fit `reduce_func` as np.mean.
    units_label : str, optional
        Label for the units of flow, defaults as r`$(m^3/s)$`.
    title_label : str
        Lable for the figure title representing the reduce_func, defaults as
        f'Annual Mean Flows' to fit `reduce_func` as np.mean.
    raw_var : str, optional
        The string to access the raw flows in `flow_dataset`, defaults as
        'IRFroutedRunoff'.
    raw_name : str, optional
        Label for the raw flows in the legend, defaults as 'Mizuroute Raw'.
    ref_var : str, optional
        The string to access the reference flows in `flow_dataset`, defaults
        as 'upstream_ref_flow'.
    ref_name : str, optional
        Label for the reference flows in the legend, defaults as 'upstream_ref_flow'.
    bc_vars : list
        The strings to access the bias corrected flows in `flow_dataset`.
    bc_names : list
        Labels for the bias corrected flows in the legend, expected in the same
        order as `bc_vars`.
    plot_colors : list, optional
        Colors to be plotted for `raw_var`, `ref_var`, `bc_vars` respectively.
        Defaults as ['grey', 'black', 'blue', 'red'].
    return_reduced_flows : boolean, optional
        If True, returns the reduced flows as calculated for plotting, defaults
        as False. This is typically used for debugging purposes.
    fontsize_title : int, optional
        Font size of the plot title, defaults as 80.
    fontsize_legend : int, optional
        Font size of the plot legend, defaults as 68.
    fontsize_subplot : int, optional
        Font size for the subplots, defaults as 60.
    fontsize_tick : int, optional
        Font size of the ticks, defaults 45.
    fontcolor : str, optional
        Color of the font, defaults as 'black'
    figsize_width : int, optional
        Width of the figure, defaults as 70.
    figusize_height : int, optional
        Height of the figure, defaults as 30.

    Returns
    -------
    xarray.Dataset or (matplotlib.figure, matplotlib.axes)
        If `return_reduced_flows` is False, matplotlib.figure and matplotlib.axes,
        otherwise the reduced_flows are returned as the xarray.Dataset.
    """

    if len(bc_vars) == 0:
        raise Exception("Please enter a non-zero number strings in bc_vars to be used")
    if len(bc_vars) != len(bc_names):
        raise Exception("Please have the same number of entries in bc_names as bc_names")
    if len(plot_colors) < 2 + len(bc_vars):
        raise Exception(f"Please enter at least {2 + len(bc_vars)} colors in plot_colors")

    interval = interval.lower()
    interval_name = "Day of Year"
    if interval == 'day':
        interval_name = "Day"
        raw_flow = flow_dataset[raw_var].groupby(
            flow_dataset['time'].dt.dayofyear).reduce(reduce_func)
        reference_flow = flow_dataset[ref_var].groupby(
            flow_dataset['time'].dt.dayofyear).reduce(reduce_func)
        bc_flows = list()
        for bc_var in bc_vars:
            bc_flows.append(flow_dataset[bc_var].groupby(flow_dataset['time'].dt.dayofyear).reduce(reduce_func))
        time = raw_flow['dayofyear'].values
    elif interval == 'week':
        interval_name = "Week of Year"
        raw_flow = flow_dataset[raw_var].groupby(
            flow_dataset['time'].dt.isocalendar().week).reduce(reduce_func)
        reference_flow = flow_dataset[ref_var].groupby(
            flow_dataset['time'].dt.isocalendar().week).reduce(reduce_func)
        bc_flows = list()
        for bc_var in bc_vars:
            bc_flows.append(flow_dataset[bc_var].groupby(flow_dataset['time'].dt.isocalendar().week).reduce(reduce_func))
        time = raw_flow['week'].values
    elif interval == 'month':
        interval_name = "Month"
        raw_flow = flow_dataset[raw_var].groupby(
            flow_dataset['time'].dt.month).reduce(reduce_func)
        reference_flow = flow_dataset[ref_var].groupby(
            flow_dataset['time'].dt.month).reduce(reduce_func)
        bc_flows = list()
        for bc_var in bc_vars:
            bc_flows.append(flow_dataset[bc_var].groupby(flow_dataset['time'].dt.month).reduce(reduce_func))
        time = raw_flow['month'].values
    else:
        raise Exception("Please use 'day', 'week', or 'month' for interval.")


    outlet_names = flow_dataset['seg'].values

    raw_flow_df = pd.DataFrame(data = raw_flow.values,
                                   index=time, columns = outlet_names)
    reference_flow_df = pd.DataFrame(data = reference_flow.values,
                                         index=time, columns = outlet_names)
    bc_flow_dfs = list()
    for bc_flow in bc_flows:
        bc_flow_dfs.append(pd.DataFrame(data = bc_flow.values, index = time, columns = outlet_names))

    plot_names = [raw_name, ref_name]
    plot_names.extend(bc_names)

    mpl.rcParams['figure.figsize'] = (figsize_width, figsize_height)
    n_rows, n_cols = determine_row_col(len(plot_sites))
    fig, axs = plt.subplots(n_rows, n_cols)

    for site, ax in zip(plot_sites, axs.ravel()):
        ax.plot(raw_flow_df[site], color = plot_colors[0], alpha = 0.8, lw = 4)
        ax.plot(reference_flow_df[site], color = plot_colors[1], alpha = 0.8, lw = 4)

        for i, bc_flow_df in enumerate(bc_flow_dfs):
            ax.plot(bc_flow_df[site], color = plot_colors[2+i], lw = 4, alpha = 0.8)

        ax.set_title(site, fontsize = fontsize_subplot, color = fontcolor)
        plt.setp(ax.spines.values(), color = fontcolor)
        ax.tick_params(axis = 'both', colors = fontcolor, labelsize = fontsize_tick)

    if len(plot_sites) < n_rows * n_cols:
        for ax_index in np.arange(len(plot_sites), n_rows * n_cols):
            axs.ravel().tolist()[ax_index].axis('off')

    fig.text(0.4, -0.02, interval_name, fontsize = fontsize_title, ha = 'center')
    fig.text(-0.02, 0.5, f"{statistic_label} {interval_name} Flow {units_label}", fontsize = fontsize_title,
             va = 'center', rotation = 'vertical')
    plt.subplots_adjust(wspace = 0.1, hspace = 0.3, left = 0.05, right = 0.8, top = 0.95)

    fig.legend(plot_names, fontsize = fontsize_legend, loc = 'center right');

    if return_reduced_flows:
        reduced_flows = xr.Dataset(coords={'site': plot_sites, 'time': time})
        reduced_flows[raw_var] = xr.DataArray(data = raw_flow_df.values, dims = ('time', 'site') ).transpose()
        reduced_flows[ref_var] = xr.DataArray(data = reference_flow_df.values, dims = ('time', 'site') ).transpose()
        for bc_var, bc_flow_df in zip(bc_vars, bc_flow_dfs):
            reduced_flows[bc_var] = xr.DataArray(data = bc_flow_df.values, dims = ('time', 'site') ).transpose()
        return reduced_flows

    return fig, ax


[docs]def plot_spearman_rank_difference(flow_dataset:xr.Dataset, gauge_sites: list,
                                  start_year: str, end_year: str,
                                  relative_locations_triu: pd.DataFrame, basin_map_png,
                                  cmap = mpl.cm.get_cmap('coolwarm_r'),
                                  blank_plot_color = 'w', fontcolor = 'black',
                                  fontsize_title=60, fontsize_tick = 30, fontsize_label = 45):
    """Creates a site-to-site rank correlation difference comparison plot with a map of the basin.

    Parameters
    ----------
    flow_dataset : xarray.Dataset
        Contains raw as 'raw_flow' and bias corrected as 'bias_corrected_total_flow'
        flow values, times of the flow values, and the names of the sites where those
        flow values exist
    gauge_sites : list
        Gauge sites to be plotted.
    start_year : str
        String formatted as 'yyyy-mm-dd' to start rank correlation window.
    end_year : str
        String formatted as 'yyyy-mm-dd' to end rank correlation window.
    relative_locations_triu : pandas.DataFrame
        Denotes which sites are connected with a '1' and has the lower
        triangle set to '0'.
    basin_map_png : png file
        The basin map with site values marked.
    cmap : matplotlib.colors.LinearSegmentedColormap, optional
        Colormap to be used for coloring the SimpleRiverNewtork
        plot. This defaults as 'coolwarm_r'.
    blank_plot_color : str, optional
        Color to set the lower extremes in `cmap` should be to keep extreme values from
        skewing the color map and hiding values. This is defaulted as 'w' for white. It
        should appear that this color matches the background plot color to make it appear
        as if no value is plotted here.
    font_color : str, optional
        Color of the font, defaulted as 'black'.
    fontsize_title : int, optional
        Font size of the title, defaults as 60.
    fontsize_tick : int, optional
        Font size of the ticks, defaults as 30.
    fontsize_label : int, optional
        Font size of the labels, defaults as 45.
    """

    mpl.rcParams['figure.figsize'] = (40, 20)
    cmap.set_under(color = blank_plot_color)

    fig, ax = plt.subplots()

    if int(end_year[:4]) - int(start_year[:4]) == 1:
        fig.suptitle(f'WY{start_year[:4]}: 'r'$r_{s}(Q_{raw}) - r_{s}(Q_{bc})$',
                     fontsize = fontsize_title, color = fontcolor, x = 0.6, y = 0.95)
    elif int(end_year[:4])-int(start_year[:4]) > 1:
        end_WY = int(end_year[:4])-1
        fig.suptitle(f'WY{start_year[:4]} to WY{end_WY}: 'r'$r_{s}(Q_{raw}) - r_{s}(Q_{bc})$',
                     fontsize = fontsize_title, color = fontcolor, x = 0.6, y = 0.95)
    else:
        raise Exception('Please check end_year is later than start_year')

    time_span = flow_dataset['time'].values
    outlet_names = flow_dataset['outlet'].values

    raw_flow = pd.DataFrame(data = np.transpose(flow_dataset['raw_flow'].values),
                            index = time_span, columns = outlet_names)
    ref_flow = pd.DataFrame(data = flow_dataset['reference_flow'].values,
                            index = time_span, columns = outlet_names)
    bc_flow = pd.DataFrame(data=np.transpose(flow_dataset['bias_corrected_total_flow'].values),
                           index = time_span, columns = outlet_names)

    raw_flow_spearman = filter_rank_corr(raw_flow.loc[start_year : end_year].corr(method = 'spearman'),
                                         rel_loc = relative_locations_triu)
    bc_flow_spearman = filter_rank_corr(bc_flow.loc[start_year : end_year].corr(method = 'spearman'),
                                        rel_loc = relative_locations_triu)
    raw_minus_bc_spearman = raw_flow_spearman - bc_flow_spearman

    vmin = np.min(raw_minus_bc_spearman.min())
    vmax = np.max(raw_minus_bc_spearman.max())
    vextreme = vmax
    if np.abs(vmin) > vextreme:
        vextreme = np.abs(vmin)

    # -10 is used to ensure that the squares not to be plotted are marked as such by
    # the cmap's under values
    vunder = np.abs(vmin) * -10

    im = ax.imshow(raw_minus_bc_spearman.fillna(vunder), vmin = -vextreme, vmax = vextreme, cmap = cmap)
    plt.setp(ax.spines.values(), color = fontcolor)
    ax.tick_params(axis='both', colors = fontcolor)
    tick_tot = len(gauge_sites)
    ax.set_xticks(np.arange(0, tick_tot, 1.0))
    ax.set_yticks(np.arange(0, tick_tot, 1.0))
    ax.set_ylim(tick_tot - 0.5, -0.5)
    ax.set_xticklabels(gauge_sites, rotation = 'vertical', fontsize = fontsize_label)
    ax.set_yticklabels(gauge_sites, fontsize = fontsize_tick);

    cb = fig.colorbar(im, pad = 0.01)
    cb.ax.yaxis.set_tick_params(color = fontcolor, labelcolor = fontcolor, labelsize = fontsize_tick)
    cb.outline.set_edgecolor(None)

    fig.text(0.6, -0.02, 'Site Abbreviation', fontsize = fontsize_label, ha = 'center')
    fig.text(0.32, 0.4, 'Site Abbreviation', fontsize = fontsize_label, va = 'center', rotation = 'vertical')

    newax = fig.add_axes([0.295, 0.4, 0.49, 0.49], anchor = 'NE', zorder = 2)
    newax.imshow(basin_map_png)
    newax.axis('off')
    newax.tick_params(axis = 'both')
    newax.set_xticks([])
    newax.set_yticks([])

    plt.tight_layout
    plt.show()

[docs]def correction_scatter(site_dict:dict, raw_flow:pd.DataFrame,
                       ref_flow:pd.DataFrame, bc_flow:pd.DataFrame,
                       colors:list, title='Flow Residuals',
                       fontsize_title=80, fontsize_legend=68,
                       fontsize_subplot=60, fontsize_tick=45, fontcolor='black',
                       pos_cone_guide=False, neg_cone_guide=False):
    """Difference from reference flows before and after correction.

    Plots differences between the raw and reference flows on the horizontal
    and differences between the bias corrected and refrerence on the vertical.
    This compares corrections needed before and after the bias correction method
    is applied.

    Parameters
    ----------
    site_dict : dict
        Expects {subgroup name: list of segments in subgroup} how sites are to be
        seperated.
    raw_flow : pandas.DataFrame
        Contains flows before correction.
    ref_flow : pandas.DataFrame
        Contains the reference flows to compare `raw_flow` and `bc_flow`.
    bc_flow : pandas.DataFrame
        Contains flows after correction.
    colors : list
        Colors to be plotted for each site in `site_dict`.
    title : str, optional
        Title label for the plot, defaults as 'Flow Residuals'.
    fontsize_title : int, optional
        Fontsize of the title, defaults as 80.
    fontsize_legend : int, optional
        Fontsize of the legend, defaults as 68.
    fontsize_subplot : int, optional
        Fontsize of the subplots, defaults as 60.
    fontsize_tick : int, optional
        Fontsize of the ticks, defaults as 45.
    fontcolor : str, optional
        Color of the font, defaults as 'black'.
    pos_cone_guide : boolean, optional
        If True, plots a postive 1:1 line through the origin for reference.
    neg_cone_guide : boolean, optional
        If True, plots a negative 1:1 line through the origin for reference.
    """
    num_plots = len(site_dict.keys())
    n_rows, n_cols = determine_row_col(num_plots)

    mpl.rcParams['figure.figsize']=(60,40)

    fig,axs = plt.subplots(nrows=n_rows, ncols=n_cols)
    plt.suptitle(title, fontsize= fontsize_title, color=fontcolor, y=1.05)

    ax_list = axs.ravel().tolist()

    before_bc = ref_flow-raw_flow
    after_bc = ref_flow-bc_flow

    for i, site_group_key in enumerate(site_dict.keys()):
        site_group = site_dict[site_group_key]
        group_before_bc = before_bc.loc[:, site_group]
        group_after_bc = after_bc.loc[:, site_group]
        scatter_series_axes(group_before_bc, group_after_bc, label=site_group_key,
                                     alpha=0.05, color=colors[i], ax=ax_list[i])
        ax_list[i].set_title(site_group_key, fontsize=fontsize_subplot)
        plt.setp(ax_list[i].spines.values(), color=fontcolor)
        ax_list[i].tick_params(axis='both', colors=fontcolor, labelsize=fontsize_tick)

    # add horizontal axis at 0 line and hide plots that are not in use
    for i, ax in enumerate(axs.ravel()):
        if i < num_plots:
            bottom, top = ax.get_ylim()
            left, right = ax.get_xlim()
            ref_line_max = np.max([bottom, top, left, right])
            ref_line_min = np.min([bottom, top, left, right])
            ref_line_ext = ref_line_max
            if np.abs(ref_line_min) > ref_line_ext:
                ref_line_ext = np.abs(ref_line_min)
            ax.plot([-ref_line_ext, ref_line_ext], [0,0], color='k', linestyle='--')

            if pos_cone_guide and neg_cone_guide:
                ax.plot([-ref_line_ext, ref_line_ext], [-ref_line_ext, ref_line_max], color='k', linestyle='--')
                ax.plot([-ref_line_ext, ref_line_ext], [ref_line_ext, -ref_line_max], color='k', linestyle='--')
            elif pos_cone_guide:
                ax.plot([0, ref_line_ext], [0, ref_line_ext], color='k', linestyle='--')
                ax.plot([0, ref_line_ext], [0, -ref_line_ext], color='k', linestyle='--')
            elif neg_cone_guide:
                ax.plot([0, -ref_line_ext], [0, ref_line_ext], color='k', linestyle='--')
                ax.plot([0, -ref_line_ext], [0, -ref_line_ext], color='k', linestyle='--')

        else:
            ax.axis('off')

    fig.text(0.5, -0.04, r'$Q_{ref} - Q_{raw} \quad (m^3/s)$', ha='center', va = 'bottom',
             fontsize=fontsize_title, color=fontcolor);
    fig.text(-0.02, 0.5, r'$Q_{ref} - Q_{bc} \quad (m^3/s)$', va='center', rotation = 'vertical',
             fontsize=fontsize_title, color=fontcolor);

    plt.tight_layout()
    plt.show

[docs]def compare_correction_scatter(flow_dataset: xr.Dataset, plot_sites:list,
                       raw_var= "raw", raw_name = "Mizuroute Raw",
                       ref_var="ref", ref_name = "Reference",
                       bc_vars = list(), bc_names = list(),
                       plot_colors = ['blue', 'purple','orange','red'],
                       title='Absolute Error in Flow'r'$(m^3/s)$',
                       fontsize_title=80, fontsize_legend=68, alpha = 0.05,
                       fontsize_subplot=60, fontsize_tick=45, fontcolor='black',
                       pos_cone_guide=False, neg_cone_guide=False, symmetry=True):
    """Difference from reference flows before and after correction.

    Plots differences between the raw and reference flows on the horizontal
    and differences between the bias corrected and refrerence on the vertical.
    This compares corrections needed before and after the bias correction method
    is applied.

    Parameters
    ----------
    flow_dataset : xarray.Dataset
        contains raw, reference, and bias corrected flows.
    plot_sites : list
        Sites to be plotted, expected as the `seg` coordinate in `flow_dataset`.
    raw_var : str, optional
        The string to access the raw flows in `flow_dataset`, defaults as `raw`.
    raw_name : str, optional
        Label for the raw flows in the legend, defaults as 'Mizuroute Raw'.
    ref_var : str, optional
        The string to access the reference flows in `ref`, defaults
        as 'upstream_ref_flow'.
    ref_name : str, optional
        Label for the reference flows in the legend, defaults as 'Reference'.
    bc_vars : list
        The strings to access the bias corrected flows in `flow_dataset`.
    bc_names : list
        Labels for the bias corrected flows in the legend, expected in the same
        order as `bc_vars`.
    plot_colors : list, optional
        Colors to be plotted for each site in `plot sites`.
        Defaults as ['blue', 'purple', 'orange, 'red'].
    fontsize_title : int, optional
        Fontsize of the title, defaults as 80.
    fontsize_legend : int, optional
        Fontsize of the legend, defaults as 68.
    fontsize_subplot : int, optional
        Fontsize of the subplots, defaults as 60.
    fontsize_tick : int, optional
        Fontsize of the ticks, defaults as 45.
    fontcolor : str, optional
        Color of the font, defaults as 'black'.
    pos_cone_guide : boolean, optional
        If True, plots a postive 1:1 line through the origin for reference.
    neg_cone_guide : boolean, optional
        If True, plots a negative 1:1 line through the origin for reference.
    symmetry : boolean, optional
        If True, the plot axis are symmetrical about the origin (default).
        If False, plotting limits will minimize empty space while not losing any data.
    """
    if len(bc_vars) == 0:
        raise Exception("Please enter a non-zero number strings in bc_vars to be used")
    if len(bc_vars) != len(bc_names):
        raise Exception("Please have the same number of entries in bc_names as bc_names")
    if len(plot_colors) < len(bc_vars):
        raise Exception(f"Please enter at least {len(bc_vars)} colors in plot_colors")

    num_plots = len(plot_sites)
    n_rows, n_cols = determine_row_col(num_plots)

    mpl.rcParams['figure.figsize']=(60,40)

    fig,axs = plt.subplots(nrows=n_rows, ncols=n_cols)
    plt.suptitle(title, fontsize= fontsize_title, color=fontcolor, y=1.05)

    ax_list = axs.ravel().tolist()

    plot_flows = flow_dataset.sel(seg=plot_sites).copy()
    plot_flows["before_bc"] = plot_flows[ref_var]-plot_flows[raw_var]

    for bc_var in bc_vars:
        plot_flows[f"after_{bc_var}"] = plot_flows[ref_var]-plot_flows[bc_var]

    #we need to figure out which plots have the most spread to make certain we hide the fewest

    spread = list()
    for bc_var in bc_vars:
        spread.append(plot_flows[f"after_{bc_var}"].values.max()-plot_flows[f"after_{bc_var}"].values.min())

    sorted_spread = np.flip(np.sort(spread.copy()))
    spread_ranks = [spread.index(val) for val in sorted_spread]


    for i, site in enumerate(plot_sites):
        before_bc = plot_flows["before_bc"].sel(seg=site).values
        for j in spread_ranks:
            after_bc = plot_flows[f"after_{bc_vars[j]}"].sel(seg=site).values
            scatter_series_axes(before_bc, after_bc, label=bc_names[j],
                                         alpha=alpha, color=plot_colors[j], ax=ax_list[i])
        ax_list[i].set_title(site, fontsize=fontsize_subplot)
        plt.setp(ax_list[i].spines.values(), color=fontcolor)
        ax_list[i].tick_params(axis='both', colors=fontcolor, labelsize=fontsize_tick)

    # add horizontal axis at 0 line and hide plots that are not in use
    for i, ax in enumerate(axs.ravel()):
        if i == len(axs.ravel())-1:
            ax.axis('off')
            ax.legend(handles=custom_legend(names = bc_names, colors=plot_colors), fontsize = fontsize_legend, loc='center')
        if i < num_plots:
            bottom, top = ax.get_ylim()
            left, right = ax.get_xlim()
            ref_line_max = np.max([bottom, top, left, right])
            ref_line_min = np.min([bottom, top, left, right])
            ref_line_ext = ref_line_max
            if np.abs(ref_line_min) > ref_line_ext:
                ref_line_ext = np.abs(ref_line_min)
            ax.plot([-ref_line_ext, ref_line_ext], [0,0], color='k', linestyle='--')

            if pos_cone_guide and neg_cone_guide:
                ax.plot([-ref_line_ext, ref_line_ext], [-ref_line_ext, ref_line_max], color='k', linestyle='--')
                ax.plot([-ref_line_ext, ref_line_ext], [ref_line_ext, -ref_line_max], color='k', linestyle='--')
            elif pos_cone_guide:
                ax.plot([0, ref_line_ext], [0, ref_line_ext], color='k', linestyle='--')
                ax.plot([0, ref_line_ext], [0, -ref_line_ext], color='k', linestyle='--')
            elif neg_cone_guide:
                ax.plot([0, -ref_line_ext], [0, ref_line_ext], color='k', linestyle='--')
                ax.plot([0, -ref_line_ext], [0, -ref_line_ext], color='k', linestyle='--')
            if not symmetry:
                ax.set_xlim(left=left,right=right)
                ax.set_ylim(top=top,bottom=bottom)

        else:
            ax.axis('off')

    fig.text(0.5, -0.04, r'$Q_{ref} - Q_{raw} \quad (m^3/s)$', ha='center', va = 'bottom',
             fontsize=fontsize_title, color=fontcolor);
    fig.text(-0.02, 0.5, r'$Q_{ref} - Q_{bc} \quad (m^3/s)$', va='center', rotation = 'vertical',
             fontsize=fontsize_title, color=fontcolor);
    plt.tight_layout()
    plt.show

[docs]def pbias_diff_hist(sites:list, colors:list, raw_flow:pd.DataFrame, ref_flow:pd.DataFrame,
                      bc_flow: pd.DataFrame, grouper=pd.Grouper(freq='M'), total_bins=None,
                      title_freq='Monthly', fontsize_title=90, fontsize_subplot_title=60,
                    fontsize_tick=40, fontsize_labels=84):
    """Histograms of differences in percent bias before/after bias correction.

    Creates a number of histogram subplots by each given site that
    plot the difference in percent bias before and after bias correction.

    Parameters
    ----------
    sites : list
        Sites that are the columns of the flow DataFrames, `raw_flow`,
        `ref_flow`, and `bc_flow`.
    colors : list
        Colors to plot the sites with, (do not have to be different),
        that are used in the same order as the list of sites.
    raw_flow : pandas.DataFrame
        Contains flows before bias correction.
    ref_flow : pandas.DataFrame
        Contains reference flows for comparison as true values.
    bc_flow : pandas.DataFrame
        Contains flows after bias correction.
    grouper : pandas.TimeGrouper, optional
        How flows should be grouped for bias correction. This defaults as
        monthly.
    total_bins : int, optional
        Number of bins to use in the histogram plots. If none specified,
        defaults to the floored square root of the number of pbias difference
        values.
    title_freq : str, optional
        An adjective description of the frequency with which the flows are
        grouped, should align with `grouper`, although there is no check to
        verify this. This defaults as 'Monthly'.
    fontsize_title : int, optional
        Font size of the title. Defaults as 90.
    fontsize_subplot_title : int, optional
        Font size of the subplots. Defaults as 60.
    fontsize_tick : int, optional
        Font size of the ticks. Defaults as 40.
    fontsize_labels : int, optional
        Font size of the labels. Defaults as 84.
    """

    if len(sites) != len(colors):
        raise Exception('Please enter the same number of colors as sites')

    mpl.rcParams['figure.figsize']=(60, 40)

    n_rows,n_cols = determine_row_col(len(sites))

    bc_m_pbias = pbias_by_index(
        observe=ref_flow.groupby(grouper).sum(),
        predict=bc_flow.groupby(grouper).sum())
    raw_m_pbias = pbias_by_index(
        observe=ref_flow.groupby(grouper).sum(),
        predict=raw_flow.groupby(grouper).sum())

    bc_pbias_impact = bc_m_pbias-raw_m_pbias

    if type(total_bins) is type(None):
        total_bins=int(np.sqrt(len(bc_pbias_impact.index)))

    fig, axs = plt.subplots(n_rows, n_cols)
    axs_list = axs.ravel().tolist()

    plt.suptitle(f"Change in Percent Bias from Bias Correction on a {title_freq} Basis",
                 fontsize=fontsize_title, y=1.05)
    i=0
    for site in sites:
        ax = axs_list[i]
        site_bc_pbias_impact=bc_pbias_impact[site]
        impact_extreme = np.max(np.abs(site_bc_pbias_impact))
        bin_step = (2*impact_extreme/total_bins)
        bin_range = np.arange(-impact_extreme, impact_extreme+bin_step, bin_step)

        site_bc_pbias_impact.plot.hist(ax=ax, bins=bin_range, color=colors[i])
        ax.vlines(0, 0, ax.get_ylim()[1], color='k', linestyle='--', lw=4)
        ax.set_xlim(left=-150, right=150)

        ax.set_title(site,fontsize=fontsize_subplot_title)
        ax.set_ylabel("")
        ax.tick_params(axis='both', labelsize=fontsize_tick)
        i+=1

    while i < len(axs_list):
        axs_list[i].axis('off')
        i+=1

    fig.text(0.5, -0.04, r'$(PBias_{BC})-(PBias_{raw}) \quad (\%)$',
             ha='center', va = 'bottom', fontsize=fontsize_labels);
    fig.text(-0.02, 0.5, 'Frequencey',
             va='center', rotation = 'vertical', fontsize=fontsize_labels);
    plt.tight_layout()

[docs]def plot_residual_overlay(flows: pd.DataFrame, upstream_sites: list, downstream_site: str,
                          start_year: int, end_year: int, ax=None, fontsize_title=40,
                          fontsize_labels=60, fontsize_tick= 30, linecolor='k', alpha=0.3):
    """Plots flow upstream/downstream residuals overlayed across one year span.

    Plots residuals from each hydrologic year on top of each
    other with a refence line at zero flow. Residuals are calculated as
    downstream flows - sum(upstream flows).

    Parameters
    ----------
    flows : pandas.DataFrame
        All flows to be used in plotting.
    upstream_sites : list
        Strings of the site names stored in `flows` to aggregate.
    downstream_sites : str
        Name of the downstream site stored in flows that will
        have the `upstream_sites` subtracted from it.
    start_year : int
        The starting year to plot.
    end_year : int
        The year to conclude on.
    ax : matplotlib.axes, optional
        Axes to plot on. If none is specified, a new one is created.
    fontsize_title : int, optional
        Font size of the title.
    fontsize_labels : int, optional
        Font size of the labels.
    fontsize_tick : int, optional
        Font size of the ticks.
    linecolor : str, optional
        Color of the lines plotted. Defaults as 'k' for black.
    alpha : float, optional
        Transparency of the lines plotted. Defaults as 0.3 to help
        see how residuals line up across many years.

    Returns
    -------
    matplotlib.axes
    """

    mpl.rcParams['figure.figsize']=(20,20)

    if type(ax) is type(None):
        fig,ax = plt.subplots()

    start_date='-10-01'
    end_date='-09-30'

    year = start_year
    upstream_site_string = ""

    residual_flow = pd.DataFrame(index=flows.index, columns=['Residuals'])

    residual_flow['Residuals'] = flows[downstream_site].values
    for upstream_site in upstream_sites:
        residual_flow['Residuals'] -= flows[upstream_site].values
        upstream_site_string += f'{upstream_site}, '

    upstream_site_string = upstream_site_string[:len(upstream_site_string)-2]

    while year < end_year:
        values = residual_flow[f'{str(year)}{start_date}':f'{str(year+1)}{end_date}'].values
        doy = np.arange(1, len(values)+1, 1)
        ax.plot(doy, values, color=linecolor, alpha=alpha)
        year += 1

    ax.plot([1,366],[0,0], color='k', linestyle='--', lw=4)
    ax.set_xlim(left=1, right=366)
    ax.set_ylabel(r"$Q_{downstream}-\sum{Q_{upstream}}$"" "r"$(m^3/s)$", fontsize=fontsize_labels);
    ax.set_xlabel("Day of Year", fontsize=fontsize_labels);
    ax.set_title(f"Upstream: {upstream_site_string} | Downstream: {downstream_site}",
                 fontsize=fontsize_title, y=1.04);
    ax.tick_params(axis='both', labelsize=fontsize_tick)

    return ax


[docs]def norm_change_annual_flow(sites: list, before_bc: pd.DataFrame, after_bc: pd.DataFrame, colors = list,
                            fontsize_title=60, fontsize_labels=40, fontsize_tick= 30):
    """Normalized change in annual flow volume.

    Plots a series of subplots containing bar charts that depict
    the differnece in normalized annual flow volume due to bias correction.

    Parameters
    ----------
    sites : list
        String names of all the sites to be plotted, matching sites contained in
        the DataFrames `before_bc` and `after_bc`.
    before_bc : pandas.DataFrame
        Contains flows, (not aggregated), before bias correction is applied.
    after_bc : pandas.DataFrame
        Contains flows, (not aggregated), after bias correction is applied.
    colors : list
        Ccolors to be used for each site's subplot in the same order
        as sites, (does not have to be unique).
    fontsize_title : int, optional
        Font size of the title. Defaults as 60.
    fontsize_labels : int, optional
        Font size of the labels. Defaults as 40.
    fontsize_tick : int, optional
        Font size of the ticks. Defaults as 30.
    """

    mpl.rcParams['figure.figsize']=(30,20)

    WY_grouper = calc_water_year(before_bc)
    after_bc_annual = after_bc.groupby(WY_grouper).sum()
    before_bc_annual = before_bc.groupby(WY_grouper).sum()

    after_bc_annual, before_bc_annual = dst.normalize_pair(data=after_bc_annual, norming_data=before_bc_annual)

    diff_annual = after_bc_annual - before_bc_annual

    max_mag = np.abs(np.max(diff_annual.max()))
    min_mag = np.abs(np.min(diff_annual.min()))
    extreme_mag = np.max([max_mag, min_mag])

    n_rows, n_cols = determine_row_col(len(sites))

    fig, axs = plt.subplots(n_rows, n_cols)

    plt.suptitle("Change in Annual Flow Volume from Bias Correction",
                     fontsize=fontsize_title, y=1.05)

    axs_list = axs.ravel().tolist()

    i=0
    for site in sites:
        ax = axs_list[i]
        ax.bar(diff_annual.index, diff_annual[site].values, color=color_list[i])
        ax.set_title(site, fontsize=fontsize_tick)
        ax.set_ylim(top = extreme_mag, bottom=-extreme_mag)
        ax.tick_params(axis='both', labelsize=fontsize_tick)
        i += 1

    while i < len(axs_list):
        axs_list[i].axis('off')
        i += 1

    fig.text(0.5, -0.04, 'Hydrologic Year',
                 ha='center', va = 'bottom', fontsize=fontsize_labels);
    fig.text(-0.04, 0.5, "Normalized Change in Annual Flow Volume",
                 va='center', rotation = 'vertical', fontsize=fontsize_labels);

    plt.tight_layout()

[docs]def pbias_compare_hist(sites: list, raw_flow: pd.DataFrame, ref_flow: pd.DataFrame,
                      bc_flow: pd.DataFrame, grouper=pd.Grouper(freq='Y'), total_bins=None,
                      title_freq='Yearly', fontsize_title=90, fontsize_subplot_title=60,
                    fontsize_tick=40,fontsize_labels=84, x_extreme=150):
    """Histograms comparing percent bias before/after bias correction.

    Creates a number of histogram subplots by each given sites that
    plot percent bias both before and after bias correction.

    Parameters
    ----------
    sites : list
        Sites corresponding to the columns of the flow DataFrames, `raw_flow`,
        `ref_flow`, and `bc_flow`, to be plotted.
    raw_flow : pandas.DataFrame
        Flows before bias correction.
    ref_flow : pandas.DataFrame
        Reference flows for comparison as true values.
    bc_flow : pandas.DataFrame
        Flows after bias correction.
    grouper : pandas.TimeGrouper
        How flows should be grouped for bias correction, defaults as yearly.
    total_bins : int, optional
        Number of bins to use in the histogram plots. if none specified,
        defaults to the floored square root of the number of pbias difference
        values.
    title_freq : str, optional
        An adjective description of the frequency with which the flows are
        grouped corresponding to `grouper`. Defaults as 'Yearly'.
    fontsize_title : int, optional
        Font size of the title, defaulting as 90.
    fontsize_subplot_title : int, optional
        Font size of the subplot title, defaulting as 60.
    fontsize_ticler : int, optional
        Font size of the ticks, defaulting as 40.
    fontsize_labels : int, optional
        Font size of the labels, defaulting as 84.
    x_extreme : float, optional
        Greatest magnitude on the horizontal axis to specify the range,
        defaulting as 150, which results in a range of (-150, 150). This is
        useful if desiring to zoom in closer to the origin and exclued
        outlying percent biases.
    """

    mpl.rcParams['figure.figsize']=(60,40)

    n_rows,n_cols = determine_row_col(len(sites))

    bc_m_pbias = dst.pbias_by_index(
        observe=ref_flow.groupby(grouper).sum(),
        predict=bc_flow.groupby(grouper).sum())
    raw_m_pbias = dst.pbias_by_index(
        observe=ref_flow.groupby(grouper).sum(),
        predict=raw_flow.groupby(grouper).sum())
    if type(total_bins)==type(None):
        total_bins=int(np.sqrt(len(bc_m_pbias.index)))

    fig, axs = plt.subplots(n_rows,n_cols)
    axs_list = axs.ravel().tolist()

    plt.suptitle(f"Change in Percent Bias from Bias Correction on a {title_freq} Basis",
                 fontsize=fontsize_title,y=1.05)
    i=0
    for site in sites:
        ax = axs_list[i]
        before_pbias=raw_m_pbias[site]
        after_pbias=bc_m_pbias[site]
        before_extreme = np.max(np.abs(before_pbias))
        after_extreme = np.max(np.abs(after_pbias))
        extreme=np.max([before_extreme,after_extreme])
        bin_step = (2*extreme/total_bins)
        bin_range = np.arange(-extreme,extreme+bin_step,bin_step)

        before_pbias.plot.hist(ax=ax,bins=bin_range,color='red',edgecolor='black',alpha=0.5)
        after_pbias.plot.hist(ax=ax,bins=bin_range,color='blue',edgecolor='black',alpha=0.5)
        ax.vlines(0,0,ax.get_ylim()[1], color='k',linestyle='--',lw=4)
        x_extreme = np.abs(x_extreme)
        ax.set_xlim(left=-x_extreme,right=x_extreme)

        ax.set_title(site,fontsize=fontsize_subplot_title)
        ax.set_ylabel("")
        ax.tick_params(axis='both',labelsize=fontsize_tick)
        i+=1

    while i < len(axs_list):
        axs_list[i].axis('off')
        i+=1

    fig.text(0.5, -0.04, r'$PBias_{monthly} \quad (\%)$',
             ha='center', va = 'bottom', fontsize=fontsize_labels);
    fig.text(-0.02, 0.5, 'Frequencey',
             va='center', rotation = 'vertical', fontsize=fontsize_labels);
    plt.legend(handles=custom_legend(['Before BC','After BC', 'Overlap'],['red','blue','purple']),
               loc='lower right',fontsize=fontsize_labels)
    plt.tight_layout()

[docs]def compare_PDF(flow_dataset:xr.Dataset, gauge_sites = list,
                raw_var='raw_flow', ref_var='reference_flow', bc_var='bias_corrected_total_flow',
                raw_name='Mizuroute Raw', ref_name='NRNI Reference', bc_name='BMORPH BC',
                fontsize_title=40, fontsize_labels=30, fontsize_tick= 20):
    """Compare probability distribution functions.

    Plots the PDF's of the raw, reference, and bias corrected flows for each gauge site.

    Parameters
    ----------
    flow_dataset : xarray.Dataset
        Contatains raw, reference, and bias corrected flows.
    gauges_sites : list
        Gauge sites to be plotted as used in the `flow_dataset`.
    raw_var : str, optional
        The string to access the raw flows in `flow_dataset` for flows before
        bias correction. Defaults as 'raw_flow'.
    ref_var : str, optional
        The string to access the reference flows in `flow_dataset` for true flows.
        Defaults as 'reference_flow'.
    bc_var : str, optional
        The string to access the bias corrected flows in flow_dataset for flows
        after bias correction. Defaults as 'bias_corrected_total_flow'.
    raw_name : str, optional
        Label for the raw flows before bias correction. Defaults as 'Mizuroute Raw'.
    ref_name : str, optional
        Label for the reference flows. Defaults as 'NRNI Reference'.
    bc_name : str, optional
        Label for the bias corrected flows after bias correction. Defaults as
        'BMORPH BC'.
    fontsize_title : int, optional
        Fontsize of the title. Defaults as 40.
    fontsize_labels : int, optional
        Fontsize of the lables. Defaults as 30.
    fontsize_tick : int, optional
        Fontsize of the ticks. Defaults as 20.
    """

    n_rows, n_cols = determine_row_col(len(gauge_sites))

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 20), sharex=False, sharey=False)
    axes = axes.flatten()

    fig.suptitle("Probability Distribution Functions", y=1.01,x=0.4, fontsize=fontsize_title)

    for i, site in enumerate(gauge_sites):
        cmp = flow_dataset.sel(outlet=site)
        ax=axes[i]
        sns.kdeplot(np.log10(cmp[raw_var].values), ax=ax, color='grey', legend=False, label=raw_name)
        sns.kdeplot(np.log10(cmp[ref_var].values), ax=ax, color='black', legend=False, label=ref_name)
        sns.kdeplot(np.log10(cmp[bc_var].values), ax=ax, color='red', legend=False, label=bc_name)
        ax.set_title(site, fontsize=fontsize_labels)
        ax.tick_params(axis='both', labelsize=fontsize_tick)
        # relabel axis in standard base 10
        labels = ax.get_xticks()
        ax.set_xticklabels(labels=["$" + str(10**j) + "$" for j in labels], rotation=30)

    axes[-1].axis('off')
    axes[i].legend(bbox_to_anchor=(1.1, 0.8), fontsize=fontsize_tick)

    fig.text(0.4, 0.04, r'Q [$m^3/s$]', ha='center', fontsize=fontsize_labels)
    fig.text(-0.04, 0.5, r'Density', va='center', rotation='vertical', fontsize=fontsize_labels)

    plt.subplots_adjust(wspace=0.3, hspace= 0.45, left = 0.05, right = 0.8, top = 0.95)

[docs]def compare_CDF(flow_dataset: xr.Dataset, plot_sites: list,
                      raw_var: str, raw_name: str,
                      ref_var: str, ref_name: str,
                      bc_vars: list, bc_names: list,
                      plot_colors: list, logit_scale = True,
                      logarithm_base = '10', units = r'Q [$m^3/s$]',
                      markers = ['o', 'x', '*', '*'],
                      figsize = (20,20), sharex = False, sharey = True,
                      fontsize_title = 40, fontsize_labels = 30, fontsize_tick = 20,
                      markersize = 1, alpha = 0.3):
    """Compare probability distribution functions on a logit scale.

    Plots the CDF's of the raw, reference, and bias corrected flows.

    Parameters
    ----------
    flow_dataset : xarray.Dataset
        Contatains raw, reference, and bias corrected flows.
    gauges_sites : list
        Gauge sites to be plotted contained in `flow_dataset`.
    raw_var : str
        Accesses the raw flows in `flow_dataset` for flows before bias correction.
    raw_name : str
        Label for the raw flows in the legend, corresponding to `raw_var`.
    ref_var : str
        Accesses the reference flows in `flow_dataset` for true flows.
    ref_name : str
        Label for the reference flows in the legend, corresponding to `ref_var`.
    bc_vars : list
        Accesses the bias corrected flows in `flow_dataset`, where each element
        accesses its own bias corrected flows. Can be a size of 1.
    bc_names : list
        Label for the bias corrected flows in the legend for each entry in `bc_var`,
        assumed to be in the same order. Can be a size of 1.
    logit_scale : boolean, optional
        Whether to plot the vertical scale on a logit axis (True) or not (False).
        Defaults as True.
    logarithm_base : str, optional
        The logarthimic base to use for the horizontal scale. Only the following
        are currently supported:
            '10' to use a log10 horizontal scale (default)
            'e' to use a natural log horizontal scale
    units : str, optional
        The horizontal axis's label for units, defaults as r'Q [$m^3/s$]'.
    plot_colors : list, optional
        Colors to be plotted for the flows corresponding to `raw_var`,
        `ref_var`, and `bc_var`, defaulting as ['grey', 'black', 'blue', 'red'],
        assuming there are two entries in `bc_var`.
    markers : list, optional
        Markers to be plotted for the flows corresponding to `raw_var`,
        `ref_var`, and `bc_var`, defaulting as ['o', 'x', '*', '*'],
        assuming there are two entries in `bc_var`.
    figsize : tuple, optional
        Figure size following matplotlib notation, defaults as (20, 20).
    sharex : boolean or str, optional
        Whether horizontal axis should be shared amongst subplots,
        defaulting as False.
    sharey : boolean or str, optional
        Whether vertical axis should be shared amongst subplots,
        defaulting as True.
    fontsize_title : int, optional
        Font size of the title, defaults as 40.
    fontsize_labels : int, optional
        Font size of the labels, defaults as 30.
    fontsize_tick : int, optional
        Font size of the ticks, defaults as 20.
    markersize : float, optional
        Size of the markers plotted, defaults as 1.
    alpha : float, optional
        Transparancy of the markers plotted, defaults as 0.3 to help
        see where markers clump together.

    Returns
    -------
    matplotlib.figure, matplotlib.axes
    """

    if len(bc_vars) == 0:
        raise Exception("Please enter a non-zero number strings in bc_vars to be used")
    if len(bc_vars) != len(bc_names):
        raise Exception("Please have the same number of entries in bc_names as bc_names")
    if len(plot_colors) < len(bc_vars):
        raise Exception(f"Please enter at least {len(bc_vars)} colors in plot_colors")

    if logarithm_base == '10':
        log_func = log10_1p
    elif logarithm_base == 'e':
        log_func = np.log1p
    else:
        raise Exception("Please enter logarithm_base as '10' or 'e'")

    n_rows, n_cols = determine_row_col(len(plot_sites))

    fig, axes = plt.subplots(n_rows, n_cols, figsize = figsize, sharex = sharex, sharey = sharey)
    axes = axes.flatten()

    fig.suptitle("Cumulative Distribution Functions", y = 1.01, x = 0.4, fontsize = fontsize_title)

    for i, site in enumerate(plot_sites):
        ax = axes[i]
        cmp = flow_dataset.sel(seg = site)

        raw = ECDF(log_func(cmp[raw_var].values))
        ref = ECDF(log_func(cmp[ref_var].values))

        cors = list()
        for bc_var in bc_vars:
            cors.append(ECDF(log_func(cmp[bc_var].values)))

        linewidth = markersize / 2

        ax.plot(raw.x, raw.y, color = plot_colors[0], label = raw_name, lw = linewidth,
                linestyle = '--', marker = 'o', markersize = markersize, alpha = alpha)
        ax.plot(ref.x, ref.y, color = plot_colors[1], label = ref_name, lw = linewidth,
                linestyle = '--', marker = 'x', markersize = markersize, alpha = alpha)

        for j, cor in enumerate(cors):
            ax.plot(cor.x, cor.y, color = plot_colors[2+j], label = bc_names[j], lw = linewidth,
                    linestyle = '--', marker = '*', markersize = markersize, alpha = alpha)

        ax.set_title(site, fontsize = fontsize_labels)
        ax.tick_params(axis = 'both', labelsize = fontsize_tick)

        # relabel axis to account for log_func
        xlabels = ax.get_xticks()
        if logarithm_base == '10':
            ax.set_xticklabels(labels = ["$" + "{:0.0f}".format(10**k-1) + "$" for k in xlabels],
                               rotation = 30)
        elif logarithm_base == 'e':
            ax.set_xticklabels(labels = ["$" + "{:0.2f}".format(exp(k)-1) + "$" for k in xlabels],
                               rotation=30)

        if logit_scale:
            ax.set_yscale('logit')
            ax.minorticks_off()
            ax.yaxis.set_major_formatter(mpl.ticker.LogitFormatter())
            ylabels = ax.get_yticks()
            new_ylabels = list()
            for k, ylabel in enumerate(ylabels):
                if k % 3 == 0:
                    new_ylabels.append(ylabel)

            ax.set_yticks(ticks = new_ylabels)

    axes[-1].axis('off')
    axes[i].legend(bbox_to_anchor = (1.1, 0.8), fontsize = fontsize_tick)

    fig.text(0.4, 0.04, units, ha = 'center', fontsize = fontsize_labels)
    fig.text(-0.04, 0.5, r'Non-exceedence probability', va = 'center',
             rotation = 'vertical', fontsize = fontsize_labels)
    plt.subplots_adjust(hspace = 0.35, left = 0.05, right = 0.8, top = 0.95)

    return fig, axes

[docs]def spearman_diff_boxplots_annual(raw_flows: pd.DataFrame, bc_flows: pd.DataFrame, site_pairings,
                                  fontsize_title=40, fontsize_tick=30, fontsize_labels=40,
                                  subtitle = None, median_plot_color = 'red'):

    """Annual difference in spearman rank as boxplots.

    Creates box plots for each stide pairing determing the difference in spearman
    rank for each year between the raw and bias corrected data.

    Parameters
    ----------
    raw_flows : pandas.DataFrame
        Raw flows before bias correction with sites in the columns and time in the index.
    bc_flows : pandas.DataFrame
        Bias corrected flows with sites in the columns and time in the index.
    site_pairings : List[List[str]]
        List of list of string site pairs e.g. [['downstream_name','upstream_name'],...].
        This is used to organize which sites should be paired together for computing the
        spearman rank difference.
    fontsize_title : int, optional
        Font size of the title, defaults as 40.
    fontsize_tick : int, optional
        Font size of the ticks, defaults as 30.
    fontsize_labels : int, optional
        Font size of the labels, defaults as 40.
    subtitle : str, optional
        Subtitle to include after "Annual Chnage in Speraman Rank: ". If no subtitle
        is is specified, none is included and only the title is plotted.
    median_plot_color : str
        Color to plot the boxplot's median as, defaults as 'red'.
    """

    if np.where(raw_flows.index != bc_flows.index)[0].size != 0:
        raise Exception('Please ensure raw_flows and bc_flows have the same index')

    WY_grouper = calc_water_year(raw_flows)

    annual_spearman_difference = pd.DataFrame(index=np.arange(WY_grouper[0], WY_grouper[-1],1),
                                             columns=[str(pairing) for pairing in site_pairings])

    for WY in annual_spearman_difference.index:
        raw_flow_WY = raw_flows[f"{WY}-10-01":f"{WY+1}-09-30"]
        bc_flow_WY = bc_flows[f"{WY}-10-01":f"{WY+1}-09-30"]

        for site_pairing in site_pairings:
            downstream = site_pairing[0]
            upstream = site_pairing[1]

            downstream_raw = raw_flow_WY[downstream]
            upstream_raw = raw_flow_WY[upstream]
            downstream_bc = bc_flow_WY[downstream]
            upstream_bc = bc_flow_WY[upstream]

            raw_spearman, raw_pvalue = scipy.stats.spearmanr(a=downstream_raw.values,b=upstream_raw.values)
            bc_spearman, bc_pvalue = scipy.stats.spearmanr(a=downstream_bc.values,b=upstream_bc.values)
            annual_spearman_difference.loc[WY][str(site_pairing)] = raw_spearman-bc_spearman

    fig, ax = plt.subplots(figsize=(20,20))

    if isinstance(subtitle, type(None)):
        plt.suptitle('Annual Change in Spearman Rank', fontsize=fontsize_title, y=1.05)
    else:
        assert isinstance(subtitle, str)
        plt.suptitle(f'Annual Change in Spearman Rank: {subtitle}', fontsize = fontsize_title, y = 1.05)

    max_vert = np.max(annual_spearman_difference.max().values)*1.1
    min_vert = np.min(annual_spearman_difference.min().values)*1.1
    ax.axhline(y=0, color='black', linestyle='--')
    ax.boxplot([annual_spearman_difference[site_pairing].values for
                site_pairing in annual_spearman_difference.columns],
               labels=[f'{site_pairing[0]},\n{site_pairing[1]}' for site_pairing in site_pairings],
               medianprops={'color':median_plot_color, 'lw':4}, boxprops={'lw':4}, whiskerprops={'lw':4},
               capprops={'lw':4})

    ax.set_ylim(top=max_vert, bottom=min_vert)
    ax.tick_params(axis='both', labelsize=fontsize_tick)

    fig.text(0.5, -0.04, "Gauge Site Pairs: Downstream, Upstream",
                 ha='center', va = 'bottom', fontsize=fontsize_labels);

    fig.text(-0.04, 0.5, r'$r_s(Q_{raw}^{up}, Q_{raw}^{down}) - r_s(Q_{bc}^{up}, Q_{bc}^{down})$',
             va='center', rotation = 'vertical', fontsize=fontsize_labels);

    plt.tight_layout()

[docs]def kl_divergence_annual_compare(flow_dataset: xr.Dataset, sites: list,
                                 raw_var: str, raw_name: str,
                                 ref_var: str, ref_name: str,
                                 bc_vars: list, bc_names: list,
                                 plot_colors: list, title = "Annual KL Diveregence Before/After Bias Correction",
                                 fontsize_title = 40, fontsize_tick = 30, fontsize_labels = 40,
                                 fontsize_legend = 30,
                                 showfliers = False, sharex = True, sharey = 'row', TINY_VAL = 1e-6,
                                 figsize = (30,20), show_y_grid = True):
    """Kullback-Liebler Divergence compared before and after bias correction as boxplots.

    Plots the KL divergence for each year per site as KL(P_{ref} || P_{raw}) and
    KL( P_{ref} || P_{bc}).

    Parameters
    ----------
    flow_dataset : xarray.Dataset
        Contains raw (uncorrected), reference (true), and bias corrected flows.
    sites : list
        Contains all the sites to be plotted as included in `flow_dataset`, (note that if
        the number of sites to be plotted is square or rectangular, the last site will not
        be plotted to save room for the legend).
    raw_var : str
        Accesses the raw flows in `flow_dataset`.
    raw_name : str
        Label for the raw flows in the legend and horizontal labels corresponding to `raw_var`.
    ref_var : str
        Accesses the reference flows in `flow_dataset`.
    ref_name : str
        Label the reference flows in the legend and horizontal labels corresponding to `ref_var`.
    bc_vars : list
        String(s) to access the bias corrected flows in `flow_dataset`.
    bc_names : list
        Label(s) for the bias corrected flows in the legend and horizontal labels, corresponding
        to each element in `bc_vars`.
    plot_colors : list
        Colors to be plotted for the raw and the bias corrected flows, respectively.
    title : str, optional
        Title to be plotted, defaults as "Annual KL Diveregence Before/After Bias Correction".
    fontsize_title : int, optional
        Font size of the title, defaults as 40.
    fontsize_tick : int, optional
        Font size of the ticks, defaults as 30.
    fontsize_labels : int, optional
        Font size of the labels, defaults as 40.
    fontsize_legend : int, optional
        Font size of the legend text, defaults as 30.
    showfliers : boolean, optional
        Whether to include fliers in the boxplots, defaults as False.
    sharex : boolean or str, optional
        Whether the horizontal axis is shared, defaults as True.
    sharey : boolean or str, optional
        Whether the vertical axis is shared, defaults as 'row' to share the vertical axis
        in the same row.
    TINY_VAL : float, optional
        Used to ensure there are no zero values in the data because zero values cause unsual
        behavior in calculating the KL Divergence. Defaults as 1E-6.
    figsize : tuple, optional
        Figure size following maptlotlib connventions, defaults as (30,20).
    show_y_grid : boolean, optional
        Whether to plot y grid lines, defaults as True.

    Returns
    -------
    maptlotlib.figure, matplotlib.axes
    """

    raw_flows = flow_dataset[raw_var].to_pandas()
    ref_flows = flow_dataset[ref_var].to_pandas()
    bc_flows = list()
    for bc_var in bc_vars:
        bc_flows.append(flow_dataset[bc_var].to_pandas())

    WY_grouper = calc_water_year(raw_flows)
    WY_array = np.arange(WY_grouper[0], WY_grouper[-1], 1)

    n_rows, n_cols = determine_row_col(len(sites))
    fig, axs = plt.subplots(n_rows, n_cols, figsize = figsize, sharex = sharex, sharey = sharey)
    axs_list = axs.ravel().tolist()

    kldiv_refraw_annual = pd.DataFrame(index = WY_array, columns = sites)
    kldiv_refbc_annuals = list()
    for bc_var in bc_vars:
        kldiv_refbc_annuals.append(pd.DataFrame(index = WY_array, columns = sites))

    plt.suptitle(title, fontsize = fontsize_title, y = 1.05)

    for WY in WY_array:
        raw_flow_WY = raw_flows[f"{WY}-10-01":f"{WY+1}-09-30"]
        ref_flow_WY = ref_flows[f"{WY}-10-01":f"{WY+1}-09-30"]
        bc_flow_WYs = list()
        for bc_flow in bc_flows:
            bc_flow_WYs.append(bc_flow[f"{WY}-10-01":f"{WY+1}-09-30"])
        total_bins = int(np.sqrt(len(raw_flow_WY.index)))

        for site in sites:
            ref_WY_site_vals = ref_flow_WY[site].values
            raw_WY_site_vals = raw_flow_WY[site].values
            bc_WY_site_vals = list()
            for bc_flow_WY in bc_flow_WYs:
                bc_WY_site_vals.append(bc_flow_WY[site].values)

            ref_WY_site_pdf, ref_WY_site_edges = np.histogram(ref_WY_site_vals, bins = total_bins,
                                                              density = True)
            raw_WY_site_pdf = np.histogram(raw_WY_site_vals, bins = ref_WY_site_edges, density = True)[0]
            bc_WY_site_pdfs = list()
            for bc_WY_site_val in bc_WY_site_vals:
                bc_WY_site_pdf = np.histogram(bc_WY_site_val, bins = ref_WY_site_edges, density = True)[0]
                bc_WY_site_pdf[bc_WY_site_pdf == 0] = TINY_VAL
                bc_WY_site_pdfs.append(bc_WY_site_pdf)

            ref_WY_site_pdf[ref_WY_site_pdf == 0] = TINY_VAL
            raw_WY_site_pdf[raw_WY_site_pdf == 0] = TINY_VAL

            kldiv_refraw_annual.loc[WY][site] = scipy.stats.entropy(pk = raw_WY_site_pdf, qk = ref_WY_site_pdf)
            for i, (kldiv_refbc_annual, bc_WY_site_pdf) in enumerate(zip(kldiv_refbc_annuals, bc_WY_site_pdfs)):
                kldiv_refbc_annual.loc[WY][site] = scipy.stats.entropy(pk = bc_WY_site_pdf, qk = ref_WY_site_pdf)
                kldiv_refbc_annuals[i] = kldiv_refbc_annual

    plot_labels = [raw_name]
    plot_labels.extend(bc_names)

    for i, site in enumerate(sites):
        ax=axs_list[i]
        plot_vals = [kldiv_refraw_annual[site].values]
        plot_vals.extend([kldiv_refbc_annual[site].values for kldiv_refbc_annual in kldiv_refbc_annuals])
        box_dict = ax.boxplot(plot_vals, patch_artist = True, showfliers = showfliers, widths = 0.8, notch = True)
        for item in ['boxes', 'fliers', 'medians', 'means']:
            for sub_item, color in zip(box_dict[item], plot_colors):
                plt.setp(sub_item, color = color)
        ax.set_title(site, fontsize = fontsize_labels)
        ax.set_xticks(np.arange(1, len(plot_labels)+1))
        ax.set_xticklabels(plot_labels, fontsize = fontsize_tick, rotation = 45)
        ax.tick_params(axis = 'both', labelsize = fontsize_tick)
        if show_y_grid:
            ax.grid(which = 'major', axis = 'y', alpha = 0.5)

    # gets rid of any spare axes
    i += 1
    while i < len(axs_list):
        axs_list[i].axis('off')
        i += 1
    # ensures last axes is off to make room for the legend
    axs_list[-1].axis('off')

    fig.text(-0.04, 0.5, "Annual KL Divergence",
             va = 'center', rotation = 'vertical', fontsize = fontsize_labels);

    fig.text(0.5, -0.04, r'$KL(P_{' + f'{ref_name}' + r'} || P_{scenario})$',
             va = 'bottom', ha = 'center', fontsize = fontsize_labels);

    axs_list[-1].legend(handles=custom_legend(names=plot_labels, colors = plot_colors),
                        fontsize = fontsize_legend, loc = 'center')

    plt.tight_layout()

    return fig, axs

[docs]def spearman_diff_boxplots_annual_compare(flow_dataset: xr.Dataset, site_pairings,
                                  raw_var: str, bc_vars: list, bc_names: list,
                                  plot_colors: list, showfliers =  True,
                                  fontsize_title=40, fontsize_tick=25, fontsize_labels=30,
                                  figsize = (20,20), sharey = 'row'):

    """Annual difference in spearman rank as boxplots.

    Creates box plots for each site pairing determining the difference in spearman
    rank for each year between the raw and the bias corrected data.

    Parameters
    ----------
    flow_dataset : xarray.Dataset
        Contains raw (uncorrected), reference (true), and bias corrected flows.
    site_pairings : List[List[str]]
        List of list of string site pairs e.g. [['downstream_name','upstream_name'],...].
        This is used to organize which sites should be paired together for computing the
        spearman rank difference.
    raw_var : str
        Accesses the raw (uncorrected) flows in `flow_dataset`.
    bc_vars : list
        Strings to access the bias corrected flows in `flow_dataset`.
    bc_names : list
        Labels for the bias corrected flows from `flow_dataset`, corresponding to
        each element in `bc_vars` in the same order.
    plot_colors : list
        Colors that are in the same order as the `bc_vars` and `bc_names`
        to be used in plotting.
    showfliers : boolean, optional
        Whether to show fliers on the boxplots, defaults as True.
    fontsize_title : int, optional
        Font size of the title, defaults as 40.
    fontsize_tick : int, optional
        Font size of the ticks, defaults as 25.
    fontsize_lables : int, optional
        Font size of the labels, defaults as 30.
    figsize : tuple
        Figure size following matplotlib connventions, defaults as (20, 20).
    sharey : boolean or str, optional
        Whether or how the vertical axis are to be shared, defaults as 'row' to
        have vertical axis in the same row shared.

    Returns
    -------
    matplotlib.figure, matplotlib.axes
    """

    if len(bc_vars) == 0:
        raise Exception("Please enter a non-zero number strings in bc_vars to be used")
    if len(bc_vars) != len(bc_names):
        raise Exception("Please have the same number of entries in bc_names as bc_names")
    if len(plot_colors) < len(bc_vars):
        raise Exception(f"Please enter at least {len(bc_vars)} colors in plot_colors")

    raw_flows = flow_dataset[raw_var].to_pandas()
    bc_flows = list()
    for bc_var in bc_vars:
        bc_flows.append(flow_dataset[bc_var].to_pandas())

    WY_grouper = calc_water_year(raw_flows)
    WY_index = np.arange(WY_grouper[0], WY_grouper[-1], 1)

    annual_spearman_differences = list()
    for bc_var in bc_vars:
        annual_spearman_differences.append(pd.DataFrame(index=WY_index,
                                                        columns=[str(pairing) for pairing in site_pairings]))

    for WY in WY_index:
        raw_flow_WY = raw_flows[f"{WY}-10-01":f"{WY+1}-09-30"]
        bc_flow_WY_list= list()
        for bc_flow in bc_flows:
            bc_flow_WY_list.append(bc_flow[f"{WY}-10-01":f"{WY+1}-09-30"])

        for site_pairing in site_pairings:
            downstream = site_pairing[0]
            upstream = site_pairing[1]

            downstream_raw = raw_flow_WY[downstream]
            upstream_raw = raw_flow_WY[upstream]
            downstream_bcs = list()
            upstream_bcs = list()
            for bc_flow_WY in bc_flow_WY_list:
                downstream_bcs.append(bc_flow_WY[downstream])
                upstream_bcs.append(bc_flow_WY[upstream])

            raw_spearman, raw_pvalue = scipy.stats.spearmanr(a=downstream_raw.values, b=upstream_raw.values)
            for i, (downstream_bc, upstream_bc, annual_spearman_difference) in enumerate(
                zip(downstream_bcs, upstream_bcs, annual_spearman_differences)):
                bc_spearman, bc_pvalue = scipy.stats.spearmanr(a=downstream_bc.values, b=upstream_bc.values)
                annual_spearman_difference.loc[WY][str(site_pairing)] = raw_spearman-bc_spearman
                annual_spearman_differences[i] = annual_spearman_difference

    n_rows, n_cols = determine_row_col(len(site_pairings))
    fig, axs = plt.subplots(n_rows, n_cols, figsize=figsize, sharex = True, sharey = sharey)
    axs_list = axs.ravel().tolist()

    #rewrites wrt subplots for site_pairings

    fig.suptitle('Annual Change in Spearman Rank', fontsize=fontsize_title, y=1.05)

    max_vert = np.max([df.values for df in annual_spearman_differences])*1.1
    min_vert = np.min([df.values for df in annual_spearman_differences])
    min_vert  = np.min([min_vert*1.1, min_vert*0.9])

    for i, site_pairing in enumerate(site_pairings):
        ax = axs_list[i]

        ax.axhline(y=0, color='black', linestyle='--')
        ax.set_title(f"{site_pairing[0]}, {site_pairing[1]}", fontsize = fontsize_labels)
        box_dict = ax.boxplot([annual_spearman_difference[str(site_pairing)].values for
                               annual_spearman_difference in annual_spearman_differences],
                   patch_artist = True, showfliers = showfliers, widths = 0.8, notch = True)

        ax.set_ylim(top=max_vert, bottom=min_vert)
        ax.set_xticklabels(bc_names, fontsize = fontsize_tick)
        ax.tick_params(axis='both', labelsize=fontsize_tick)
        for item in ['boxes', 'fliers', 'medians', 'means']:
                for sub_item, color in zip(box_dict[item], plot_colors):
                    plt.setp(sub_item, color = color)

    fig.text(0.5, -0.04, "Bias Correction Scenario",
                 ha='center', va = 'bottom', fontsize=fontsize_labels);
    fig.text(-0.04, 0.5, r'$r_s(Q_{raw}^{up}, Q_{raw}^{down}) - r_s(Q_{bc}^{up}, Q_{bc}^{down})$',
             va='center', rotation = 'vertical', fontsize=fontsize_labels);

    plt.tight_layout()

    return fig, axs

[docs]def compare_CDF_all(flow_dataset:xr.Dataset, plot_sites: list,
                      raw_var: str, raw_name: str,
                      ref_var: str, ref_name: str,
                      bc_vars: list, bc_names: list,
                      plot_colors: list, logit_scale = True,
                      logarithm_base = '10', units = r'Q [$m^3/s$]',
                      figsize = (20,20),
                      fontsize_title = 40, fontsize_labels = 40, fontsize_tick = 40,
                      markersize = 1, alpha = 0.3):
    """Compare probability distribution functions as a summary statistic.

    Plots the CDF's of the raw, reference, and bias corrected flows with data
    from all sites in plot_sites combined for a summary statistic.

    Parameters
    ----------
    flow_dataset : xarray.Dataset
        Contains raw (uncorrected), reference (true), and bias corrected flows.
    plot_sites : list
        Gauge sites to be plotted.
    raw_var : str
        Accesses the raw (uncorrected) flows in `flow_dataset`.
    raw_name : str
       Label for the raw flows in the legend, corresponding to `raw_var`.
    ref_var : str
        Accesses the reference (true) flows in `flow_dataset`.
    ref_name : str
        Label for the reference flows in the legend, corresponding to `ref_var`.
    bc_vars : list
        Accesses the bias corrected flows in `flow_dataset`. Can be a size of 1.
    bc_names : list
        Label for the bias corrected flows in the legend, corresponding to `bc_var`.
        Can be a size of 1.
    plot_colors : list, optional
        Colors to be plotted for the flows corresponding to `raw_var`,
        `ref_var`, and `bc_var`, defaulting as ['grey', 'black', 'blue', 'red'],
        assuming there are two entries in `bc_var`.
    logit_scale : True, optional
        Whether to plot the vertical scale on a logit axis (True) or not (False).
        Defaults as True.
    logarithm_base : str, optional
        The logarthimic base to use for the horizontal scale. Only the following
        are currently supported:
            '10' to use a log10 horizontal scale (default)
            'e' to use a natural log horizontal scale
    units : str, optional
        The horizontal axis's label for units, defaults as r'Q [$m^3/s$]'.
    figsize : tuple, optional
        Figure size following matplotlib connventions, defaults as (20, 20).
    fontsize_title : int, optional
        Font size of the title, defaults as 40.
    fontsize_labels : int, optional
        Font size of the labels, defaults as 40.
    fontsize_tick : int, optional
        Font size of the ticks, defaults as 40.
    markersize : float, optional
        Size of the markers plotted, defaults as 1. Linewidth is half of this value.
    alpha : float, optional
        Transparancy of the lines and markers, defaults as 0.3.

    Returns
    -------
    matplotlib.figure, matplotlib.axes
    """

    if len(bc_vars) == 0:
        raise Exception("Please enter a non-zero number strings in bc_vars to be used")
    if len(bc_vars) != len(bc_names):
        raise Exception("Please have the same number of entries in bc_names as bc_names")
    if len(plot_colors) < len(bc_vars):
        raise Exception(f"Please enter at least {len(bc_vars)} colors in plot_colors")

    if logarithm_base == '10':
        log_func = log10_1p
    elif logarithm_base == 'e':
        log_func = np.log1p
    else:
        raise Exception("Please enter logarithm_base as '10' or 'e'")

    fig, ax = plt.subplots(figsize = figsize)

    fig.suptitle("Cumulative Distribution Function", y = 1.01, x = 0.4, fontsize = fontsize_title)

    cmp = flow_dataset.sel(seg = plot_sites)

    raw = ECDF(log_func(cmp[raw_var].values.flatten()))
    ref = ECDF(log_func(cmp[ref_var].values.flatten()))

    cors = list()
    for bc_var in bc_vars:
        cors.append(ECDF(log_func(cmp[bc_var].values.flatten())))

    linewidth = markersize / 2

    ax.plot(raw.x, raw.y, color = plot_colors[0], label = raw_name, lw = linewidth,
            linestyle = '--', marker = 'o', markersize = markersize, alpha = alpha)
    ax.plot(ref.x, ref.y, color = plot_colors[1], label = ref_name, lw = linewidth,
            linestyle = '--', marker = 'x', markersize = markersize, alpha = alpha)

    for j, cor in enumerate(cors):
        ax.plot(cor.x, cor.y, color = plot_colors[2+j], label = bc_names[j], lw = linewidth,
                linestyle = '--', marker = '*', markersize = markersize, alpha = alpha)

    ax.tick_params(axis = 'both', labelsize = fontsize_tick)

    # relabel axis to account for log_func
    xlabels = ax.get_xticks()
    if logarithm_base == '10':
        ax.set_xticklabels(labels = ["$" + "{:0.0f}".format(10**k-1) + "$" for k in xlabels])
    elif logarithm_base == 'e':
        ax.set_xticklabels(labels = ["$" + "{:0.2f}".format(exp(k)-1) + "$" for k in xlabels])

    if logit_scale:
        ax.set_yscale('logit')
        ax.minorticks_off()
        ax.yaxis.set_major_formatter(mpl.ticker.LogitFormatter())
        ylabels = ax.get_yticks()
        new_ylabels = list()
        for k, ylabel in enumerate(ylabels):
            if k % 3 == 0:
                new_ylabels.append(ylabel)

        ax.set_yticks(ticks = new_ylabels)

    plot_labels = [raw_name, ref_name]
    plot_labels.extend(bc_names)
    ax.legend(handles = custom_legend(plot_labels, plot_colors), fontsize = fontsize_tick)

    fig.text(0.4, 0.04, units, ha = 'center', fontsize = fontsize_labels)
    fig.text(-0.04, 0.5, r'Non-exceedence probability', va = 'center',
             rotation = 'vertical', fontsize = fontsize_labels)
    plt.subplots_adjust(hspace = 0.35, left = 0.05, right = 0.8, top = 0.95)

    return fig, axes

[docs]def compare_mean_grouped_CPD(flow_dataset: xr.Dataset, plot_sites: list, grouper_func,
                             raw_var: str, raw_name: str,
                             ref_var: str, ref_name: str,
                             bc_vars: list, bc_names: list,
                             plot_colors: list, subset_month = None,
                             units = r'Mean Annual Flow [$m^3/s$]',
                             figsize = (20,20), sharex = False, sharey = False,
                             pp_kws = dict(postype='cunnane'), fontsize_title = 80,
                             fontsize_legend = 68, fontsize_subplot = 60,
                             fontsize_tick = 45, fontsize_labels = 80,
                             linestyles = ['-','-','-'], markers = ['.','.','.'],
                             markersize = 30, alpha = 1, legend_bbox_to_anchor = (1, 1),
                             fig = None, axes = None, start_ax_index = 0, tot_plots = None
                      ):
    """
    Cumulative Probability Distributions
        plots the CPD's of the raw, reference, and bias corrected flows on a probability axis
    ----
    flow_dataset : xarray.Dataset
        Contatains raw, reference, and bias corrected flows.
    plot_sites : list
        A list of sites to be plotted.
    grouper_func
        Function to group a pandas.DataFrame index by to calculate the
        mean of the grouped values.
    raw_var : str
        The string to access the raw flows in flow_dataset.
    raw_name : str
        Label for the raw flows in the legend.
    ref_var : str
        The string to access the reference flows in flow_dataset.
    ref_name : str
        Label for the reference flows in the legend.
    bc_vars : list
        List of strings to access the bias corrected flows in flow_dataset.
    bc_names : list
        List of labels for the bias corrected flows in the legend.
    plot_colors : list
        Contains the colors to be plotted for `raw_var`, `ref_var`, and
        `bc_vars`, respectively.
    subset_month: int, optional
        The integer date of a month to subset out for plotting,
        (ex: if you want to subset out January, enter 1). Defaults as None
        to avoid subsetting and use all the data in the year.
    units : str, optional
        Vertical axis's label for units, defaults as r'Mean Annual Flow [$m^3/s$]'.
    pp_kws : dict, optional
        Plotting position computation as specified by
        https://matplotlib.org/mpl-probscale/tutorial/closer_look_at_plot_pos.html.
        Defaults as dict(postype='cunnane') for cunnane plotting positions.
    fontsize_title : int, optional
        Font size for the plot title, defaults as 80.
    fontsize_legend : int, optional
        Font size for the plot legend, defaults as 68.
    fontsize_subplot : int, optional
        Font size for the plot subplot text, default as 60.
    fontsize_tick : int, optional
        Font size for the plot ticks, defaults as 45.
    fontsize_labels : int, optional
        Font size for the horizontal and vertical axis labels, defaults as 80.
    linestyles : list, optional
        Linestyles for ploting `raw_var`, `ref_var`, and `bc_vars`, respectively.
        Defaults as ['-','-','-'], expecting one of each.
    markers : list, optional
        Markers for ploting `raw_var`, `ref_var`, and `bc_vars`, respectively.
        Defaults as ['.','.','.'], expecting one of each.
    markersize : int, optional
        Size of the markers for plotting, defaults as 30.
    alpha : float, optional
        Alpha transparency value for plotting, where 1 is opaque and 0 is transparent.
    legend_bbox_to_anchor : tuple, optional
        Box that is used to position the legend to the final axes. Defaults as (1,1).
        Modify this is the legend does not plot where you desire it to be.
    fig : matplotlib.figure, optional
        matplotlib figure object to plot on, defaulting as None and creating a new object
        unless otherwise specified.
    axes : matplotlib.axes, optional
        Array-like of matplotlib axes objet to plot multiple plots on, defaulting as None and creating
        a new object unless otherwise specified.
    start_ax_index : int, optional
        If the plots should not be plotted starting at the first ax in axes, specifiy the index
        that plotting should begin on. Defaults as None, assuming plotting should begin from
        the first ax.
    tot_plots : int, optional
        If more plotting is to be done than with the total data to be provided, describe how many
        total plots there should be. Defalts as None, assuming plotting should begin form the
        first ax.
    """

    if len(bc_vars) == 0:
        raise Exception("Please enter a non-zero number strings in bc_vars to be used")
    if len(bc_vars) != len(bc_names):
        raise Exception("Please have the same number of entries in bc_names as bc_names")
    if len(plot_colors) < 2 + len(bc_vars):
        raise Exception(f"Please enter at least {2+len(bc_vars)} colors in plot_colors")
    if len(linestyles) < 2 + len(bc_vars):
        raise Exception(f"Please enter at least {2+len(bc_vars)} styles in linestyles")
    if len(markers) < 2 + len(bc_vars):
        raise Exception(f"Please enter at least {2+len(bc_vars)} markers in markers")

    if not tot_plots:
        tot_plots = len(plot_sites)
    n_rows, n_cols = determine_row_col(tot_plots)
    if fig is None or axes is None:
        fig, axes = plt.subplots(n_rows, n_cols, figsize = figsize,
                                 sharex = sharex, sharey = sharey)
    axes = axes.flatten()

    time = flow_dataset['time'].values
    raw_flow_df = pd.DataFrame(data = flow_dataset[raw_var].sel(seg=plot_sites).values,
                               index=time, columns = plot_sites)
    ref_flow_df = pd.DataFrame(data = flow_dataset[ref_var].sel(seg=plot_sites).values,
                               index=time, columns = plot_sites)
    bc_flow_dfs = list()
    for bc_var in bc_vars:
        bc_flow_df = pd.DataFrame(data = flow_dataset[bc_var].sel(seg=plot_sites).values,
                                  index = time, columns = plot_sites)
        bc_flow_dfs.append(bc_flow_df)

    if not isinstance(subset_month, type(None)):
        raw_flow_df = raw_flow_df[raw_flow_df.index.month == subset_month]
        ref_flow_df = ref_flow_df[ref_flow_df.index.month == subset_month]
        for i, bc_flow_df in enumerate(bc_flow_dfs):
            bc_flow_dfs[i] = bc_flow_df[bc_flow_df.index.month == subset_month]

    WY_grouper = grouper_func(raw_flow_df)
    raw_flow_annual = raw_flow_df.groupby(WY_grouper).mean()
    ref_flow_annual = ref_flow_df.groupby(WY_grouper).mean()
    bc_flow_annuals = list()
    for bc_flow_df in bc_flow_dfs:
        bc_flow_annual = bc_flow_df.groupby(WY_grouper).mean()
        bc_flow_annuals.append(bc_flow_annual)

    for i, site in enumerate(plot_sites):
        ax = axes[i+start_ax_index]

        raw = raw_flow_annual[site].values
        ref = ref_flow_annual[site].values

        y_max_raw = scipy.stats.scoreatpercentile(raw, 99)
        y_max_ref = scipy.stats.scoreatpercentile(ref, 99)
        y_min_raw = scipy.stats.scoreatpercentile(raw, 1)
        y_min_ref = scipy.stats.scoreatpercentile(ref, 1)
        y_max = np.max([y_max_raw, y_max_ref])*1.1
        y_min = np.min([y_min_raw, y_min_ref])*0.9

        cors = list()
        for bc_flow_annual in bc_flow_annuals:
            bc = bc_flow_annual[site].values
            cors.append(bc)
            y_max_bc = scipy.stats.scoreatpercentile(bc, 99)
            y_min_bc = scipy.stats.scoreatpercentile(bc, 1)
            if y_max_bc > y_max:
                y_max = y_max_bc
            if y_min_bc < y_min:
                y_min = y_min_bc

        common_opts = dict(
            plottype='prob',
            probax='x'
        )

        probscale.probplot(raw, ax=ax, pp_kws=pp_kws,
                           scatter_kws=dict(linestyle=linestyles[0], marker=markers[0], markersize = markersize, alpha=alpha,
                                            color = plot_colors[0], label=raw_name), **common_opts)

        probscale.probplot(ref, ax=ax, pp_kws=pp_kws,
                           scatter_kws=dict(linestyle=linestyles[1], marker=markers[1], markersize = markersize*1.5, alpha=alpha,
                                            color = plot_colors[1], label=ref_name), **common_opts)
        for j, cor in enumerate(cors):
            probscale.probplot(cor, ax=ax, pp_kws=pp_kws,
                               scatter_kws=dict(linestyle=linestyles[2+j], marker=markers[2+j], markersize = markersize, alpha=alpha,
                                                color = plot_colors[2+j], label=bc_names[j]), **common_opts)

        ax.set_title(site, fontsize = fontsize_subplot)
        ax.tick_params(axis = 'both', labelsize = fontsize_tick)
        ax.set_ylim([y_min, y_max])
        ax.set_xlim(left=1, right=99)
        ax.set_xticks([1, 10, 20, 50, 80, 90, 95, 99])
        plt.setp(ax.get_xticklabels(), Rotation=45)

    axes[-1].axis('off')
    plot_names = [raw_name, ref_name]
    plot_names.extend(bc_names)
    axes[i].legend(handles=custom_legend(names = plot_names, colors=plot_colors),
                   bbox_to_anchor = legend_bbox_to_anchor, fontsize = fontsize_legend)

    fig.text(0.4, 0.04, 'Cumulative Percentile', ha = 'center', fontsize = fontsize_labels)
    fig.text(-0.01, 0.5, units, va = 'center',
             rotation = 'vertical', fontsize = fontsize_labels)

    plt.subplots_adjust(hspace = 0.35, left = 0.05, right = 0.8, top = 0.95)

    return fig, axes