Source code for score_analysis.utils

"""
This module contains various utility functions used elsewhere in the library.
"""

from typing import List, Optional, Union

import numpy as np
import scipy.stats


[docs]def binomial_ci(count: np.ndarray, nobs: np.ndarray, alpha: float = 0.05):
    """
    Confidence interval for binomial proportions using the normal approximation.

    Args:
        count: Number of successes of shape (X,).
        nobs: Number of trials of shape (X,).
        alpha: Significance level. In range (0, 1)

    Returns:
        np.ndarray of shape (X, 2). Lower and upper limits of confidence interval with
        coverage 1-alpha.
    """
    nans = np.full_like(count, np.nan, dtype=float)
    p = np.divide(count, nobs, out=nans, where=nobs != 0)

    nans = np.full_like(count, np.nan, dtype=float)
    std = np.divide(p * (1 - p), nobs, out=nans, where=nobs != 0)
    std = np.sqrt(std)

    dist = scipy.stats.norm.isf(alpha / 2.0) * std
    ci = np.stack([p - dist, p + dist], axis=-1)
    return ci


[docs]def bootstrap_ci(
    theta: np.ndarray,
    theta_hat: Optional[Union[float, np.ndarray]] = None,
    alpha: Optional[Union[float, np.ndarray]] = 0.05,
    *,
    method: str = "quantile",
) -> np.ndarray:
    """
    Calculates the bootstrap confidence interval with approximate coverage 1-alpha for
    the empirical sample theta. We assume that we have computed N bootstrap estimates,
    theta, of the quantity of interest, theta_hat.

    This function then constructs a confidence interval based on the bootstrap
    estimates, using the bias corrected and accelerated quantile method.

    Args:
        theta: Array of shape (N, Y), where N is the number of samples.
        theta_hat: Array of shape (Y,) with the empirical estimate of the metric. This
            is only needed for the methods "bc" and "bca".
        alpha: Significance level. In range (0, 1). Vector-valued alpha only supported
            with "quantile" method.
        method: Method to compute the CI from the bootstrap samples.
            Possible values are

            * "quantile" uses the alpha/2 and 1-alpha/2 quantiles of the
              empirical metric distribution.
            * "bc" applies bias correction to correct for the bias of the median
              of the empirical distribution
            * "bca" applies bias correction and acceleration to correct for
              non-constant standard error.

            See Ch. 11 of Computer Age Statistical Inference by Efron and Hastie
            for details.

    Returns:
        Returns an array of shape (Y, 2) with lower and upper bounds of the CI. For
        vector-valued alpha of shape (Z,) the return values has shape (Y, Z, 2).
    """
    alpha = np.asarray(alpha)  # (Z,)
    alpha_shape = alpha.shape
    alpha = np.reshape(alpha, -1)  # (Z',)

    alpha_lower = alpha / 2.0
    alpha_upper = 1 - alpha / 2.0

    if method == "quantile":
        alpha_joint = np.stack([alpha_lower, alpha_upper], axis=0)  # (2, Z')
        ci = np.nanquantile(theta, q=alpha_joint, axis=0)  # (2, Z', Y)
        ci = np.moveaxis(ci, source=[0, 1], destination=[-1, -2])  # (Y, Z', 2)
        ci = np.reshape(ci, theta.shape[1:] + alpha_shape + (2,))  # (Y, Z, 2)
    elif method in {"bc", "bca"}:
        if theta_hat is None:
            raise ValueError(f"Must provide theta_hat when using method {method}.")
        theta_hat = np.asarray(theta_hat)
        theta_hat = theta_hat[np.newaxis]  # (1, Y)

        # Flatten the metric shape to a vector
        nb_samples = theta.shape[0]
        metric_shape = theta.shape[1:]
        theta = np.reshape(theta, (nb_samples, -1))
        theta_hat = np.reshape(theta_hat, (1, -1))
        metric_size = theta.shape[-1]

        # Proportion of samples less than theta_hat
        nb_not_nan = np.sum(~np.isnan(theta), axis=0)
        p0 = np.sum(theta <= theta_hat, axis=0) / nb_not_nan  # (Y,)
        # Inverse function of standard normal cdf. If p0=0.5, then z0=0 and this
        # method reduces to the quantile method.
        z0 = scipy.stats.norm.ppf(p0)

        z_alpha_lower = scipy.stats.norm.ppf(alpha_lower)
        z_alpha_upper = scipy.stats.norm.ppf(alpha_upper)

        if method == "bc":
            # See (11.33) in Efron, Hastie
            z_lower = 2 * z0 + z_alpha_lower
            z_upper = 2 * z0 + z_alpha_upper
        else:  # bootstrap_method == "bca"
            # Estimate acceleration from data. See (11.40) in Efron, Hastie.
            a_num = np.nansum((theta - theta_hat) ** 3, axis=0)
            a_den = 6 * np.nansum((theta - theta_hat) ** 2, axis=0) ** 1.5
            # If a=0, the method reduces to the non-accelerated bias correction
            a = np.divide(a_num, a_den, out=np.zeros_like(a_num), where=a_den != 0)
            # See (11.39) in Efron, Hastie
            fin = np.isfinite(z0)
            z_lower = np.copy(z0)
            s_lower = z0[fin] + z_alpha_lower
            z_lower[fin] = z0[fin] + s_lower / (1 - a[fin] * s_lower)
            z_upper = np.copy(z0)
            s_upper = z0[fin] + z_alpha_upper
            z_upper[fin] = z0[fin] + s_upper / (1 - a[fin] * s_upper)

        alpha_hat_lower = scipy.stats.norm.cdf(z_lower)
        alpha_hat_upper = scipy.stats.norm.cdf(z_upper)

        ci = np.empty((metric_size, 2))
        for j in range(metric_size):
            ci[j] = np.nanquantile(
                theta[:, j], q=[alpha_hat_lower[j], alpha_hat_upper[j]], axis=0
            )
        ci = np.reshape(ci, (*metric_shape, 2))
    else:
        raise ValueError(f"Unknown value for bootstrap_method: {method}")

    return ci


[docs]def invert_pl_function(x: np.ndarray, y: np.ndarray, t: np.ndarray) -> List[np.ndarray]:
    r"""
    Inverts piecewise linear function.

    The piecewise linear function  :math:`f(x_i) = y_i` is defined the by the sequence
    of points :math:`(x_i, y_i)`. We assume that :math:`x` is an increasing
    vector, i.e., :math:`x_i \leq x_{i+1}`; the vector :math:`x` does not have to be
    strictly increasing, but if :math:`x_i = x_{i+1}`, then we assume that also
    :math:`y_i = y_{i+1}`.

    For a given target value :math:`t_j`, the function finds all values
    :math:`s_{j, k}`, such that :math:`f(s_{j, k}) = t_j`. Because the number
    of solutions can vary for different :math:`t_j`, the return type for :math:`s`
    is a list of the same length as :math:`t` with each entry an array.

    If the equation :math:`f(s) = t_j` has no solution, we return the closest point
    :math:`s_j`, i.e.,

    .. math::

        \|f(s_j) - t_j\| = \min_z \|f(z) - t_j\|\,.
    We always return only one solution in this case.

    Args:
         x: Increasing vector of points defining the function.
         y: Vector of same length as x defining the function values.
         t: Vector of points at which to invert the function.

    Returns:
        A list :math:`s` of the same length as :math:`t` of arrays such that for all
        :math:`j`, the array :math:`s_j` is strictly increasing and contains all
        solutions of the equation :math:`f(s) = t_j`.
    """
    x = np.asarray(x)
    y = np.asarray(y)
    t = np.asarray(t)
    t_scalar = t.ndim == 0  # Is input a scalar?

    x = x[:, np.newaxis]  # (N, 1)
    y = y[:, np.newaxis]  # (N, 1)
    # We turn scalar t into a 1-dim array + extra dimension for broadcasting
    t = t[np.newaxis, np.newaxis] if t_scalar else t[np.newaxis, :]  # (1, T)

    # Find out where we cross the threshold
    crossing_up = (y[:-1] <= t) & (y[1:] > t)
    crossing_down = (y[:-1] >= t) & (y[1:] < t)
    crossing = crossing_up | crossing_down
    # Get indices of crossings; Note that np.nonzero tests the array in row-major
    # order, meaning that after transposing for each t index, the s indices will
    # be already in increasing order.
    t_indices, s_indices = np.nonzero(crossing.T)

    # Find closest points to given t values in case we don't find exact solutions.
    min_ind = np.argmin(np.abs(y - t), axis=0)  # (T,)
    s_min = x[min_ind]

    t = t[0]  # We are done with broadcasting
    x = x[:, 0]
    y = y[:, 0]

    s = [[] for _ in t]
    # Find all solutions given the crossing indices
    for t_ind, j in zip(t_indices, s_indices):
        # Apply linear interpolation between x[j] and x[j+1]
        la = (t[t_ind] - y[j]) / (y[j + 1] - y[j])
        z = (1 - la) * x[j] + la * x[j + 1]
        s[t_ind].append(z)

    # Deal with cases where we don't have solutions
    for j in range(len(s)):
        if len(s[j]) == 0:
            s[j].append(s_min[j])

    # Convert to list of arrays
    s = [np.asarray(z) for z in s]

    if t_scalar:  # Reduce back to scalar
        s = s[0]

    return s