Source code for score_analysis.utils

"""
This module contains various utility functions used elsewhere in the library.
"""

from typing import List, Optional, Union

import numpy as np
import scipy.stats


[docs]def binomial_ci(count: np.ndarray, nobs: np.ndarray, alpha: float = 0.05): """ Confidence interval for binomial proportions using the normal approximation. Args: count: Number of successes of shape (X,). nobs: Number of trials of shape (X,). alpha: Significance level. In range (0, 1) Returns: np.ndarray of shape (X, 2). Lower and upper limits of confidence interval with coverage 1-alpha. """ nans = np.full_like(count, np.nan, dtype=float) p = np.divide(count, nobs, out=nans, where=nobs != 0) nans = np.full_like(count, np.nan, dtype=float) std = np.divide(p * (1 - p), nobs, out=nans, where=nobs != 0) std = np.sqrt(std) dist = scipy.stats.norm.isf(alpha / 2.0) * std ci = np.stack([p - dist, p + dist], axis=-1) return ci
[docs]def bootstrap_ci( theta: np.ndarray, theta_hat: Optional[Union[float, np.ndarray]] = None, alpha: Optional[Union[float, np.ndarray]] = 0.05, *, method: str = "quantile", ) -> np.ndarray: """ Calculates the bootstrap confidence interval with approximate coverage 1-alpha for the empirical sample theta. We assume that we have computed N bootstrap estimates, theta, of the quantity of interest, theta_hat. This function then constructs a confidence interval based on the bootstrap estimates, using the bias corrected and accelerated quantile method. Args: theta: Array of shape (N, Y), where N is the number of samples. theta_hat: Array of shape (Y,) with the empirical estimate of the metric. This is only needed for the methods "bc" and "bca". alpha: Significance level. In range (0, 1). Vector-valued alpha only supported with "quantile" method. method: Method to compute the CI from the bootstrap samples. Possible values are * "quantile" uses the alpha/2 and 1-alpha/2 quantiles of the empirical metric distribution. * "bc" applies bias correction to correct for the bias of the median of the empirical distribution * "bca" applies bias correction and acceleration to correct for non-constant standard error. See Ch. 11 of Computer Age Statistical Inference by Efron and Hastie for details. Returns: Returns an array of shape (Y, 2) with lower and upper bounds of the CI. For vector-valued alpha of shape (Z,) the return values has shape (Y, Z, 2). """ alpha = np.asarray(alpha) # (Z,) alpha_shape = alpha.shape alpha = np.reshape(alpha, -1) # (Z',) alpha_lower = alpha / 2.0 alpha_upper = 1 - alpha / 2.0 if method == "quantile": alpha_joint = np.stack([alpha_lower, alpha_upper], axis=0) # (2, Z') ci = np.nanquantile(theta, q=alpha_joint, axis=0) # (2, Z', Y) ci = np.moveaxis(ci, source=[0, 1], destination=[-1, -2]) # (Y, Z', 2) ci = np.reshape(ci, theta.shape[1:] + alpha_shape + (2,)) # (Y, Z, 2) elif method in {"bc", "bca"}: if theta_hat is None: raise ValueError(f"Must provide theta_hat when using method {method}.") theta_hat = np.asarray(theta_hat) theta_hat = theta_hat[np.newaxis] # (1, Y) # Flatten the metric shape to a vector nb_samples = theta.shape[0] metric_shape = theta.shape[1:] theta = np.reshape(theta, (nb_samples, -1)) theta_hat = np.reshape(theta_hat, (1, -1)) metric_size = theta.shape[-1] # Proportion of samples less than theta_hat nb_not_nan = np.sum(~np.isnan(theta), axis=0) p0 = np.sum(theta <= theta_hat, axis=0) / nb_not_nan # (Y,) # Inverse function of standard normal cdf. If p0=0.5, then z0=0 and this # method reduces to the quantile method. z0 = scipy.stats.norm.ppf(p0) z_alpha_lower = scipy.stats.norm.ppf(alpha_lower) z_alpha_upper = scipy.stats.norm.ppf(alpha_upper) if method == "bc": # See (11.33) in Efron, Hastie z_lower = 2 * z0 + z_alpha_lower z_upper = 2 * z0 + z_alpha_upper else: # bootstrap_method == "bca" # Estimate acceleration from data. See (11.40) in Efron, Hastie. a_num = np.nansum((theta - theta_hat) ** 3, axis=0) a_den = 6 * np.nansum((theta - theta_hat) ** 2, axis=0) ** 1.5 # If a=0, the method reduces to the non-accelerated bias correction a = np.divide(a_num, a_den, out=np.zeros_like(a_num), where=a_den != 0) # See (11.39) in Efron, Hastie fin = np.isfinite(z0) z_lower = np.copy(z0) s_lower = z0[fin] + z_alpha_lower z_lower[fin] = z0[fin] + s_lower / (1 - a[fin] * s_lower) z_upper = np.copy(z0) s_upper = z0[fin] + z_alpha_upper z_upper[fin] = z0[fin] + s_upper / (1 - a[fin] * s_upper) alpha_hat_lower = scipy.stats.norm.cdf(z_lower) alpha_hat_upper = scipy.stats.norm.cdf(z_upper) ci = np.empty((metric_size, 2)) for j in range(metric_size): ci[j] = np.nanquantile( theta[:, j], q=[alpha_hat_lower[j], alpha_hat_upper[j]], axis=0 ) ci = np.reshape(ci, (*metric_shape, 2)) else: raise ValueError(f"Unknown value for bootstrap_method: {method}") return ci
[docs]def invert_pl_function(x: np.ndarray, y: np.ndarray, t: np.ndarray) -> List[np.ndarray]: r""" Inverts piecewise linear function. The piecewise linear function :math:`f(x_i) = y_i` is defined the by the sequence of points :math:`(x_i, y_i)`. We assume that :math:`x` is an increasing vector, i.e., :math:`x_i \leq x_{i+1}`; the vector :math:`x` does not have to be strictly increasing, but if :math:`x_i = x_{i+1}`, then we assume that also :math:`y_i = y_{i+1}`. For a given target value :math:`t_j`, the function finds all values :math:`s_{j, k}`, such that :math:`f(s_{j, k}) = t_j`. Because the number of solutions can vary for different :math:`t_j`, the return type for :math:`s` is a list of the same length as :math:`t` with each entry an array. If the equation :math:`f(s) = t_j` has no solution, we return the closest point :math:`s_j`, i.e., .. math:: \|f(s_j) - t_j\| = \min_z \|f(z) - t_j\|\,. We always return only one solution in this case. Args: x: Increasing vector of points defining the function. y: Vector of same length as x defining the function values. t: Vector of points at which to invert the function. Returns: A list :math:`s` of the same length as :math:`t` of arrays such that for all :math:`j`, the array :math:`s_j` is strictly increasing and contains all solutions of the equation :math:`f(s) = t_j`. """ x = np.asarray(x) y = np.asarray(y) t = np.asarray(t) t_scalar = t.ndim == 0 # Is input a scalar? x = x[:, np.newaxis] # (N, 1) y = y[:, np.newaxis] # (N, 1) # We turn scalar t into a 1-dim array + extra dimension for broadcasting t = t[np.newaxis, np.newaxis] if t_scalar else t[np.newaxis, :] # (1, T) # Find out where we cross the threshold crossing_up = (y[:-1] <= t) & (y[1:] > t) crossing_down = (y[:-1] >= t) & (y[1:] < t) crossing = crossing_up | crossing_down # Get indices of crossings; Note that np.nonzero tests the array in row-major # order, meaning that after transposing for each t index, the s indices will # be already in increasing order. t_indices, s_indices = np.nonzero(crossing.T) # Find closest points to given t values in case we don't find exact solutions. min_ind = np.argmin(np.abs(y - t), axis=0) # (T,) s_min = x[min_ind] t = t[0] # We are done with broadcasting x = x[:, 0] y = y[:, 0] s = [[] for _ in t] # Find all solutions given the crossing indices for t_ind, j in zip(t_indices, s_indices): # Apply linear interpolation between x[j] and x[j+1] la = (t[t_ind] - y[j]) / (y[j + 1] - y[j]) z = (1 - la) * x[j] + la * x[j + 1] s[t_ind].append(z) # Deal with cases where we don't have solutions for j in range(len(s)): if len(s[j]) == 0: s[j].append(s_min[j]) # Convert to list of arrays s = [np.asarray(z) for z in s] if t_scalar: # Reduce back to scalar s = s[0] return s