Source code for statistics

"""
Collection of useful statistic related functionalities.
"""
import numpy as np


[docs]def mid(x): """ Midpoints of a given array :param x: array with dimension bigger 1 :return: all the midpoints as numpy array (shape: x.size -1) """ return (x[:-1] + x[1:]) / 2.
[docs]def mean_and_variance(y, weights=None): """ Weighted mean and variance of array y and weights :param y: array for which to calculate mean and variance :param weights: optional weights for weighted mean and variance :return: mean, weights (both like y dimensions) """ weights = weights if weights is not None else np.ones(y.shape) assert y.shape == weights.shape w_sum = sum(weights) if w_sum.sum() == 0: return np.nan, np.nan m = np.dot(y, weights) / w_sum v = np.dot((y - m)**2, weights) / w_sum return m, v
[docs]def quantile_1d(data, weights, quant): """ Compute the weighted quantile of a 1D numpy array. from https://github.com/nudomarinero/wquantiles/blob/master/weighted.py :param data: 1d-array for which to calculate mean and variance :param weights : 1d-array with weights for data (same shape of data) :param quant: quantile to compute, it must have a value between 0 and 1. :return: quantiles """ # Check the data if not isinstance(data, np.matrix): data = np.asarray(data) if not isinstance(weights, np.matrix): weights = np.asarray(weights) nd = data.ndim if nd != 1: raise TypeError("data must be a one dimensional array") ndw = weights.ndim if ndw != 1: raise TypeError("weights must be a one dimensional array") if data.shape != weights.shape: raise TypeError("the length of data and weights must be the same") if (quant > 1.) or (quant < 0.): raise ValueError("quantile must have a value between 0. and 1.") # Sort the data ind_sorted = np.argsort(data) sorted_data = data[ind_sorted] sorted_weights = weights[ind_sorted] # Compute the auxiliary arrays sn = np.cumsum(sorted_weights) assert np.sum(sn) > 0, "The sum of the weights must not be zero!" pn = (sn - 0.5 * sorted_weights) / np.sum(sorted_weights) # Get the value of the weighted median # noinspection PyTypeChecker return np.interp(quant, pn, sorted_data)
[docs]def quantile(data, weights, quant): # pylint: disable=R1710 """ Weighted quantile of an array with respect to the last axis. from https://github.com/nudomarinero/wquantiles/blob/master/weighted.py :param data: ndarray for which to calculate weighted quantile :param weights: ndarray with weights for data, it must have the same size of the last axis of data. :param quant: quantile to compute, it must have a value between 0 and 1. :return: weighted quantiles with respect to last axis """ # TODO: Allow to specify the axis nd = data.ndim assert nd > 0, "Data must have at least one dimension!" if nd == 1: return quantile_1d(data, weights, quant) n = data.shape assert n[-1] == weights.size, "Weights must have same size than last axis of data!" imr = data.reshape((-1, n[-1])) result = np.apply_along_axis(quantile_1d, -1, imr, weights, quant) return result.reshape(n[:-1])
[docs]def median(data, weights): """ Weighted median of an array with respect to the last axis. Alias for quantile(data, weights, 0.5). from https://github.com/nudomarinero/wquantiles/blob/master/weighted.py :param data: ndarray for which to calculate the weighted median :param weights: ndarray with weights for data, it must have the same size of the last axis of data. """ return quantile(data, weights, 0.5)
[docs]def binned_mean_and_variance(x, y, bins, weights=None): """ Calculates the mean and variance of y in the bins of x. This is effectively a ROOT.TProfile. :param x: data values that are used for binning (e.g. energies) :param y: data values that should be avaraged :param bins: bin borders return: <y>_i, sigma(y)_i : mean and variance of y in bins of x """ dig = np.digitize(x, bins) n = len(bins) - 1 my, vy = np.zeros(n), np.zeros(n) for i in range(n): idx = (dig == i+1) if not idx.any(): # check for empty bin my[i] = np.nan vy[i] = np.nan continue if weights is None: my[i] = np.mean(y[idx]) vy[i] = np.std(y[idx])**2 else: my[i], vy[i] = mean_and_variance(y[idx], weights[idx]) return my, vy
[docs]def sym_interval_around(x, xm, alpha=0.32): """ In a distribution represented by a set of samples, find the interval that contains (1-alpha)/2 to each the left and right of xm. If xm is too marginal to allow both sides to contain (1-alpha)/2, add the remaining fraction to the other side. :param x: data values in the distribution :param xm: symmetric center value for which to find the interval :param alpha: fraction that will be outside of the interval (default 0.32, corresponds to 68 percent quantile) :return: interval (lower, upper) which contains 1-alpha symmetric around xm """ assert (alpha > 0) & (alpha < 1) xt = x.copy() xt.sort() i = xt.searchsorted(xm) # index of central value n = len(x) # number of samples ns = int((1 - alpha) * n) # number of samples corresponding to 1-alpha i0 = i - ns/2 # index of lower and upper bound of interval i1 = i + ns/2 # if central value does not allow for (1-alpha)/2 on left side, add to right if i0 < 0: i0 = 0 i1 -= i0 # if central value does not allow for (1-alpha)/2 on right side, add to left if i1 >= n: i1 = n-1 i0 -= i1-n+1 return xt[int(i0)], xt[int(i1)]
[docs]def random_choice_multi(a, k, p): """ Pull multiple sets of k random samples out of a given array or list using individual probabilities for each set. :param a: data values to sample from, 1d array :param k: single value :param p: probability vectors, array of shape (N, len(a)), with N being the number of sets :return: array of shape (N, k) """ if not isinstance(a, np.ndarray): if isinstance(a, int): a = np.arange(a) else: a = np.array(a) assert p.shape[-1] == len(a), "Probabilites vectors do not match number of data samples." assert np.all((np.sum(p, axis=-1) - 1) < 1e-8), "Probabilities do not sum up to one" s = p.cumsum(axis=1) r = np.random.rand(p.shape[0], k, 1) q = np.expand_dims(s, 1) >= r n = q.argmax(axis=-1) n = np.asarray(a)[n] return n