2061 lines
		
	
	
		
			79 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			2061 lines
		
	
	
		
			79 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from collections import namedtuple
 | |
| from dataclasses import dataclass
 | |
| from math import comb
 | |
| import numpy as np
 | |
| import warnings
 | |
| from itertools import combinations
 | |
| import scipy.stats
 | |
| from scipy.optimize import shgo
 | |
| from . import distributions
 | |
| from ._common import ConfidenceInterval
 | |
| from ._continuous_distns import norm
 | |
| from scipy.special import gamma, kv, gammaln
 | |
| from scipy.fft import ifft
 | |
| from ._stats_pythran import _a_ij_Aij_Dij2
 | |
| from ._stats_pythran import (
 | |
|     _concordant_pairs as _P, _discordant_pairs as _Q
 | |
| )
 | |
| from ._axis_nan_policy import _axis_nan_policy_factory
 | |
| from scipy.stats import _stats_py
 | |
| 
 | |
| __all__ = ['epps_singleton_2samp', 'cramervonmises', 'somersd',
 | |
|            'barnard_exact', 'boschloo_exact', 'cramervonmises_2samp',
 | |
|            'tukey_hsd', 'poisson_means_test']
 | |
| 
 | |
| Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult',
 | |
|                                         ('statistic', 'pvalue'))
 | |
| 
 | |
| 
 | |
| @_axis_nan_policy_factory(Epps_Singleton_2sampResult, n_samples=2, too_small=4)
 | |
| def epps_singleton_2samp(x, y, t=(0.4, 0.8)):
 | |
|     """Compute the Epps-Singleton (ES) test statistic.
 | |
| 
 | |
|     Test the null hypothesis that two samples have the same underlying
 | |
|     probability distribution.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x, y : array-like
 | |
|         The two samples of observations to be tested. Input must not have more
 | |
|         than one dimension. Samples can have different lengths, but both
 | |
|         must have at least five observations.
 | |
|     t : array-like, optional
 | |
|         The points (t1, ..., tn) where the empirical characteristic function is
 | |
|         to be evaluated. It should be positive distinct numbers. The default
 | |
|         value (0.4, 0.8) is proposed in [1]_. Input must not have more than
 | |
|         one dimension.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : float
 | |
|         The test statistic.
 | |
|     pvalue : float
 | |
|         The associated p-value based on the asymptotic chi2-distribution.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     ks_2samp, anderson_ksamp
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     Testing whether two samples are generated by the same underlying
 | |
|     distribution is a classical question in statistics. A widely used test is
 | |
|     the Kolmogorov-Smirnov (KS) test which relies on the empirical
 | |
|     distribution function. Epps and Singleton introduce a test based on the
 | |
|     empirical characteristic function in [1]_.
 | |
| 
 | |
|     One advantage of the ES test compared to the KS test is that is does
 | |
|     not assume a continuous distribution. In [1]_, the authors conclude
 | |
|     that the test also has a higher power than the KS test in many
 | |
|     examples. They recommend the use of the ES test for discrete samples as
 | |
|     well as continuous samples with at least 25 observations each, whereas
 | |
|     `anderson_ksamp` is recommended for smaller sample sizes in the
 | |
|     continuous case.
 | |
| 
 | |
|     The p-value is computed from the asymptotic distribution of the test
 | |
|     statistic which follows a `chi2` distribution. If the sample size of both
 | |
|     `x` and `y` is below 25, the small sample correction proposed in [1]_ is
 | |
|     applied to the test statistic.
 | |
| 
 | |
|     The default values of `t` are determined in [1]_ by considering
 | |
|     various distributions and finding good values that lead to a high power
 | |
|     of the test in general. Table III in [1]_ gives the optimal values for
 | |
|     the distributions tested in that study. The values of `t` are scaled by
 | |
|     the semi-interquartile range in the implementation, see [1]_.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample
 | |
|        problem using the empirical characteristic function", Journal of
 | |
|        Statistical Computation and Simulation 26, p. 177--203, 1986.
 | |
| 
 | |
|     .. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions
 | |
|        - the Epps-Singleton two-sample test using the empirical characteristic
 | |
|        function", The Stata Journal 9(3), p. 454--465, 2009.
 | |
| 
 | |
|     """
 | |
|     # x and y are converted to arrays by the decorator
 | |
|     t = np.asarray(t)
 | |
|     # check if x and y are valid inputs
 | |
|     nx, ny = len(x), len(y)
 | |
|     if (nx < 5) or (ny < 5):
 | |
|         raise ValueError('x and y should have at least 5 elements, but len(x) '
 | |
|                          f'= {nx} and len(y) = {ny}.')
 | |
|     if not np.isfinite(x).all():
 | |
|         raise ValueError('x must not contain nonfinite values.')
 | |
|     if not np.isfinite(y).all():
 | |
|         raise ValueError('y must not contain nonfinite values.')
 | |
|     n = nx + ny
 | |
| 
 | |
|     # check if t is valid
 | |
|     if t.ndim > 1:
 | |
|         raise ValueError(f't must be 1d, but t.ndim equals {t.ndim}.')
 | |
|     if np.less_equal(t, 0).any():
 | |
|         raise ValueError('t must contain positive elements only.')
 | |
| 
 | |
|     # rescale t with semi-iqr as proposed in [1]; import iqr here to avoid
 | |
|     # circular import
 | |
|     from scipy.stats import iqr
 | |
|     sigma = iqr(np.hstack((x, y))) / 2
 | |
|     ts = np.reshape(t, (-1, 1)) / sigma
 | |
| 
 | |
|     # covariance estimation of ES test
 | |
|     gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T  # shape = (nx, 2*len(t))
 | |
|     gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T
 | |
|     cov_x = np.cov(gx.T, bias=True)  # the test uses biased cov-estimate
 | |
|     cov_y = np.cov(gy.T, bias=True)
 | |
|     est_cov = (n/nx)*cov_x + (n/ny)*cov_y
 | |
|     est_cov_inv = np.linalg.pinv(est_cov)
 | |
|     r = np.linalg.matrix_rank(est_cov_inv)
 | |
|     if r < 2*len(t):
 | |
|         warnings.warn('Estimated covariance matrix does not have full rank. '
 | |
|                       'This indicates a bad choice of the input t and the '
 | |
|                       'test might not be consistent.', # see p. 183 in [1]_
 | |
|                       stacklevel=2)
 | |
| 
 | |
|     # compute test statistic w distributed asympt. as chisquare with df=r
 | |
|     g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0)
 | |
|     w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff))
 | |
| 
 | |
|     # apply small-sample correction
 | |
|     if (max(nx, ny) < 25):
 | |
|         corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7)))
 | |
|         w = corr * w
 | |
| 
 | |
|     chi2 = _stats_py._SimpleChi2(r)
 | |
|     p = _stats_py._get_pvalue(w, chi2, alternative='greater', symmetric=False, xp=np)
 | |
| 
 | |
|     return Epps_Singleton_2sampResult(w, p)
 | |
| 
 | |
| 
 | |
| def poisson_means_test(k1, n1, k2, n2, *, diff=0, alternative='two-sided'):
 | |
|     r"""
 | |
|     Performs the Poisson means test, AKA the "E-test".
 | |
| 
 | |
|     This is a test of the null hypothesis that the difference between means of
 | |
|     two Poisson distributions is `diff`. The samples are provided as the
 | |
|     number of events `k1` and `k2` observed within measurement intervals
 | |
|     (e.g. of time, space, number of observations) of sizes `n1` and `n2`.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     k1 : int
 | |
|         Number of events observed from distribution 1.
 | |
|     n1: float
 | |
|         Size of sample from distribution 1.
 | |
|     k2 : int
 | |
|         Number of events observed from distribution 2.
 | |
|     n2 : float
 | |
|         Size of sample from distribution 2.
 | |
|     diff : float, default=0
 | |
|         The hypothesized difference in means between the distributions
 | |
|         underlying the samples.
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Defines the alternative hypothesis.
 | |
|         The following options are available (default is 'two-sided'):
 | |
| 
 | |
|           * 'two-sided': the difference between distribution means is not
 | |
|             equal to `diff`
 | |
|           * 'less': the difference between distribution means is less than
 | |
|             `diff`
 | |
|           * 'greater': the difference between distribution means is greater
 | |
|             than `diff`
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : float
 | |
|         The test statistic (see [1]_ equation 3.3).
 | |
|     pvalue : float
 | |
|         The probability of achieving such an extreme value of the test
 | |
|         statistic under the null hypothesis.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
| 
 | |
|     Let:
 | |
| 
 | |
|     .. math:: X_1 \sim \mbox{Poisson}(\mathtt{n1}\lambda_1)
 | |
| 
 | |
|     be a random variable independent of
 | |
| 
 | |
|     .. math:: X_2  \sim \mbox{Poisson}(\mathtt{n2}\lambda_2)
 | |
| 
 | |
|     and let ``k1`` and ``k2`` be the observed values of :math:`X_1`
 | |
|     and :math:`X_2`, respectively. Then `poisson_means_test` uses the number
 | |
|     of observed events ``k1`` and ``k2`` from samples of size ``n1`` and
 | |
|     ``n2``, respectively, to test the null hypothesis that
 | |
| 
 | |
|     .. math::
 | |
|        H_0: \lambda_1 - \lambda_2 = \mathtt{diff}
 | |
| 
 | |
|     A benefit of the E-test is that it has good power for small sample sizes,
 | |
|     which can reduce sampling costs [1]_. It has been evaluated and determined
 | |
|     to be more powerful than the comparable C-test, sometimes referred to as
 | |
|     the Poisson exact test.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1]  Krishnamoorthy, K., & Thomson, J. (2004). A more powerful test for
 | |
|        comparing two Poisson means. Journal of Statistical Planning and
 | |
|        Inference, 119(1), 23-35.
 | |
| 
 | |
|     .. [2]  Przyborowski, J., & Wilenski, H. (1940). Homogeneity of results in
 | |
|        testing samples from Poisson series: With an application to testing
 | |
|        clover seed for dodder. Biometrika, 31(3/4), 313-323.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
| 
 | |
|     Suppose that a gardener wishes to test the number of dodder (weed) seeds
 | |
|     in a sack of clover seeds that they buy from a seed company. It has
 | |
|     previously been established that the number of dodder seeds in clover
 | |
|     follows the Poisson distribution.
 | |
| 
 | |
|     A 100 gram sample is drawn from the sack before being shipped to the
 | |
|     gardener. The sample is analyzed, and it is found to contain no dodder
 | |
|     seeds; that is, `k1` is 0. However, upon arrival, the gardener draws
 | |
|     another 100 gram sample from the sack. This time, three dodder seeds are
 | |
|     found in the sample; that is, `k2` is 3. The gardener would like to
 | |
|     know if the difference is significant and not due to chance. The
 | |
|     null hypothesis is that the difference between the two samples is merely
 | |
|     due to chance, or that :math:`\lambda_1 - \lambda_2 = \mathtt{diff}`
 | |
|     where :math:`\mathtt{diff} = 0`. The alternative hypothesis is that the
 | |
|     difference is not due to chance, or :math:`\lambda_1 - \lambda_2 \ne 0`.
 | |
|     The gardener selects a significance level of 5% to reject the null
 | |
|     hypothesis in favor of the alternative [2]_.
 | |
| 
 | |
|     >>> import scipy.stats as stats
 | |
|     >>> res = stats.poisson_means_test(0, 100, 3, 100)
 | |
|     >>> res.statistic, res.pvalue
 | |
|     (-1.7320508075688772, 0.08837900929018157)
 | |
| 
 | |
|     The p-value is .088, indicating a near 9% chance of observing a value of
 | |
|     the test statistic under the null hypothesis. This exceeds 5%, so the
 | |
|     gardener does not reject the null hypothesis as the difference cannot be
 | |
|     regarded as significant at this level.
 | |
|     """
 | |
| 
 | |
|     _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative)
 | |
| 
 | |
|     # "for a given k_1 and k_2, an estimate of \lambda_2 is given by" [1] (3.4)
 | |
|     lmbd_hat2 = ((k1 + k2) / (n1 + n2) - diff * n1 / (n1 + n2))
 | |
| 
 | |
|     # "\hat{\lambda_{2k}} may be less than or equal to zero ... and in this
 | |
|     # case the null hypothesis cannot be rejected ... [and] it is not necessary
 | |
|     # to compute the p-value". [1] page 26 below eq. (3.6).
 | |
|     if lmbd_hat2 <= 0:
 | |
|         return _stats_py.SignificanceResult(0, 1)
 | |
| 
 | |
|     # The unbiased variance estimate [1] (3.2)
 | |
|     var = k1 / (n1 ** 2) + k2 / (n2 ** 2)
 | |
| 
 | |
|     # The _observed_ pivot statistic from the input. It follows the
 | |
|     # unnumbered equation following equation (3.3) This is used later in
 | |
|     # comparison with the computed pivot statistics in an indicator function.
 | |
|     t_k1k2 = (k1 / n1 - k2 / n2 - diff) / np.sqrt(var)
 | |
| 
 | |
|     # Equation (3.5) of [1] is lengthy, so it is broken into several parts,
 | |
|     # beginning here. Note that the probability mass function of poisson is
 | |
|     # exp^(-\mu)*\mu^k/k!, so and this is called with shape \mu, here noted
 | |
|     # here as nlmbd_hat*. The strategy for evaluating the double summation in
 | |
|     # (3.5) is to create two arrays of the values of the two products inside
 | |
|     # the summation and then broadcast them together into a matrix, and then
 | |
|     # sum across the entire matrix.
 | |
| 
 | |
|     # Compute constants (as seen in the first and second separated products in
 | |
|     # (3.5).). (This is the shape (\mu) parameter of the poisson distribution.)
 | |
|     nlmbd_hat1 = n1 * (lmbd_hat2 + diff)
 | |
|     nlmbd_hat2 = n2 * lmbd_hat2
 | |
| 
 | |
|     # Determine summation bounds for tail ends of distribution rather than
 | |
|     # summing to infinity. `x1*` is for the outer sum and `x2*` is the inner
 | |
|     # sum.
 | |
|     x1_lb, x1_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat1)
 | |
|     x2_lb, x2_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat2)
 | |
| 
 | |
|     # Construct arrays to function as the x_1 and x_2 counters on the summation
 | |
|     # in (3.5). `x1` is in columns and `x2` is in rows to allow for
 | |
|     # broadcasting.
 | |
|     x1 = np.arange(x1_lb, x1_ub + 1)
 | |
|     x2 = np.arange(x2_lb, x2_ub + 1)[:, None]
 | |
| 
 | |
|     # These are the two products in equation (3.5) with `prob_x1` being the
 | |
|     # first (left side) and `prob_x2` being the second (right side). (To
 | |
|     # make as clear as possible: the 1st contains a "+ d" term, the 2nd does
 | |
|     # not.)
 | |
|     prob_x1 = distributions.poisson.pmf(x1, nlmbd_hat1)
 | |
|     prob_x2 = distributions.poisson.pmf(x2, nlmbd_hat2)
 | |
| 
 | |
|     # compute constants for use in the "pivot statistic" per the
 | |
|     # unnumbered equation following (3.3).
 | |
|     lmbd_x1 = x1 / n1
 | |
|     lmbd_x2 = x2 / n2
 | |
|     lmbds_diff = lmbd_x1 - lmbd_x2 - diff
 | |
|     var_x1x2 = lmbd_x1 / n1 + lmbd_x2 / n2
 | |
| 
 | |
|     # This is the 'pivot statistic' for use in the indicator of the summation
 | |
|     # (left side of "I[.]").
 | |
|     with np.errstate(invalid='ignore', divide='ignore'):
 | |
|         t_x1x2 = lmbds_diff / np.sqrt(var_x1x2)
 | |
| 
 | |
|     # `[indicator]` implements the "I[.] ... the indicator function" per
 | |
|     # the paragraph following equation (3.5).
 | |
|     if alternative == 'two-sided':
 | |
|         indicator = np.abs(t_x1x2) >= np.abs(t_k1k2)
 | |
|     elif alternative == 'less':
 | |
|         indicator = t_x1x2 <= t_k1k2
 | |
|     else:
 | |
|         indicator = t_x1x2 >= t_k1k2
 | |
| 
 | |
|     # Multiply all combinations of the products together, exclude terms
 | |
|     # based on the `indicator` and then sum. (3.5)
 | |
|     pvalue = np.sum((prob_x1 * prob_x2)[indicator])
 | |
|     return _stats_py.SignificanceResult(t_k1k2, pvalue)
 | |
| 
 | |
| 
 | |
| def _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative):
 | |
|     # """check for valid types and values of input to `poisson_mean_test`."""
 | |
|     if k1 != int(k1) or k2 != int(k2):
 | |
|         raise TypeError('`k1` and `k2` must be integers.')
 | |
| 
 | |
|     count_err = '`k1` and `k2` must be greater than or equal to 0.'
 | |
|     if k1 < 0 or k2 < 0:
 | |
|         raise ValueError(count_err)
 | |
| 
 | |
|     if n1 <= 0 or n2 <= 0:
 | |
|         raise ValueError('`n1` and `n2` must be greater than 0.')
 | |
| 
 | |
|     if diff < 0:
 | |
|         raise ValueError('diff must be greater than or equal to 0.')
 | |
| 
 | |
|     alternatives = {'two-sided', 'less', 'greater'}
 | |
|     if alternative.lower() not in alternatives:
 | |
|         raise ValueError(f"Alternative must be one of '{alternatives}'.")
 | |
| 
 | |
| 
 | |
| class CramerVonMisesResult:
 | |
|     def __init__(self, statistic, pvalue):
 | |
|         self.statistic = statistic
 | |
|         self.pvalue = pvalue
 | |
| 
 | |
|     def __repr__(self):
 | |
|         return (f"{self.__class__.__name__}(statistic={self.statistic}, "
 | |
|                 f"pvalue={self.pvalue})")
 | |
| 
 | |
| 
 | |
| def _psi1_mod(x):
 | |
|     """
 | |
|     psi1 is defined in equation 1.10 in Csörgő, S. and Faraway, J. (1996).
 | |
|     This implements a modified version by excluding the term V(x) / 12
 | |
|     (here: _cdf_cvm_inf(x) / 12) to avoid evaluating _cdf_cvm_inf(x)
 | |
|     twice in _cdf_cvm.
 | |
| 
 | |
|     Implementation based on MAPLE code of Julian Faraway and R code of the
 | |
|     function pCvM in the package goftest (v1.1.1), permission granted
 | |
|     by Adrian Baddeley. Main difference in the implementation: the code
 | |
|     here keeps adding terms of the series until the terms are small enough.
 | |
|     """
 | |
| 
 | |
|     def _ed2(y):
 | |
|         z = y**2 / 4
 | |
|         b = kv(1/4, z) + kv(3/4, z)
 | |
|         return np.exp(-z) * (y/2)**(3/2) * b / np.sqrt(np.pi)
 | |
| 
 | |
|     def _ed3(y):
 | |
|         z = y**2 / 4
 | |
|         c = np.exp(-z) / np.sqrt(np.pi)
 | |
|         return c * (y/2)**(5/2) * (2*kv(1/4, z) + 3*kv(3/4, z) - kv(5/4, z))
 | |
| 
 | |
|     def _Ak(k, x):
 | |
|         m = 2*k + 1
 | |
|         sx = 2 * np.sqrt(x)
 | |
|         y1 = x**(3/4)
 | |
|         y2 = x**(5/4)
 | |
| 
 | |
|         e1 = m * gamma(k + 1/2) * _ed2((4 * k + 3)/sx) / (9 * y1)
 | |
|         e2 = gamma(k + 1/2) * _ed3((4 * k + 1) / sx) / (72 * y2)
 | |
|         e3 = 2 * (m + 2) * gamma(k + 3/2) * _ed3((4 * k + 5) / sx) / (12 * y2)
 | |
|         e4 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 1) / sx) / (144 * y1)
 | |
|         e5 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 5) / sx) / (144 * y1)
 | |
| 
 | |
|         return e1 + e2 + e3 + e4 + e5
 | |
| 
 | |
|     x = np.asarray(x)
 | |
|     tot = np.zeros_like(x, dtype='float')
 | |
|     cond = np.ones_like(x, dtype='bool')
 | |
|     k = 0
 | |
|     while np.any(cond):
 | |
|         z = -_Ak(k, x[cond]) / (np.pi * gamma(k + 1))
 | |
|         tot[cond] = tot[cond] + z
 | |
|         cond[cond] = np.abs(z) >= 1e-7
 | |
|         k += 1
 | |
| 
 | |
|     return tot
 | |
| 
 | |
| 
 | |
| def _cdf_cvm_inf(x):
 | |
|     """
 | |
|     Calculate the cdf of the Cramér-von Mises statistic (infinite sample size).
 | |
| 
 | |
|     See equation 1.2 in Csörgő, S. and Faraway, J. (1996).
 | |
| 
 | |
|     Implementation based on MAPLE code of Julian Faraway and R code of the
 | |
|     function pCvM in the package goftest (v1.1.1), permission granted
 | |
|     by Adrian Baddeley. Main difference in the implementation: the code
 | |
|     here keeps adding terms of the series until the terms are small enough.
 | |
| 
 | |
|     The function is not expected to be accurate for large values of x, say
 | |
|     x > 4, when the cdf is very close to 1.
 | |
|     """
 | |
|     x = np.asarray(x)
 | |
| 
 | |
|     def term(x, k):
 | |
|         # this expression can be found in [2], second line of (1.3)
 | |
|         u = np.exp(gammaln(k + 0.5) - gammaln(k+1)) / (np.pi**1.5 * np.sqrt(x))
 | |
|         y = 4*k + 1
 | |
|         q = y**2 / (16*x)
 | |
|         b = kv(0.25, q)
 | |
|         return u * np.sqrt(y) * np.exp(-q) * b
 | |
| 
 | |
|     tot = np.zeros_like(x, dtype='float')
 | |
|     cond = np.ones_like(x, dtype='bool')
 | |
|     k = 0
 | |
|     while np.any(cond):
 | |
|         z = term(x[cond], k)
 | |
|         tot[cond] = tot[cond] + z
 | |
|         cond[cond] = np.abs(z) >= 1e-7
 | |
|         k += 1
 | |
| 
 | |
|     return tot
 | |
| 
 | |
| 
 | |
| def _cdf_cvm(x, n=None):
 | |
|     """
 | |
|     Calculate the cdf of the Cramér-von Mises statistic for a finite sample
 | |
|     size n. If N is None, use the asymptotic cdf (n=inf).
 | |
| 
 | |
|     See equation 1.8 in Csörgő, S. and Faraway, J. (1996) for finite samples,
 | |
|     1.2 for the asymptotic cdf.
 | |
| 
 | |
|     The function is not expected to be accurate for large values of x, say
 | |
|     x > 2, when the cdf is very close to 1 and it might return values > 1
 | |
|     in that case, e.g. _cdf_cvm(2.0, 12) = 1.0000027556716846. Moreover, it
 | |
|     is not accurate for small values of n, especially close to the bounds of
 | |
|     the distribution's domain, [1/(12*n), n/3], where the value jumps to 0
 | |
|     and 1, respectively. These are limitations of the approximation by Csörgő
 | |
|     and Faraway (1996) implemented in this function.
 | |
|     """
 | |
|     x = np.asarray(x)
 | |
|     if n is None:
 | |
|         y = _cdf_cvm_inf(x)
 | |
|     else:
 | |
|         # support of the test statistic is [12/n, n/3], see 1.1 in [2]
 | |
|         y = np.zeros_like(x, dtype='float')
 | |
|         sup = (1./(12*n) < x) & (x < n/3.)
 | |
|         # note: _psi1_mod does not include the term _cdf_cvm_inf(x) / 12
 | |
|         # therefore, we need to add it here
 | |
|         y[sup] = _cdf_cvm_inf(x[sup]) * (1 + 1./(12*n)) + _psi1_mod(x[sup]) / n
 | |
|         y[x >= n/3] = 1
 | |
| 
 | |
|     if y.ndim == 0:
 | |
|         return y[()]
 | |
|     return y
 | |
| 
 | |
| 
 | |
| def _cvm_result_to_tuple(res, _):
 | |
|     return res.statistic, res.pvalue
 | |
| 
 | |
| 
 | |
| @_axis_nan_policy_factory(CramerVonMisesResult, n_samples=1, too_small=1,
 | |
|                           result_to_tuple=_cvm_result_to_tuple)
 | |
| def cramervonmises(rvs, cdf, args=()):
 | |
|     """Perform the one-sample Cramér-von Mises test for goodness of fit.
 | |
| 
 | |
|     This performs a test of the goodness of fit of a cumulative distribution
 | |
|     function (cdf) :math:`F` compared to the empirical distribution function
 | |
|     :math:`F_n` of observed random variates :math:`X_1, ..., X_n` that are
 | |
|     assumed to be independent and identically distributed ([1]_).
 | |
|     The null hypothesis is that the :math:`X_i` have cumulative distribution
 | |
|     :math:`F`.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     rvs : array_like
 | |
|         A 1-D array of observed values of the random variables :math:`X_i`.
 | |
|         The sample must contain at least two observations.
 | |
|     cdf : str or callable
 | |
|         The cumulative distribution function :math:`F` to test the
 | |
|         observations against. If a string, it should be the name of a
 | |
|         distribution in `scipy.stats`. If a callable, that callable is used
 | |
|         to calculate the cdf: ``cdf(x, *args) -> float``.
 | |
|     args : tuple, optional
 | |
|         Distribution parameters. These are assumed to be known; see Notes.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     res : object with attributes
 | |
|         statistic : float
 | |
|             Cramér-von Mises statistic.
 | |
|         pvalue : float
 | |
|             The p-value.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     kstest, cramervonmises_2samp
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     .. versionadded:: 1.6.0
 | |
| 
 | |
|     The p-value relies on the approximation given by equation 1.8 in [2]_.
 | |
|     It is important to keep in mind that the p-value is only accurate if
 | |
|     one tests a simple hypothesis, i.e. the parameters of the reference
 | |
|     distribution are known. If the parameters are estimated from the data
 | |
|     (composite hypothesis), the computed p-value is not reliable.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] Cramér-von Mises criterion, Wikipedia,
 | |
|            https://en.wikipedia.org/wiki/Cram%C3%A9r%E2%80%93von_Mises_criterion
 | |
|     .. [2] Csörgő, S. and Faraway, J. (1996). The Exact and Asymptotic
 | |
|            Distribution of Cramér-von Mises Statistics. Journal of the
 | |
|            Royal Statistical Society, pp. 221-234.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
| 
 | |
|     Suppose we wish to test whether data generated by ``scipy.stats.norm.rvs``
 | |
|     were, in fact, drawn from the standard normal distribution. We choose a
 | |
|     significance level of ``alpha=0.05``.
 | |
| 
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy import stats
 | |
|     >>> rng = np.random.default_rng(165417232101553420507139617764912913465)
 | |
|     >>> x = stats.norm.rvs(size=500, random_state=rng)
 | |
|     >>> res = stats.cramervonmises(x, 'norm')
 | |
|     >>> res.statistic, res.pvalue
 | |
|     (0.1072085112565724, 0.5508482238203407)
 | |
| 
 | |
|     The p-value exceeds our chosen significance level, so we do not
 | |
|     reject the null hypothesis that the observed sample is drawn from the
 | |
|     standard normal distribution.
 | |
| 
 | |
|     Now suppose we wish to check whether the same samples shifted by 2.1 is
 | |
|     consistent with being drawn from a normal distribution with a mean of 2.
 | |
| 
 | |
|     >>> y = x + 2.1
 | |
|     >>> res = stats.cramervonmises(y, 'norm', args=(2,))
 | |
|     >>> res.statistic, res.pvalue
 | |
|     (0.8364446265294695, 0.00596286797008283)
 | |
| 
 | |
|     Here we have used the `args` keyword to specify the mean (``loc``)
 | |
|     of the normal distribution to test the data against. This is equivalent
 | |
|     to the following, in which we create a frozen normal distribution with
 | |
|     mean 2.1, then pass its ``cdf`` method as an argument.
 | |
| 
 | |
|     >>> frozen_dist = stats.norm(loc=2)
 | |
|     >>> res = stats.cramervonmises(y, frozen_dist.cdf)
 | |
|     >>> res.statistic, res.pvalue
 | |
|     (0.8364446265294695, 0.00596286797008283)
 | |
| 
 | |
|     In either case, we would reject the null hypothesis that the observed
 | |
|     sample is drawn from a normal distribution with a mean of 2 (and default
 | |
|     variance of 1) because the p-value is less than our chosen
 | |
|     significance level.
 | |
| 
 | |
|     """
 | |
|     if isinstance(cdf, str):
 | |
|         cdf = getattr(distributions, cdf).cdf
 | |
| 
 | |
|     vals = np.sort(np.asarray(rvs))
 | |
| 
 | |
|     if vals.size <= 1:
 | |
|         raise ValueError('The sample must contain at least two observations.')
 | |
| 
 | |
|     n = len(vals)
 | |
|     cdfvals = cdf(vals, *args)
 | |
| 
 | |
|     u = (2*np.arange(1, n+1) - 1)/(2*n)
 | |
|     w = 1/(12*n) + np.sum((u - cdfvals)**2)
 | |
| 
 | |
|     # avoid small negative values that can occur due to the approximation
 | |
|     p = np.clip(1. - _cdf_cvm(w, n), 0., None)
 | |
| 
 | |
|     return CramerVonMisesResult(statistic=w, pvalue=p)
 | |
| 
 | |
| 
 | |
| def _get_wilcoxon_distr(n):
 | |
|     """
 | |
|     Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum
 | |
|     of ranks of positive differences).
 | |
|     Returns an array with the probabilities of all the possible ranks
 | |
|     r = 0, ..., n*(n+1)/2
 | |
|     """
 | |
|     c = np.ones(1, dtype=np.float64)
 | |
|     for k in range(1, n + 1):
 | |
|         prev_c = c
 | |
|         c = np.zeros(k * (k + 1) // 2 + 1, dtype=np.float64)
 | |
|         m = len(prev_c)
 | |
|         c[:m] = prev_c * 0.5
 | |
|         c[-m:] += prev_c * 0.5
 | |
|     return c
 | |
| 
 | |
| 
 | |
| def _get_wilcoxon_distr2(n):
 | |
|     """
 | |
|     Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum
 | |
|     of ranks of positive differences).
 | |
|     Returns an array with the probabilities of all the possible ranks
 | |
|     r = 0, ..., n*(n+1)/2
 | |
|     This is a slower reference function
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] 1. Harris T, Hardin JW. Exact Wilcoxon Signed-Rank and Wilcoxon
 | |
|         Mann-Whitney Ranksum Tests. The Stata Journal. 2013;13(2):337-343.
 | |
|     """
 | |
|     ai = np.arange(1, n+1)[:, None]
 | |
|     t = n*(n+1)/2
 | |
|     q = 2*t
 | |
|     j = np.arange(q)
 | |
|     theta = 2*np.pi/q*j
 | |
|     phi_sp = np.prod(np.cos(theta*ai), axis=0)
 | |
|     phi_s = np.exp(1j*theta*t) * phi_sp
 | |
|     p = np.real(ifft(phi_s))
 | |
|     res = np.zeros(int(t)+1)
 | |
|     res[:-1:] = p[::2]
 | |
|     res[0] /= 2
 | |
|     res[-1] = res[0]
 | |
|     return res
 | |
| 
 | |
| 
 | |
| def _tau_b(A):
 | |
|     """Calculate Kendall's tau-b and p-value from contingency table."""
 | |
|     # See [2] 2.2 and 4.2
 | |
| 
 | |
|     # contingency table must be truly 2D
 | |
|     if A.shape[0] == 1 or A.shape[1] == 1:
 | |
|         return np.nan, np.nan
 | |
| 
 | |
|     NA = A.sum()
 | |
|     PA = _P(A)
 | |
|     QA = _Q(A)
 | |
|     Sri2 = (A.sum(axis=1)**2).sum()
 | |
|     Scj2 = (A.sum(axis=0)**2).sum()
 | |
|     denominator = (NA**2 - Sri2)*(NA**2 - Scj2)
 | |
| 
 | |
|     tau = (PA-QA)/(denominator)**0.5
 | |
| 
 | |
|     numerator = 4*(_a_ij_Aij_Dij2(A) - (PA - QA)**2 / NA)
 | |
|     s02_tau_b = numerator/denominator
 | |
|     if s02_tau_b == 0:  # Avoid divide by zero
 | |
|         return tau, 0
 | |
|     Z = tau/s02_tau_b**0.5
 | |
|     p = 2*norm.sf(abs(Z))  # 2-sided p-value
 | |
| 
 | |
|     return tau, p
 | |
| 
 | |
| 
 | |
| def _somers_d(A, alternative='two-sided'):
 | |
|     """Calculate Somers' D and p-value from contingency table."""
 | |
|     # See [3] page 1740
 | |
| 
 | |
|     # contingency table must be truly 2D
 | |
|     if A.shape[0] <= 1 or A.shape[1] <= 1:
 | |
|         return np.nan, np.nan
 | |
| 
 | |
|     NA = A.sum()
 | |
|     NA2 = NA**2
 | |
|     PA = _P(A)
 | |
|     QA = _Q(A)
 | |
|     Sri2 = (A.sum(axis=1)**2).sum()
 | |
| 
 | |
|     d = (PA - QA)/(NA2 - Sri2)
 | |
| 
 | |
|     S = _a_ij_Aij_Dij2(A) - (PA-QA)**2/NA
 | |
| 
 | |
|     with np.errstate(divide='ignore'):
 | |
|         Z = (PA - QA)/(4*(S))**0.5
 | |
| 
 | |
|     norm = _stats_py._SimpleNormal()
 | |
|     p = _stats_py._get_pvalue(Z, norm, alternative, xp=np)
 | |
| 
 | |
|     return d, p
 | |
| 
 | |
| 
 | |
| @dataclass
 | |
| class SomersDResult:
 | |
|     statistic: float
 | |
|     pvalue: float
 | |
|     table: np.ndarray
 | |
| 
 | |
| 
 | |
| def somersd(x, y=None, alternative='two-sided'):
 | |
|     r"""Calculates Somers' D, an asymmetric measure of ordinal association.
 | |
| 
 | |
|     Like Kendall's :math:`\tau`, Somers' :math:`D` is a measure of the
 | |
|     correspondence between two rankings. Both statistics consider the
 | |
|     difference between the number of concordant and discordant pairs in two
 | |
|     rankings :math:`X` and :math:`Y`, and both are normalized such that values
 | |
|     close  to 1 indicate strong agreement and values close to -1 indicate
 | |
|     strong disagreement. They differ in how they are normalized. To show the
 | |
|     relationship, Somers' :math:`D` can be defined in terms of Kendall's
 | |
|     :math:`\tau_a`:
 | |
| 
 | |
|     .. math::
 | |
|         D(Y|X) = \frac{\tau_a(X, Y)}{\tau_a(X, X)}
 | |
| 
 | |
|     Suppose the first ranking :math:`X` has :math:`r` distinct ranks and the
 | |
|     second ranking :math:`Y` has :math:`s` distinct ranks. These two lists of
 | |
|     :math:`n` rankings can also be viewed as an :math:`r \times s` contingency
 | |
|     table in which element :math:`i, j` is the number of rank pairs with rank
 | |
|     :math:`i` in ranking :math:`X` and rank :math:`j` in ranking :math:`Y`.
 | |
|     Accordingly, `somersd` also allows the input data to be supplied as a
 | |
|     single, 2D contingency table instead of as two separate, 1D rankings.
 | |
| 
 | |
|     Note that the definition of Somers' :math:`D` is asymmetric: in general,
 | |
|     :math:`D(Y|X) \neq D(X|Y)`. ``somersd(x, y)`` calculates Somers'
 | |
|     :math:`D(Y|X)`: the "row" variable :math:`X` is treated as an independent
 | |
|     variable, and the "column" variable :math:`Y` is dependent. For Somers'
 | |
|     :math:`D(X|Y)`, swap the input lists or transpose the input table.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : array_like
 | |
|         1D array of rankings, treated as the (row) independent variable.
 | |
|         Alternatively, a 2D contingency table.
 | |
|     y : array_like, optional
 | |
|         If `x` is a 1D array of rankings, `y` is a 1D array of rankings of the
 | |
|         same length, treated as the (column) dependent variable.
 | |
|         If `x` is 2D, `y` is ignored.
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Defines the alternative hypothesis. Default is 'two-sided'.
 | |
|         The following options are available:
 | |
|         * 'two-sided': the rank correlation is nonzero
 | |
|         * 'less': the rank correlation is negative (less than zero)
 | |
|         * 'greater':  the rank correlation is positive (greater than zero)
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     res : SomersDResult
 | |
|         A `SomersDResult` object with the following fields:
 | |
| 
 | |
|             statistic : float
 | |
|                The Somers' :math:`D` statistic.
 | |
|             pvalue : float
 | |
|                The p-value for a hypothesis test whose null
 | |
|                hypothesis is an absence of association, :math:`D=0`.
 | |
|                See notes for more information.
 | |
|             table : 2D array
 | |
|                The contingency table formed from rankings `x` and `y` (or the
 | |
|                provided contingency table, if `x` is a 2D array)
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     kendalltau : Calculates Kendall's tau, another correlation measure.
 | |
|     weightedtau : Computes a weighted version of Kendall's tau.
 | |
|     spearmanr : Calculates a Spearman rank-order correlation coefficient.
 | |
|     pearsonr : Calculates a Pearson correlation coefficient.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     This function follows the contingency table approach of [2]_ and
 | |
|     [3]_. *p*-values are computed based on an asymptotic approximation of
 | |
|     the test statistic distribution under the null hypothesis :math:`D=0`.
 | |
| 
 | |
|     Theoretically, hypothesis tests based on Kendall's :math:`tau` and Somers'
 | |
|     :math:`D` should be identical.
 | |
|     However, the *p*-values returned by `kendalltau` are based
 | |
|     on the null hypothesis of *independence* between :math:`X` and :math:`Y`
 | |
|     (i.e. the population from which pairs in :math:`X` and :math:`Y` are
 | |
|     sampled contains equal numbers of all possible pairs), which is more
 | |
|     specific than the null hypothesis :math:`D=0` used here. If the null
 | |
|     hypothesis of independence is desired, it is acceptable to use the
 | |
|     *p*-value returned by `kendalltau` with the statistic returned by
 | |
|     `somersd` and vice versa. For more information, see [2]_.
 | |
| 
 | |
|     Contingency tables are formatted according to the convention used by
 | |
|     SAS and R: the first ranking supplied (``x``) is the "row" variable, and
 | |
|     the second ranking supplied (``y``) is the "column" variable. This is
 | |
|     opposite the convention of Somers' original paper [1]_.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] Robert H. Somers, "A New Asymmetric Measure of Association for
 | |
|            Ordinal Variables", *American Sociological Review*, Vol. 27, No. 6,
 | |
|            pp. 799--811, 1962.
 | |
| 
 | |
|     .. [2] Morton B. Brown and Jacqueline K. Benedetti, "Sampling Behavior of
 | |
|            Tests for Correlation in Two-Way Contingency Tables", *Journal of
 | |
|            the American Statistical Association* Vol. 72, No. 358, pp.
 | |
|            309--315, 1977.
 | |
| 
 | |
|     .. [3] SAS Institute, Inc., "The FREQ Procedure (Book Excerpt)",
 | |
|            *SAS/STAT 9.2 User's Guide, Second Edition*, SAS Publishing, 2009.
 | |
| 
 | |
|     .. [4] Laerd Statistics, "Somers' d using SPSS Statistics", *SPSS
 | |
|            Statistics Tutorials and Statistical Guides*,
 | |
|            https://statistics.laerd.com/spss-tutorials/somers-d-using-spss-statistics.php,
 | |
|            Accessed July 31, 2020.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     We calculate Somers' D for the example given in [4]_, in which a hotel
 | |
|     chain owner seeks to determine the association between hotel room
 | |
|     cleanliness and customer satisfaction. The independent variable, hotel
 | |
|     room cleanliness, is ranked on an ordinal scale: "below average (1)",
 | |
|     "average (2)", or "above average (3)". The dependent variable, customer
 | |
|     satisfaction, is ranked on a second scale: "very dissatisfied (1)",
 | |
|     "moderately dissatisfied (2)", "neither dissatisfied nor satisfied (3)",
 | |
|     "moderately satisfied (4)", or "very satisfied (5)". 189 customers
 | |
|     respond to the survey, and the results are cast into a contingency table
 | |
|     with the hotel room cleanliness as the "row" variable and customer
 | |
|     satisfaction as the "column" variable.
 | |
| 
 | |
|     +-----+-----+-----+-----+-----+-----+
 | |
|     |     | (1) | (2) | (3) | (4) | (5) |
 | |
|     +=====+=====+=====+=====+=====+=====+
 | |
|     | (1) | 27  | 25  | 14  | 7   | 0   |
 | |
|     +-----+-----+-----+-----+-----+-----+
 | |
|     | (2) | 7   | 14  | 18  | 35  | 12  |
 | |
|     +-----+-----+-----+-----+-----+-----+
 | |
|     | (3) | 1   | 3   | 2   | 7   | 17  |
 | |
|     +-----+-----+-----+-----+-----+-----+
 | |
| 
 | |
|     For example, 27 customers assigned their room a cleanliness ranking of
 | |
|     "below average (1)" and a corresponding satisfaction of "very
 | |
|     dissatisfied (1)". We perform the analysis as follows.
 | |
| 
 | |
|     >>> from scipy.stats import somersd
 | |
|     >>> table = [[27, 25, 14, 7, 0], [7, 14, 18, 35, 12], [1, 3, 2, 7, 17]]
 | |
|     >>> res = somersd(table)
 | |
|     >>> res.statistic
 | |
|     0.6032766111513396
 | |
|     >>> res.pvalue
 | |
|     1.0007091191074533e-27
 | |
| 
 | |
|     The value of the Somers' D statistic is approximately 0.6, indicating
 | |
|     a positive correlation between room cleanliness and customer satisfaction
 | |
|     in the sample.
 | |
|     The *p*-value is very small, indicating a very small probability of
 | |
|     observing such an extreme value of the statistic under the null
 | |
|     hypothesis that the statistic of the entire population (from which
 | |
|     our sample of 189 customers is drawn) is zero. This supports the
 | |
|     alternative hypothesis that the true value of Somers' D for the population
 | |
|     is nonzero.
 | |
| 
 | |
|     """
 | |
|     x, y = np.array(x), np.array(y)
 | |
|     if x.ndim == 1:
 | |
|         if x.size != y.size:
 | |
|             raise ValueError("Rankings must be of equal length.")
 | |
|         table = scipy.stats.contingency.crosstab(x, y)[1]
 | |
|     elif x.ndim == 2:
 | |
|         if np.any(x < 0):
 | |
|             raise ValueError("All elements of the contingency table must be "
 | |
|                              "non-negative.")
 | |
|         if np.any(x != x.astype(int)):
 | |
|             raise ValueError("All elements of the contingency table must be "
 | |
|                              "integer.")
 | |
|         if x.nonzero()[0].size < 2:
 | |
|             raise ValueError("At least two elements of the contingency table "
 | |
|                              "must be nonzero.")
 | |
|         table = x
 | |
|     else:
 | |
|         raise ValueError("x must be either a 1D or 2D array")
 | |
|     # The table type is converted to a float to avoid an integer overflow
 | |
|     d, p = _somers_d(table.astype(float), alternative)
 | |
| 
 | |
|     # add alias for consistency with other correlation functions
 | |
|     res = SomersDResult(d, p, table)
 | |
|     res.correlation = d
 | |
|     return res
 | |
| 
 | |
| 
 | |
| # This could be combined with `_all_partitions` in `_resampling.py`
 | |
| def _all_partitions(nx, ny):
 | |
|     """
 | |
|     Partition a set of indices into two fixed-length sets in all possible ways
 | |
| 
 | |
|     Partition a set of indices 0 ... nx + ny - 1 into two sets of length nx and
 | |
|     ny in all possible ways (ignoring order of elements).
 | |
|     """
 | |
|     z = np.arange(nx+ny)
 | |
|     for c in combinations(z, nx):
 | |
|         x = np.array(c)
 | |
|         mask = np.ones(nx+ny, bool)
 | |
|         mask[x] = False
 | |
|         y = z[mask]
 | |
|         yield x, y
 | |
| 
 | |
| 
 | |
| def _compute_log_combinations(n):
 | |
|     """Compute all log combination of C(n, k)."""
 | |
|     gammaln_arr = gammaln(np.arange(n + 1) + 1)
 | |
|     return gammaln(n + 1) - gammaln_arr - gammaln_arr[::-1]
 | |
| 
 | |
| 
 | |
| @dataclass
 | |
| class BarnardExactResult:
 | |
|     statistic: float
 | |
|     pvalue: float
 | |
| 
 | |
| 
 | |
| def barnard_exact(table, alternative="two-sided", pooled=True, n=32):
 | |
|     r"""Perform a Barnard exact test on a 2x2 contingency table.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     table : array_like of ints
 | |
|         A 2x2 contingency table.  Elements should be non-negative integers.
 | |
| 
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Defines the null and alternative hypotheses. Default is 'two-sided'.
 | |
|         Please see explanations in the Notes section below.
 | |
| 
 | |
|     pooled : bool, optional
 | |
|         Whether to compute score statistic with pooled variance (as in
 | |
|         Student's t-test, for example) or unpooled variance (as in Welch's
 | |
|         t-test). Default is ``True``.
 | |
| 
 | |
|     n : int, optional
 | |
|         Number of sampling points used in the construction of the sampling
 | |
|         method. Note that this argument will automatically be converted to
 | |
|         the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to
 | |
|         select sample points. Default is 32. Must be positive. In most cases,
 | |
|         32 points is enough to reach good precision. More points comes at
 | |
|         performance cost.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     ber : BarnardExactResult
 | |
|         A result object with the following attributes.
 | |
| 
 | |
|         statistic : float
 | |
|             The Wald statistic with pooled or unpooled variance, depending
 | |
|             on the user choice of `pooled`.
 | |
| 
 | |
|         pvalue : float
 | |
|             P-value, the probability of obtaining a distribution at least as
 | |
|             extreme as the one that was actually observed, assuming that the
 | |
|             null hypothesis is true.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     chi2_contingency : Chi-square test of independence of variables in a
 | |
|         contingency table.
 | |
|     fisher_exact : Fisher exact test on a 2x2 contingency table.
 | |
|     boschloo_exact : Boschloo's exact test on a 2x2 contingency table,
 | |
|         which is an uniformly more powerful alternative to Fisher's exact test.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     Barnard's test is an exact test used in the analysis of contingency
 | |
|     tables. It examines the association of two categorical variables, and
 | |
|     is a more powerful alternative than Fisher's exact test
 | |
|     for 2x2 contingency tables.
 | |
| 
 | |
|     Let's define :math:`X_0` a 2x2 matrix representing the observed sample,
 | |
|     where each column stores the binomial experiment, as in the example
 | |
|     below. Let's also define :math:`p_1, p_2` the theoretical binomial
 | |
|     probabilities for  :math:`x_{11}` and :math:`x_{12}`. When using
 | |
|     Barnard exact test, we can assert three different null hypotheses :
 | |
| 
 | |
|     - :math:`H_0 : p_1 \geq p_2` versus :math:`H_1 : p_1 < p_2`,
 | |
|       with `alternative` = "less"
 | |
| 
 | |
|     - :math:`H_0 : p_1 \leq p_2` versus :math:`H_1 : p_1 > p_2`,
 | |
|       with `alternative` = "greater"
 | |
| 
 | |
|     - :math:`H_0 : p_1 = p_2` versus :math:`H_1 : p_1 \neq p_2`,
 | |
|       with `alternative` = "two-sided" (default one)
 | |
| 
 | |
|     In order to compute Barnard's exact test, we are using the Wald
 | |
|     statistic [3]_ with pooled or unpooled variance.
 | |
|     Under the default assumption that both variances are equal
 | |
|     (``pooled = True``), the statistic is computed as:
 | |
| 
 | |
|     .. math::
 | |
| 
 | |
|         T(X) = \frac{
 | |
|             \hat{p}_1 - \hat{p}_2
 | |
|         }{
 | |
|             \sqrt{
 | |
|                 \hat{p}(1 - \hat{p})
 | |
|                 (\frac{1}{c_1} +
 | |
|                 \frac{1}{c_2})
 | |
|             }
 | |
|         }
 | |
| 
 | |
|     with :math:`\hat{p}_1, \hat{p}_2` and :math:`\hat{p}` the estimator of
 | |
|     :math:`p_1, p_2` and :math:`p`, the latter being the combined probability,
 | |
|     given the assumption that :math:`p_1 = p_2`.
 | |
| 
 | |
|     If this assumption is invalid (``pooled = False``), the statistic is:
 | |
| 
 | |
|     .. math::
 | |
| 
 | |
|         T(X) = \frac{
 | |
|             \hat{p}_1 - \hat{p}_2
 | |
|         }{
 | |
|             \sqrt{
 | |
|                 \frac{\hat{p}_1 (1 - \hat{p}_1)}{c_1} +
 | |
|                 \frac{\hat{p}_2 (1 - \hat{p}_2)}{c_2}
 | |
|             }
 | |
|         }
 | |
| 
 | |
|     The p-value is then computed as:
 | |
| 
 | |
|     .. math::
 | |
| 
 | |
|         \sum
 | |
|             \binom{c_1}{x_{11}}
 | |
|             \binom{c_2}{x_{12}}
 | |
|             \pi^{x_{11} + x_{12}}
 | |
|             (1 - \pi)^{t - x_{11} - x_{12}}
 | |
| 
 | |
|     where the sum is over all  2x2 contingency tables :math:`X` such that:
 | |
|     * :math:`T(X) \leq T(X_0)` when `alternative` = "less",
 | |
|     * :math:`T(X) \geq T(X_0)` when `alternative` = "greater", or
 | |
|     * :math:`T(X) \geq |T(X_0)|` when `alternative` = "two-sided".
 | |
|     Above, :math:`c_1, c_2` are the sum of the columns 1 and 2,
 | |
|     and :math:`t` the total (sum of the 4 sample's element).
 | |
| 
 | |
|     The returned p-value is the maximum p-value taken over the nuisance
 | |
|     parameter :math:`\pi`, where :math:`0 \leq \pi \leq 1`.
 | |
| 
 | |
|     This function's complexity is :math:`O(n c_1 c_2)`, where `n` is the
 | |
|     number of sample points.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] Barnard, G. A. "Significance Tests for 2x2 Tables". *Biometrika*.
 | |
|            34.1/2 (1947): 123-138. :doi:`dpgkg3`
 | |
| 
 | |
|     .. [2] Mehta, Cyrus R., and Pralay Senchaudhuri. "Conditional versus
 | |
|            unconditional exact tests for comparing two binomials."
 | |
|            *Cytel Software Corporation* 675 (2003): 1-5.
 | |
| 
 | |
|     .. [3] "Wald Test". *Wikipedia*. https://en.wikipedia.org/wiki/Wald_test
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     An example use of Barnard's test is presented in [2]_.
 | |
| 
 | |
|         Consider the following example of a vaccine efficacy study
 | |
|         (Chan, 1998). In a randomized clinical trial of 30 subjects, 15 were
 | |
|         inoculated with a recombinant DNA influenza vaccine and the 15 were
 | |
|         inoculated with a placebo. Twelve of the 15 subjects in the placebo
 | |
|         group (80%) eventually became infected with influenza whereas for the
 | |
|         vaccine group, only 7 of the 15 subjects (47%) became infected. The
 | |
|         data are tabulated as a 2 x 2 table::
 | |
| 
 | |
|                 Vaccine  Placebo
 | |
|             Yes     7        12
 | |
|             No      8        3
 | |
| 
 | |
|     When working with statistical hypothesis testing, we usually use a
 | |
|     threshold probability or significance level upon which we decide
 | |
|     to reject the null hypothesis :math:`H_0`. Suppose we choose the common
 | |
|     significance level of 5%.
 | |
| 
 | |
|     Our alternative hypothesis is that the vaccine will lower the chance of
 | |
|     becoming infected with the virus; that is, the probability :math:`p_1` of
 | |
|     catching the virus with the vaccine will be *less than* the probability
 | |
|     :math:`p_2` of catching the virus without the vaccine.  Therefore, we call
 | |
|     `barnard_exact` with the ``alternative="less"`` option:
 | |
| 
 | |
|     >>> import scipy.stats as stats
 | |
|     >>> res = stats.barnard_exact([[7, 12], [8, 3]], alternative="less")
 | |
|     >>> res.statistic
 | |
|     -1.894
 | |
|     >>> res.pvalue
 | |
|     0.03407
 | |
| 
 | |
|     Under the null hypothesis that the vaccine will not lower the chance of
 | |
|     becoming infected, the probability of obtaining test results at least as
 | |
|     extreme as the observed data is approximately 3.4%. Since this p-value is
 | |
|     less than our chosen significance level, we have evidence to reject
 | |
|     :math:`H_0` in favor of the alternative.
 | |
| 
 | |
|     Suppose we had used Fisher's exact test instead:
 | |
| 
 | |
|     >>> _, pvalue = stats.fisher_exact([[7, 12], [8, 3]], alternative="less")
 | |
|     >>> pvalue
 | |
|     0.0640
 | |
| 
 | |
|     With the same threshold significance of 5%, we would not have been able
 | |
|     to reject the null hypothesis in favor of the alternative. As stated in
 | |
|     [2]_, Barnard's test is uniformly more powerful than Fisher's exact test
 | |
|     because Barnard's test does not condition on any margin. Fisher's test
 | |
|     should only be used when both sets of marginals are fixed.
 | |
| 
 | |
|     """
 | |
|     if n <= 0:
 | |
|         raise ValueError(
 | |
|             "Number of points `n` must be strictly positive, "
 | |
|             f"found {n!r}"
 | |
|         )
 | |
| 
 | |
|     table = np.asarray(table, dtype=np.int64)
 | |
| 
 | |
|     if not table.shape == (2, 2):
 | |
|         raise ValueError("The input `table` must be of shape (2, 2).")
 | |
| 
 | |
|     if np.any(table < 0):
 | |
|         raise ValueError("All values in `table` must be nonnegative.")
 | |
| 
 | |
|     if 0 in table.sum(axis=0):
 | |
|         # If both values in column are zero, the p-value is 1 and
 | |
|         # the score's statistic is NaN.
 | |
|         return BarnardExactResult(np.nan, 1.0)
 | |
| 
 | |
|     total_col_1, total_col_2 = table.sum(axis=0)
 | |
| 
 | |
|     x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(-1, 1)
 | |
|     x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(1, -1)
 | |
| 
 | |
|     # We need to calculate the wald statistics for each combination of x1 and
 | |
|     # x2.
 | |
|     p1, p2 = x1 / total_col_1, x2 / total_col_2
 | |
| 
 | |
|     if pooled:
 | |
|         p = (x1 + x2) / (total_col_1 + total_col_2)
 | |
|         variances = p * (1 - p) * (1 / total_col_1 + 1 / total_col_2)
 | |
|     else:
 | |
|         variances = p1 * (1 - p1) / total_col_1 + p2 * (1 - p2) / total_col_2
 | |
| 
 | |
|     # To avoid warning when dividing by 0
 | |
|     with np.errstate(divide="ignore", invalid="ignore"):
 | |
|         wald_statistic = np.divide((p1 - p2), np.sqrt(variances))
 | |
| 
 | |
|     wald_statistic[p1 == p2] = 0  # Removing NaN values
 | |
| 
 | |
|     wald_stat_obs = wald_statistic[table[0, 0], table[0, 1]]
 | |
| 
 | |
|     if alternative == "two-sided":
 | |
|         index_arr = np.abs(wald_statistic) >= abs(wald_stat_obs)
 | |
|     elif alternative == "less":
 | |
|         index_arr = wald_statistic <= wald_stat_obs
 | |
|     elif alternative == "greater":
 | |
|         index_arr = wald_statistic >= wald_stat_obs
 | |
|     else:
 | |
|         msg = (
 | |
|             "`alternative` should be one of {'two-sided', 'less', 'greater'},"
 | |
|             f" found {alternative!r}"
 | |
|         )
 | |
|         raise ValueError(msg)
 | |
| 
 | |
|     x1_sum_x2 = x1 + x2
 | |
| 
 | |
|     x1_log_comb = _compute_log_combinations(total_col_1)
 | |
|     x2_log_comb = _compute_log_combinations(total_col_2)
 | |
|     x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2]
 | |
| 
 | |
|     result = shgo(
 | |
|         _get_binomial_log_p_value_with_nuisance_param,
 | |
|         args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr),
 | |
|         bounds=((0, 1),),
 | |
|         n=n,
 | |
|         sampling_method="sobol",
 | |
|     )
 | |
| 
 | |
|     # result.fun is the negative log pvalue and therefore needs to be
 | |
|     # changed before return
 | |
|     p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1)
 | |
|     return BarnardExactResult(wald_stat_obs, p_value)
 | |
| 
 | |
| 
 | |
| @dataclass
 | |
| class BoschlooExactResult:
 | |
|     statistic: float
 | |
|     pvalue: float
 | |
| 
 | |
| 
 | |
| def boschloo_exact(table, alternative="two-sided", n=32):
 | |
|     r"""Perform Boschloo's exact test on a 2x2 contingency table.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     table : array_like of ints
 | |
|         A 2x2 contingency table.  Elements should be non-negative integers.
 | |
| 
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Defines the null and alternative hypotheses. Default is 'two-sided'.
 | |
|         Please see explanations in the Notes section below.
 | |
| 
 | |
|     n : int, optional
 | |
|         Number of sampling points used in the construction of the sampling
 | |
|         method. Note that this argument will automatically be converted to
 | |
|         the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to
 | |
|         select sample points. Default is 32. Must be positive. In most cases,
 | |
|         32 points is enough to reach good precision. More points comes at
 | |
|         performance cost.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     ber : BoschlooExactResult
 | |
|         A result object with the following attributes.
 | |
| 
 | |
|         statistic : float
 | |
|             The statistic used in Boschloo's test; that is, the p-value
 | |
|             from Fisher's exact test.
 | |
| 
 | |
|         pvalue : float
 | |
|             P-value, the probability of obtaining a distribution at least as
 | |
|             extreme as the one that was actually observed, assuming that the
 | |
|             null hypothesis is true.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     chi2_contingency : Chi-square test of independence of variables in a
 | |
|         contingency table.
 | |
|     fisher_exact : Fisher exact test on a 2x2 contingency table.
 | |
|     barnard_exact : Barnard's exact test, which is a more powerful alternative
 | |
|         than Fisher's exact test for 2x2 contingency tables.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     Boschloo's test is an exact test used in the analysis of contingency
 | |
|     tables. It examines the association of two categorical variables, and
 | |
|     is a uniformly more powerful alternative to Fisher's exact test
 | |
|     for 2x2 contingency tables.
 | |
| 
 | |
|     Boschloo's exact test uses the p-value of Fisher's exact test as a
 | |
|     statistic, and Boschloo's p-value is the probability under the null
 | |
|     hypothesis of observing such an extreme value of this statistic.
 | |
| 
 | |
|     Let's define :math:`X_0` a 2x2 matrix representing the observed sample,
 | |
|     where each column stores the binomial experiment, as in the example
 | |
|     below. Let's also define :math:`p_1, p_2` the theoretical binomial
 | |
|     probabilities for  :math:`x_{11}` and :math:`x_{12}`. When using
 | |
|     Boschloo exact test, we can assert three different alternative hypotheses:
 | |
| 
 | |
|     - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 < p_2`,
 | |
|       with `alternative` = "less"
 | |
| 
 | |
|     - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 > p_2`,
 | |
|       with `alternative` = "greater"
 | |
| 
 | |
|     - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 \neq p_2`,
 | |
|       with `alternative` = "two-sided" (default)
 | |
| 
 | |
|     There are multiple conventions for computing a two-sided p-value when the
 | |
|     null distribution is asymmetric. Here, we apply the convention that the
 | |
|     p-value of a two-sided test is twice the minimum of the p-values of the
 | |
|     one-sided tests (clipped to 1.0). Note that `fisher_exact` follows a
 | |
|     different convention, so for a given `table`, the statistic reported by
 | |
|     `boschloo_exact` may differ from the p-value reported by `fisher_exact`
 | |
|     when ``alternative='two-sided'``.
 | |
| 
 | |
|     .. versionadded:: 1.7.0
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] R.D. Boschloo. "Raised conditional level of significance for the
 | |
|        2 x 2-table when testing the equality of two probabilities",
 | |
|        Statistica Neerlandica, 24(1), 1970
 | |
| 
 | |
|     .. [2] "Boschloo's test", Wikipedia,
 | |
|        https://en.wikipedia.org/wiki/Boschloo%27s_test
 | |
| 
 | |
|     .. [3] Lise M. Saari et al. "Employee attitudes and job satisfaction",
 | |
|        Human Resource Management, 43(4), 395-407, 2004,
 | |
|        :doi:`10.1002/hrm.20032`.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     In the following example, we consider the article "Employee
 | |
|     attitudes and job satisfaction" [3]_
 | |
|     which reports the results of a survey from 63 scientists and 117 college
 | |
|     professors. Of the 63 scientists, 31 said they were very satisfied with
 | |
|     their jobs, whereas 74 of the college professors were very satisfied
 | |
|     with their work. Is this significant evidence that college
 | |
|     professors are happier with their work than scientists?
 | |
|     The following table summarizes the data mentioned above::
 | |
| 
 | |
|                          college professors   scientists
 | |
|         Very Satisfied   74                     31
 | |
|         Dissatisfied     43                     32
 | |
| 
 | |
|     When working with statistical hypothesis testing, we usually use a
 | |
|     threshold probability or significance level upon which we decide
 | |
|     to reject the null hypothesis :math:`H_0`. Suppose we choose the common
 | |
|     significance level of 5%.
 | |
| 
 | |
|     Our alternative hypothesis is that college professors are truly more
 | |
|     satisfied with their work than scientists. Therefore, we expect
 | |
|     :math:`p_1` the proportion of very satisfied college professors to be
 | |
|     greater than :math:`p_2`, the proportion of very satisfied scientists.
 | |
|     We thus call `boschloo_exact` with the ``alternative="greater"`` option:
 | |
| 
 | |
|     >>> import scipy.stats as stats
 | |
|     >>> res = stats.boschloo_exact([[74, 31], [43, 32]], alternative="greater")
 | |
|     >>> res.statistic
 | |
|     0.0483
 | |
|     >>> res.pvalue
 | |
|     0.0355
 | |
| 
 | |
|     Under the null hypothesis that scientists are happier in their work than
 | |
|     college professors, the probability of obtaining test
 | |
|     results at least as extreme as the observed data is approximately 3.55%.
 | |
|     Since this p-value is less than our chosen significance level, we have
 | |
|     evidence to reject :math:`H_0` in favor of the alternative hypothesis.
 | |
| 
 | |
|     """
 | |
|     hypergeom = distributions.hypergeom
 | |
| 
 | |
|     if n <= 0:
 | |
|         raise ValueError(
 | |
|             "Number of points `n` must be strictly positive,"
 | |
|             f" found {n!r}"
 | |
|         )
 | |
| 
 | |
|     table = np.asarray(table, dtype=np.int64)
 | |
| 
 | |
|     if not table.shape == (2, 2):
 | |
|         raise ValueError("The input `table` must be of shape (2, 2).")
 | |
| 
 | |
|     if np.any(table < 0):
 | |
|         raise ValueError("All values in `table` must be nonnegative.")
 | |
| 
 | |
|     if 0 in table.sum(axis=0):
 | |
|         # If both values in column are zero, the p-value is 1 and
 | |
|         # the score's statistic is NaN.
 | |
|         return BoschlooExactResult(np.nan, np.nan)
 | |
| 
 | |
|     total_col_1, total_col_2 = table.sum(axis=0)
 | |
|     total = total_col_1 + total_col_2
 | |
|     x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(1, -1)
 | |
|     x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(-1, 1)
 | |
|     x1_sum_x2 = x1 + x2
 | |
| 
 | |
|     if alternative == 'less':
 | |
|         pvalues = hypergeom.cdf(x1, total, x1_sum_x2, total_col_1).T
 | |
|     elif alternative == 'greater':
 | |
|         # Same formula as the 'less' case, but with the second column.
 | |
|         pvalues = hypergeom.cdf(x2, total, x1_sum_x2, total_col_2).T
 | |
|     elif alternative == 'two-sided':
 | |
|         boschloo_less = boschloo_exact(table, alternative="less", n=n)
 | |
|         boschloo_greater = boschloo_exact(table, alternative="greater", n=n)
 | |
| 
 | |
|         res = (
 | |
|             boschloo_less if boschloo_less.pvalue < boschloo_greater.pvalue
 | |
|             else boschloo_greater
 | |
|         )
 | |
| 
 | |
|         # Two-sided p-value is defined as twice the minimum of the one-sided
 | |
|         # p-values
 | |
|         pvalue = np.clip(2 * res.pvalue, a_min=0, a_max=1)
 | |
|         return BoschlooExactResult(res.statistic, pvalue)
 | |
|     else:
 | |
|         msg = (
 | |
|             f"`alternative` should be one of {'two-sided', 'less', 'greater'},"
 | |
|             f" found {alternative!r}"
 | |
|         )
 | |
|         raise ValueError(msg)
 | |
| 
 | |
|     fisher_stat = pvalues[table[0, 0], table[0, 1]]
 | |
| 
 | |
|     # fisher_stat * (1+1e-13) guards us from small numerical error. It is
 | |
|     # equivalent to np.isclose with relative tol of 1e-13 and absolute tol of 0
 | |
|     # For more throughout explanations, see gh-14178
 | |
|     index_arr = pvalues <= fisher_stat * (1+1e-13)
 | |
| 
 | |
|     x1, x2, x1_sum_x2 = x1.T, x2.T, x1_sum_x2.T
 | |
|     x1_log_comb = _compute_log_combinations(total_col_1)
 | |
|     x2_log_comb = _compute_log_combinations(total_col_2)
 | |
|     x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2]
 | |
| 
 | |
|     result = shgo(
 | |
|         _get_binomial_log_p_value_with_nuisance_param,
 | |
|         args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr),
 | |
|         bounds=((0, 1),),
 | |
|         n=n,
 | |
|         sampling_method="sobol",
 | |
|     )
 | |
| 
 | |
|     # result.fun is the negative log pvalue and therefore needs to be
 | |
|     # changed before return
 | |
|     p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1)
 | |
|     return BoschlooExactResult(fisher_stat, p_value)
 | |
| 
 | |
| 
 | |
| def _get_binomial_log_p_value_with_nuisance_param(
 | |
|     nuisance_param, x1_sum_x2, x1_sum_x2_log_comb, index_arr
 | |
| ):
 | |
|     r"""
 | |
|     Compute the log pvalue in respect of a nuisance parameter considering
 | |
|     a 2x2 sample space.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     nuisance_param : float
 | |
|         nuisance parameter used in the computation of the maximisation of
 | |
|         the p-value. Must be between 0 and 1
 | |
| 
 | |
|     x1_sum_x2 : ndarray
 | |
|         Sum of x1 and x2 inside barnard_exact
 | |
| 
 | |
|     x1_sum_x2_log_comb : ndarray
 | |
|         sum of the log combination of x1 and x2
 | |
| 
 | |
|     index_arr : ndarray of boolean
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     p_value : float
 | |
|         Return the maximum p-value considering every nuisance parameter
 | |
|         between 0 and 1
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
| 
 | |
|     Both Barnard's test and Boschloo's test iterate over a nuisance parameter
 | |
|     :math:`\pi \in [0, 1]` to find the maximum p-value. To search this
 | |
|     maxima, this function return the negative log pvalue with respect to the
 | |
|     nuisance parameter passed in params. This negative log p-value is then
 | |
|     used in `shgo` to find the minimum negative pvalue which is our maximum
 | |
|     pvalue.
 | |
| 
 | |
|     Also, to compute the different combination used in the
 | |
|     p-values' computation formula, this function uses `gammaln` which is
 | |
|     more tolerant for large value than `scipy.special.comb`. `gammaln` gives
 | |
|     a log combination. For the little precision loss, performances are
 | |
|     improved a lot.
 | |
|     """
 | |
|     t1, t2 = x1_sum_x2.shape
 | |
|     n = t1 + t2 - 2
 | |
|     with np.errstate(divide="ignore", invalid="ignore"):
 | |
|         log_nuisance = np.log(
 | |
|             nuisance_param,
 | |
|             out=np.zeros_like(nuisance_param),
 | |
|             where=nuisance_param >= 0,
 | |
|         )
 | |
|         log_1_minus_nuisance = np.log(
 | |
|             1 - nuisance_param,
 | |
|             out=np.zeros_like(nuisance_param),
 | |
|             where=1 - nuisance_param >= 0,
 | |
|         )
 | |
| 
 | |
|         nuisance_power_x1_x2 = log_nuisance * x1_sum_x2
 | |
|         nuisance_power_x1_x2[(x1_sum_x2 == 0)[:, :]] = 0
 | |
| 
 | |
|         nuisance_power_n_minus_x1_x2 = log_1_minus_nuisance * (n - x1_sum_x2)
 | |
|         nuisance_power_n_minus_x1_x2[(x1_sum_x2 == n)[:, :]] = 0
 | |
| 
 | |
|         tmp_log_values_arr = (
 | |
|             x1_sum_x2_log_comb
 | |
|             + nuisance_power_x1_x2
 | |
|             + nuisance_power_n_minus_x1_x2
 | |
|         )
 | |
| 
 | |
|     tmp_values_from_index = tmp_log_values_arr[index_arr]
 | |
| 
 | |
|     # To avoid dividing by zero in log function and getting inf value,
 | |
|     # values are centered according to the max
 | |
|     max_value = tmp_values_from_index.max()
 | |
| 
 | |
|     # To have better result's precision, the log pvalue is taken here.
 | |
|     # Indeed, pvalue is included inside [0, 1] interval. Passing the
 | |
|     # pvalue to log makes the interval a lot bigger ([-inf, 0]), and thus
 | |
|     # help us to achieve better precision
 | |
|     with np.errstate(divide="ignore", invalid="ignore"):
 | |
|         log_probs = np.exp(tmp_values_from_index - max_value).sum()
 | |
|         log_pvalue = max_value + np.log(
 | |
|             log_probs,
 | |
|             out=np.full_like(log_probs, -np.inf),
 | |
|             where=log_probs > 0,
 | |
|         )
 | |
| 
 | |
|     # Since shgo find the minima, minus log pvalue is returned
 | |
|     return -log_pvalue
 | |
| 
 | |
| 
 | |
| def _pval_cvm_2samp_exact(s, m, n):
 | |
|     """
 | |
|     Compute the exact p-value of the Cramer-von Mises two-sample test
 | |
|     for a given value s of the test statistic.
 | |
|     m and n are the sizes of the samples.
 | |
| 
 | |
|     [1] Y. Xiao, A. Gordon, and A. Yakovlev, "A C++ Program for
 | |
|         the Cramér-Von Mises Two-Sample Test", J. Stat. Soft.,
 | |
|         vol. 17, no. 8, pp. 1-15, Dec. 2006.
 | |
|     [2] T. W. Anderson "On the Distribution of the Two-Sample Cramer-von Mises
 | |
|         Criterion," The Annals of Mathematical Statistics, Ann. Math. Statist.
 | |
|         33(3), 1148-1159, (September, 1962)
 | |
|     """
 | |
| 
 | |
|     # [1, p. 3]
 | |
|     lcm = np.lcm(m, n)
 | |
|     # [1, p. 4], below eq. 3
 | |
|     a = lcm // m
 | |
|     b = lcm // n
 | |
|     # Combine Eq. 9 in [2] with Eq. 2 in [1] and solve for $\zeta$
 | |
|     # Hint: `s` is $U$ in [2], and $T_2$ in [1] is $T$ in [2]
 | |
|     mn = m * n
 | |
|     zeta = lcm ** 2 * (m + n) * (6 * s - mn * (4 * mn - 1)) // (6 * mn ** 2)
 | |
| 
 | |
|     # bound maximum value that may appear in `gs` (remember both rows!)
 | |
|     zeta_bound = lcm**2 * (m + n)  # bound elements in row 1
 | |
|     combinations = comb(m + n, m)  # sum of row 2
 | |
|     max_gs = max(zeta_bound, combinations)
 | |
|     dtype = np.min_scalar_type(max_gs)
 | |
| 
 | |
|     # the frequency table of $g_{u, v}^+$ defined in [1, p. 6]
 | |
|     gs = ([np.array([[0], [1]], dtype=dtype)]
 | |
|           + [np.empty((2, 0), dtype=dtype) for _ in range(m)])
 | |
|     for u in range(n + 1):
 | |
|         next_gs = []
 | |
|         tmp = np.empty((2, 0), dtype=dtype)
 | |
|         for v, g in enumerate(gs):
 | |
|             # Calculate g recursively with eq. 11 in [1]. Even though it
 | |
|             # doesn't look like it, this also does 12/13 (all of Algorithm 1).
 | |
|             vi, i0, i1 = np.intersect1d(tmp[0], g[0], return_indices=True)
 | |
|             tmp = np.concatenate([
 | |
|                 np.stack([vi, tmp[1, i0] + g[1, i1]]),
 | |
|                 np.delete(tmp, i0, 1),
 | |
|                 np.delete(g, i1, 1)
 | |
|             ], 1)
 | |
|             res = (a * v - b * u) ** 2
 | |
|             tmp[0] += res.astype(dtype)
 | |
|             next_gs.append(tmp)
 | |
|         gs = next_gs
 | |
|     value, freq = gs[m]
 | |
|     return np.float64(np.sum(freq[value >= zeta]) / combinations)
 | |
| 
 | |
| 
 | |
| @_axis_nan_policy_factory(CramerVonMisesResult, n_samples=2, too_small=1,
 | |
|                           result_to_tuple=_cvm_result_to_tuple)
 | |
| def cramervonmises_2samp(x, y, method='auto'):
 | |
|     """Perform the two-sample Cramér-von Mises test for goodness of fit.
 | |
| 
 | |
|     This is the two-sample version of the Cramér-von Mises test ([1]_):
 | |
|     for two independent samples :math:`X_1, ..., X_n` and
 | |
|     :math:`Y_1, ..., Y_m`, the null hypothesis is that the samples
 | |
|     come from the same (unspecified) continuous distribution.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : array_like
 | |
|         A 1-D array of observed values of the random variables :math:`X_i`.
 | |
|         Must contain at least two observations.
 | |
|     y : array_like
 | |
|         A 1-D array of observed values of the random variables :math:`Y_i`.
 | |
|         Must contain at least two observations.
 | |
|     method : {'auto', 'asymptotic', 'exact'}, optional
 | |
|         The method used to compute the p-value, see Notes for details.
 | |
|         The default is 'auto'.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     res : object with attributes
 | |
|         statistic : float
 | |
|             Cramér-von Mises statistic.
 | |
|         pvalue : float
 | |
|             The p-value.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     cramervonmises, anderson_ksamp, epps_singleton_2samp, ks_2samp
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     .. versionadded:: 1.7.0
 | |
| 
 | |
|     The statistic is computed according to equation 9 in [2]_. The
 | |
|     calculation of the p-value depends on the keyword `method`:
 | |
| 
 | |
|     - ``asymptotic``: The p-value is approximated by using the limiting
 | |
|       distribution of the test statistic.
 | |
|     - ``exact``: The exact p-value is computed by enumerating all
 | |
|       possible combinations of the test statistic, see [2]_.
 | |
| 
 | |
|     If ``method='auto'``, the exact approach is used
 | |
|     if both samples contain equal to or less than 20 observations,
 | |
|     otherwise the asymptotic distribution is used.
 | |
| 
 | |
|     If the underlying distribution is not continuous, the p-value is likely to
 | |
|     be conservative (Section 6.2 in [3]_). When ranking the data to compute
 | |
|     the test statistic, midranks are used if there are ties.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] https://en.wikipedia.org/wiki/Cramer-von_Mises_criterion
 | |
|     .. [2] Anderson, T.W. (1962). On the distribution of the two-sample
 | |
|            Cramer-von-Mises criterion. The Annals of Mathematical
 | |
|            Statistics, pp. 1148-1159.
 | |
|     .. [3] Conover, W.J., Practical Nonparametric Statistics, 1971.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
| 
 | |
|     Suppose we wish to test whether two samples generated by
 | |
|     ``scipy.stats.norm.rvs`` have the same distribution. We choose a
 | |
|     significance level of alpha=0.05.
 | |
| 
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy import stats
 | |
|     >>> rng = np.random.default_rng()
 | |
|     >>> x = stats.norm.rvs(size=100, random_state=rng)
 | |
|     >>> y = stats.norm.rvs(size=70, random_state=rng)
 | |
|     >>> res = stats.cramervonmises_2samp(x, y)
 | |
|     >>> res.statistic, res.pvalue
 | |
|     (0.29376470588235293, 0.1412873014573014)
 | |
| 
 | |
|     The p-value exceeds our chosen significance level, so we do not
 | |
|     reject the null hypothesis that the observed samples are drawn from the
 | |
|     same distribution.
 | |
| 
 | |
|     For small sample sizes, one can compute the exact p-values:
 | |
| 
 | |
|     >>> x = stats.norm.rvs(size=7, random_state=rng)
 | |
|     >>> y = stats.t.rvs(df=2, size=6, random_state=rng)
 | |
|     >>> res = stats.cramervonmises_2samp(x, y, method='exact')
 | |
|     >>> res.statistic, res.pvalue
 | |
|     (0.197802197802198, 0.31643356643356646)
 | |
| 
 | |
|     The p-value based on the asymptotic distribution is a good approximation
 | |
|     even though the sample size is small.
 | |
| 
 | |
|     >>> res = stats.cramervonmises_2samp(x, y, method='asymptotic')
 | |
|     >>> res.statistic, res.pvalue
 | |
|     (0.197802197802198, 0.2966041181527128)
 | |
| 
 | |
|     Independent of the method, one would not reject the null hypothesis at the
 | |
|     chosen significance level in this example.
 | |
| 
 | |
|     """
 | |
|     xa = np.sort(np.asarray(x))
 | |
|     ya = np.sort(np.asarray(y))
 | |
| 
 | |
|     if xa.size <= 1 or ya.size <= 1:
 | |
|         raise ValueError('x and y must contain at least two observations.')
 | |
|     if method not in ['auto', 'exact', 'asymptotic']:
 | |
|         raise ValueError('method must be either auto, exact or asymptotic.')
 | |
| 
 | |
|     nx = len(xa)
 | |
|     ny = len(ya)
 | |
| 
 | |
|     if method == 'auto':
 | |
|         if max(nx, ny) > 20:
 | |
|             method = 'asymptotic'
 | |
|         else:
 | |
|             method = 'exact'
 | |
| 
 | |
|     # get ranks of x and y in the pooled sample
 | |
|     z = np.concatenate([xa, ya])
 | |
|     # in case of ties, use midrank (see [1])
 | |
|     r = scipy.stats.rankdata(z, method='average')
 | |
|     rx = r[:nx]
 | |
|     ry = r[nx:]
 | |
| 
 | |
|     # compute U (eq. 10 in [2])
 | |
|     u = nx * np.sum((rx - np.arange(1, nx+1))**2)
 | |
|     u += ny * np.sum((ry - np.arange(1, ny+1))**2)
 | |
| 
 | |
|     # compute T (eq. 9 in [2])
 | |
|     k, N = nx*ny, nx + ny
 | |
|     t = u / (k*N) - (4*k - 1)/(6*N)
 | |
| 
 | |
|     if method == 'exact':
 | |
|         p = _pval_cvm_2samp_exact(u, nx, ny)
 | |
|     else:
 | |
|         # compute expected value and variance of T (eq. 11 and 14 in [2])
 | |
|         et = (1 + 1/N)/6
 | |
|         vt = (N+1) * (4*k*N - 3*(nx**2 + ny**2) - 2*k)
 | |
|         vt = vt / (45 * N**2 * 4 * k)
 | |
| 
 | |
|         # computed the normalized statistic (eq. 15 in [2])
 | |
|         tn = 1/6 + (t - et) / np.sqrt(45 * vt)
 | |
| 
 | |
|         # approximate distribution of tn with limiting distribution
 | |
|         # of the one-sample test statistic
 | |
|         # if tn < 0.003, the _cdf_cvm_inf(tn) < 1.28*1e-18, return 1.0 directly
 | |
|         if tn < 0.003:
 | |
|             p = 1.0
 | |
|         else:
 | |
|             p = max(0, 1. - _cdf_cvm_inf(tn))
 | |
| 
 | |
|     return CramerVonMisesResult(statistic=t, pvalue=p)
 | |
| 
 | |
| 
 | |
| class TukeyHSDResult:
 | |
|     """Result of `scipy.stats.tukey_hsd`.
 | |
| 
 | |
|     Attributes
 | |
|     ----------
 | |
|     statistic : float ndarray
 | |
|         The computed statistic of the test for each comparison. The element
 | |
|         at index ``(i, j)`` is the statistic for the comparison between groups
 | |
|         ``i`` and ``j``.
 | |
|     pvalue : float ndarray
 | |
|         The associated p-value from the studentized range distribution. The
 | |
|         element at index ``(i, j)`` is the p-value for the comparison
 | |
|         between groups ``i`` and ``j``.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     The string representation of this object displays the most recently
 | |
|     calculated confidence interval, and if none have been previously
 | |
|     calculated, it will evaluate ``confidence_interval()``.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's
 | |
|            Method."
 | |
|            https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
 | |
|            28 November 2020.
 | |
|     .. [2] P. A. Games and J. F. Howell, "Pairwise Multiple Comparison Procedures
 | |
|            with Unequal N's and/or Variances: A Monte Carlo Study," Journal of
 | |
|            Educational Statistics, vol. 1, no. 2, pp. 113-125, Jun. 1976,
 | |
|            doi: https://doi.org/10.3102/10769986001002113.
 | |
|     """
 | |
| 
 | |
|     def __init__(self, statistic, pvalue, _ntreatments, _df, _stand_err):
 | |
|         self.statistic = statistic
 | |
|         self.pvalue = pvalue
 | |
|         self._ntreatments = _ntreatments
 | |
|         self._df = _df
 | |
|         self._stand_err = _stand_err
 | |
|         self._ci = None
 | |
|         self._ci_cl = None
 | |
| 
 | |
|     def __str__(self):
 | |
|         # Note: `__str__` prints the confidence intervals from the most
 | |
|         # recent call to `confidence_interval`. If it has not been called,
 | |
|         # it will be called with the default CL of .95.
 | |
|         if self._ci is None:
 | |
|             self.confidence_interval(confidence_level=.95)
 | |
|         s = ("Pairwise Group Comparisons"
 | |
|              f" ({self._ci_cl*100:.1f}% Confidence Interval)\n")
 | |
|         s += "Comparison  Statistic  p-value  Lower CI  Upper CI\n"
 | |
|         for i in range(self.pvalue.shape[0]):
 | |
|             for j in range(self.pvalue.shape[0]):
 | |
|                 if i != j:
 | |
|                     s += (f" ({i} - {j}) {self.statistic[i, j]:>10.3f}"
 | |
|                           f"{self.pvalue[i, j]:>10.3f}"
 | |
|                           f"{self._ci.low[i, j]:>10.3f}"
 | |
|                           f"{self._ci.high[i, j]:>10.3f}\n")
 | |
|         return s
 | |
| 
 | |
|     def confidence_interval(self, confidence_level=.95):
 | |
|         """Compute the confidence interval for the specified confidence level.
 | |
| 
 | |
|         Parameters
 | |
|         ----------
 | |
|         confidence_level : float, optional
 | |
|             Confidence level for the computed confidence interval
 | |
|             of the estimated proportion. Default is .95.
 | |
| 
 | |
|         Returns
 | |
|         -------
 | |
|         ci : ``ConfidenceInterval`` object
 | |
|             The object has attributes ``low`` and ``high`` that hold the
 | |
|             lower and upper bounds of the confidence intervals for each
 | |
|             comparison. The high and low values are accessible for each
 | |
|             comparison at index ``(i, j)`` between groups ``i`` and ``j``.
 | |
| 
 | |
|         References
 | |
|         ----------
 | |
|         .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1.
 | |
|                Tukey's Method."
 | |
|                https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
 | |
|                28 November 2020.
 | |
|         .. [2] P. A. Games and J. F. Howell, "Pairwise Multiple Comparison Procedures
 | |
|                with Unequal N's and/or Variances: A Monte Carlo Study," Journal of
 | |
|                Educational Statistics, vol. 1, no. 2, pp. 113-125, Jun. 1976,
 | |
|                doi: https://doi.org/10.3102/10769986001002113.
 | |
| 
 | |
|         Examples
 | |
|         --------
 | |
|         >>> from scipy.stats import tukey_hsd
 | |
|         >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
 | |
|         >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
 | |
|         >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
 | |
|         >>> result = tukey_hsd(group0, group1, group2)
 | |
|         >>> ci = result.confidence_interval()
 | |
|         >>> ci.low
 | |
|         array([[-3.649159, -8.249159, -3.909159],
 | |
|                [ 0.950841, -3.649159,  0.690841],
 | |
|                [-3.389159, -7.989159, -3.649159]])
 | |
|         >>> ci.high
 | |
|         array([[ 3.649159, -0.950841,  3.389159],
 | |
|                [ 8.249159,  3.649159,  7.989159],
 | |
|                [ 3.909159, -0.690841,  3.649159]])
 | |
|         """
 | |
|         # check to see if the supplied confidence level matches that of the
 | |
|         # previously computed CI.
 | |
|         if (self._ci is not None and self._ci_cl is not None and
 | |
|                 confidence_level == self._ci_cl):
 | |
|             return self._ci
 | |
| 
 | |
|         if not 0 < confidence_level < 1:
 | |
|             raise ValueError("Confidence level must be between 0 and 1.")
 | |
|         # determine the critical value of the studentized range using the
 | |
|         # appropriate confidence level, number of treatments, and degrees
 | |
|         # of freedom. See [1] "Confidence limits for Tukey's method" / [2] p.117
 | |
|         # "H0 was rejected if...". Note that in the cases of unequal sample sizes,
 | |
|         # there will be a criterion for each group comparison.
 | |
|         params = (confidence_level, self._ntreatments, self._df)
 | |
|         srd = distributions.studentized_range.ppf(*params)
 | |
|         # also called maximum critical value, the confidence_radius is the
 | |
|         # studentized range critical value * the square root of mean square
 | |
|         # error over the sample size.
 | |
|         confidence_radius = srd * self._stand_err
 | |
|         # the confidence levels are determined by the
 | |
|         # `mean_differences` +- `confidence_radius`
 | |
|         upper_conf = self.statistic + confidence_radius
 | |
|         lower_conf = self.statistic - confidence_radius
 | |
|         self._ci = ConfidenceInterval(low=lower_conf, high=upper_conf)
 | |
|         self._ci_cl = confidence_level
 | |
|         return self._ci
 | |
| 
 | |
| 
 | |
| def _tukey_hsd_iv(args, equal_var):
 | |
|     if (len(args)) < 2:
 | |
|         raise ValueError("There must be more than 1 treatment.")
 | |
|     if not isinstance(equal_var, bool):
 | |
|         raise TypeError("Expected a boolean value for 'equal_var'")
 | |
|     args = [np.asarray(arg) for arg in args]
 | |
|     for arg in args:
 | |
|         if arg.ndim != 1:
 | |
|             raise ValueError("Input samples must be one-dimensional.")
 | |
|         if arg.size <= 1:
 | |
|             raise ValueError("Input sample size must be greater than one.")
 | |
|         if np.isinf(arg).any():
 | |
|             raise ValueError("Input samples must be finite.")
 | |
|     return args
 | |
| 
 | |
| 
 | |
| def tukey_hsd(*args, equal_var=True):
 | |
|     """Perform Tukey's HSD test for equality of means over multiple treatments.
 | |
| 
 | |
|     Tukey's honestly significant difference (HSD) test performs pairwise
 | |
|     comparison of means for a set of samples. Whereas ANOVA (e.g. `f_oneway`)
 | |
|     assesses whether the true means underlying each sample are identical,
 | |
|     Tukey's HSD is a post hoc test used to compare the mean of each sample
 | |
|     to the mean of each other sample.
 | |
| 
 | |
|     The null hypothesis is that the distributions underlying the samples all
 | |
|     have the same mean. The test statistic, which is computed for every
 | |
|     possible pairing of samples, is simply the difference between the sample
 | |
|     means. For each pair, the p-value is the probability under the null
 | |
|     hypothesis (and other assumptions; see notes) of observing such an extreme
 | |
|     value of the statistic, considering that many pairwise comparisons are
 | |
|     being performed. Confidence intervals for the difference between each pair
 | |
|     of means are also available.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     sample1, sample2, ... : array_like
 | |
|         The sample measurements for each group. There must be at least
 | |
|         two arguments.
 | |
|     equal_var: bool, optional
 | |
|         If True (default) and equal sample size, perform Tukey-HSD test [6].
 | |
|         If True and unequal sample size, perform Tukey-Kramer test [4]_.
 | |
|         If False, perform Games-Howell test [7]_, which does not assume equal variances.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     result : `~scipy.stats._result_classes.TukeyHSDResult` instance
 | |
|         The return value is an object with the following attributes:
 | |
| 
 | |
|         statistic : float ndarray
 | |
|             The computed statistic of the test for each comparison. The element
 | |
|             at index ``(i, j)`` is the statistic for the comparison between
 | |
|             groups ``i`` and ``j``.
 | |
|         pvalue : float ndarray
 | |
|             The computed p-value of the test for each comparison. The element
 | |
|             at index ``(i, j)`` is the p-value for the comparison between
 | |
|             groups ``i`` and ``j``.
 | |
| 
 | |
|         The object has the following methods:
 | |
| 
 | |
|         confidence_interval(confidence_level=0.95):
 | |
|             Compute the confidence interval for the specified confidence level.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     dunnett : performs comparison of means against a control group.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     The use of this test relies on several assumptions.
 | |
| 
 | |
|     1. The observations are independent within and among groups.
 | |
|     2. The observations within each group are normally distributed.
 | |
|     3. The distributions from which the samples are drawn have the same finite
 | |
|        variance.
 | |
| 
 | |
|     The original formulation of the test was for samples of equal size drawn from
 | |
|     populations assumed to have equal variances [6]_. In case of unequal sample sizes,
 | |
|     the test uses the Tukey-Kramer method [4]_. When equal variances are not assumed
 | |
|     (``equal_var=False``), the test uses the Games-Howell method [7]_.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's
 | |
|            Method."
 | |
|            https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
 | |
|            28 November 2020.
 | |
|     .. [2] Abdi, Herve & Williams, Lynne. (2021). "Tukey's Honestly Significant
 | |
|            Difference (HSD) Test."
 | |
|            https://personal.utdallas.edu/~herve/abdi-HSD2010-pretty.pdf
 | |
|     .. [3] "One-Way ANOVA Using SAS PROC ANOVA & PROC GLM." SAS
 | |
|            Tutorials, 2007, www.stattutorials.com/SAS/TUTORIAL-PROC-GLM.htm.
 | |
|     .. [4] Kramer, Clyde Young. "Extension of Multiple Range Tests to Group
 | |
|            Means with Unequal Numbers of Replications." Biometrics, vol. 12,
 | |
|            no. 3, 1956, pp. 307-310. JSTOR, www.jstor.org/stable/3001469.
 | |
|            Accessed 25 May 2021.
 | |
|     .. [5] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.3.3.
 | |
|            The ANOVA table and tests of hypotheses about means"
 | |
|            https://www.itl.nist.gov/div898/handbook/prc/section4/prc433.htm,
 | |
|            2 June 2021.
 | |
|     .. [6] Tukey, John W. "Comparing Individual Means in the Analysis of
 | |
|            Variance." Biometrics, vol. 5, no. 2, 1949, pp. 99-114. JSTOR,
 | |
|            www.jstor.org/stable/3001913. Accessed 14 June 2021.
 | |
|     .. [7] P. A. Games and J. F. Howell, "Pairwise Multiple Comparison Procedures
 | |
|            with Unequal N's and/or Variances: A Monte Carlo Study," Journal of
 | |
|            Educational Statistics, vol. 1, no. 2, pp. 113-125, Jun. 1976,
 | |
|            doi: https://doi.org/10.3102/10769986001002113.
 | |
| 
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     Here are some data comparing the time to relief of three brands of
 | |
|     headache medicine, reported in minutes. Data adapted from [3]_.
 | |
| 
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats import tukey_hsd
 | |
|     >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
 | |
|     >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
 | |
|     >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
 | |
| 
 | |
|     We would like to see if the means between any of the groups are
 | |
|     significantly different. First, visually examine a box and whisker plot.
 | |
| 
 | |
|     >>> import matplotlib.pyplot as plt
 | |
|     >>> fig, ax = plt.subplots(1, 1)
 | |
|     >>> ax.boxplot([group0, group1, group2])
 | |
|     >>> ax.set_xticklabels(["group0", "group1", "group2"]) # doctest: +SKIP
 | |
|     >>> ax.set_ylabel("mean") # doctest: +SKIP
 | |
|     >>> plt.show()
 | |
| 
 | |
|     From the box and whisker plot, we can see overlap in the interquartile
 | |
|     ranges group 1 to group 2 and group 3, but we can apply the ``tukey_hsd``
 | |
|     test to determine if the difference between means is significant. We
 | |
|     set a significance level of .05 to reject the null hypothesis.
 | |
| 
 | |
|     >>> res = tukey_hsd(group0, group1, group2)
 | |
|     >>> print(res)
 | |
|     Pairwise Group Comparisons (95.0% Confidence Interval)
 | |
|     Comparison  Statistic  p-value  Lower CI  Upper CI
 | |
|      (0 - 1)     -4.600     0.014    -8.249    -0.951
 | |
|      (0 - 2)     -0.260     0.980    -3.909     3.389
 | |
|      (1 - 0)      4.600     0.014     0.951     8.249
 | |
|      (1 - 2)      4.340     0.020     0.691     7.989
 | |
|      (2 - 0)      0.260     0.980    -3.389     3.909
 | |
|      (2 - 1)     -4.340     0.020    -7.989    -0.691
 | |
| 
 | |
|     The null hypothesis is that each group has the same mean. The p-value for
 | |
|     comparisons between ``group0`` and ``group1`` as well as ``group1`` and
 | |
|     ``group2`` do not exceed .05, so we reject the null hypothesis that they
 | |
|     have the same means. The p-value of the comparison between ``group0``
 | |
|     and ``group2`` exceeds .05, so we accept the null hypothesis that there
 | |
|     is not a significant difference between their means.
 | |
| 
 | |
|     We can also compute the confidence interval associated with our chosen
 | |
|     confidence level.
 | |
| 
 | |
|     >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
 | |
|     >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
 | |
|     >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
 | |
|     >>> result = tukey_hsd(group0, group1, group2)
 | |
|     >>> conf = res.confidence_interval(confidence_level=.99)
 | |
|     >>> for ((i, j), l) in np.ndenumerate(conf.low):
 | |
|     ...     # filter out self comparisons
 | |
|     ...     if i != j:
 | |
|     ...         h = conf.high[i,j]
 | |
|     ...         print(f"({i} - {j}) {l:>6.3f} {h:>6.3f}")
 | |
|     (0 - 1) -9.480  0.280
 | |
|     (0 - 2) -5.140  4.620
 | |
|     (1 - 0) -0.280  9.480
 | |
|     (1 - 2) -0.540  9.220
 | |
|     (2 - 0) -4.620  5.140
 | |
|     (2 - 1) -9.220  0.540
 | |
|     """
 | |
|     args = _tukey_hsd_iv(args, equal_var)
 | |
|     ntreatments = len(args)
 | |
|     means = np.asarray([np.mean(arg) for arg in args])
 | |
|     nsamples_treatments = np.asarray([a.size for a in args])
 | |
|     nobs = np.sum(nsamples_treatments)
 | |
|     vars_ = np.asarray([np.var(arg, ddof=1) for arg in args])
 | |
| 
 | |
|     if equal_var:
 | |
|         # determine mean square error [5]. Note that this is sometimes called
 | |
|         # mean square error within.
 | |
|         mse = (np.sum(vars_ * (nsamples_treatments - 1)) / (nobs - ntreatments))
 | |
| 
 | |
|         # The calculation of the standard error differs when treatments differ in
 | |
|         # size. See ("Unequal sample sizes")[1].
 | |
|         if np.unique(nsamples_treatments).size == 1:
 | |
|             # all input groups are the same length, so only one value needs to be
 | |
|             # calculated [1].
 | |
|             normalize = 2 / nsamples_treatments[0]
 | |
|         else:
 | |
|             # to compare groups of differing sizes, we must compute a variance
 | |
|             # value for each individual comparison. Use broadcasting to get the
 | |
|             # resulting matrix. [3], verified against [4] (page 308).
 | |
|             normalize = 1 / nsamples_treatments + 1 / nsamples_treatments[None].T
 | |
| 
 | |
|         # the standard error is used in the computation of the tukey criterion and
 | |
|         # finding the p-values.
 | |
|         stand_err = np.sqrt(normalize * mse / 2)
 | |
|         df = nobs - ntreatments
 | |
|     else:
 | |
|         # `stand_err` is the denominator of the Behrens-Fisher statistic ($v$)
 | |
|         # with a factor of $\sqrt{2}$. Compare [7] p.116 "t-solution rejects H0 if...",
 | |
|         # [7] p. 117 "H0 was rejected", and definition of `t_stat` below.
 | |
|         sj2_nj = vars_ / nsamples_treatments
 | |
|         si2_ni = sj2_nj[:, np.newaxis]
 | |
|         stand_err = np.sqrt(si2_ni + sj2_nj) / 2**0.5
 | |
| 
 | |
|         # `df` is the Welch degree of freedom $\nu$.
 | |
|         # See [7] p. 116 "and the degrees of freedom, $\nu$, are given by...".
 | |
|         njm1 = nsamples_treatments - 1
 | |
|         nim1 = njm1[:, np.newaxis]
 | |
|         df = (si2_ni + sj2_nj)**2 / (si2_ni**2 / nim1 + sj2_nj**2 / njm1)
 | |
| 
 | |
|     # the mean difference is the test statistic.
 | |
|     mean_differences = means[None].T - means
 | |
| 
 | |
|     # Calculate the t-statistic to use within the survival function of the
 | |
|     # studentized range to get the p-value.
 | |
|     t_stat = np.abs(mean_differences) / stand_err
 | |
| 
 | |
|     params = t_stat, ntreatments, df
 | |
|     pvalues = distributions.studentized_range.sf(*params)
 | |
| 
 | |
|     return TukeyHSDResult(mean_differences, pvalues, ntreatments,
 | |
|                           df, stand_err)
 |