522 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			522 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| Contingency table functions (:mod:`scipy.stats.contingency`)
 | |
| ============================================================
 | |
| 
 | |
| Functions for creating and analyzing contingency tables.
 | |
| 
 | |
| .. currentmodule:: scipy.stats.contingency
 | |
| 
 | |
| .. autosummary::
 | |
|    :toctree: generated/
 | |
| 
 | |
|    chi2_contingency
 | |
|    relative_risk
 | |
|    odds_ratio
 | |
|    crosstab
 | |
|    association
 | |
| 
 | |
|    expected_freq
 | |
|    margins
 | |
| 
 | |
| """
 | |
| 
 | |
| 
 | |
| from functools import reduce
 | |
| import math
 | |
| import numpy as np
 | |
| from ._stats_py import power_divergence, _untabulate
 | |
| from ._relative_risk import relative_risk
 | |
| from ._crosstab import crosstab
 | |
| from ._odds_ratio import odds_ratio
 | |
| from scipy._lib._bunch import _make_tuple_bunch
 | |
| from scipy import stats
 | |
| 
 | |
| 
 | |
| __all__ = ['margins', 'expected_freq', 'chi2_contingency', 'crosstab',
 | |
|            'association', 'relative_risk', 'odds_ratio']
 | |
| 
 | |
| 
 | |
| def margins(a):
 | |
|     """Return a list of the marginal sums of the array `a`.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : ndarray
 | |
|         The array for which to compute the marginal sums.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     margsums : list of ndarrays
 | |
|         A list of length `a.ndim`.  `margsums[k]` is the result
 | |
|         of summing `a` over all axes except `k`; it has the same
 | |
|         number of dimensions as `a`, but the length of each axis
 | |
|         except axis `k` will be 1.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats.contingency import margins
 | |
| 
 | |
|     >>> a = np.arange(12).reshape(2, 6)
 | |
|     >>> a
 | |
|     array([[ 0,  1,  2,  3,  4,  5],
 | |
|            [ 6,  7,  8,  9, 10, 11]])
 | |
|     >>> m0, m1 = margins(a)
 | |
|     >>> m0
 | |
|     array([[15],
 | |
|            [51]])
 | |
|     >>> m1
 | |
|     array([[ 6,  8, 10, 12, 14, 16]])
 | |
| 
 | |
|     >>> b = np.arange(24).reshape(2,3,4)
 | |
|     >>> m0, m1, m2 = margins(b)
 | |
|     >>> m0
 | |
|     array([[[ 66]],
 | |
|            [[210]]])
 | |
|     >>> m1
 | |
|     array([[[ 60],
 | |
|             [ 92],
 | |
|             [124]]])
 | |
|     >>> m2
 | |
|     array([[[60, 66, 72, 78]]])
 | |
|     """
 | |
|     margsums = []
 | |
|     ranged = list(range(a.ndim))
 | |
|     for k in ranged:
 | |
|         marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
 | |
|         margsums.append(marg)
 | |
|     return margsums
 | |
| 
 | |
| 
 | |
| def expected_freq(observed):
 | |
|     """
 | |
|     Compute the expected frequencies from a contingency table.
 | |
| 
 | |
|     Given an n-dimensional contingency table of observed frequencies,
 | |
|     compute the expected frequencies for the table based on the marginal
 | |
|     sums under the assumption that the groups associated with each
 | |
|     dimension are independent.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     observed : array_like
 | |
|         The table of observed frequencies.  (While this function can handle
 | |
|         a 1-D array, that case is trivial.  Generally `observed` is at
 | |
|         least 2-D.)
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     expected : ndarray of float64
 | |
|         The expected frequencies, based on the marginal sums of the table.
 | |
|         Same shape as `observed`.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats.contingency import expected_freq
 | |
|     >>> observed = np.array([[10, 10, 20],[20, 20, 20]])
 | |
|     >>> expected_freq(observed)
 | |
|     array([[ 12.,  12.,  16.],
 | |
|            [ 18.,  18.,  24.]])
 | |
| 
 | |
|     """
 | |
|     # Typically `observed` is an integer array. If `observed` has a large
 | |
|     # number of dimensions or holds large values, some of the following
 | |
|     # computations may overflow, so we first switch to floating point.
 | |
|     observed = np.asarray(observed, dtype=np.float64)
 | |
| 
 | |
|     # Create a list of the marginal sums.
 | |
|     margsums = margins(observed)
 | |
| 
 | |
|     # Create the array of expected frequencies.  The shapes of the
 | |
|     # marginal sums returned by apply_over_axes() are just what we
 | |
|     # need for broadcasting in the following product.
 | |
|     d = observed.ndim
 | |
|     expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
 | |
|     return expected
 | |
| 
 | |
| 
 | |
| Chi2ContingencyResult = _make_tuple_bunch(
 | |
|     'Chi2ContingencyResult',
 | |
|     ['statistic', 'pvalue', 'dof', 'expected_freq'], []
 | |
| )
 | |
| 
 | |
| 
 | |
| def chi2_contingency(observed, correction=True, lambda_=None, *, method=None):
 | |
|     """Chi-square test of independence of variables in a contingency table.
 | |
| 
 | |
|     This function computes the chi-square statistic and p-value for the
 | |
|     hypothesis test of independence of the observed frequencies in the
 | |
|     contingency table [1]_ `observed`.  The expected frequencies are computed
 | |
|     based on the marginal sums under the assumption of independence; see
 | |
|     `scipy.stats.contingency.expected_freq`.  The number of degrees of
 | |
|     freedom is (expressed using numpy functions and attributes)::
 | |
| 
 | |
|         dof = observed.size - sum(observed.shape) + observed.ndim - 1
 | |
| 
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     observed : array_like
 | |
|         The contingency table. The table contains the observed frequencies
 | |
|         (i.e. number of occurrences) in each category.  In the two-dimensional
 | |
|         case, the table is often described as an "R x C table".
 | |
|     correction : bool, optional
 | |
|         If True, *and* the degrees of freedom is 1, apply Yates' correction
 | |
|         for continuity.  The effect of the correction is to adjust each
 | |
|         observed value by 0.5 towards the corresponding expected value.
 | |
|     lambda_ : float or str, optional
 | |
|         By default, the statistic computed in this test is Pearson's
 | |
|         chi-squared statistic [2]_.  `lambda_` allows a statistic from the
 | |
|         Cressie-Read power divergence family [3]_ to be used instead.  See
 | |
|         `scipy.stats.power_divergence` for details.
 | |
|     method : ResamplingMethod, optional
 | |
|         Defines the method used to compute the p-value. Compatible only with
 | |
|         `correction=False`,  default `lambda_`, and two-way tables.
 | |
|         If `method` is an instance of `PermutationMethod`/`MonteCarloMethod`,
 | |
|         the p-value is computed using
 | |
|         `scipy.stats.permutation_test`/`scipy.stats.monte_carlo_test` with the
 | |
|         provided configuration options and other appropriate settings.
 | |
|         Otherwise, the p-value is computed as documented in the notes.
 | |
|         Note that if `method` is an instance of `MonteCarloMethod`, the ``rvs``
 | |
|         attribute must be left unspecified; Monte Carlo samples are always drawn
 | |
|         using the ``rvs`` method of `scipy.stats.random_table`.
 | |
| 
 | |
|         .. versionadded:: 1.15.0
 | |
| 
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     res : Chi2ContingencyResult
 | |
|         An object containing attributes:
 | |
| 
 | |
|         statistic : float
 | |
|             The test statistic.
 | |
|         pvalue : float
 | |
|             The p-value of the test.
 | |
|         dof : int
 | |
|             The degrees of freedom. NaN if `method` is not ``None``.
 | |
|         expected_freq : ndarray, same shape as `observed`
 | |
|             The expected frequencies, based on the marginal sums of the table.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     scipy.stats.contingency.expected_freq
 | |
|     scipy.stats.fisher_exact
 | |
|     scipy.stats.chisquare
 | |
|     scipy.stats.power_divergence
 | |
|     scipy.stats.barnard_exact
 | |
|     scipy.stats.boschloo_exact
 | |
|     :ref:`hypothesis_chi2_contingency` : Extended example
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     An often quoted guideline for the validity of this calculation is that
 | |
|     the test should be used only if the observed and expected frequencies
 | |
|     in each cell are at least 5.
 | |
| 
 | |
|     This is a test for the independence of different categories of a
 | |
|     population. The test is only meaningful when the dimension of
 | |
|     `observed` is two or more.  Applying the test to a one-dimensional
 | |
|     table will always result in `expected` equal to `observed` and a
 | |
|     chi-square statistic equal to 0.
 | |
| 
 | |
|     This function does not handle masked arrays, because the calculation
 | |
|     does not make sense with missing values.
 | |
| 
 | |
|     Like `scipy.stats.chisquare`, this function computes a chi-square
 | |
|     statistic; the convenience this function provides is to figure out the
 | |
|     expected frequencies and degrees of freedom from the given contingency
 | |
|     table. If these were already known, and if the Yates' correction was not
 | |
|     required, one could use `scipy.stats.chisquare`.  That is, if one calls::
 | |
| 
 | |
|         res = chi2_contingency(obs, correction=False)
 | |
| 
 | |
|     then the following is true::
 | |
| 
 | |
|         (res.statistic, res.pvalue) == stats.chisquare(obs.ravel(),
 | |
|                                                        f_exp=ex.ravel(),
 | |
|                                                        ddof=obs.size - 1 - dof)
 | |
| 
 | |
|     The `lambda_` argument was added in version 0.13.0 of scipy.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] "Contingency table",
 | |
|            https://en.wikipedia.org/wiki/Contingency_table
 | |
|     .. [2] "Pearson's chi-squared test",
 | |
|            https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
 | |
|     .. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
 | |
|            Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
 | |
|            pp. 440-464.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     A two-way example (2 x 3):
 | |
| 
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats import chi2_contingency
 | |
|     >>> obs = np.array([[10, 10, 20], [20, 20, 20]])
 | |
|     >>> res = chi2_contingency(obs)
 | |
|     >>> res.statistic
 | |
|     2.7777777777777777
 | |
|     >>> res.pvalue
 | |
|     0.24935220877729619
 | |
|     >>> res.dof
 | |
|     2
 | |
|     >>> res.expected_freq
 | |
|     array([[ 12.,  12.,  16.],
 | |
|            [ 18.,  18.,  24.]])
 | |
| 
 | |
|     Perform the test using the log-likelihood ratio (i.e. the "G-test")
 | |
|     instead of Pearson's chi-squared statistic.
 | |
| 
 | |
|     >>> res = chi2_contingency(obs, lambda_="log-likelihood")
 | |
|     >>> res.statistic
 | |
|     2.7688587616781319
 | |
|     >>> res.pvalue
 | |
|     0.25046668010954165
 | |
| 
 | |
|     A four-way example (2 x 2 x 2 x 2):
 | |
| 
 | |
|     >>> obs = np.array(
 | |
|     ...     [[[[12, 17],
 | |
|     ...        [11, 16]],
 | |
|     ...       [[11, 12],
 | |
|     ...        [15, 16]]],
 | |
|     ...      [[[23, 15],
 | |
|     ...        [30, 22]],
 | |
|     ...       [[14, 17],
 | |
|     ...        [15, 16]]]])
 | |
|     >>> res = chi2_contingency(obs)
 | |
|     >>> res.statistic
 | |
|     8.7584514426741897
 | |
|     >>> res.pvalue
 | |
|     0.64417725029295503
 | |
| 
 | |
|     When the sum of the elements in a two-way table is small, the p-value
 | |
|     produced by the default asymptotic approximation may be inaccurate.
 | |
|     Consider passing a `PermutationMethod` or `MonteCarloMethod` as the
 | |
|     `method` parameter with `correction=False`.
 | |
| 
 | |
|     >>> from scipy.stats import PermutationMethod
 | |
|     >>> obs = np.asarray([[12, 3],
 | |
|     ...                   [17, 16]])
 | |
|     >>> res = chi2_contingency(obs, correction=False)
 | |
|     >>> ref = chi2_contingency(obs, correction=False, method=PermutationMethod())
 | |
|     >>> res.pvalue, ref.pvalue
 | |
|     (0.0614122539870913, 0.1074)  # may vary
 | |
| 
 | |
|     For a more detailed example, see :ref:`hypothesis_chi2_contingency`.
 | |
| 
 | |
|     """
 | |
|     observed = np.asarray(observed)
 | |
|     if np.any(observed < 0):
 | |
|         raise ValueError("All values in `observed` must be nonnegative.")
 | |
|     if observed.size == 0:
 | |
|         raise ValueError("No data; `observed` has size 0.")
 | |
| 
 | |
|     expected = expected_freq(observed)
 | |
|     if np.any(expected == 0):
 | |
|         # Include one of the positions where expected is zero in
 | |
|         # the exception message.
 | |
|         zeropos = list(zip(*np.nonzero(expected == 0)))[0]
 | |
|         raise ValueError("The internally computed table of expected "
 | |
|                          f"frequencies has a zero element at {zeropos}.")
 | |
| 
 | |
|     if method is not None:
 | |
|         return _chi2_resampling_methods(observed, expected, correction, lambda_, method)
 | |
| 
 | |
|     # The degrees of freedom
 | |
|     dof = expected.size - sum(expected.shape) + expected.ndim - 1
 | |
| 
 | |
|     if dof == 0:
 | |
|         # Degenerate case; this occurs when `observed` is 1D (or, more
 | |
|         # generally, when it has only one nontrivial dimension).  In this
 | |
|         # case, we also have observed == expected, so chi2 is 0.
 | |
|         chi2 = 0.0
 | |
|         p = 1.0
 | |
|     else:
 | |
|         if dof == 1 and correction:
 | |
|             # Adjust `observed` according to Yates' correction for continuity.
 | |
|             # Magnitude of correction no bigger than difference; see gh-13875
 | |
|             diff = expected - observed
 | |
|             direction = np.sign(diff)
 | |
|             magnitude = np.minimum(0.5, np.abs(diff))
 | |
|             observed = observed + magnitude * direction
 | |
| 
 | |
|         chi2, p = power_divergence(observed, expected,
 | |
|                                    ddof=observed.size - 1 - dof, axis=None,
 | |
|                                    lambda_=lambda_)
 | |
| 
 | |
|     return Chi2ContingencyResult(chi2, p, dof, expected)
 | |
| 
 | |
| 
 | |
| def _chi2_resampling_methods(observed, expected, correction, lambda_, method):
 | |
| 
 | |
|     if observed.ndim != 2:
 | |
|         message = 'Use of `method` is only compatible with two-way tables.'
 | |
|         raise ValueError(message)
 | |
| 
 | |
|     if correction:
 | |
|         message = f'`{correction=}` is not compatible with `{method=}.`'
 | |
|         raise ValueError(message)
 | |
| 
 | |
|     if lambda_ is not None:
 | |
|         message = f'`{lambda_=}` is not compatible with `{method=}.`'
 | |
|         raise ValueError(message)
 | |
| 
 | |
|     if isinstance(method, stats.PermutationMethod):
 | |
|         res = _chi2_permutation_method(observed, expected, method)
 | |
|     elif isinstance(method, stats.MonteCarloMethod):
 | |
|         res = _chi2_monte_carlo_method(observed, expected, method)
 | |
|     else:
 | |
|         message = (f'`{method=}` not recognized; if provided, `method` must be an '
 | |
|                    'instance of `PermutationMethod` or `MonteCarloMethod`.')
 | |
|         raise ValueError(message)
 | |
| 
 | |
|     return Chi2ContingencyResult(res.statistic, res.pvalue, np.nan, expected)
 | |
| 
 | |
| 
 | |
| def _chi2_permutation_method(observed, expected, method):
 | |
|     x, y = _untabulate(observed)
 | |
|     # `permutation_test` with `permutation_type='pairings' permutes the order of `x`,
 | |
|     # which pairs observations in `x` with different observations in `y`.
 | |
|     def statistic(x):
 | |
|         # crosstab the resample and compute the statistic
 | |
|         table = crosstab(x, y)[1]
 | |
|         return np.sum((table - expected)**2/expected)
 | |
| 
 | |
|     return stats.permutation_test((x,), statistic, permutation_type='pairings',
 | |
|                                   alternative='greater', **method._asdict())
 | |
| 
 | |
| 
 | |
| def _chi2_monte_carlo_method(observed, expected, method):
 | |
|     method = method._asdict()
 | |
| 
 | |
|     if method.pop('rvs', None) is not None:
 | |
|         message = ('If the `method` argument of `chi2_contingency` is an '
 | |
|                    'instance of `MonteCarloMethod`, its `rvs` attribute '
 | |
|                    'must be unspecified. Use the `MonteCarloMethod` `rng` argument '
 | |
|                    'to control the random state.')
 | |
|         raise ValueError(message)
 | |
|     rng = np.random.default_rng(method.pop('rng', None))
 | |
| 
 | |
|     # `random_table.rvs` produces random contingency tables with the given marginals
 | |
|     # under the null hypothesis of independence
 | |
|     rowsums, colsums = stats.contingency.margins(observed)
 | |
|     X = stats.random_table(rowsums.ravel(), colsums.ravel(), seed=rng)
 | |
|     def rvs(size):
 | |
|         n_resamples = size[0]
 | |
|         return X.rvs(size=n_resamples).reshape(size)
 | |
| 
 | |
|     expected = expected.ravel()
 | |
|     def statistic(table, axis):
 | |
|         return np.sum((table - expected)**2/expected, axis=axis)
 | |
| 
 | |
|     return stats.monte_carlo_test(observed.ravel(), rvs, statistic,
 | |
|                                   alternative='greater', **method)
 | |
| 
 | |
| 
 | |
| def association(observed, method="cramer", correction=False, lambda_=None):
 | |
|     """Calculates degree of association between two nominal variables.
 | |
| 
 | |
|     The function provides the option for computing one of three measures of
 | |
|     association between two nominal variables from the data given in a 2d
 | |
|     contingency table: Tschuprow's T, Pearson's Contingency Coefficient
 | |
|     and Cramer's V.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     observed : array-like
 | |
|         The array of observed values
 | |
|     method : {"cramer", "tschuprow", "pearson"} (default = "cramer")
 | |
|         The association test statistic.
 | |
|     correction : bool, optional
 | |
|         Inherited from `scipy.stats.contingency.chi2_contingency()`
 | |
|     lambda_ : float or str, optional
 | |
|         Inherited from `scipy.stats.contingency.chi2_contingency()`
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : float
 | |
|         Value of the test statistic
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     Cramer's V, Tschuprow's T and Pearson's Contingency Coefficient, all
 | |
|     measure the degree to which two nominal or ordinal variables are related,
 | |
|     or the level of their association. This differs from correlation, although
 | |
|     many often mistakenly consider them equivalent. Correlation measures in
 | |
|     what way two variables are related, whereas, association measures how
 | |
|     related the variables are. As such, association does not subsume
 | |
|     independent variables, and is rather a test of independence. A value of
 | |
|     1.0 indicates perfect association, and 0.0 means the variables have no
 | |
|     association.
 | |
| 
 | |
|     Both the Cramer's V and Tschuprow's T are extensions of the phi
 | |
|     coefficient.  Moreover, due to the close relationship between the
 | |
|     Cramer's V and Tschuprow's T the returned values can often be similar
 | |
|     or even equivalent.  They are likely to diverge more as the array shape
 | |
|     diverges from a 2x2.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] "Tschuprow's T",
 | |
|            https://en.wikipedia.org/wiki/Tschuprow's_T
 | |
|     .. [2] Tschuprow, A. A. (1939)
 | |
|            Principles of the Mathematical Theory of Correlation;
 | |
|            translated by M. Kantorowitsch. W. Hodge & Co.
 | |
|     .. [3] "Cramer's V", https://en.wikipedia.org/wiki/Cramer's_V
 | |
|     .. [4] "Nominal Association: Phi and Cramer's V",
 | |
|            http://www.people.vcu.edu/~pdattalo/702SuppRead/MeasAssoc/NominalAssoc.html
 | |
|     .. [5] Gingrich, Paul, "Association Between Variables",
 | |
|            http://uregina.ca/~gingrich/ch11a.pdf
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     An example with a 4x2 contingency table:
 | |
| 
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats.contingency import association
 | |
|     >>> obs4x2 = np.array([[100, 150], [203, 322], [420, 700], [320, 210]])
 | |
| 
 | |
|     Pearson's contingency coefficient
 | |
| 
 | |
|     >>> association(obs4x2, method="pearson")
 | |
|     0.18303298140595667
 | |
| 
 | |
|     Cramer's V
 | |
| 
 | |
|     >>> association(obs4x2, method="cramer")
 | |
|     0.18617813077483678
 | |
| 
 | |
|     Tschuprow's T
 | |
| 
 | |
|     >>> association(obs4x2, method="tschuprow")
 | |
|     0.14146478765062995
 | |
|     """
 | |
|     arr = np.asarray(observed)
 | |
|     if not np.issubdtype(arr.dtype, np.integer):
 | |
|         raise ValueError("`observed` must be an integer array.")
 | |
| 
 | |
|     if len(arr.shape) != 2:
 | |
|         raise ValueError("method only accepts 2d arrays")
 | |
| 
 | |
|     chi2_stat = chi2_contingency(arr, correction=correction,
 | |
|                                  lambda_=lambda_)
 | |
| 
 | |
|     phi2 = chi2_stat.statistic / arr.sum()
 | |
|     n_rows, n_cols = arr.shape
 | |
|     if method == "cramer":
 | |
|         value = phi2 / min(n_cols - 1, n_rows - 1)
 | |
|     elif method == "tschuprow":
 | |
|         value = phi2 / math.sqrt((n_rows - 1) * (n_cols - 1))
 | |
|     elif method == 'pearson':
 | |
|         value = phi2 / (1 + phi2)
 | |
|     else:
 | |
|         raise ValueError("Invalid argument value: 'method' argument must "
 | |
|                          "be 'cramer', 'tschuprow', or 'pearson'")
 | |
| 
 | |
|     return math.sqrt(value)
 |