3659 lines
		
	
	
		
			120 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			3659 lines
		
	
	
		
			120 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| An extension of scipy.stats._stats_py to support masked arrays
 | |
| 
 | |
| """
 | |
| # Original author (2007): Pierre GF Gerard-Marchant
 | |
| 
 | |
| 
 | |
| __all__ = ['argstoarray',
 | |
|            'count_tied_groups',
 | |
|            'describe',
 | |
|            'f_oneway', 'find_repeats','friedmanchisquare',
 | |
|            'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',
 | |
|            'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest',
 | |
|            'ks_1samp', 'kstest',
 | |
|            'linregress',
 | |
|            'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',
 | |
|            'normaltest',
 | |
|            'obrientransform',
 | |
|            'pearsonr','plotting_positions','pointbiserialr',
 | |
|            'rankdata',
 | |
|            'scoreatpercentile','sem',
 | |
|            'sen_seasonal_slopes','skew','skewtest','spearmanr',
 | |
|            'siegelslopes', 'theilslopes',
 | |
|            'tmax','tmean','tmin','trim','trimboth',
 | |
|            'trimtail','trima','trimr','trimmed_mean','trimmed_std',
 | |
|            'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
 | |
|            'ttest_ind','ttest_rel','tvar',
 | |
|            'variation',
 | |
|            'winsorize',
 | |
|            'brunnermunzel',
 | |
|            ]
 | |
| 
 | |
| import numpy as np
 | |
| from numpy import ndarray
 | |
| import numpy.ma as ma
 | |
| from numpy.ma import masked, nomask
 | |
| import math
 | |
| 
 | |
| import itertools
 | |
| import warnings
 | |
| from collections import namedtuple
 | |
| 
 | |
| from . import distributions
 | |
| from scipy._lib._util import _rename_parameter, _contains_nan
 | |
| from scipy._lib._bunch import _make_tuple_bunch
 | |
| import scipy.special as special
 | |
| import scipy.stats._stats_py
 | |
| import scipy.stats._stats_py as _stats_py
 | |
| 
 | |
| from ._stats_mstats_common import (
 | |
|         _find_repeats,
 | |
|         theilslopes as stats_theilslopes,
 | |
|         siegelslopes as stats_siegelslopes
 | |
|         )
 | |
| 
 | |
| 
 | |
| def _chk_asarray(a, axis):
 | |
|     # Always returns a masked array, raveled for axis=None
 | |
|     a = ma.asanyarray(a)
 | |
|     if axis is None:
 | |
|         a = ma.ravel(a)
 | |
|         outaxis = 0
 | |
|     else:
 | |
|         outaxis = axis
 | |
|     return a, outaxis
 | |
| 
 | |
| 
 | |
| def _chk2_asarray(a, b, axis):
 | |
|     a = ma.asanyarray(a)
 | |
|     b = ma.asanyarray(b)
 | |
|     if axis is None:
 | |
|         a = ma.ravel(a)
 | |
|         b = ma.ravel(b)
 | |
|         outaxis = 0
 | |
|     else:
 | |
|         outaxis = axis
 | |
|     return a, b, outaxis
 | |
| 
 | |
| 
 | |
| def _chk_size(a, b):
 | |
|     a = ma.asanyarray(a)
 | |
|     b = ma.asanyarray(b)
 | |
|     (na, nb) = (a.size, b.size)
 | |
|     if na != nb:
 | |
|         raise ValueError("The size of the input array should match!"
 | |
|                          f" ({na} <> {nb})")
 | |
|     return (a, b, na)
 | |
| 
 | |
| 
 | |
| def _ttest_finish(df, t, alternative):
 | |
|     """Common code between all 3 t-test functions."""
 | |
|     # We use ``stdtr`` directly here to preserve masked arrays
 | |
| 
 | |
|     if alternative == 'less':
 | |
|         pval = special._ufuncs.stdtr(df, t)
 | |
|     elif alternative == 'greater':
 | |
|         pval = special._ufuncs.stdtr(df, -t)
 | |
|     elif alternative == 'two-sided':
 | |
|         pval = special._ufuncs.stdtr(df, -np.abs(t))*2
 | |
|     else:
 | |
|         raise ValueError("alternative must be "
 | |
|                          "'less', 'greater' or 'two-sided'")
 | |
| 
 | |
|     if t.ndim == 0:
 | |
|         t = t[()]
 | |
|     if pval.ndim == 0:
 | |
|         pval = pval[()]
 | |
| 
 | |
|     return t, pval
 | |
| 
 | |
| 
 | |
| def argstoarray(*args):
 | |
|     """
 | |
|     Constructs a 2D array from a group of sequences.
 | |
| 
 | |
|     Sequences are filled with missing values to match the length of the longest
 | |
|     sequence.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     *args : sequences
 | |
|         Group of sequences.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     argstoarray : MaskedArray
 | |
|         A ( `m` x `n` ) masked array, where `m` is the number of arguments and
 | |
|         `n` the length of the longest argument.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     `numpy.ma.vstack` has identical behavior, but is called with a sequence
 | |
|     of sequences.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     A 2D masked array constructed from a group of sequences is returned.
 | |
| 
 | |
|     >>> from scipy.stats.mstats import argstoarray
 | |
|     >>> argstoarray([1, 2, 3], [4, 5, 6])
 | |
|     masked_array(
 | |
|      data=[[1.0, 2.0, 3.0],
 | |
|            [4.0, 5.0, 6.0]],
 | |
|      mask=[[False, False, False],
 | |
|            [False, False, False]],
 | |
|      fill_value=1e+20)
 | |
| 
 | |
|     The returned masked array filled with missing values when the lengths of
 | |
|     sequences are different.
 | |
| 
 | |
|     >>> argstoarray([1, 3], [4, 5, 6])
 | |
|     masked_array(
 | |
|      data=[[1.0, 3.0, --],
 | |
|            [4.0, 5.0, 6.0]],
 | |
|      mask=[[False, False,  True],
 | |
|            [False, False, False]],
 | |
|      fill_value=1e+20)
 | |
| 
 | |
|     """
 | |
|     if len(args) == 1 and not isinstance(args[0], ndarray):
 | |
|         output = ma.asarray(args[0])
 | |
|         if output.ndim != 2:
 | |
|             raise ValueError("The input should be 2D")
 | |
|     else:
 | |
|         n = len(args)
 | |
|         m = max([len(k) for k in args])
 | |
|         output = ma.array(np.empty((n,m), dtype=float), mask=True)
 | |
|         for (k,v) in enumerate(args):
 | |
|             output[k,:len(v)] = v
 | |
| 
 | |
|     output[np.logical_not(np.isfinite(output._data))] = masked
 | |
|     return output
 | |
| 
 | |
| 
 | |
| def find_repeats(arr):
 | |
|     """Find repeats in arr and return a tuple (repeats, repeat_count).
 | |
| 
 | |
|     The input is cast to float64. Masked values are discarded.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     arr : sequence
 | |
|         Input array. The array is flattened if it is not 1D.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     repeats : ndarray
 | |
|         Array of repeated values.
 | |
|     counts : ndarray
 | |
|         Array of counts.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> from scipy.stats import mstats
 | |
|     >>> mstats.find_repeats([2, 1, 2, 3, 2, 2, 5])
 | |
|     (array([2.]), array([4]))
 | |
| 
 | |
|     In the above example, 2 repeats 4 times.
 | |
| 
 | |
|     >>> mstats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
 | |
|     (array([4., 5.]), array([2, 2]))
 | |
| 
 | |
|     In the above example, both 4 and 5 repeat 2 times.
 | |
| 
 | |
|     """
 | |
|     # Make sure we get a copy. ma.compressed promises a "new array", but can
 | |
|     # actually return a reference.
 | |
|     compr = np.asarray(ma.compressed(arr), dtype=np.float64)
 | |
|     try:
 | |
|         need_copy = np.may_share_memory(compr, arr)
 | |
|     except AttributeError:
 | |
|         # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
 | |
|         # while in numpy 1.8.2 and above it just (correctly) returns False.
 | |
|         need_copy = False
 | |
|     if need_copy:
 | |
|         compr = compr.copy()
 | |
|     return _find_repeats(compr)
 | |
| 
 | |
| 
 | |
| def count_tied_groups(x, use_missing=False):
 | |
|     """
 | |
|     Counts the number of tied values.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : sequence
 | |
|         Sequence of data on which to counts the ties
 | |
|     use_missing : bool, optional
 | |
|         Whether to consider missing values as tied.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     count_tied_groups : dict
 | |
|         Returns a dictionary (nb of ties: nb of groups).
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> from scipy.stats import mstats
 | |
|     >>> import numpy as np
 | |
|     >>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6]
 | |
|     >>> mstats.count_tied_groups(z)
 | |
|     {2: 1, 3: 2}
 | |
| 
 | |
|     In the above example, the ties were 0 (3x), 2 (3x) and 3 (2x).
 | |
| 
 | |
|     >>> z = np.ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6])
 | |
|     >>> mstats.count_tied_groups(z)
 | |
|     {2: 2, 3: 1}
 | |
|     >>> z[[1,-1]] = np.ma.masked
 | |
|     >>> mstats.count_tied_groups(z, use_missing=True)
 | |
|     {2: 2, 3: 1}
 | |
| 
 | |
|     """
 | |
|     nmasked = ma.getmask(x).sum()
 | |
|     # We need the copy as find_repeats will overwrite the initial data
 | |
|     data = ma.compressed(x).copy()
 | |
|     (ties, counts) = find_repeats(data)
 | |
|     nties = {}
 | |
|     if len(ties):
 | |
|         nties = dict(zip(np.unique(counts), itertools.repeat(1)))
 | |
|         nties.update(dict(zip(*find_repeats(counts))))
 | |
| 
 | |
|     if nmasked and use_missing:
 | |
|         try:
 | |
|             nties[nmasked] += 1
 | |
|         except KeyError:
 | |
|             nties[nmasked] = 1
 | |
| 
 | |
|     return nties
 | |
| 
 | |
| 
 | |
| def rankdata(data, axis=None, use_missing=False):
 | |
|     """Returns the rank (also known as order statistics) of each data point
 | |
|     along the given axis.
 | |
| 
 | |
|     If some values are tied, their rank is averaged.
 | |
|     If some values are masked, their rank is set to 0 if use_missing is False,
 | |
|     or set to the average rank of the unmasked values if use_missing is True.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     data : sequence
 | |
|         Input data. The data is transformed to a masked array
 | |
|     axis : {None,int}, optional
 | |
|         Axis along which to perform the ranking.
 | |
|         If None, the array is first flattened. An exception is raised if
 | |
|         the axis is specified for arrays with a dimension larger than 2
 | |
|     use_missing : bool, optional
 | |
|         Whether the masked values have a rank of 0 (False) or equal to the
 | |
|         average rank of the unmasked values (True).
 | |
| 
 | |
|     """
 | |
|     def _rank1d(data, use_missing=False):
 | |
|         n = data.count()
 | |
|         rk = np.empty(data.size, dtype=float)
 | |
|         idx = data.argsort()
 | |
|         rk[idx[:n]] = np.arange(1,n+1)
 | |
| 
 | |
|         if use_missing:
 | |
|             rk[idx[n:]] = (n+1)/2.
 | |
|         else:
 | |
|             rk[idx[n:]] = 0
 | |
| 
 | |
|         repeats = find_repeats(data.copy())
 | |
|         for r in repeats[0]:
 | |
|             condition = (data == r).filled(False)
 | |
|             rk[condition] = rk[condition].mean()
 | |
|         return rk
 | |
| 
 | |
|     data = ma.array(data, copy=False)
 | |
|     if axis is None:
 | |
|         if data.ndim > 1:
 | |
|             return _rank1d(data.ravel(), use_missing).reshape(data.shape)
 | |
|         else:
 | |
|             return _rank1d(data, use_missing)
 | |
|     else:
 | |
|         return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray)
 | |
| 
 | |
| 
 | |
| ModeResult = namedtuple('ModeResult', ('mode', 'count'))
 | |
| 
 | |
| 
 | |
| def mode(a, axis=0):
 | |
|     """
 | |
|     Returns an array of the modal (most common) value in the passed array.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         n-dimensional array of which to find mode(s).
 | |
|     axis : int or None, optional
 | |
|         Axis along which to operate. Default is 0. If None, compute over
 | |
|         the whole array `a`.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     mode : ndarray
 | |
|         Array of modal values.
 | |
|     count : ndarray
 | |
|         Array of counts for each mode.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details, see `scipy.stats.mode`.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy import stats
 | |
|     >>> from scipy.stats import mstats
 | |
|     >>> m_arr = np.ma.array([1, 1, 0, 0, 0, 0], mask=[0, 0, 1, 1, 1, 0])
 | |
|     >>> mstats.mode(m_arr)  # note that most zeros are masked
 | |
|     ModeResult(mode=array([1.]), count=array([2.]))
 | |
| 
 | |
|     """
 | |
|     return _mode(a, axis=axis, keepdims=True)
 | |
| 
 | |
| 
 | |
| def _mode(a, axis=0, keepdims=True):
 | |
|     # Don't want to expose `keepdims` from the public `mstats.mode`
 | |
|     a, axis = _chk_asarray(a, axis)
 | |
| 
 | |
|     def _mode1D(a):
 | |
|         (rep,cnt) = find_repeats(a)
 | |
|         if not cnt.ndim:
 | |
|             return (0, 0)
 | |
|         elif cnt.size:
 | |
|             return (rep[cnt.argmax()], cnt.max())
 | |
|         else:
 | |
|             return (a.min(), 1)
 | |
| 
 | |
|     if axis is None:
 | |
|         output = _mode1D(ma.ravel(a))
 | |
|         output = (ma.array(output[0]), ma.array(output[1]))
 | |
|     else:
 | |
|         output = ma.apply_along_axis(_mode1D, axis, a)
 | |
|         if keepdims is None or keepdims:
 | |
|             newshape = list(a.shape)
 | |
|             newshape[axis] = 1
 | |
|             slices = [slice(None)] * output.ndim
 | |
|             slices[axis] = 0
 | |
|             modes = output[tuple(slices)].reshape(newshape)
 | |
|             slices[axis] = 1
 | |
|             counts = output[tuple(slices)].reshape(newshape)
 | |
|             output = (modes, counts)
 | |
|         else:
 | |
|             output = np.moveaxis(output, axis, 0)
 | |
| 
 | |
|     return ModeResult(*output)
 | |
| 
 | |
| 
 | |
| def _betai(a, b, x):
 | |
|     x = np.asanyarray(x)
 | |
|     x = ma.where(x < 1.0, x, 1.0)  # if x > 1 then return 1.0
 | |
|     return special.betainc(a, b, x)
 | |
| 
 | |
| 
 | |
| def msign(x):
 | |
|     """Returns the sign of x, or 0 if x is masked."""
 | |
|     return ma.filled(np.sign(x), 0)
 | |
| 
 | |
| 
 | |
| def pearsonr(x, y):
 | |
|     r"""
 | |
|     Pearson correlation coefficient and p-value for testing non-correlation.
 | |
| 
 | |
|     The Pearson correlation coefficient [1]_ measures the linear relationship
 | |
|     between two datasets.  The calculation of the p-value relies on the
 | |
|     assumption that each dataset is normally distributed.  (See Kowalski [3]_
 | |
|     for a discussion of the effects of non-normality of the input on the
 | |
|     distribution of the correlation coefficient.)  Like other correlation
 | |
|     coefficients, this one varies between -1 and +1 with 0 implying no
 | |
|     correlation. Correlations of -1 or +1 imply an exact linear relationship.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : (N,) array_like
 | |
|         Input array.
 | |
|     y : (N,) array_like
 | |
|         Input array.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     r : float
 | |
|         Pearson's correlation coefficient.
 | |
|     p-value : float
 | |
|         Two-tailed p-value.
 | |
| 
 | |
|     Warns
 | |
|     -----
 | |
|     `~scipy.stats.ConstantInputWarning`
 | |
|         Raised if an input is a constant array.  The correlation coefficient
 | |
|         is not defined in this case, so ``np.nan`` is returned.
 | |
| 
 | |
|     `~scipy.stats.NearConstantInputWarning`
 | |
|         Raised if an input is "nearly" constant.  The array ``x`` is considered
 | |
|         nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``.
 | |
|         Numerical errors in the calculation ``x - mean(x)`` in this case might
 | |
|         result in an inaccurate calculation of r.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     spearmanr : Spearman rank-order correlation coefficient.
 | |
|     kendalltau : Kendall's tau, a correlation measure for ordinal data.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     The correlation coefficient is calculated as follows:
 | |
| 
 | |
|     .. math::
 | |
| 
 | |
|         r = \frac{\sum (x - m_x) (y - m_y)}
 | |
|                  {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}
 | |
| 
 | |
|     where :math:`m_x` is the mean of the vector x and :math:`m_y` is
 | |
|     the mean of the vector y.
 | |
| 
 | |
|     Under the assumption that x and y are drawn from
 | |
|     independent normal distributions (so the population correlation coefficient
 | |
|     is 0), the probability density function of the sample correlation
 | |
|     coefficient r is ([1]_, [2]_):
 | |
| 
 | |
|     .. math::
 | |
| 
 | |
|         f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)}
 | |
| 
 | |
|     where n is the number of samples, and B is the beta function.  This
 | |
|     is sometimes referred to as the exact distribution of r.  This is
 | |
|     the distribution that is used in `pearsonr` to compute the p-value.
 | |
|     The distribution is a beta distribution on the interval [-1, 1],
 | |
|     with equal shape parameters a = b = n/2 - 1.  In terms of SciPy's
 | |
|     implementation of the beta distribution, the distribution of r is::
 | |
| 
 | |
|         dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)
 | |
| 
 | |
|     The p-value returned by `pearsonr` is a two-sided p-value. The p-value
 | |
|     roughly indicates the probability of an uncorrelated system
 | |
|     producing datasets that have a Pearson correlation at least as extreme
 | |
|     as the one computed from these datasets. More precisely, for a
 | |
|     given sample with correlation coefficient r, the p-value is
 | |
|     the probability that abs(r') of a random sample x' and y' drawn from
 | |
|     the population with zero correlation would be greater than or equal
 | |
|     to abs(r). In terms of the object ``dist`` shown above, the p-value
 | |
|     for a given r and length n can be computed as::
 | |
| 
 | |
|         p = 2*dist.cdf(-abs(r))
 | |
| 
 | |
|     When n is 2, the above continuous distribution is not well-defined.
 | |
|     One can interpret the limit of the beta distribution as the shape
 | |
|     parameters a and b approach a = b = 0 as a discrete distribution with
 | |
|     equal probability masses at r = 1 and r = -1.  More directly, one
 | |
|     can observe that, given the data x = [x1, x2] and y = [y1, y2], and
 | |
|     assuming x1 != x2 and y1 != y2, the only possible values for r are 1
 | |
|     and -1.  Because abs(r') for any sample x' and y' with length 2 will
 | |
|     be 1, the two-sided p-value for a sample of length 2 is always 1.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] "Pearson correlation coefficient", Wikipedia,
 | |
|            https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
 | |
|     .. [2] Student, "Probable error of a correlation coefficient",
 | |
|            Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310.
 | |
|     .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution
 | |
|            of the Sample Product-Moment Correlation Coefficient"
 | |
|            Journal of the Royal Statistical Society. Series C (Applied
 | |
|            Statistics), Vol. 21, No. 1 (1972), pp. 1-12.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy import stats
 | |
|     >>> from scipy.stats import mstats
 | |
|     >>> mstats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4])
 | |
|     (-0.7426106572325057, 0.1505558088534455)
 | |
| 
 | |
|     There is a linear dependence between x and y if y = a + b*x + e, where
 | |
|     a,b are constants and e is a random error term, assumed to be independent
 | |
|     of x. For simplicity, assume that x is standard normal, a=0, b=1 and let
 | |
|     e follow a normal distribution with mean zero and standard deviation s>0.
 | |
| 
 | |
|     >>> s = 0.5
 | |
|     >>> x = stats.norm.rvs(size=500)
 | |
|     >>> e = stats.norm.rvs(scale=s, size=500)
 | |
|     >>> y = x + e
 | |
|     >>> mstats.pearsonr(x, y)
 | |
|     (0.9029601878969703, 8.428978827629898e-185) # may vary
 | |
| 
 | |
|     This should be close to the exact value given by
 | |
| 
 | |
|     >>> 1/np.sqrt(1 + s**2)
 | |
|     0.8944271909999159
 | |
| 
 | |
|     For s=0.5, we observe a high level of correlation. In general, a large
 | |
|     variance of the noise reduces the correlation, while the correlation
 | |
|     approaches one as the variance of the error goes to zero.
 | |
| 
 | |
|     It is important to keep in mind that no correlation does not imply
 | |
|     independence unless (x, y) is jointly normal. Correlation can even be zero
 | |
|     when there is a very simple dependence structure: if X follows a
 | |
|     standard normal distribution, let y = abs(x). Note that the correlation
 | |
|     between x and y is zero. Indeed, since the expectation of x is zero,
 | |
|     cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero
 | |
|     by symmetry. The following lines of code illustrate this observation:
 | |
| 
 | |
|     >>> y = np.abs(x)
 | |
|     >>> mstats.pearsonr(x, y)
 | |
|     (-0.016172891856853524, 0.7182823678751942) # may vary
 | |
| 
 | |
|     A non-zero correlation coefficient can be misleading. For example, if X has
 | |
|     a standard normal distribution, define y = x if x < 0 and y = 0 otherwise.
 | |
|     A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797...,
 | |
|     implying a high level of correlation:
 | |
| 
 | |
|     >>> y = np.where(x < 0, x, 0)
 | |
|     >>> mstats.pearsonr(x, y)
 | |
|     (0.8537091583771509, 3.183461621422181e-143) # may vary
 | |
| 
 | |
|     This is unintuitive since there is no dependence of x and y if x is larger
 | |
|     than zero which happens in about half of the cases if we sample x and y.
 | |
|     """
 | |
|     (x, y, n) = _chk_size(x, y)
 | |
|     (x, y) = (x.ravel(), y.ravel())
 | |
|     # Get the common mask and the total nb of unmasked elements
 | |
|     m = ma.mask_or(ma.getmask(x), ma.getmask(y))
 | |
|     n -= m.sum()
 | |
|     df = n-2
 | |
|     if df < 0:
 | |
|         return (masked, masked)
 | |
| 
 | |
|     return scipy.stats._stats_py.pearsonr(
 | |
|                 ma.masked_array(x, mask=m).compressed(),
 | |
|                 ma.masked_array(y, mask=m).compressed())
 | |
| 
 | |
| 
 | |
| def spearmanr(x, y=None, use_ties=True, axis=None, nan_policy='propagate',
 | |
|               alternative='two-sided'):
 | |
|     """
 | |
|     Calculates a Spearman rank-order correlation coefficient and the p-value
 | |
|     to test for non-correlation.
 | |
| 
 | |
|     The Spearman correlation is a nonparametric measure of the linear
 | |
|     relationship between two datasets. Unlike the Pearson correlation, the
 | |
|     Spearman correlation does not assume that both datasets are normally
 | |
|     distributed. Like other correlation coefficients, this one varies
 | |
|     between -1 and +1 with 0 implying no correlation. Correlations of -1 or
 | |
|     +1 imply a monotonic relationship. Positive correlations imply that
 | |
|     as `x` increases, so does `y`. Negative correlations imply that as `x`
 | |
|     increases, `y` decreases.
 | |
| 
 | |
|     Missing values are discarded pair-wise: if a value is missing in `x`, the
 | |
|     corresponding value in `y` is masked.
 | |
| 
 | |
|     The p-value roughly indicates the probability of an uncorrelated system
 | |
|     producing datasets that have a Spearman correlation at least as extreme
 | |
|     as the one computed from these datasets. The p-values are not entirely
 | |
|     reliable but are probably reasonable for datasets larger than 500 or so.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x, y : 1D or 2D array_like, y is optional
 | |
|         One or two 1-D or 2-D arrays containing multiple variables and
 | |
|         observations. When these are 1-D, each represents a vector of
 | |
|         observations of a single variable. For the behavior in the 2-D case,
 | |
|         see under ``axis``, below.
 | |
|     use_ties : bool, optional
 | |
|         DO NOT USE.  Does not do anything, keyword is only left in place for
 | |
|         backwards compatibility reasons.
 | |
|     axis : int or None, optional
 | |
|         If axis=0 (default), then each column represents a variable, with
 | |
|         observations in the rows. If axis=1, the relationship is transposed:
 | |
|         each row represents a variable, while the columns contain observations.
 | |
|         If axis=None, then both arrays will be raveled.
 | |
|     nan_policy : {'propagate', 'raise', 'omit'}, optional
 | |
|         Defines how to handle when input contains nan. 'propagate' returns nan,
 | |
|         'raise' throws an error, 'omit' performs the calculations ignoring nan
 | |
|         values. Default is 'propagate'.
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Defines the alternative hypothesis. Default is 'two-sided'.
 | |
|         The following options are available:
 | |
| 
 | |
|         * 'two-sided': the correlation is nonzero
 | |
|         * 'less': the correlation is negative (less than zero)
 | |
|         * 'greater':  the correlation is positive (greater than zero)
 | |
| 
 | |
|         .. versionadded:: 1.7.0
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     res : SignificanceResult
 | |
|         An object containing attributes:
 | |
| 
 | |
|         statistic : float or ndarray (2-D square)
 | |
|             Spearman correlation matrix or correlation coefficient (if only 2
 | |
|             variables are given as parameters). Correlation matrix is square
 | |
|             with length equal to total number of variables (columns or rows) in
 | |
|             ``a`` and ``b`` combined.
 | |
|         pvalue : float
 | |
|             The p-value for a hypothesis test whose null hypothesis
 | |
|             is that two sets of data are linearly uncorrelated. See
 | |
|             `alternative` above for alternative hypotheses. `pvalue` has the
 | |
|             same shape as `statistic`.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     [CRCProbStat2000] section 14.7
 | |
| 
 | |
|     """
 | |
|     if not use_ties:
 | |
|         raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
 | |
| 
 | |
|     # Always returns a masked array, raveled if axis=None
 | |
|     x, axisout = _chk_asarray(x, axis)
 | |
|     if y is not None:
 | |
|         # Deal only with 2-D `x` case.
 | |
|         y, _ = _chk_asarray(y, axis)
 | |
|         if axisout == 0:
 | |
|             x = ma.column_stack((x, y))
 | |
|         else:
 | |
|             x = ma.vstack((x, y))
 | |
| 
 | |
|     if axisout == 1:
 | |
|         # To simplify the code that follow (always use `n_obs, n_vars` shape)
 | |
|         x = x.T
 | |
| 
 | |
|     if nan_policy == 'omit':
 | |
|         x = ma.masked_invalid(x)
 | |
| 
 | |
|     def _spearmanr_2cols(x):
 | |
|         # Mask the same observations for all variables, and then drop those
 | |
|         # observations (can't leave them masked, rankdata is weird).
 | |
|         x = ma.mask_rowcols(x, axis=0)
 | |
|         x = x[~x.mask.any(axis=1), :]
 | |
| 
 | |
|         # If either column is entirely NaN or Inf
 | |
|         if not np.any(x.data):
 | |
|             res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)
 | |
|             res.correlation = np.nan
 | |
|             return res
 | |
| 
 | |
|         m = ma.getmask(x)
 | |
|         n_obs = x.shape[0]
 | |
|         dof = n_obs - 2 - int(m.sum(axis=0)[0])
 | |
|         if dof < 0:
 | |
|             raise ValueError("The input must have at least 3 entries!")
 | |
| 
 | |
|         # Gets the ranks and rank differences
 | |
|         x_ranked = rankdata(x, axis=0)
 | |
|         rs = ma.corrcoef(x_ranked, rowvar=False).data
 | |
| 
 | |
|         # rs can have elements equal to 1, so avoid zero division warnings
 | |
|         with np.errstate(divide='ignore'):
 | |
|             # clip the small negative values possibly caused by rounding
 | |
|             # errors before taking the square root
 | |
|             t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
 | |
| 
 | |
|         t, prob = _ttest_finish(dof, t, alternative)
 | |
| 
 | |
|         # For backwards compatibility, return scalars when comparing 2 columns
 | |
|         if rs.shape == (2, 2):
 | |
|             res = scipy.stats._stats_py.SignificanceResult(rs[1, 0],
 | |
|                                                            prob[1, 0])
 | |
|             res.correlation = rs[1, 0]
 | |
|             return res
 | |
|         else:
 | |
|             res = scipy.stats._stats_py.SignificanceResult(rs, prob)
 | |
|             res.correlation = rs
 | |
|             return res
 | |
| 
 | |
|     # Need to do this per pair of variables, otherwise the dropped observations
 | |
|     # in a third column mess up the result for a pair.
 | |
|     n_vars = x.shape[1]
 | |
|     if n_vars == 2:
 | |
|         return _spearmanr_2cols(x)
 | |
|     else:
 | |
|         rs = np.ones((n_vars, n_vars), dtype=float)
 | |
|         prob = np.zeros((n_vars, n_vars), dtype=float)
 | |
|         for var1 in range(n_vars - 1):
 | |
|             for var2 in range(var1+1, n_vars):
 | |
|                 result = _spearmanr_2cols(x[:, [var1, var2]])
 | |
|                 rs[var1, var2] = result.correlation
 | |
|                 rs[var2, var1] = result.correlation
 | |
|                 prob[var1, var2] = result.pvalue
 | |
|                 prob[var2, var1] = result.pvalue
 | |
| 
 | |
|         res = scipy.stats._stats_py.SignificanceResult(rs, prob)
 | |
|         res.correlation = rs
 | |
|         return res
 | |
| 
 | |
| 
 | |
| def _kendall_p_exact(n, c, alternative='two-sided'):
 | |
| 
 | |
|     # Use the fact that distribution is symmetric: always calculate a CDF in
 | |
|     # the left tail.
 | |
|     # This will be the one-sided p-value if `c` is on the side of
 | |
|     # the null distribution predicted by the alternative hypothesis.
 | |
|     # The two-sided p-value will be twice this value.
 | |
|     # If `c` is on the other side of the null distribution, we'll need to
 | |
|     # take the complement and add back the probability mass at `c`.
 | |
|     in_right_tail = (c >= (n*(n-1))//2 - c)
 | |
|     alternative_greater = (alternative == 'greater')
 | |
|     c = int(min(c, (n*(n-1))//2 - c))
 | |
| 
 | |
|     # Exact p-value, see Maurice G. Kendall, "Rank Correlation Methods"
 | |
|     # (4th Edition), Charles Griffin & Co., 1970.
 | |
|     if n <= 0:
 | |
|         raise ValueError(f'n ({n}) must be positive')
 | |
|     elif c < 0 or 4*c > n*(n-1):
 | |
|         raise ValueError(f'c ({c}) must satisfy 0 <= 4c <= n(n-1) = {n*(n-1)}.')
 | |
|     elif n == 1:
 | |
|         prob = 1.0
 | |
|         p_mass_at_c = 1
 | |
|     elif n == 2:
 | |
|         prob = 1.0
 | |
|         p_mass_at_c = 0.5
 | |
|     elif c == 0:
 | |
|         prob = 2.0/math.factorial(n) if n < 171 else 0.0
 | |
|         p_mass_at_c = prob/2
 | |
|     elif c == 1:
 | |
|         prob = 2.0/math.factorial(n-1) if n < 172 else 0.0
 | |
|         p_mass_at_c = (n-1)/math.factorial(n)
 | |
|     elif 4*c == n*(n-1) and alternative == 'two-sided':
 | |
|         # I'm sure there's a simple formula for p_mass_at_c in this
 | |
|         # case, but I don't know it. Use generic formula for one-sided p-value.
 | |
|         prob = 1.0
 | |
|     elif n < 171:
 | |
|         new = np.zeros(c+1)
 | |
|         new[0:2] = 1.0
 | |
|         for j in range(3,n+1):
 | |
|             new = np.cumsum(new)
 | |
|             if j <= c:
 | |
|                 new[j:] -= new[:c+1-j]
 | |
|         prob = 2.0*np.sum(new)/math.factorial(n)
 | |
|         p_mass_at_c = new[-1]/math.factorial(n)
 | |
|     else:
 | |
|         new = np.zeros(c+1)
 | |
|         new[0:2] = 1.0
 | |
|         for j in range(3, n+1):
 | |
|             new = np.cumsum(new)/j
 | |
|             if j <= c:
 | |
|                 new[j:] -= new[:c+1-j]
 | |
|         prob = np.sum(new)
 | |
|         p_mass_at_c = new[-1]/2
 | |
| 
 | |
|     if alternative != 'two-sided':
 | |
|         # if the alternative hypothesis and alternative agree,
 | |
|         # one-sided p-value is half the two-sided p-value
 | |
|         if in_right_tail == alternative_greater:
 | |
|             prob /= 2
 | |
|         else:
 | |
|             prob = 1 - prob/2 + p_mass_at_c
 | |
| 
 | |
|     prob = np.clip(prob, 0, 1)
 | |
| 
 | |
|     return prob
 | |
| 
 | |
| 
 | |
| def kendalltau(x, y, use_ties=True, use_missing=False, method='auto',
 | |
|                alternative='two-sided'):
 | |
|     """
 | |
|     Computes Kendall's rank correlation tau on two variables *x* and *y*.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : sequence
 | |
|         First data list (for example, time).
 | |
|     y : sequence
 | |
|         Second data list.
 | |
|     use_ties : {True, False}, optional
 | |
|         Whether ties correction should be performed.
 | |
|     use_missing : {False, True}, optional
 | |
|         Whether missing data should be allocated a rank of 0 (False) or the
 | |
|         average rank (True)
 | |
|     method : {'auto', 'asymptotic', 'exact'}, optional
 | |
|         Defines which method is used to calculate the p-value [1]_.
 | |
|         'asymptotic' uses a normal approximation valid for large samples.
 | |
|         'exact' computes the exact p-value, but can only be used if no ties
 | |
|         are present. As the sample size increases, the 'exact' computation
 | |
|         time may grow and the result may lose some precision.
 | |
|         'auto' is the default and selects the appropriate
 | |
|         method based on a trade-off between speed and accuracy.
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Defines the alternative hypothesis. Default is 'two-sided'.
 | |
|         The following options are available:
 | |
| 
 | |
|         * 'two-sided': the rank correlation is nonzero
 | |
|         * 'less': the rank correlation is negative (less than zero)
 | |
|         * 'greater':  the rank correlation is positive (greater than zero)
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     res : SignificanceResult
 | |
|         An object containing attributes:
 | |
| 
 | |
|         statistic : float
 | |
|            The tau statistic.
 | |
|         pvalue : float
 | |
|            The p-value for a hypothesis test whose null hypothesis is
 | |
|            an absence of association, tau = 0.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),
 | |
|            Charles Griffin & Co., 1970.
 | |
| 
 | |
|     """
 | |
|     (x, y, n) = _chk_size(x, y)
 | |
|     (x, y) = (x.flatten(), y.flatten())
 | |
|     m = ma.mask_or(ma.getmask(x), ma.getmask(y))
 | |
|     if m is not nomask:
 | |
|         x = ma.array(x, mask=m, copy=True)
 | |
|         y = ma.array(y, mask=m, copy=True)
 | |
|         # need int() here, otherwise numpy defaults to 32 bit
 | |
|         # integer on all Windows architectures, causing overflow.
 | |
|         # int() will keep it infinite precision.
 | |
|         n -= int(m.sum())
 | |
| 
 | |
|     if n < 2:
 | |
|         res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)
 | |
|         res.correlation = np.nan
 | |
|         return res
 | |
| 
 | |
|     rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0)
 | |
|     ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0)
 | |
|     idx = rx.argsort()
 | |
|     (rx, ry) = (rx[idx], ry[idx])
 | |
|     C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum()
 | |
|                 for i in range(len(ry)-1)], dtype=float)
 | |
|     D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum()
 | |
|                 for i in range(len(ry)-1)], dtype=float)
 | |
|     xties = count_tied_groups(x)
 | |
|     yties = count_tied_groups(y)
 | |
|     if use_ties:
 | |
|         corr_x = np.sum([v*k*(k-1) for (k,v) in xties.items()], dtype=float)
 | |
|         corr_y = np.sum([v*k*(k-1) for (k,v) in yties.items()], dtype=float)
 | |
|         denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.)
 | |
|     else:
 | |
|         denom = n*(n-1)/2.
 | |
|     tau = (C-D) / denom
 | |
| 
 | |
|     if method == 'exact' and (xties or yties):
 | |
|         raise ValueError("Ties found, exact method cannot be used.")
 | |
| 
 | |
|     if method == 'auto':
 | |
|         if (not xties and not yties) and (n <= 33 or min(C, n*(n-1)/2.0-C) <= 1):
 | |
|             method = 'exact'
 | |
|         else:
 | |
|             method = 'asymptotic'
 | |
| 
 | |
|     if not xties and not yties and method == 'exact':
 | |
|         prob = _kendall_p_exact(n, C, alternative)
 | |
| 
 | |
|     elif method == 'asymptotic':
 | |
|         var_s = n*(n-1)*(2*n+5)
 | |
|         if use_ties:
 | |
|             var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in xties.items()])
 | |
|             var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in yties.items()])
 | |
|             v1 = (np.sum([v*k*(k-1) for (k, v) in xties.items()], dtype=float) *
 | |
|                   np.sum([v*k*(k-1) for (k, v) in yties.items()], dtype=float))
 | |
|             v1 /= 2.*n*(n-1)
 | |
|             if n > 2:
 | |
|                 v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in xties.items()],
 | |
|                             dtype=float) * \
 | |
|                      np.sum([v*k*(k-1)*(k-2) for (k,v) in yties.items()],
 | |
|                             dtype=float)
 | |
|                 v2 /= 9.*n*(n-1)*(n-2)
 | |
|             else:
 | |
|                 v2 = 0
 | |
|         else:
 | |
|             v1 = v2 = 0
 | |
| 
 | |
|         var_s /= 18.
 | |
|         var_s += (v1 + v2)
 | |
|         z = (C-D)/np.sqrt(var_s)
 | |
|         prob = scipy.stats._stats_py._get_pvalue(z, distributions.norm, alternative)
 | |
|     else:
 | |
|         raise ValueError("Unknown method "+str(method)+" specified, please "
 | |
|                          "use auto, exact or asymptotic.")
 | |
| 
 | |
|     res = scipy.stats._stats_py.SignificanceResult(tau[()], prob[()])
 | |
|     res.correlation = tau
 | |
|     return res
 | |
| 
 | |
| 
 | |
| def kendalltau_seasonal(x):
 | |
|     """
 | |
|     Computes a multivariate Kendall's rank correlation tau, for seasonal data.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : 2-D ndarray
 | |
|         Array of seasonal data, with seasons in columns.
 | |
| 
 | |
|     """
 | |
|     x = ma.array(x, subok=True, copy=False, ndmin=2)
 | |
|     (n,m) = x.shape
 | |
|     n_p = x.count(0)
 | |
| 
 | |
|     S_szn = sum(msign(x[i:]-x[i]).sum(0) for i in range(n))
 | |
|     S_tot = S_szn.sum()
 | |
| 
 | |
|     n_tot = x.count()
 | |
|     ties = count_tied_groups(x.compressed())
 | |
|     corr_ties = sum(v*k*(k-1) for (k,v) in ties.items())
 | |
|     denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2.
 | |
| 
 | |
|     R = rankdata(x, axis=0, use_missing=True)
 | |
|     K = ma.empty((m,m), dtype=int)
 | |
|     covmat = ma.empty((m,m), dtype=float)
 | |
|     denom_szn = ma.empty(m, dtype=float)
 | |
|     for j in range(m):
 | |
|         ties_j = count_tied_groups(x[:,j].compressed())
 | |
|         corr_j = sum(v*k*(k-1) for (k,v) in ties_j.items())
 | |
|         cmb = n_p[j]*(n_p[j]-1)
 | |
|         for k in range(j,m,1):
 | |
|             K[j,k] = sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum()
 | |
|                          for i in range(n))
 | |
|             covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() -
 | |
|                            n*(n_p[j]+1)*(n_p[k]+1))/3.
 | |
|             K[k,j] = K[j,k]
 | |
|             covmat[k,j] = covmat[j,k]
 | |
| 
 | |
|         denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2.
 | |
| 
 | |
|     var_szn = covmat.diagonal()
 | |
| 
 | |
|     z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn)
 | |
|     z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum())
 | |
|     z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum())
 | |
| 
 | |
|     prob_szn = special.erfc(abs(z_szn.data)/np.sqrt(2))
 | |
|     prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2))
 | |
|     prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2))
 | |
| 
 | |
|     chi2_tot = (z_szn*z_szn).sum()
 | |
|     chi2_trd = m * z_szn.mean()**2
 | |
|     output = {'seasonal tau': S_szn/denom_szn,
 | |
|               'global tau': S_tot/denom_tot,
 | |
|               'global tau (alt)': S_tot/denom_szn.sum(),
 | |
|               'seasonal p-value': prob_szn,
 | |
|               'global p-value (indep)': prob_tot_ind,
 | |
|               'global p-value (dep)': prob_tot_dep,
 | |
|               'chi2 total': chi2_tot,
 | |
|               'chi2 trend': chi2_trd,
 | |
|               }
 | |
|     return output
 | |
| 
 | |
| 
 | |
| PointbiserialrResult = namedtuple('PointbiserialrResult', ('correlation',
 | |
|                                                            'pvalue'))
 | |
| 
 | |
| 
 | |
| def pointbiserialr(x, y):
 | |
|     """Calculates a point biserial correlation coefficient and its p-value.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : array_like of bools
 | |
|         Input array.
 | |
|     y : array_like
 | |
|         Input array.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     correlation : float
 | |
|         R value
 | |
|     pvalue : float
 | |
|         2-tailed p-value
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     Missing values are considered pair-wise: if a value is missing in x,
 | |
|     the corresponding value in y is masked.
 | |
| 
 | |
|     For more details on `pointbiserialr`, see `scipy.stats.pointbiserialr`.
 | |
| 
 | |
|     """
 | |
|     x = ma.fix_invalid(x, copy=True).astype(bool)
 | |
|     y = ma.fix_invalid(y, copy=True).astype(float)
 | |
|     # Get rid of the missing data
 | |
|     m = ma.mask_or(ma.getmask(x), ma.getmask(y))
 | |
|     if m is not nomask:
 | |
|         unmask = np.logical_not(m)
 | |
|         x = x[unmask]
 | |
|         y = y[unmask]
 | |
| 
 | |
|     n = len(x)
 | |
|     # phat is the fraction of x values that are True
 | |
|     phat = x.sum() / float(n)
 | |
|     y0 = y[~x]  # y-values where x is False
 | |
|     y1 = y[x]  # y-values where x is True
 | |
|     y0m = y0.mean()
 | |
|     y1m = y1.mean()
 | |
| 
 | |
|     rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std()
 | |
| 
 | |
|     df = n-2
 | |
|     t = rpb*ma.sqrt(df/(1.0-rpb**2))
 | |
|     prob = _betai(0.5*df, 0.5, df/(df+t*t))
 | |
| 
 | |
|     return PointbiserialrResult(rpb, prob)
 | |
| 
 | |
| 
 | |
| def linregress(x, y=None):
 | |
|     r"""
 | |
|     Calculate a linear least-squares regression for two sets of measurements.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x, y : array_like
 | |
|         Two sets of measurements.  Both arrays should have the same length N.  If
 | |
|         only `x` is given (and ``y=None``), then it must be a two-dimensional
 | |
|         array where one dimension has length 2.  The two sets of measurements
 | |
|         are then found by splitting the array along the length-2 dimension. In
 | |
|         the case where ``y=None`` and `x` is a 2xN array, ``linregress(x)`` is
 | |
|         equivalent to ``linregress(x[0], x[1])``.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     result : ``LinregressResult`` instance
 | |
|         The return value is an object with the following attributes:
 | |
| 
 | |
|         slope : float
 | |
|             Slope of the regression line.
 | |
|         intercept : float
 | |
|             Intercept of the regression line.
 | |
|         rvalue : float
 | |
|             The Pearson correlation coefficient. The square of ``rvalue``
 | |
|             is equal to the coefficient of determination.
 | |
|         pvalue : float
 | |
|             The p-value for a hypothesis test whose null hypothesis is
 | |
|             that the slope is zero, using Wald Test with t-distribution of
 | |
|             the test statistic. See `alternative` above for alternative
 | |
|             hypotheses.
 | |
|         stderr : float
 | |
|             Standard error of the estimated slope (gradient), under the
 | |
|             assumption of residual normality.
 | |
|         intercept_stderr : float
 | |
|             Standard error of the estimated intercept, under the assumption
 | |
|             of residual normality.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     scipy.optimize.curve_fit :
 | |
|         Use non-linear least squares to fit a function to data.
 | |
|     scipy.optimize.leastsq :
 | |
|         Minimize the sum of squares of a set of equations.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     Missing values are considered pair-wise: if a value is missing in `x`,
 | |
|     the corresponding value in `y` is masked.
 | |
| 
 | |
|     For compatibility with older versions of SciPy, the return value acts
 | |
|     like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``,
 | |
|     ``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write::
 | |
| 
 | |
|         slope, intercept, r, p, se = linregress(x, y)
 | |
| 
 | |
|     With that style, however, the standard error of the intercept is not
 | |
|     available.  To have access to all the computed values, including the
 | |
|     standard error of the intercept, use the return value as an object
 | |
|     with attributes, e.g.::
 | |
| 
 | |
|         result = linregress(x, y)
 | |
|         print(result.intercept, result.intercept_stderr)
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> import matplotlib.pyplot as plt
 | |
|     >>> from scipy import stats
 | |
|     >>> rng = np.random.default_rng()
 | |
| 
 | |
|     Generate some data:
 | |
| 
 | |
|     >>> x = rng.random(10)
 | |
|     >>> y = 1.6*x + rng.random(10)
 | |
| 
 | |
|     Perform the linear regression:
 | |
| 
 | |
|     >>> res = stats.mstats.linregress(x, y)
 | |
| 
 | |
|     Coefficient of determination (R-squared):
 | |
| 
 | |
|     >>> print(f"R-squared: {res.rvalue**2:.6f}")
 | |
|     R-squared: 0.717533
 | |
| 
 | |
|     Plot the data along with the fitted line:
 | |
| 
 | |
|     >>> plt.plot(x, y, 'o', label='original data')
 | |
|     >>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')
 | |
|     >>> plt.legend()
 | |
|     >>> plt.show()
 | |
| 
 | |
|     Calculate 95% confidence interval on slope and intercept:
 | |
| 
 | |
|     >>> # Two-sided inverse Students t-distribution
 | |
|     >>> # p - probability, df - degrees of freedom
 | |
|     >>> from scipy.stats import t
 | |
|     >>> tinv = lambda p, df: abs(t.ppf(p/2, df))
 | |
| 
 | |
|     >>> ts = tinv(0.05, len(x)-2)
 | |
|     >>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}")
 | |
|     slope (95%): 1.453392 +/- 0.743465
 | |
|     >>> print(f"intercept (95%): {res.intercept:.6f}"
 | |
|     ...       f" +/- {ts*res.intercept_stderr:.6f}")
 | |
|     intercept (95%): 0.616950 +/- 0.544475
 | |
| 
 | |
|     """
 | |
|     if y is None:
 | |
|         x = ma.array(x)
 | |
|         if x.shape[0] == 2:
 | |
|             x, y = x
 | |
|         elif x.shape[1] == 2:
 | |
|             x, y = x.T
 | |
|         else:
 | |
|             raise ValueError("If only `x` is given as input, "
 | |
|                              "it has to be of shape (2, N) or (N, 2), "
 | |
|                              f"provided shape was {x.shape}")
 | |
|     else:
 | |
|         x = ma.array(x)
 | |
|         y = ma.array(y)
 | |
| 
 | |
|     x = x.flatten()
 | |
|     y = y.flatten()
 | |
| 
 | |
|     if np.amax(x) == np.amin(x) and len(x) > 1:
 | |
|         raise ValueError("Cannot calculate a linear regression "
 | |
|                          "if all x values are identical")
 | |
| 
 | |
|     m = ma.mask_or(ma.getmask(x), ma.getmask(y), shrink=False)
 | |
|     if m is not nomask:
 | |
|         x = ma.array(x, mask=m)
 | |
|         y = ma.array(y, mask=m)
 | |
|         if np.any(~m):
 | |
|             result = _stats_py.linregress(x.data[~m], y.data[~m])
 | |
|         else:
 | |
|             # All data is masked
 | |
|             result = _stats_py.LinregressResult(slope=None, intercept=None,
 | |
|                                                 rvalue=None, pvalue=None,
 | |
|                                                 stderr=None,
 | |
|                                                 intercept_stderr=None)
 | |
|     else:
 | |
|         result = _stats_py.linregress(x.data, y.data)
 | |
| 
 | |
|     return result
 | |
| 
 | |
| 
 | |
| def theilslopes(y, x=None, alpha=0.95, method='separate'):
 | |
|     r"""
 | |
|     Computes the Theil-Sen estimator for a set of points (x, y).
 | |
| 
 | |
|     `theilslopes` implements a method for robust linear regression.  It
 | |
|     computes the slope as the median of all slopes between paired values.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     y : array_like
 | |
|         Dependent variable.
 | |
|     x : array_like or None, optional
 | |
|         Independent variable. If None, use ``arange(len(y))`` instead.
 | |
|     alpha : float, optional
 | |
|         Confidence degree between 0 and 1. Default is 95% confidence.
 | |
|         Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
 | |
|         interpreted as "find the 90% confidence interval".
 | |
|     method : {'joint', 'separate'}, optional
 | |
|         Method to be used for computing estimate for intercept.
 | |
|         Following methods are supported,
 | |
| 
 | |
|             * 'joint': Uses np.median(y - slope * x) as intercept.
 | |
|             * 'separate': Uses np.median(y) - slope * np.median(x)
 | |
|                           as intercept.
 | |
| 
 | |
|         The default is 'separate'.
 | |
| 
 | |
|         .. versionadded:: 1.8.0
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     result : ``TheilslopesResult`` instance
 | |
|         The return value is an object with the following attributes:
 | |
| 
 | |
|         slope : float
 | |
|             Theil slope.
 | |
|         intercept : float
 | |
|             Intercept of the Theil line.
 | |
|         low_slope : float
 | |
|             Lower bound of the confidence interval on `slope`.
 | |
|         high_slope : float
 | |
|             Upper bound of the confidence interval on `slope`.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     siegelslopes : a similar technique using repeated medians
 | |
| 
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details on `theilslopes`, see `scipy.stats.theilslopes`.
 | |
| 
 | |
|     """
 | |
|     y = ma.asarray(y).flatten()
 | |
|     if x is None:
 | |
|         x = ma.arange(len(y), dtype=float)
 | |
|     else:
 | |
|         x = ma.asarray(x).flatten()
 | |
|         if len(x) != len(y):
 | |
|             raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
 | |
| 
 | |
|     m = ma.mask_or(ma.getmask(x), ma.getmask(y))
 | |
|     y._mask = x._mask = m
 | |
|     # Disregard any masked elements of x or y
 | |
|     y = y.compressed()
 | |
|     x = x.compressed().astype(float)
 | |
|     # We now have unmasked arrays so can use `scipy.stats.theilslopes`
 | |
|     return stats_theilslopes(y, x, alpha=alpha, method=method)
 | |
| 
 | |
| 
 | |
| def siegelslopes(y, x=None, method="hierarchical"):
 | |
|     r"""
 | |
|     Computes the Siegel estimator for a set of points (x, y).
 | |
| 
 | |
|     `siegelslopes` implements a method for robust linear regression
 | |
|     using repeated medians to fit a line to the points (x, y).
 | |
|     The method is robust to outliers with an asymptotic breakdown point
 | |
|     of 50%.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     y : array_like
 | |
|         Dependent variable.
 | |
|     x : array_like or None, optional
 | |
|         Independent variable. If None, use ``arange(len(y))`` instead.
 | |
|     method : {'hierarchical', 'separate'}
 | |
|         If 'hierarchical', estimate the intercept using the estimated
 | |
|         slope ``slope`` (default option).
 | |
|         If 'separate', estimate the intercept independent of the estimated
 | |
|         slope. See Notes for details.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     result : ``SiegelslopesResult`` instance
 | |
|         The return value is an object with the following attributes:
 | |
| 
 | |
|         slope : float
 | |
|             Estimate of the slope of the regression line.
 | |
|         intercept : float
 | |
|             Estimate of the intercept of the regression line.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     theilslopes : a similar technique without repeated medians
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details on `siegelslopes`, see `scipy.stats.siegelslopes`.
 | |
| 
 | |
|     """
 | |
|     y = ma.asarray(y).ravel()
 | |
|     if x is None:
 | |
|         x = ma.arange(len(y), dtype=float)
 | |
|     else:
 | |
|         x = ma.asarray(x).ravel()
 | |
|         if len(x) != len(y):
 | |
|             raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
 | |
| 
 | |
|     m = ma.mask_or(ma.getmask(x), ma.getmask(y))
 | |
|     y._mask = x._mask = m
 | |
|     # Disregard any masked elements of x or y
 | |
|     y = y.compressed()
 | |
|     x = x.compressed().astype(float)
 | |
|     # We now have unmasked arrays so can use `scipy.stats.siegelslopes`
 | |
|     return stats_siegelslopes(y, x, method=method)
 | |
| 
 | |
| 
 | |
| SenSeasonalSlopesResult = _make_tuple_bunch('SenSeasonalSlopesResult',
 | |
|                                             ['intra_slope', 'inter_slope'])
 | |
| 
 | |
| 
 | |
| def sen_seasonal_slopes(x):
 | |
|     r"""
 | |
|     Computes seasonal Theil-Sen and Kendall slope estimators.
 | |
| 
 | |
|     The seasonal generalization of Sen's slope computes the slopes between all
 | |
|     pairs of values within a "season" (column) of a 2D array. It returns an
 | |
|     array containing the median of these "within-season" slopes for each
 | |
|     season (the Theil-Sen slope estimator of each season), and it returns the
 | |
|     median of the within-season slopes across all seasons (the seasonal Kendall
 | |
|     slope estimator).
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : 2D array_like
 | |
|         Each column of `x` contains measurements of the dependent variable
 | |
|         within a season. The independent variable (usually time) of each season
 | |
|         is assumed to be ``np.arange(x.shape[0])``.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     result : ``SenSeasonalSlopesResult`` instance
 | |
|         The return value is an object with the following attributes:
 | |
| 
 | |
|         intra_slope : ndarray
 | |
|             For each season, the Theil-Sen slope estimator: the median of
 | |
|             within-season slopes.
 | |
|         inter_slope : float
 | |
|             The seasonal Kendall slope estimator: the median of within-season
 | |
|             slopes *across all* seasons.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     theilslopes : the analogous function for non-seasonal data
 | |
|     scipy.stats.theilslopes : non-seasonal slopes for non-masked arrays
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     The slopes :math:`d_{ijk}` within season :math:`i` are:
 | |
| 
 | |
|     .. math::
 | |
| 
 | |
|         d_{ijk} = \frac{x_{ij} - x_{ik}}
 | |
|                             {j - k}
 | |
| 
 | |
|     for pairs of distinct integer indices :math:`j, k` of :math:`x`.
 | |
| 
 | |
|     Element :math:`i` of the returned `intra_slope` array is the median of the
 | |
|     :math:`d_{ijk}` over all :math:`j < k`; this is the Theil-Sen slope
 | |
|     estimator of season :math:`i`. The returned `inter_slope` value, better
 | |
|     known as the seasonal Kendall slope estimator, is the median of the
 | |
|     :math:`d_{ijk}` over all :math:`i, j, k`.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] Hirsch, Robert M., James R. Slack, and Richard A. Smith.
 | |
|            "Techniques of trend analysis for monthly water quality data."
 | |
|            *Water Resources Research* 18.1 (1982): 107-121.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     Suppose we have 100 observations of a dependent variable for each of four
 | |
|     seasons:
 | |
| 
 | |
|     >>> import numpy as np
 | |
|     >>> rng = np.random.default_rng()
 | |
|     >>> x = rng.random(size=(100, 4))
 | |
| 
 | |
|     We compute the seasonal slopes as:
 | |
| 
 | |
|     >>> from scipy import stats
 | |
|     >>> intra_slope, inter_slope = stats.mstats.sen_seasonal_slopes(x)
 | |
| 
 | |
|     If we define a function to compute all slopes between observations within
 | |
|     a season:
 | |
| 
 | |
|     >>> def dijk(yi):
 | |
|     ...     n = len(yi)
 | |
|     ...     x = np.arange(n)
 | |
|     ...     dy = yi - yi[:, np.newaxis]
 | |
|     ...     dx = x - x[:, np.newaxis]
 | |
|     ...     # we only want unique pairs of distinct indices
 | |
|     ...     mask = np.triu(np.ones((n, n), dtype=bool), k=1)
 | |
|     ...     return dy[mask]/dx[mask]
 | |
| 
 | |
|     then element ``i`` of ``intra_slope`` is the median of ``dijk[x[:, i]]``:
 | |
| 
 | |
|     >>> i = 2
 | |
|     >>> np.allclose(np.median(dijk(x[:, i])), intra_slope[i])
 | |
|     True
 | |
| 
 | |
|     and ``inter_slope`` is the median of the values returned by ``dijk`` for
 | |
|     all seasons:
 | |
| 
 | |
|     >>> all_slopes = np.concatenate([dijk(x[:, i]) for i in range(x.shape[1])])
 | |
|     >>> np.allclose(np.median(all_slopes), inter_slope)
 | |
|     True
 | |
| 
 | |
|     Because the data are randomly generated, we would expect the median slopes
 | |
|     to be nearly zero both within and across all seasons, and indeed they are:
 | |
| 
 | |
|     >>> intra_slope.data
 | |
|     array([ 0.00124504, -0.00277761, -0.00221245, -0.00036338])
 | |
|     >>> inter_slope
 | |
|     -0.0010511779872922058
 | |
| 
 | |
|     """
 | |
|     x = ma.array(x, subok=True, copy=False, ndmin=2)
 | |
|     (n,_) = x.shape
 | |
|     # Get list of slopes per season
 | |
|     szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None]
 | |
|                             for i in range(n)])
 | |
|     szn_medslopes = ma.median(szn_slopes, axis=0)
 | |
|     medslope = ma.median(szn_slopes, axis=None)
 | |
|     return SenSeasonalSlopesResult(szn_medslopes, medslope)
 | |
| 
 | |
| 
 | |
| Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue'))
 | |
| 
 | |
| 
 | |
| def ttest_1samp(a, popmean, axis=0, alternative='two-sided'):
 | |
|     """
 | |
|     Calculates the T-test for the mean of ONE group of scores.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         sample observation
 | |
|     popmean : float or array_like
 | |
|         expected value in null hypothesis, if array_like than it must have the
 | |
|         same shape as `a` excluding the axis dimension
 | |
|     axis : int or None, optional
 | |
|         Axis along which to compute test. If None, compute over the whole
 | |
|         array `a`.
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Defines the alternative hypothesis.
 | |
|         The following options are available (default is 'two-sided'):
 | |
| 
 | |
|         * 'two-sided': the mean of the underlying distribution of the sample
 | |
|           is different than the given population mean (`popmean`)
 | |
|         * 'less': the mean of the underlying distribution of the sample is
 | |
|           less than the given population mean (`popmean`)
 | |
|         * 'greater': the mean of the underlying distribution of the sample is
 | |
|           greater than the given population mean (`popmean`)
 | |
| 
 | |
|         .. versionadded:: 1.7.0
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : float or array
 | |
|         t-statistic
 | |
|     pvalue : float or array
 | |
|         The p-value
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details on `ttest_1samp`, see `scipy.stats.ttest_1samp`.
 | |
| 
 | |
|     """
 | |
|     a, axis = _chk_asarray(a, axis)
 | |
|     if a.size == 0:
 | |
|         return (np.nan, np.nan)
 | |
| 
 | |
|     x = a.mean(axis=axis)
 | |
|     v = a.var(axis=axis, ddof=1)
 | |
|     n = a.count(axis=axis)
 | |
|     # force df to be an array for masked division not to throw a warning
 | |
|     df = ma.asanyarray(n - 1.0)
 | |
|     svar = ((n - 1.0) * v) / df
 | |
|     with np.errstate(divide='ignore', invalid='ignore'):
 | |
|         t = (x - popmean) / ma.sqrt(svar / n)
 | |
| 
 | |
|     t, prob = _ttest_finish(df, t, alternative)
 | |
|     return Ttest_1sampResult(t, prob)
 | |
| 
 | |
| 
 | |
| ttest_onesamp = ttest_1samp
 | |
| 
 | |
| 
 | |
| Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))
 | |
| 
 | |
| 
 | |
| def ttest_ind(a, b, axis=0, equal_var=True, alternative='two-sided'):
 | |
|     """
 | |
|     Calculates the T-test for the means of TWO INDEPENDENT samples of scores.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a, b : array_like
 | |
|         The arrays must have the same shape, except in the dimension
 | |
|         corresponding to `axis` (the first, by default).
 | |
|     axis : int or None, optional
 | |
|         Axis along which to compute test. If None, compute over the whole
 | |
|         arrays, `a`, and `b`.
 | |
|     equal_var : bool, optional
 | |
|         If True, perform a standard independent 2 sample test that assumes equal
 | |
|         population variances.
 | |
|         If False, perform Welch's t-test, which does not assume equal population
 | |
|         variance.
 | |
| 
 | |
|         .. versionadded:: 0.17.0
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Defines the alternative hypothesis.
 | |
|         The following options are available (default is 'two-sided'):
 | |
| 
 | |
|         * 'two-sided': the means of the distributions underlying the samples
 | |
|           are unequal.
 | |
|         * 'less': the mean of the distribution underlying the first sample
 | |
|           is less than the mean of the distribution underlying the second
 | |
|           sample.
 | |
|         * 'greater': the mean of the distribution underlying the first
 | |
|           sample is greater than the mean of the distribution underlying
 | |
|           the second sample.
 | |
| 
 | |
|         .. versionadded:: 1.7.0
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : float or array
 | |
|         The calculated t-statistic.
 | |
|     pvalue : float or array
 | |
|         The p-value.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details on `ttest_ind`, see `scipy.stats.ttest_ind`.
 | |
| 
 | |
|     """
 | |
|     a, b, axis = _chk2_asarray(a, b, axis)
 | |
| 
 | |
|     if a.size == 0 or b.size == 0:
 | |
|         return Ttest_indResult(np.nan, np.nan)
 | |
| 
 | |
|     (x1, x2) = (a.mean(axis), b.mean(axis))
 | |
|     (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1))
 | |
|     (n1, n2) = (a.count(axis), b.count(axis))
 | |
| 
 | |
|     if equal_var:
 | |
|         # force df to be an array for masked division not to throw a warning
 | |
|         df = ma.asanyarray(n1 + n2 - 2.0)
 | |
|         svar = ((n1-1)*v1+(n2-1)*v2) / df
 | |
|         denom = ma.sqrt(svar*(1.0/n1 + 1.0/n2))  # n-D computation here!
 | |
|     else:
 | |
|         vn1 = v1/n1
 | |
|         vn2 = v2/n2
 | |
|         with np.errstate(divide='ignore', invalid='ignore'):
 | |
|             df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))
 | |
| 
 | |
|         # If df is undefined, variances are zero.
 | |
|         # It doesn't matter what df is as long as it is not NaN.
 | |
|         df = np.where(np.isnan(df), 1, df)
 | |
|         denom = ma.sqrt(vn1 + vn2)
 | |
| 
 | |
|     with np.errstate(divide='ignore', invalid='ignore'):
 | |
|         t = (x1-x2) / denom
 | |
| 
 | |
|     t, prob = _ttest_finish(df, t, alternative)
 | |
|     return Ttest_indResult(t, prob)
 | |
| 
 | |
| 
 | |
| Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue'))
 | |
| 
 | |
| 
 | |
| def ttest_rel(a, b, axis=0, alternative='two-sided'):
 | |
|     """
 | |
|     Calculates the T-test on TWO RELATED samples of scores, a and b.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a, b : array_like
 | |
|         The arrays must have the same shape.
 | |
|     axis : int or None, optional
 | |
|         Axis along which to compute test. If None, compute over the whole
 | |
|         arrays, `a`, and `b`.
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Defines the alternative hypothesis.
 | |
|         The following options are available (default is 'two-sided'):
 | |
| 
 | |
|         * 'two-sided': the means of the distributions underlying the samples
 | |
|           are unequal.
 | |
|         * 'less': the mean of the distribution underlying the first sample
 | |
|           is less than the mean of the distribution underlying the second
 | |
|           sample.
 | |
|         * 'greater': the mean of the distribution underlying the first
 | |
|           sample is greater than the mean of the distribution underlying
 | |
|           the second sample.
 | |
| 
 | |
|         .. versionadded:: 1.7.0
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : float or array
 | |
|         t-statistic
 | |
|     pvalue : float or array
 | |
|         two-tailed p-value
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details on `ttest_rel`, see `scipy.stats.ttest_rel`.
 | |
| 
 | |
|     """
 | |
|     a, b, axis = _chk2_asarray(a, b, axis)
 | |
|     if len(a) != len(b):
 | |
|         raise ValueError('unequal length arrays')
 | |
| 
 | |
|     if a.size == 0 or b.size == 0:
 | |
|         return Ttest_relResult(np.nan, np.nan)
 | |
| 
 | |
|     n = a.count(axis)
 | |
|     df = ma.asanyarray(n-1.0)
 | |
|     d = (a-b).astype('d')
 | |
|     dm = d.mean(axis)
 | |
|     v = d.var(axis=axis, ddof=1)
 | |
|     denom = ma.sqrt(v / n)
 | |
|     with np.errstate(divide='ignore', invalid='ignore'):
 | |
|         t = dm / denom
 | |
| 
 | |
|     t, prob = _ttest_finish(df, t, alternative)
 | |
|     return Ttest_relResult(t, prob)
 | |
| 
 | |
| 
 | |
| MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic',
 | |
|                                                        'pvalue'))
 | |
| 
 | |
| 
 | |
| def mannwhitneyu(x,y, use_continuity=True):
 | |
|     """
 | |
|     Computes the Mann-Whitney statistic
 | |
| 
 | |
|     Missing values in `x` and/or `y` are discarded.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : sequence
 | |
|         Input
 | |
|     y : sequence
 | |
|         Input
 | |
|     use_continuity : {True, False}, optional
 | |
|         Whether a continuity correction (1/2.) should be taken into account.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : float
 | |
|         The minimum of the Mann-Whitney statistics
 | |
|     pvalue : float
 | |
|         Approximate two-sided p-value assuming a normal distribution.
 | |
| 
 | |
|     """
 | |
|     x = ma.asarray(x).compressed().view(ndarray)
 | |
|     y = ma.asarray(y).compressed().view(ndarray)
 | |
|     ranks = rankdata(np.concatenate([x,y]))
 | |
|     (nx, ny) = (len(x), len(y))
 | |
|     nt = nx + ny
 | |
|     U = ranks[:nx].sum() - nx*(nx+1)/2.
 | |
|     U = max(U, nx*ny - U)
 | |
|     u = nx*ny - U
 | |
| 
 | |
|     mu = (nx*ny)/2.
 | |
|     sigsq = (nt**3 - nt)/12.
 | |
|     ties = count_tied_groups(ranks)
 | |
|     sigsq -= sum(v*(k**3-k) for (k,v) in ties.items())/12.
 | |
|     sigsq *= nx*ny/float(nt*(nt-1))
 | |
| 
 | |
|     if use_continuity:
 | |
|         z = (U - 1/2. - mu) / ma.sqrt(sigsq)
 | |
|     else:
 | |
|         z = (U - mu) / ma.sqrt(sigsq)
 | |
| 
 | |
|     prob = special.erfc(abs(z)/np.sqrt(2))
 | |
|     return MannwhitneyuResult(u, prob)
 | |
| 
 | |
| 
 | |
| KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))
 | |
| 
 | |
| 
 | |
| def kruskal(*args):
 | |
|     """
 | |
|     Compute the Kruskal-Wallis H-test for independent samples
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     sample1, sample2, ... : array_like
 | |
|        Two or more arrays with the sample measurements can be given as
 | |
|        arguments.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : float
 | |
|        The Kruskal-Wallis H statistic, corrected for ties
 | |
|     pvalue : float
 | |
|        The p-value for the test using the assumption that H has a chi
 | |
|        square distribution
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details on `kruskal`, see `scipy.stats.kruskal`.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> from scipy.stats.mstats import kruskal
 | |
| 
 | |
|     Random samples from three different brands of batteries were tested
 | |
|     to see how long the charge lasted. Results were as follows:
 | |
| 
 | |
|     >>> a = [6.3, 5.4, 5.7, 5.2, 5.0]
 | |
|     >>> b = [6.9, 7.0, 6.1, 7.9]
 | |
|     >>> c = [7.2, 6.9, 6.1, 6.5]
 | |
| 
 | |
|     Test the hypothesis that the distribution functions for all of the brands'
 | |
|     durations are identical. Use 5% level of significance.
 | |
| 
 | |
|     >>> kruskal(a, b, c)
 | |
|     KruskalResult(statistic=7.113812154696133, pvalue=0.028526948491942164)
 | |
| 
 | |
|     The null hypothesis is rejected at the 5% level of significance
 | |
|     because the returned p-value is less than the critical value of 5%.
 | |
| 
 | |
|     """
 | |
|     output = argstoarray(*args)
 | |
|     ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
 | |
|     sumrk = ranks.sum(-1)
 | |
|     ngrp = ranks.count(-1)
 | |
|     ntot = ranks.count()
 | |
|     H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
 | |
|     # Tie correction
 | |
|     ties = count_tied_groups(ranks)
 | |
|     T = 1. - sum(v*(k**3-k) for (k,v) in ties.items())/float(ntot**3-ntot)
 | |
|     if T == 0:
 | |
|         raise ValueError('All numbers are identical in kruskal')
 | |
| 
 | |
|     H /= T
 | |
|     df = len(output) - 1
 | |
|     prob = distributions.chi2.sf(H, df)
 | |
|     return KruskalResult(H, prob)
 | |
| 
 | |
| 
 | |
| kruskalwallis = kruskal
 | |
| 
 | |
| 
 | |
| @_rename_parameter("mode", "method")
 | |
| def ks_1samp(x, cdf, args=(), alternative="two-sided", method='auto'):
 | |
|     """
 | |
|     Computes the Kolmogorov-Smirnov test on one sample of masked values.
 | |
| 
 | |
|     Missing values in `x` are discarded.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : array_like
 | |
|         a 1-D array of observations of random variables.
 | |
|     cdf : str or callable
 | |
|         If a string, it should be the name of a distribution in `scipy.stats`.
 | |
|         If a callable, that callable is used to calculate the cdf.
 | |
|     args : tuple, sequence, optional
 | |
|         Distribution parameters, used if `cdf` is a string.
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Indicates the alternative hypothesis.  Default is 'two-sided'.
 | |
|     method : {'auto', 'exact', 'asymp'}, optional
 | |
|         Defines the method used for calculating the p-value.
 | |
|         The following options are available (default is 'auto'):
 | |
| 
 | |
|           * 'auto' : use 'exact' for small size arrays, 'asymp' for large
 | |
|           * 'exact' : use approximation to exact distribution of test statistic
 | |
|           * 'asymp' : use asymptotic distribution of test statistic
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     d : float
 | |
|         Value of the Kolmogorov Smirnov test
 | |
|     p : float
 | |
|         Corresponding p-value.
 | |
| 
 | |
|     """
 | |
|     alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
 | |
|        alternative.lower()[0], alternative)
 | |
|     return scipy.stats._stats_py.ks_1samp(
 | |
|         x, cdf, args=args, alternative=alternative, method=method)
 | |
| 
 | |
| 
 | |
| @_rename_parameter("mode", "method")
 | |
| def ks_2samp(data1, data2, alternative="two-sided", method='auto'):
 | |
|     """
 | |
|     Computes the Kolmogorov-Smirnov test on two samples.
 | |
| 
 | |
|     Missing values in `x` and/or `y` are discarded.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     data1 : array_like
 | |
|         First data set
 | |
|     data2 : array_like
 | |
|         Second data set
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Indicates the alternative hypothesis.  Default is 'two-sided'.
 | |
|     method : {'auto', 'exact', 'asymp'}, optional
 | |
|         Defines the method used for calculating the p-value.
 | |
|         The following options are available (default is 'auto'):
 | |
| 
 | |
|           * 'auto' : use 'exact' for small size arrays, 'asymp' for large
 | |
|           * 'exact' : use approximation to exact distribution of test statistic
 | |
|           * 'asymp' : use asymptotic distribution of test statistic
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     d : float
 | |
|         Value of the Kolmogorov Smirnov test
 | |
|     p : float
 | |
|         Corresponding p-value.
 | |
| 
 | |
|     """
 | |
|     # Ideally this would be accomplished by
 | |
|     # ks_2samp = scipy.stats._stats_py.ks_2samp
 | |
|     # but the circular dependencies between _mstats_basic and stats prevent that.
 | |
|     alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
 | |
|        alternative.lower()[0], alternative)
 | |
|     return scipy.stats._stats_py.ks_2samp(data1, data2,
 | |
|                                           alternative=alternative,
 | |
|                                           method=method)
 | |
| 
 | |
| 
 | |
| ks_twosamp = ks_2samp
 | |
| 
 | |
| 
 | |
| @_rename_parameter("mode", "method")
 | |
| def kstest(data1, data2, args=(), alternative='two-sided', method='auto'):
 | |
|     """
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     data1 : array_like
 | |
|     data2 : str, callable or array_like
 | |
|     args : tuple, sequence, optional
 | |
|         Distribution parameters, used if `data1` or `data2` are strings.
 | |
|     alternative : str, as documented in stats.kstest
 | |
|     method : str, as documented in stats.kstest
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     tuple of (K-S statistic, probability)
 | |
| 
 | |
|     """
 | |
|     return scipy.stats._stats_py.kstest(data1, data2, args,
 | |
|                                         alternative=alternative, method=method)
 | |
| 
 | |
| 
 | |
| def trima(a, limits=None, inclusive=(True,True)):
 | |
|     """
 | |
|     Trims an array by masking the data outside some given limits.
 | |
| 
 | |
|     Returns a masked version of the input array.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         Input array.
 | |
|     limits : {None, tuple}, optional
 | |
|         Tuple of (lower limit, upper limit) in absolute values.
 | |
|         Values of the input array lower (greater) than the lower (upper) limit
 | |
|         will be masked.  A limit is None indicates an open interval.
 | |
|     inclusive : (bool, bool) tuple, optional
 | |
|         Tuple of (lower flag, upper flag), indicating whether values exactly
 | |
|         equal to the lower (upper) limit are allowed.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> from scipy.stats.mstats import trima
 | |
|     >>> import numpy as np
 | |
| 
 | |
|     >>> a = np.arange(10)
 | |
| 
 | |
|     The interval is left-closed and right-open, i.e., `[2, 8)`.
 | |
|     Trim the array by keeping only values in the interval.
 | |
| 
 | |
|     >>> trima(a, limits=(2, 8), inclusive=(True, False))
 | |
|     masked_array(data=[--, --, 2, 3, 4, 5, 6, 7, --, --],
 | |
|                  mask=[ True,  True, False, False, False, False, False, False,
 | |
|                         True,  True],
 | |
|            fill_value=999999)
 | |
| 
 | |
|     """
 | |
|     a = ma.asarray(a)
 | |
|     a.unshare_mask()
 | |
|     if (limits is None) or (limits == (None, None)):
 | |
|         return a
 | |
| 
 | |
|     (lower_lim, upper_lim) = limits
 | |
|     (lower_in, upper_in) = inclusive
 | |
|     condition = False
 | |
|     if lower_lim is not None:
 | |
|         if lower_in:
 | |
|             condition |= (a < lower_lim)
 | |
|         else:
 | |
|             condition |= (a <= lower_lim)
 | |
| 
 | |
|     if upper_lim is not None:
 | |
|         if upper_in:
 | |
|             condition |= (a > upper_lim)
 | |
|         else:
 | |
|             condition |= (a >= upper_lim)
 | |
| 
 | |
|     a[condition.filled(True)] = masked
 | |
|     return a
 | |
| 
 | |
| 
 | |
| def trimr(a, limits=None, inclusive=(True, True), axis=None):
 | |
|     """
 | |
|     Trims an array by masking some proportion of the data on each end.
 | |
|     Returns a masked version of the input array.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : sequence
 | |
|         Input array.
 | |
|     limits : {None, tuple}, optional
 | |
|         Tuple of the percentages to cut on each side of the array, with respect
 | |
|         to the number of unmasked data, as floats between 0. and 1.
 | |
|         Noting n the number of unmasked data before trimming, the
 | |
|         (n*limits[0])th smallest data and the (n*limits[1])th largest data are
 | |
|         masked, and the total number of unmasked data after trimming is
 | |
|         n*(1.-sum(limits)).  The value of one limit can be set to None to
 | |
|         indicate an open interval.
 | |
|     inclusive : {(True,True) tuple}, optional
 | |
|         Tuple of flags indicating whether the number of data being masked on
 | |
|         the left (right) end should be truncated (True) or rounded (False) to
 | |
|         integers.
 | |
|     axis : {None,int}, optional
 | |
|         Axis along which to trim. If None, the whole array is trimmed, but its
 | |
|         shape is maintained.
 | |
| 
 | |
|     """
 | |
|     def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
 | |
|         n = a.count()
 | |
|         idx = a.argsort()
 | |
|         if low_limit:
 | |
|             if low_inclusive:
 | |
|                 lowidx = int(low_limit*n)
 | |
|             else:
 | |
|                 lowidx = int(np.round(low_limit*n))
 | |
|             a[idx[:lowidx]] = masked
 | |
|         if up_limit is not None:
 | |
|             if up_inclusive:
 | |
|                 upidx = n - int(n*up_limit)
 | |
|             else:
 | |
|                 upidx = n - int(np.round(n*up_limit))
 | |
|             a[idx[upidx:]] = masked
 | |
|         return a
 | |
| 
 | |
|     a = ma.asarray(a)
 | |
|     a.unshare_mask()
 | |
|     if limits is None:
 | |
|         return a
 | |
| 
 | |
|     # Check the limits
 | |
|     (lolim, uplim) = limits
 | |
|     errmsg = "The proportion to cut from the %s should be between 0. and 1."
 | |
|     if lolim is not None:
 | |
|         if lolim > 1. or lolim < 0:
 | |
|             raise ValueError(errmsg % 'beginning' + f"(got {lolim})")
 | |
|     if uplim is not None:
 | |
|         if uplim > 1. or uplim < 0:
 | |
|             raise ValueError(errmsg % 'end' + f"(got {uplim})")
 | |
| 
 | |
|     (loinc, upinc) = inclusive
 | |
| 
 | |
|     if axis is None:
 | |
|         shp = a.shape
 | |
|         return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp)
 | |
|     else:
 | |
|         return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc)
 | |
| 
 | |
| 
 | |
| trimdoc = """
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : sequence
 | |
|         Input array
 | |
|     limits : {None, tuple}, optional
 | |
|         If `relative` is False, tuple (lower limit, upper limit) in absolute values.
 | |
|         Values of the input array lower (greater) than the lower (upper) limit are
 | |
|         masked.
 | |
| 
 | |
|         If `relative` is True, tuple (lower percentage, upper percentage) to cut
 | |
|         on each side of the  array, with respect to the number of unmasked data.
 | |
| 
 | |
|         Noting n the number of unmasked data before trimming, the (n*limits[0])th
 | |
|         smallest data and the (n*limits[1])th largest data are masked, and the
 | |
|         total number of unmasked data after trimming is n*(1.-sum(limits))
 | |
|         In each case, the value of one limit can be set to None to indicate an
 | |
|         open interval.
 | |
| 
 | |
|         If limits is None, no trimming is performed
 | |
|     inclusive : {(bool, bool) tuple}, optional
 | |
|         If `relative` is False, tuple indicating whether values exactly equal
 | |
|         to the absolute limits are allowed.
 | |
|         If `relative` is True, tuple indicating whether the number of data
 | |
|         being masked on each side should be rounded (True) or truncated
 | |
|         (False).
 | |
|     relative : bool, optional
 | |
|         Whether to consider the limits as absolute values (False) or proportions
 | |
|         to cut (True).
 | |
|     axis : int, optional
 | |
|         Axis along which to trim.
 | |
| """
 | |
| 
 | |
| 
 | |
| def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None):
 | |
|     """
 | |
|     Trims an array by masking the data outside some given limits.
 | |
| 
 | |
|     Returns a masked version of the input array.
 | |
| 
 | |
|     %s
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> from scipy.stats.mstats import trim
 | |
|     >>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10]
 | |
|     >>> print(trim(z,(3,8)))
 | |
|     [-- -- 3 4 5 6 7 8 -- --]
 | |
|     >>> print(trim(z,(0.1,0.2),relative=True))
 | |
|     [-- 2 3 4 5 6 7 8 -- --]
 | |
| 
 | |
|     """
 | |
|     if relative:
 | |
|         return trimr(a, limits=limits, inclusive=inclusive, axis=axis)
 | |
|     else:
 | |
|         return trima(a, limits=limits, inclusive=inclusive)
 | |
| 
 | |
| 
 | |
| if trim.__doc__:
 | |
|     trim.__doc__ = trim.__doc__ % trimdoc
 | |
| 
 | |
| 
 | |
| def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None):
 | |
|     """
 | |
|     Trims the smallest and largest data values.
 | |
| 
 | |
|     Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and
 | |
|     ``int(proportiontocut * n)`` largest values of data along the given axis,
 | |
|     where n is the number of unmasked values before trimming.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     data : ndarray
 | |
|         Data to trim.
 | |
|     proportiontocut : float, optional
 | |
|         Percentage of trimming (as a float between 0 and 1).
 | |
|         If n is the number of unmasked values before trimming, the number of
 | |
|         values after trimming is ``(1 - 2*proportiontocut) * n``.
 | |
|         Default is 0.2.
 | |
|     inclusive : {(bool, bool) tuple}, optional
 | |
|         Tuple indicating whether the number of data being masked on each side
 | |
|         should be rounded (True) or truncated (False).
 | |
|     axis : int, optional
 | |
|         Axis along which to perform the trimming.
 | |
|         If None, the input array is first flattened.
 | |
| 
 | |
|     """
 | |
|     return trimr(data, limits=(proportiontocut,proportiontocut),
 | |
|                  inclusive=inclusive, axis=axis)
 | |
| 
 | |
| 
 | |
| def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True),
 | |
|              axis=None):
 | |
|     """
 | |
|     Trims the data by masking values from one tail.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     data : array_like
 | |
|         Data to trim.
 | |
|     proportiontocut : float, optional
 | |
|         Percentage of trimming. If n is the number of unmasked values
 | |
|         before trimming, the number of values after trimming is
 | |
|         ``(1 - proportiontocut) * n``.  Default is 0.2.
 | |
|     tail : {'left','right'}, optional
 | |
|         If 'left' the `proportiontocut` lowest values will be masked.
 | |
|         If 'right' the `proportiontocut` highest values will be masked.
 | |
|         Default is 'left'.
 | |
|     inclusive : {(bool, bool) tuple}, optional
 | |
|         Tuple indicating whether the number of data being masked on each side
 | |
|         should be rounded (True) or truncated (False).  Default is
 | |
|         (True, True).
 | |
|     axis : int, optional
 | |
|         Axis along which to perform the trimming.
 | |
|         If None, the input array is first flattened.  Default is None.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     trimtail : ndarray
 | |
|         Returned array of same shape as `data` with masked tail values.
 | |
| 
 | |
|     """
 | |
|     tail = str(tail).lower()[0]
 | |
|     if tail == 'l':
 | |
|         limits = (proportiontocut,None)
 | |
|     elif tail == 'r':
 | |
|         limits = (None, proportiontocut)
 | |
|     else:
 | |
|         raise TypeError("The tail argument should be in ('left','right')")
 | |
| 
 | |
|     return trimr(data, limits=limits, axis=axis, inclusive=inclusive)
 | |
| 
 | |
| 
 | |
| trim1 = trimtail
 | |
| 
 | |
| 
 | |
| def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
 | |
|                  axis=None):
 | |
|     """Returns the trimmed mean of the data along the given axis.
 | |
| 
 | |
|     %s
 | |
| 
 | |
|     """
 | |
|     if (not isinstance(limits,tuple)) and isinstance(limits,float):
 | |
|         limits = (limits, limits)
 | |
|     if relative:
 | |
|         return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis)
 | |
|     else:
 | |
|         return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis)
 | |
| 
 | |
| 
 | |
| if trimmed_mean.__doc__:
 | |
|     trimmed_mean.__doc__ = trimmed_mean.__doc__ % trimdoc
 | |
| 
 | |
| 
 | |
| def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
 | |
|                 axis=None, ddof=0):
 | |
|     """Returns the trimmed variance of the data along the given axis.
 | |
| 
 | |
|     %s
 | |
|     ddof : {0,integer}, optional
 | |
|         Means Delta Degrees of Freedom. The denominator used during computations
 | |
|         is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
 | |
|         biased estimate of the variance.
 | |
| 
 | |
|     """
 | |
|     if (not isinstance(limits,tuple)) and isinstance(limits,float):
 | |
|         limits = (limits, limits)
 | |
|     if relative:
 | |
|         out = trimr(a,limits=limits, inclusive=inclusive,axis=axis)
 | |
|     else:
 | |
|         out = trima(a,limits=limits,inclusive=inclusive)
 | |
| 
 | |
|     return out.var(axis=axis, ddof=ddof)
 | |
| 
 | |
| 
 | |
| if trimmed_var.__doc__:
 | |
|     trimmed_var.__doc__ = trimmed_var.__doc__ % trimdoc
 | |
| 
 | |
| 
 | |
| def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
 | |
|                 axis=None, ddof=0):
 | |
|     """Returns the trimmed standard deviation of the data along the given axis.
 | |
| 
 | |
|     %s
 | |
|     ddof : {0,integer}, optional
 | |
|         Means Delta Degrees of Freedom. The denominator used during computations
 | |
|         is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
 | |
|         biased estimate of the variance.
 | |
| 
 | |
|     """
 | |
|     if (not isinstance(limits,tuple)) and isinstance(limits,float):
 | |
|         limits = (limits, limits)
 | |
|     if relative:
 | |
|         out = trimr(a,limits=limits,inclusive=inclusive,axis=axis)
 | |
|     else:
 | |
|         out = trima(a,limits=limits,inclusive=inclusive)
 | |
|     return out.std(axis=axis,ddof=ddof)
 | |
| 
 | |
| 
 | |
| if trimmed_std.__doc__:
 | |
|     trimmed_std.__doc__ = trimmed_std.__doc__ % trimdoc
 | |
| 
 | |
| 
 | |
| def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None):
 | |
|     """
 | |
|     Returns the standard error of the trimmed mean along the given axis.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : sequence
 | |
|         Input array
 | |
|     limits : {(0.1,0.1), tuple of float}, optional
 | |
|         tuple (lower percentage, upper percentage) to cut  on each side of the
 | |
|         array, with respect to the number of unmasked data.
 | |
| 
 | |
|         If n is the number of unmasked data before trimming, the values
 | |
|         smaller than ``n * limits[0]`` and the values larger than
 | |
|         ``n * `limits[1]`` are masked, and the total number of unmasked
 | |
|         data after trimming is ``n * (1.-sum(limits))``.  In each case,
 | |
|         the value of one limit can be set to None to indicate an open interval.
 | |
|         If `limits` is None, no trimming is performed.
 | |
|     inclusive : {(bool, bool) tuple} optional
 | |
|         Tuple indicating whether the number of data being masked on each side
 | |
|         should be rounded (True) or truncated (False).
 | |
|     axis : int, optional
 | |
|         Axis along which to trim.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     trimmed_stde : scalar or ndarray
 | |
| 
 | |
|     """
 | |
|     def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
 | |
|         "Returns the standard error of the trimmed mean for a 1D input data."
 | |
|         n = a.count()
 | |
|         idx = a.argsort()
 | |
|         if low_limit:
 | |
|             if low_inclusive:
 | |
|                 lowidx = int(low_limit*n)
 | |
|             else:
 | |
|                 lowidx = np.round(low_limit*n)
 | |
|             a[idx[:lowidx]] = masked
 | |
|         if up_limit is not None:
 | |
|             if up_inclusive:
 | |
|                 upidx = n - int(n*up_limit)
 | |
|             else:
 | |
|                 upidx = n - np.round(n*up_limit)
 | |
|             a[idx[upidx:]] = masked
 | |
|         a[idx[:lowidx]] = a[idx[lowidx]]
 | |
|         a[idx[upidx:]] = a[idx[upidx-1]]
 | |
|         winstd = a.std(ddof=1)
 | |
|         return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a)))
 | |
| 
 | |
|     a = ma.array(a, copy=True, subok=True)
 | |
|     a.unshare_mask()
 | |
|     if limits is None:
 | |
|         return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis))
 | |
|     if (not isinstance(limits,tuple)) and isinstance(limits,float):
 | |
|         limits = (limits, limits)
 | |
| 
 | |
|     # Check the limits
 | |
|     (lolim, uplim) = limits
 | |
|     errmsg = "The proportion to cut from the %s should be between 0. and 1."
 | |
|     if lolim is not None:
 | |
|         if lolim > 1. or lolim < 0:
 | |
|             raise ValueError(errmsg % 'beginning' + f"(got {lolim})")
 | |
|     if uplim is not None:
 | |
|         if uplim > 1. or uplim < 0:
 | |
|             raise ValueError(errmsg % 'end' + f"(got {uplim})")
 | |
| 
 | |
|     (loinc, upinc) = inclusive
 | |
|     if (axis is None):
 | |
|         return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc)
 | |
|     else:
 | |
|         if a.ndim > 2:
 | |
|             raise ValueError(f"Array 'a' must be at most two dimensional, "
 | |
|                              f"but got a.ndim = {a.ndim}")
 | |
|         return ma.apply_along_axis(_trimmed_stde_1D, axis, a,
 | |
|                                    lolim,uplim,loinc,upinc)
 | |
| 
 | |
| 
 | |
| def _mask_to_limits(a, limits, inclusive):
 | |
|     """Mask an array for values outside of given limits.
 | |
| 
 | |
|     This is primarily a utility function.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array
 | |
|     limits : (float or None, float or None)
 | |
|     A tuple consisting of the (lower limit, upper limit).  Values in the
 | |
|     input array less than the lower limit or greater than the upper limit
 | |
|     will be masked out. None implies no limit.
 | |
|     inclusive : (bool, bool)
 | |
|     A tuple consisting of the (lower flag, upper flag).  These flags
 | |
|     determine whether values exactly equal to lower or upper are allowed.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     A MaskedArray.
 | |
| 
 | |
|     Raises
 | |
|     ------
 | |
|     A ValueError if there are no values within the given limits.
 | |
|     """
 | |
|     lower_limit, upper_limit = limits
 | |
|     lower_include, upper_include = inclusive
 | |
|     am = ma.MaskedArray(a)
 | |
|     if lower_limit is not None:
 | |
|         if lower_include:
 | |
|             am = ma.masked_less(am, lower_limit)
 | |
|         else:
 | |
|             am = ma.masked_less_equal(am, lower_limit)
 | |
| 
 | |
|     if upper_limit is not None:
 | |
|         if upper_include:
 | |
|             am = ma.masked_greater(am, upper_limit)
 | |
|         else:
 | |
|             am = ma.masked_greater_equal(am, upper_limit)
 | |
| 
 | |
|     if am.count() == 0:
 | |
|         raise ValueError("No array values within given limits")
 | |
| 
 | |
|     return am
 | |
| 
 | |
| 
 | |
| def tmean(a, limits=None, inclusive=(True, True), axis=None):
 | |
|     """
 | |
|     Compute the trimmed mean.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         Array of values.
 | |
|     limits : None or (lower limit, upper limit), optional
 | |
|         Values in the input array less than the lower limit or greater than the
 | |
|         upper limit will be ignored.  When limits is None (default), then all
 | |
|         values are used.  Either of the limit values in the tuple can also be
 | |
|         None representing a half-open interval.
 | |
|     inclusive : (bool, bool), optional
 | |
|         A tuple consisting of the (lower flag, upper flag).  These flags
 | |
|         determine whether values exactly equal to the lower or upper limits
 | |
|         are included.  The default value is (True, True).
 | |
|     axis : int or None, optional
 | |
|         Axis along which to operate. If None, compute over the
 | |
|         whole array. Default is None.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     tmean : float
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details on `tmean`, see `scipy.stats.tmean`.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats import mstats
 | |
|     >>> a = np.array([[6, 8, 3, 0],
 | |
|     ...               [3, 9, 1, 2],
 | |
|     ...               [8, 7, 8, 2],
 | |
|     ...               [5, 6, 0, 2],
 | |
|     ...               [4, 5, 5, 2]])
 | |
|     ...
 | |
|     ...
 | |
|     >>> mstats.tmean(a, (2,5))
 | |
|     3.3
 | |
|     >>> mstats.tmean(a, (2,5), axis=0)
 | |
|     masked_array(data=[4.0, 5.0, 4.0, 2.0],
 | |
|                  mask=[False, False, False, False],
 | |
|            fill_value=1e+20)
 | |
| 
 | |
|     """
 | |
|     return trima(a, limits=limits, inclusive=inclusive).mean(axis=axis)
 | |
| 
 | |
| 
 | |
| def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
 | |
|     """
 | |
|     Compute the trimmed variance
 | |
| 
 | |
|     This function computes the sample variance of an array of values,
 | |
|     while ignoring values which are outside of given `limits`.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         Array of values.
 | |
|     limits : None or (lower limit, upper limit), optional
 | |
|         Values in the input array less than the lower limit or greater than the
 | |
|         upper limit will be ignored. When limits is None, then all values are
 | |
|         used. Either of the limit values in the tuple can also be None
 | |
|         representing a half-open interval.  The default value is None.
 | |
|     inclusive : (bool, bool), optional
 | |
|         A tuple consisting of the (lower flag, upper flag).  These flags
 | |
|         determine whether values exactly equal to the lower or upper limits
 | |
|         are included.  The default value is (True, True).
 | |
|     axis : int or None, optional
 | |
|         Axis along which to operate. If None, compute over the
 | |
|         whole array. Default is zero.
 | |
|     ddof : int, optional
 | |
|         Delta degrees of freedom. Default is 1.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     tvar : float
 | |
|         Trimmed variance.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details on `tvar`, see `scipy.stats.tvar`.
 | |
| 
 | |
|     """
 | |
|     a = a.astype(float).ravel()
 | |
|     if limits is None:
 | |
|         n = (~a.mask).sum()  # todo: better way to do that?
 | |
|         return np.ma.var(a) * n/(n-1.)
 | |
|     am = _mask_to_limits(a, limits=limits, inclusive=inclusive)
 | |
| 
 | |
|     return np.ma.var(am, axis=axis, ddof=ddof)
 | |
| 
 | |
| 
 | |
| def tmin(a, lowerlimit=None, axis=0, inclusive=True):
 | |
|     """
 | |
|     Compute the trimmed minimum
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         array of values
 | |
|     lowerlimit : None or float, optional
 | |
|         Values in the input array less than the given limit will be ignored.
 | |
|         When lowerlimit is None, then all values are used. The default value
 | |
|         is None.
 | |
|     axis : int or None, optional
 | |
|         Axis along which to operate. Default is 0. If None, compute over the
 | |
|         whole array `a`.
 | |
|     inclusive : {True, False}, optional
 | |
|         This flag determines whether values exactly equal to the lower limit
 | |
|         are included.  The default value is True.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     tmin : float, int or ndarray
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details on `tmin`, see `scipy.stats.tmin`.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats import mstats
 | |
|     >>> a = np.array([[6, 8, 3, 0],
 | |
|     ...               [3, 2, 1, 2],
 | |
|     ...               [8, 1, 8, 2],
 | |
|     ...               [5, 3, 0, 2],
 | |
|     ...               [4, 7, 5, 2]])
 | |
|     ...
 | |
|     >>> mstats.tmin(a, 5)
 | |
|     masked_array(data=[5, 7, 5, --],
 | |
|                  mask=[False, False, False,  True],
 | |
|            fill_value=999999)
 | |
| 
 | |
|     """
 | |
|     a, axis = _chk_asarray(a, axis)
 | |
|     am = trima(a, (lowerlimit, None), (inclusive, False))
 | |
|     return ma.minimum.reduce(am, axis)
 | |
| 
 | |
| 
 | |
| def tmax(a, upperlimit=None, axis=0, inclusive=True):
 | |
|     """
 | |
|     Compute the trimmed maximum
 | |
| 
 | |
|     This function computes the maximum value of an array along a given axis,
 | |
|     while ignoring values larger than a specified upper limit.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         array of values
 | |
|     upperlimit : None or float, optional
 | |
|         Values in the input array greater than the given limit will be ignored.
 | |
|         When upperlimit is None, then all values are used. The default value
 | |
|         is None.
 | |
|     axis : int or None, optional
 | |
|         Axis along which to operate. Default is 0. If None, compute over the
 | |
|         whole array `a`.
 | |
|     inclusive : {True, False}, optional
 | |
|         This flag determines whether values exactly equal to the upper limit
 | |
|         are included.  The default value is True.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     tmax : float, int or ndarray
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details on `tmax`, see `scipy.stats.tmax`.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats import mstats
 | |
|     >>> a = np.array([[6, 8, 3, 0],
 | |
|     ...               [3, 9, 1, 2],
 | |
|     ...               [8, 7, 8, 2],
 | |
|     ...               [5, 6, 0, 2],
 | |
|     ...               [4, 5, 5, 2]])
 | |
|     ...
 | |
|     ...
 | |
|     >>> mstats.tmax(a, 4)
 | |
|     masked_array(data=[4, --, 3, 2],
 | |
|                  mask=[False,  True, False, False],
 | |
|            fill_value=999999)
 | |
| 
 | |
|     """
 | |
|     a, axis = _chk_asarray(a, axis)
 | |
|     am = trima(a, (None, upperlimit), (False, inclusive))
 | |
|     return ma.maximum.reduce(am, axis)
 | |
| 
 | |
| 
 | |
| def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
 | |
|     """
 | |
|     Compute the trimmed standard error of the mean.
 | |
| 
 | |
|     This function finds the standard error of the mean for given
 | |
|     values, ignoring values outside the given `limits`.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         array of values
 | |
|     limits : None or (lower limit, upper limit), optional
 | |
|         Values in the input array less than the lower limit or greater than the
 | |
|         upper limit will be ignored. When limits is None, then all values are
 | |
|         used. Either of the limit values in the tuple can also be None
 | |
|         representing a half-open interval.  The default value is None.
 | |
|     inclusive : (bool, bool), optional
 | |
|         A tuple consisting of the (lower flag, upper flag).  These flags
 | |
|         determine whether values exactly equal to the lower or upper limits
 | |
|         are included.  The default value is (True, True).
 | |
|     axis : int or None, optional
 | |
|         Axis along which to operate. If None, compute over the
 | |
|         whole array. Default is zero.
 | |
|     ddof : int, optional
 | |
|         Delta degrees of freedom. Default is 1.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     tsem : float
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details on `tsem`, see `scipy.stats.tsem`.
 | |
| 
 | |
|     """
 | |
|     a = ma.asarray(a).ravel()
 | |
|     if limits is None:
 | |
|         n = float(a.count())
 | |
|         return a.std(axis=axis, ddof=ddof)/ma.sqrt(n)
 | |
| 
 | |
|     am = trima(a.ravel(), limits, inclusive)
 | |
|     sd = np.sqrt(am.var(axis=axis, ddof=ddof))
 | |
|     return sd / np.sqrt(am.count())
 | |
| 
 | |
| 
 | |
| def winsorize(a, limits=None, inclusive=(True, True), inplace=False,
 | |
|               axis=None, nan_policy='propagate'):
 | |
|     """Returns a Winsorized version of the input array.
 | |
| 
 | |
|     The (limits[0])th lowest values are set to the (limits[0])th percentile,
 | |
|     and the (limits[1])th highest values are set to the (1 - limits[1])th
 | |
|     percentile.
 | |
|     Masked values are skipped.
 | |
| 
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : sequence
 | |
|         Input array.
 | |
|     limits : {None, tuple of float}, optional
 | |
|         Tuple of the percentages to cut on each side of the array, with respect
 | |
|         to the number of unmasked data, as floats between 0. and 1.
 | |
|         Noting n the number of unmasked data before trimming, the
 | |
|         (n*limits[0])th smallest data and the (n*limits[1])th largest data are
 | |
|         masked, and the total number of unmasked data after trimming
 | |
|         is n*(1.-sum(limits)) The value of one limit can be set to None to
 | |
|         indicate an open interval.
 | |
|     inclusive : {(True, True) tuple}, optional
 | |
|         Tuple indicating whether the number of data being masked on each side
 | |
|         should be truncated (True) or rounded (False).
 | |
|     inplace : {False, True}, optional
 | |
|         Whether to winsorize in place (True) or to use a copy (False)
 | |
|     axis : {None, int}, optional
 | |
|         Axis along which to trim. If None, the whole array is trimmed, but its
 | |
|         shape is maintained.
 | |
|     nan_policy : {'propagate', 'raise', 'omit'}, optional
 | |
|         Defines how to handle when input contains nan.
 | |
|         The following options are available (default is 'propagate'):
 | |
| 
 | |
|           * 'propagate': allows nan values and may overwrite or propagate them
 | |
|           * 'raise': throws an error
 | |
|           * 'omit': performs the calculations ignoring nan values
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     This function is applied to reduce the effect of possibly spurious outliers
 | |
|     by limiting the extreme values.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats.mstats import winsorize
 | |
| 
 | |
|     A shuffled array contains integers from 1 to 10.
 | |
| 
 | |
|     >>> a = np.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6])
 | |
| 
 | |
|     The 10% of the lowest value (i.e., ``1``) and the 20% of the highest
 | |
|     values (i.e., ``9`` and ``10``) are replaced.
 | |
| 
 | |
|     >>> winsorize(a, limits=[0.1, 0.2])
 | |
|     masked_array(data=[8, 4, 8, 8, 5, 3, 7, 2, 2, 6],
 | |
|                  mask=False,
 | |
|            fill_value=999999)
 | |
| 
 | |
|     """
 | |
|     def _winsorize1D(a, low_limit, up_limit, low_include, up_include,
 | |
|                      contains_nan, nan_policy):
 | |
|         n = a.count()
 | |
|         idx = a.argsort()
 | |
|         if contains_nan:
 | |
|             nan_count = np.count_nonzero(np.isnan(a))
 | |
|         if low_limit:
 | |
|             if low_include:
 | |
|                 lowidx = int(low_limit * n)
 | |
|             else:
 | |
|                 lowidx = np.round(low_limit * n).astype(int)
 | |
|             if contains_nan and nan_policy == 'omit':
 | |
|                 lowidx = min(lowidx, n-nan_count-1)
 | |
|             a[idx[:lowidx]] = a[idx[lowidx]]
 | |
|         if up_limit is not None:
 | |
|             if up_include:
 | |
|                 upidx = n - int(n * up_limit)
 | |
|             else:
 | |
|                 upidx = n - np.round(n * up_limit).astype(int)
 | |
|             if contains_nan and nan_policy == 'omit':
 | |
|                 a[idx[upidx:-nan_count]] = a[idx[upidx - 1]]
 | |
|             else:
 | |
|                 a[idx[upidx:]] = a[idx[upidx - 1]]
 | |
|         return a
 | |
| 
 | |
|     contains_nan = _contains_nan(a, nan_policy)
 | |
|     # We are going to modify a: better make a copy
 | |
|     a = ma.array(a, copy=np.logical_not(inplace))
 | |
| 
 | |
|     if limits is None:
 | |
|         return a
 | |
|     if (not isinstance(limits, tuple)) and isinstance(limits, float):
 | |
|         limits = (limits, limits)
 | |
| 
 | |
|     # Check the limits
 | |
|     (lolim, uplim) = limits
 | |
|     errmsg = "The proportion to cut from the %s should be between 0. and 1."
 | |
|     if lolim is not None:
 | |
|         if lolim > 1. or lolim < 0:
 | |
|             raise ValueError(errmsg % 'beginning' + f"(got {lolim})")
 | |
|     if uplim is not None:
 | |
|         if uplim > 1. or uplim < 0:
 | |
|             raise ValueError(errmsg % 'end' + f"(got {uplim})")
 | |
| 
 | |
|     (loinc, upinc) = inclusive
 | |
| 
 | |
|     if axis is None:
 | |
|         shp = a.shape
 | |
|         return _winsorize1D(a.ravel(), lolim, uplim, loinc, upinc,
 | |
|                             contains_nan, nan_policy).reshape(shp)
 | |
|     else:
 | |
|         return ma.apply_along_axis(_winsorize1D, axis, a, lolim, uplim, loinc,
 | |
|                                    upinc, contains_nan, nan_policy)
 | |
| 
 | |
| 
 | |
| def moment(a, moment=1, axis=0):
 | |
|     """
 | |
|     Calculates the nth moment about the mean for a sample.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|        data
 | |
|     moment : int, optional
 | |
|        order of central moment that is returned
 | |
|     axis : int or None, optional
 | |
|        Axis along which the central moment is computed. Default is 0.
 | |
|        If None, compute over the whole array `a`.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     n-th central moment : ndarray or float
 | |
|        The appropriate moment along the given axis or over all values if axis
 | |
|        is None. The denominator for the moment calculation is the number of
 | |
|        observations, no degrees of freedom correction is done.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details about `moment`, see `scipy.stats.moment`.
 | |
| 
 | |
|     """
 | |
|     a, axis = _chk_asarray(a, axis)
 | |
|     if a.size == 0:
 | |
|         moment_shape = list(a.shape)
 | |
|         del moment_shape[axis]
 | |
|         dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
 | |
|         # empty array, return nan(s) with shape matching `moment`
 | |
|         out_shape = (moment_shape if np.isscalar(moment)
 | |
|                      else [len(moment)] + moment_shape)
 | |
|         if len(out_shape) == 0:
 | |
|             return dtype(np.nan)
 | |
|         else:
 | |
|             return ma.array(np.full(out_shape, np.nan, dtype=dtype))
 | |
| 
 | |
|     # for array_like moment input, return a value for each.
 | |
|     if not np.isscalar(moment):
 | |
|         mean = a.mean(axis, keepdims=True)
 | |
|         mmnt = [_moment(a, i, axis, mean=mean) for i in moment]
 | |
|         return ma.array(mmnt)
 | |
|     else:
 | |
|         return _moment(a, moment, axis)
 | |
| 
 | |
| 
 | |
| # Moment with optional pre-computed mean, equal to a.mean(axis, keepdims=True)
 | |
| def _moment(a, moment, axis, *, mean=None):
 | |
|     if np.abs(moment - np.round(moment)) > 0:
 | |
|         raise ValueError("All moment parameters must be integers")
 | |
| 
 | |
|     if moment == 0 or moment == 1:
 | |
|         # By definition the zeroth moment about the mean is 1, and the first
 | |
|         # moment is 0.
 | |
|         shape = list(a.shape)
 | |
|         del shape[axis]
 | |
|         dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
 | |
| 
 | |
|         if len(shape) == 0:
 | |
|             return dtype(1.0 if moment == 0 else 0.0)
 | |
|         else:
 | |
|             return (ma.ones(shape, dtype=dtype) if moment == 0
 | |
|                     else ma.zeros(shape, dtype=dtype))
 | |
|     else:
 | |
|         # Exponentiation by squares: form exponent sequence
 | |
|         n_list = [moment]
 | |
|         current_n = moment
 | |
|         while current_n > 2:
 | |
|             if current_n % 2:
 | |
|                 current_n = (current_n-1)/2
 | |
|             else:
 | |
|                 current_n /= 2
 | |
|             n_list.append(current_n)
 | |
| 
 | |
|         # Starting point for exponentiation by squares
 | |
|         mean = a.mean(axis, keepdims=True) if mean is None else mean
 | |
|         a_zero_mean = a - mean
 | |
|         if n_list[-1] == 1:
 | |
|             s = a_zero_mean.copy()
 | |
|         else:
 | |
|             s = a_zero_mean**2
 | |
| 
 | |
|         # Perform multiplications
 | |
|         for n in n_list[-2::-1]:
 | |
|             s = s**2
 | |
|             if n % 2:
 | |
|                 s *= a_zero_mean
 | |
|         return s.mean(axis)
 | |
| 
 | |
| 
 | |
| def variation(a, axis=0, ddof=0):
 | |
|     """
 | |
|     Compute the coefficient of variation.
 | |
| 
 | |
|     The coefficient of variation is the standard deviation divided by the
 | |
|     mean.  This function is equivalent to::
 | |
| 
 | |
|         np.std(x, axis=axis, ddof=ddof) / np.mean(x)
 | |
| 
 | |
|     The default for ``ddof`` is 0, but many definitions of the coefficient
 | |
|     of variation use the square root of the unbiased sample variance
 | |
|     for the sample standard deviation, which corresponds to ``ddof=1``.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         Input array.
 | |
|     axis : int or None, optional
 | |
|         Axis along which to calculate the coefficient of variation. Default
 | |
|         is 0. If None, compute over the whole array `a`.
 | |
|     ddof : int, optional
 | |
|         Delta degrees of freedom.  Default is 0.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     variation : ndarray
 | |
|         The calculated variation along the requested axis.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details about `variation`, see `scipy.stats.variation`.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats.mstats import variation
 | |
|     >>> a = np.array([2,8,4])
 | |
|     >>> variation(a)
 | |
|     0.5345224838248487
 | |
|     >>> b = np.array([2,8,3,4])
 | |
|     >>> c = np.ma.masked_array(b, mask=[0,0,1,0])
 | |
|     >>> variation(c)
 | |
|     0.5345224838248487
 | |
| 
 | |
|     In the example above, it can be seen that this works the same as
 | |
|     `scipy.stats.variation` except 'stats.mstats.variation' ignores masked
 | |
|     array elements.
 | |
| 
 | |
|     """
 | |
|     a, axis = _chk_asarray(a, axis)
 | |
|     return a.std(axis, ddof=ddof)/a.mean(axis)
 | |
| 
 | |
| 
 | |
| def skew(a, axis=0, bias=True):
 | |
|     """
 | |
|     Computes the skewness of a data set.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : ndarray
 | |
|         data
 | |
|     axis : int or None, optional
 | |
|         Axis along which skewness is calculated. Default is 0.
 | |
|         If None, compute over the whole array `a`.
 | |
|     bias : bool, optional
 | |
|         If False, then the calculations are corrected for statistical bias.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     skewness : ndarray
 | |
|         The skewness of values along an axis, returning 0 where all values are
 | |
|         equal.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details about `skew`, see `scipy.stats.skew`.
 | |
| 
 | |
|     """
 | |
|     a, axis = _chk_asarray(a,axis)
 | |
|     mean = a.mean(axis, keepdims=True)
 | |
|     m2 = _moment(a, 2, axis, mean=mean)
 | |
|     m3 = _moment(a, 3, axis, mean=mean)
 | |
|     zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
 | |
|     with np.errstate(all='ignore'):
 | |
|         vals = ma.where(zero, 0, m3 / m2**1.5)
 | |
| 
 | |
|     if not bias and zero is not ma.masked and m2 is not ma.masked:
 | |
|         n = a.count(axis)
 | |
|         can_correct = ~zero & (n > 2)
 | |
|         if can_correct.any():
 | |
|             n = np.extract(can_correct, n)
 | |
|             m2 = np.extract(can_correct, m2)
 | |
|             m3 = np.extract(can_correct, m3)
 | |
|             nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5
 | |
|             np.place(vals, can_correct, nval)
 | |
|     return vals
 | |
| 
 | |
| 
 | |
| def kurtosis(a, axis=0, fisher=True, bias=True):
 | |
|     """
 | |
|     Computes the kurtosis (Fisher or Pearson) of a dataset.
 | |
| 
 | |
|     Kurtosis is the fourth central moment divided by the square of the
 | |
|     variance. If Fisher's definition is used, then 3.0 is subtracted from
 | |
|     the result to give 0.0 for a normal distribution.
 | |
| 
 | |
|     If bias is False then the kurtosis is calculated using k statistics to
 | |
|     eliminate bias coming from biased moment estimators
 | |
| 
 | |
|     Use `kurtosistest` to see if result is close enough to normal.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array
 | |
|         data for which the kurtosis is calculated
 | |
|     axis : int or None, optional
 | |
|         Axis along which the kurtosis is calculated. Default is 0.
 | |
|         If None, compute over the whole array `a`.
 | |
|     fisher : bool, optional
 | |
|         If True, Fisher's definition is used (normal ==> 0.0). If False,
 | |
|         Pearson's definition is used (normal ==> 3.0).
 | |
|     bias : bool, optional
 | |
|         If False, then the calculations are corrected for statistical bias.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     kurtosis : array
 | |
|         The kurtosis of values along an axis. If all values are equal,
 | |
|         return -3 for Fisher's definition and 0 for Pearson's definition.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details about `kurtosis`, see `scipy.stats.kurtosis`.
 | |
| 
 | |
|     """
 | |
|     a, axis = _chk_asarray(a, axis)
 | |
|     mean = a.mean(axis, keepdims=True)
 | |
|     m2 = _moment(a, 2, axis, mean=mean)
 | |
|     m4 = _moment(a, 4, axis, mean=mean)
 | |
|     zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
 | |
|     with np.errstate(all='ignore'):
 | |
|         vals = ma.where(zero, 0, m4 / m2**2.0)
 | |
| 
 | |
|     if not bias and zero is not ma.masked and m2 is not ma.masked:
 | |
|         n = a.count(axis)
 | |
|         can_correct = ~zero & (n > 3)
 | |
|         if can_correct.any():
 | |
|             n = np.extract(can_correct, n)
 | |
|             m2 = np.extract(can_correct, m2)
 | |
|             m4 = np.extract(can_correct, m4)
 | |
|             nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0)
 | |
|             np.place(vals, can_correct, nval+3.0)
 | |
|     if fisher:
 | |
|         return vals - 3
 | |
|     else:
 | |
|         return vals
 | |
| 
 | |
| 
 | |
| DescribeResult = namedtuple('DescribeResult', ('nobs', 'minmax', 'mean',
 | |
|                                                'variance', 'skewness',
 | |
|                                                'kurtosis'))
 | |
| 
 | |
| 
 | |
| def describe(a, axis=0, ddof=0, bias=True):
 | |
|     """
 | |
|     Computes several descriptive statistics of the passed array.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         Data array
 | |
|     axis : int or None, optional
 | |
|         Axis along which to calculate statistics. Default 0. If None,
 | |
|         compute over the whole array `a`.
 | |
|     ddof : int, optional
 | |
|         degree of freedom (default 0); note that default ddof is different
 | |
|         from the same routine in stats.describe
 | |
|     bias : bool, optional
 | |
|         If False, then the skewness and kurtosis calculations are corrected for
 | |
|         statistical bias.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     nobs : int
 | |
|         (size of the data (discarding missing values)
 | |
| 
 | |
|     minmax : (int, int)
 | |
|         min, max
 | |
| 
 | |
|     mean : float
 | |
|         arithmetic mean
 | |
| 
 | |
|     variance : float
 | |
|         unbiased variance
 | |
| 
 | |
|     skewness : float
 | |
|         biased skewness
 | |
| 
 | |
|     kurtosis : float
 | |
|         biased kurtosis
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats.mstats import describe
 | |
|     >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1])
 | |
|     >>> describe(ma)
 | |
|     DescribeResult(nobs=np.int64(3), minmax=(masked_array(data=0,
 | |
|                  mask=False,
 | |
|            fill_value=999999), masked_array(data=2,
 | |
|                  mask=False,
 | |
|            fill_value=999999)), mean=np.float64(1.0),
 | |
|            variance=np.float64(0.6666666666666666),
 | |
|            skewness=masked_array(data=0., mask=False, fill_value=1e+20),
 | |
|             kurtosis=np.float64(-1.5))
 | |
| 
 | |
|     """
 | |
|     a, axis = _chk_asarray(a, axis)
 | |
|     n = a.count(axis)
 | |
|     mm = (ma.minimum.reduce(a, axis=axis), ma.maximum.reduce(a, axis=axis))
 | |
|     m = a.mean(axis)
 | |
|     v = a.var(axis, ddof=ddof)
 | |
|     sk = skew(a, axis, bias=bias)
 | |
|     kurt = kurtosis(a, axis, bias=bias)
 | |
| 
 | |
|     return DescribeResult(n, mm, m, v, sk, kurt)
 | |
| 
 | |
| 
 | |
| def stde_median(data, axis=None):
 | |
|     """Returns the McKean-Schrader estimate of the standard error of the sample
 | |
|     median along the given axis. masked values are discarded.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     data : ndarray
 | |
|         Data to trim.
 | |
|     axis : {None,int}, optional
 | |
|         Axis along which to perform the trimming.
 | |
|         If None, the input array is first flattened.
 | |
| 
 | |
|     """
 | |
|     def _stdemed_1D(data):
 | |
|         data = np.sort(data.compressed())
 | |
|         n = len(data)
 | |
|         z = 2.5758293035489004
 | |
|         k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0))
 | |
|         return ((data[n-k] - data[k-1])/(2.*z))
 | |
| 
 | |
|     data = ma.array(data, copy=False, subok=True)
 | |
|     if (axis is None):
 | |
|         return _stdemed_1D(data)
 | |
|     else:
 | |
|         if data.ndim > 2:
 | |
|             raise ValueError(f"Array 'data' must be at most two dimensional, "
 | |
|                              f"but got data.ndim = {data.ndim}")
 | |
|         return ma.apply_along_axis(_stdemed_1D, axis, data)
 | |
| 
 | |
| 
 | |
| SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))
 | |
| 
 | |
| 
 | |
| def skewtest(a, axis=0, alternative='two-sided'):
 | |
|     """
 | |
|     Tests whether the skew is different from the normal distribution.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         The data to be tested
 | |
|     axis : int or None, optional
 | |
|        Axis along which statistics are calculated. Default is 0.
 | |
|        If None, compute over the whole array `a`.
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Defines the alternative hypothesis. Default is 'two-sided'.
 | |
|         The following options are available:
 | |
| 
 | |
|         * 'two-sided': the skewness of the distribution underlying the sample
 | |
|           is different from that of the normal distribution (i.e. 0)
 | |
|         * 'less': the skewness of the distribution underlying the sample
 | |
|           is less than that of the normal distribution
 | |
|         * 'greater': the skewness of the distribution underlying the sample
 | |
|           is greater than that of the normal distribution
 | |
| 
 | |
|         .. versionadded:: 1.7.0
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : array_like
 | |
|         The computed z-score for this test.
 | |
|     pvalue : array_like
 | |
|         A p-value for the hypothesis test
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details about `skewtest`, see `scipy.stats.skewtest`.
 | |
| 
 | |
|     """
 | |
|     a, axis = _chk_asarray(a, axis)
 | |
|     if axis is None:
 | |
|         a = a.ravel()
 | |
|         axis = 0
 | |
|     b2 = skew(a,axis)
 | |
|     n = a.count(axis)
 | |
|     if np.min(n) < 8:
 | |
|         raise ValueError(f"skewtest is not valid with less than 8 samples; "
 | |
|                          f"{np.min(n)} samples were given.")
 | |
| 
 | |
|     y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2)))
 | |
|     beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9))
 | |
|     W2 = -1 + ma.sqrt(2*(beta2-1))
 | |
|     delta = 1/ma.sqrt(0.5*ma.log(W2))
 | |
|     alpha = ma.sqrt(2.0/(W2-1))
 | |
|     y = ma.where(y == 0, 1, y)
 | |
|     Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1))
 | |
|     pvalue = scipy.stats._stats_py._get_pvalue(Z, distributions.norm, alternative)
 | |
| 
 | |
|     return SkewtestResult(Z[()], pvalue[()])
 | |
| 
 | |
| 
 | |
| KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))
 | |
| 
 | |
| 
 | |
| def kurtosistest(a, axis=0, alternative='two-sided'):
 | |
|     """
 | |
|     Tests whether a dataset has normal kurtosis
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         array of the sample data
 | |
|     axis : int or None, optional
 | |
|        Axis along which to compute test. Default is 0. If None,
 | |
|        compute over the whole array `a`.
 | |
|     alternative : {'two-sided', 'less', 'greater'}, optional
 | |
|         Defines the alternative hypothesis.
 | |
|         The following options are available (default is 'two-sided'):
 | |
| 
 | |
|         * 'two-sided': the kurtosis of the distribution underlying the sample
 | |
|           is different from that of the normal distribution
 | |
|         * 'less': the kurtosis of the distribution underlying the sample
 | |
|           is less than that of the normal distribution
 | |
|         * 'greater': the kurtosis of the distribution underlying the sample
 | |
|           is greater than that of the normal distribution
 | |
| 
 | |
|         .. versionadded:: 1.7.0
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : array_like
 | |
|         The computed z-score for this test.
 | |
|     pvalue : array_like
 | |
|         The p-value for the hypothesis test
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details about `kurtosistest`, see `scipy.stats.kurtosistest`.
 | |
| 
 | |
|     """
 | |
|     a, axis = _chk_asarray(a, axis)
 | |
|     n = a.count(axis=axis)
 | |
|     if np.min(n) < 5:
 | |
|         raise ValueError(f"kurtosistest requires at least 5 observations; "
 | |
|                          f"{np.min(n)} observations were given.")
 | |
|     if np.min(n) < 20:
 | |
|         warnings.warn(f"kurtosistest only valid for n>=20 ... continuing "
 | |
|                       f"anyway, n={np.min(n)}", stacklevel=2)
 | |
| 
 | |
|     b2 = kurtosis(a, axis, fisher=False)
 | |
|     E = 3.0*(n-1) / (n+1)
 | |
|     varb2 = 24.0*n*(n-2.)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5))
 | |
|     x = (b2-E)/ma.sqrt(varb2)
 | |
|     sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) /
 | |
|                                                         (n*(n-2)*(n-3)))
 | |
|     A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))
 | |
|     term1 = 1 - 2./(9.0*A)
 | |
|     denom = 1 + x*ma.sqrt(2/(A-4.0))
 | |
|     if np.ma.isMaskedArray(denom):
 | |
|         # For multi-dimensional array input
 | |
|         denom[denom == 0.0] = masked
 | |
|     elif denom == 0.0:
 | |
|         denom = masked
 | |
| 
 | |
|     term2 = np.ma.where(denom > 0, ma.power((1-2.0/A)/denom, 1/3.0),
 | |
|                         -ma.power(-(1-2.0/A)/denom, 1/3.0))
 | |
|     Z = (term1 - term2) / np.sqrt(2/(9.0*A))
 | |
|     pvalue = scipy.stats._stats_py._get_pvalue(Z, distributions.norm, alternative)
 | |
| 
 | |
|     return KurtosistestResult(Z[()], pvalue[()])
 | |
| 
 | |
| 
 | |
| NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))
 | |
| 
 | |
| 
 | |
| def normaltest(a, axis=0):
 | |
|     """
 | |
|     Tests whether a sample differs from a normal distribution.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         The array containing the data to be tested.
 | |
|     axis : int or None, optional
 | |
|         Axis along which to compute test. Default is 0. If None,
 | |
|         compute over the whole array `a`.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : float or array
 | |
|         ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and
 | |
|         ``k`` is the z-score returned by `kurtosistest`.
 | |
|     pvalue : float or array
 | |
|        A 2-sided chi squared probability for the hypothesis test.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details about `normaltest`, see `scipy.stats.normaltest`.
 | |
| 
 | |
|     """
 | |
|     a, axis = _chk_asarray(a, axis)
 | |
|     s, _ = skewtest(a, axis)
 | |
|     k, _ = kurtosistest(a, axis)
 | |
|     k2 = s*s + k*k
 | |
| 
 | |
|     return NormaltestResult(k2, distributions.chi2.sf(k2, 2))
 | |
| 
 | |
| 
 | |
| def mquantiles(a, prob=(.25, .5, .75), alphap=.4, betap=.4, axis=None,
 | |
|                limit=()):
 | |
|     """
 | |
|     Computes empirical quantiles for a data array.
 | |
| 
 | |
|     Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``,
 | |
|     where ``x[j]`` is the j-th order statistic, and gamma is a function of
 | |
|     ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and
 | |
|     ``g = n*p + m - j``.
 | |
| 
 | |
|     Reinterpreting the above equations to compare to **R** lead to the
 | |
|     equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)``
 | |
| 
 | |
|     Typical values of (alphap,betap) are:
 | |
|         - (0,1)    : ``p(k) = k/n`` : linear interpolation of cdf
 | |
|           (**R** type 4)
 | |
|         - (.5,.5)  : ``p(k) = (k - 1/2.)/n`` : piecewise linear function
 | |
|           (**R** type 5)
 | |
|         - (0,0)    : ``p(k) = k/(n+1)`` :
 | |
|           (**R** type 6)
 | |
|         - (1,1)    : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])].
 | |
|           (**R** type 7, **R** default)
 | |
|         - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])].
 | |
|           The resulting quantile estimates are approximately median-unbiased
 | |
|           regardless of the distribution of x.
 | |
|           (**R** type 8)
 | |
|         - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom.
 | |
|           The resulting quantile estimates are approximately unbiased
 | |
|           if x is normally distributed
 | |
|           (**R** type 9)
 | |
|         - (.4,.4)  : approximately quantile unbiased (Cunnane)
 | |
|         - (.35,.35): APL, used with PWM
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         Input data, as a sequence or array of dimension at most 2.
 | |
|     prob : array_like, optional
 | |
|         List of quantiles to compute.
 | |
|     alphap : float, optional
 | |
|         Plotting positions parameter, default is 0.4.
 | |
|     betap : float, optional
 | |
|         Plotting positions parameter, default is 0.4.
 | |
|     axis : int, optional
 | |
|         Axis along which to perform the trimming.
 | |
|         If None (default), the input array is first flattened.
 | |
|     limit : tuple, optional
 | |
|         Tuple of (lower, upper) values.
 | |
|         Values of `a` outside this open interval are ignored.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     mquantiles : MaskedArray
 | |
|         An array containing the calculated quantiles.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     This formulation is very similar to **R** except the calculation of
 | |
|     ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined
 | |
|     with each type.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] *R* statistical software: https://www.r-project.org/
 | |
|     .. [2] *R* ``quantile`` function:
 | |
|             http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats.mstats import mquantiles
 | |
|     >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.])
 | |
|     >>> mquantiles(a)
 | |
|     array([ 19.2,  40. ,  42.8])
 | |
| 
 | |
|     Using a 2D array, specifying axis and limit.
 | |
| 
 | |
|     >>> data = np.array([[   6.,    7.,    1.],
 | |
|     ...                  [  47.,   15.,    2.],
 | |
|     ...                  [  49.,   36.,    3.],
 | |
|     ...                  [  15.,   39.,    4.],
 | |
|     ...                  [  42.,   40., -999.],
 | |
|     ...                  [  41.,   41., -999.],
 | |
|     ...                  [   7., -999., -999.],
 | |
|     ...                  [  39., -999., -999.],
 | |
|     ...                  [  43., -999., -999.],
 | |
|     ...                  [  40., -999., -999.],
 | |
|     ...                  [  36., -999., -999.]])
 | |
|     >>> print(mquantiles(data, axis=0, limit=(0, 50)))
 | |
|     [[19.2  14.6   1.45]
 | |
|      [40.   37.5   2.5 ]
 | |
|      [42.8  40.05  3.55]]
 | |
| 
 | |
|     >>> data[:, 2] = -999.
 | |
|     >>> print(mquantiles(data, axis=0, limit=(0, 50)))
 | |
|     [[19.200000000000003 14.6 --]
 | |
|      [40.0 37.5 --]
 | |
|      [42.800000000000004 40.05 --]]
 | |
| 
 | |
|     """
 | |
|     def _quantiles1D(data,m,p):
 | |
|         x = np.sort(data.compressed())
 | |
|         n = len(x)
 | |
|         if n == 0:
 | |
|             return ma.array(np.empty(len(p), dtype=float), mask=True)
 | |
|         elif n == 1:
 | |
|             return ma.array(np.resize(x, p.shape), mask=nomask)
 | |
|         aleph = (n*p + m)
 | |
|         k = np.floor(aleph.clip(1, n-1)).astype(int)
 | |
|         gamma = (aleph-k).clip(0,1)
 | |
|         return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]
 | |
| 
 | |
|     data = ma.array(a, copy=False)
 | |
|     if data.ndim > 2:
 | |
|         raise TypeError("Array should be 2D at most !")
 | |
| 
 | |
|     if limit:
 | |
|         condition = (limit[0] < data) & (data < limit[1])
 | |
|         data[~condition.filled(True)] = masked
 | |
| 
 | |
|     p = np.atleast_1d(np.asarray(prob))
 | |
|     m = alphap + p*(1.-alphap-betap)
 | |
|     # Computes quantiles along axis (or globally)
 | |
|     if (axis is None):
 | |
|         return _quantiles1D(data, m, p)
 | |
| 
 | |
|     return ma.apply_along_axis(_quantiles1D, axis, data, m, p)
 | |
| 
 | |
| 
 | |
| def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4):
 | |
|     """Calculate the score at the given 'per' percentile of the
 | |
|     sequence a.  For example, the score at per=50 is the median.
 | |
| 
 | |
|     This function is a shortcut to mquantile
 | |
| 
 | |
|     """
 | |
|     if (per < 0) or (per > 100.):
 | |
|         raise ValueError(f"The percentile should be between 0. and 100. ! (got {per})")
 | |
| 
 | |
|     return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap,
 | |
|                       limit=limit, axis=0).squeeze()
 | |
| 
 | |
| 
 | |
| def plotting_positions(data, alpha=0.4, beta=0.4):
 | |
|     """
 | |
|     Returns plotting positions (or empirical percentile points) for the data.
 | |
| 
 | |
|     Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where:
 | |
|         - i is the rank order statistics
 | |
|         - n is the number of unmasked values along the given axis
 | |
|         - `alpha` and `beta` are two parameters.
 | |
| 
 | |
|     Typical values for `alpha` and `beta` are:
 | |
|         - (0,1)    : ``p(k) = k/n``, linear interpolation of cdf (R, type 4)
 | |
|         - (.5,.5)  : ``p(k) = (k-1/2.)/n``, piecewise linear function
 | |
|           (R, type 5)
 | |
|         - (0,0)    : ``p(k) = k/(n+1)``, Weibull (R type 6)
 | |
|         - (1,1)    : ``p(k) = (k-1)/(n-1)``, in this case,
 | |
|           ``p(k) = mode[F(x[k])]``. That's R default (R type 7)
 | |
|         - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then
 | |
|           ``p(k) ~ median[F(x[k])]``.
 | |
|           The resulting quantile estimates are approximately median-unbiased
 | |
|           regardless of the distribution of x. (R type 8)
 | |
|         - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom.
 | |
|           The resulting quantile estimates are approximately unbiased
 | |
|           if x is normally distributed (R type 9)
 | |
|         - (.4,.4)  : approximately quantile unbiased (Cunnane)
 | |
|         - (.35,.35): APL, used with PWM
 | |
|         - (.3175, .3175): used in scipy.stats.probplot
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     data : array_like
 | |
|         Input data, as a sequence or array of dimension at most 2.
 | |
|     alpha : float, optional
 | |
|         Plotting positions parameter. Default is 0.4.
 | |
|     beta : float, optional
 | |
|         Plotting positions parameter. Default is 0.4.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     positions : MaskedArray
 | |
|         The calculated plotting positions.
 | |
| 
 | |
|     """
 | |
|     data = ma.array(data, copy=False).reshape(1,-1)
 | |
|     n = data.count()
 | |
|     plpos = np.empty(data.size, dtype=float)
 | |
|     plpos[n:] = 0
 | |
|     plpos[data.argsort(axis=None)[:n]] = ((np.arange(1, n+1) - alpha) /
 | |
|                                           (n + 1.0 - alpha - beta))
 | |
|     return ma.array(plpos, mask=data._mask)
 | |
| 
 | |
| 
 | |
| meppf = plotting_positions
 | |
| 
 | |
| 
 | |
| def obrientransform(*args):
 | |
|     """
 | |
|     Computes a transform on input data (any number of columns).  Used to
 | |
|     test for homogeneity of variance prior to running one-way stats.  Each
 | |
|     array in ``*args`` is one level of a factor.  If an `f_oneway()` run on
 | |
|     the transformed data and found significant, variances are unequal.   From
 | |
|     Maxwell and Delaney, p.112.
 | |
| 
 | |
|     Returns: transformed data for use in an ANOVA
 | |
|     """
 | |
|     data = argstoarray(*args).T
 | |
|     v = data.var(axis=0,ddof=1)
 | |
|     m = data.mean(0)
 | |
|     n = data.count(0).astype(float)
 | |
|     # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2))
 | |
|     data -= m
 | |
|     data **= 2
 | |
|     data *= (n-1.5)*n
 | |
|     data -= 0.5*v*(n-1)
 | |
|     data /= (n-1.)*(n-2.)
 | |
|     if not ma.allclose(v,data.mean(0)):
 | |
|         raise ValueError("Lack of convergence in obrientransform.")
 | |
| 
 | |
|     return data
 | |
| 
 | |
| 
 | |
| def sem(a, axis=0, ddof=1):
 | |
|     """
 | |
|     Calculates the standard error of the mean of the input array.
 | |
| 
 | |
|     Also sometimes called standard error of measurement.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     a : array_like
 | |
|         An array containing the values for which the standard error is
 | |
|         returned.
 | |
|     axis : int or None, optional
 | |
|         If axis is None, ravel `a` first. If axis is an integer, this will be
 | |
|         the axis over which to operate. Defaults to 0.
 | |
|     ddof : int, optional
 | |
|         Delta degrees-of-freedom. How many degrees of freedom to adjust
 | |
|         for bias in limited samples relative to the population estimate
 | |
|         of variance. Defaults to 1.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     s : ndarray or float
 | |
|         The standard error of the mean in the sample(s), along the input axis.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     The default value for `ddof` changed in scipy 0.15.0 to be consistent with
 | |
|     `scipy.stats.sem` as well as with the most common definition used (like in
 | |
|     the R documentation).
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     Find standard error along the first axis:
 | |
| 
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy import stats
 | |
|     >>> a = np.arange(20).reshape(5,4)
 | |
|     >>> print(stats.mstats.sem(a))
 | |
|     [2.8284271247461903 2.8284271247461903 2.8284271247461903
 | |
|      2.8284271247461903]
 | |
| 
 | |
|     Find standard error across the whole array, using n degrees of freedom:
 | |
| 
 | |
|     >>> print(stats.mstats.sem(a, axis=None, ddof=0))
 | |
|     1.2893796958227628
 | |
| 
 | |
|     """
 | |
|     a, axis = _chk_asarray(a, axis)
 | |
|     n = a.count(axis=axis)
 | |
|     s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n)
 | |
|     return s
 | |
| 
 | |
| 
 | |
| F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))
 | |
| 
 | |
| 
 | |
| def f_oneway(*args):
 | |
|     """
 | |
|     Performs a 1-way ANOVA, returning an F-value and probability given
 | |
|     any number of groups.  From Heiman, pp.394-7.
 | |
| 
 | |
|     Usage: ``f_oneway(*args)``, where ``*args`` is 2 or more arrays,
 | |
|     one per treatment group.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : float
 | |
|         The computed F-value of the test.
 | |
|     pvalue : float
 | |
|         The associated p-value from the F-distribution.
 | |
| 
 | |
|     """
 | |
|     # Construct a single array of arguments: each row is a group
 | |
|     data = argstoarray(*args)
 | |
|     ngroups = len(data)
 | |
|     ntot = data.count()
 | |
|     sstot = (data**2).sum() - (data.sum())**2/float(ntot)
 | |
|     ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum()
 | |
|     sswg = sstot-ssbg
 | |
|     dfbg = ngroups-1
 | |
|     dfwg = ntot - ngroups
 | |
|     msb = ssbg/float(dfbg)
 | |
|     msw = sswg/float(dfwg)
 | |
|     f = msb/msw
 | |
|     prob = special.fdtrc(dfbg, dfwg, f)  # equivalent to stats.f.sf
 | |
| 
 | |
|     return F_onewayResult(f, prob)
 | |
| 
 | |
| 
 | |
| FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',
 | |
|                                      ('statistic', 'pvalue'))
 | |
| 
 | |
| 
 | |
| def friedmanchisquare(*args):
 | |
|     """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA.
 | |
|     This function calculates the Friedman Chi-square test for repeated measures
 | |
|     and returns the result, along with the associated probability value.
 | |
| 
 | |
|     Each input is considered a given group. Ideally, the number of treatments
 | |
|     among each group should be equal. If this is not the case, only the first
 | |
|     n treatments are taken into account, where n is the number of treatments
 | |
|     of the smallest group.
 | |
|     If a group has some missing values, the corresponding treatments are masked
 | |
|     in the other groups.
 | |
|     The test statistic is corrected for ties.
 | |
| 
 | |
|     Masked values in one group are propagated to the other groups.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : float
 | |
|         the test statistic.
 | |
|     pvalue : float
 | |
|         the associated p-value.
 | |
| 
 | |
|     """
 | |
|     data = argstoarray(*args).astype(float)
 | |
|     k = len(data)
 | |
|     if k < 3:
 | |
|         raise ValueError(f"Less than 3 groups ({k}): the Friedman test "
 | |
|                          f"is NOT appropriate.")
 | |
| 
 | |
|     ranked = ma.masked_values(rankdata(data, axis=0), 0)
 | |
|     if ranked._mask is not nomask:
 | |
|         ranked = ma.mask_cols(ranked)
 | |
|         ranked = ranked.compressed().reshape(k,-1).view(ndarray)
 | |
|     else:
 | |
|         ranked = ranked._data
 | |
|     (k,n) = ranked.shape
 | |
|     # Ties correction
 | |
|     repeats = [find_repeats(row) for row in ranked.T]
 | |
|     ties = np.array([y for x, y in repeats if x.size > 0])
 | |
|     tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k))
 | |
| 
 | |
|     ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2)
 | |
|     chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction
 | |
| 
 | |
|     return FriedmanchisquareResult(chisq,
 | |
|                                    distributions.chi2.sf(chisq, k-1))
 | |
| 
 | |
| 
 | |
| BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', ('statistic', 'pvalue'))
 | |
| 
 | |
| 
 | |
| def brunnermunzel(x, y, alternative="two-sided", distribution="t"):
 | |
|     """
 | |
|     Compute the Brunner-Munzel test on samples x and y.
 | |
| 
 | |
|     Any missing values in `x` and/or `y` are discarded.
 | |
| 
 | |
|     The Brunner-Munzel test is a nonparametric test of the null hypothesis that
 | |
|     when values are taken one by one from each group, the probabilities of
 | |
|     getting large values in both groups are equal.
 | |
|     Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the
 | |
|     assumption of equivariance of two groups. Note that this does not assume
 | |
|     the distributions are same. This test works on two independent samples,
 | |
|     which may have different sizes.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x, y : array_like
 | |
|         Array of samples, should be one-dimensional.
 | |
|     alternative : 'less', 'two-sided', or 'greater', optional
 | |
|         Whether to get the p-value for the one-sided hypothesis ('less'
 | |
|         or 'greater') or for the two-sided hypothesis ('two-sided').
 | |
|         Defaults value is 'two-sided' .
 | |
|     distribution : 't' or 'normal', optional
 | |
|         Whether to get the p-value by t-distribution or by standard normal
 | |
|         distribution.
 | |
|         Defaults value is 't' .
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : float
 | |
|         The Brunner-Munzer W statistic.
 | |
|     pvalue : float
 | |
|         p-value assuming an t distribution. One-sided or
 | |
|         two-sided, depending on the choice of `alternative` and `distribution`.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     mannwhitneyu : Mann-Whitney rank test on two samples.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     For more details on `brunnermunzel`, see `scipy.stats.brunnermunzel`.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> from scipy.stats.mstats import brunnermunzel
 | |
|     >>> import numpy as np
 | |
|     >>> x1 = [1, 2, np.nan, np.nan, 1, 1, 1, 1, 1, 1, 2, 4, 1, 1]
 | |
|     >>> x2 = [3, 3, 4, 3, 1, 2, 3, 1, 1, 5, 4]
 | |
|     >>> brunnermunzel(x1, x2)
 | |
|     BrunnerMunzelResult(statistic=1.4723186918922935, pvalue=0.15479415300426624)  # may vary
 | |
| 
 | |
|     """  # noqa: E501
 | |
|     x = ma.asarray(x).compressed().view(ndarray)
 | |
|     y = ma.asarray(y).compressed().view(ndarray)
 | |
|     nx = len(x)
 | |
|     ny = len(y)
 | |
|     if nx == 0 or ny == 0:
 | |
|         return BrunnerMunzelResult(np.nan, np.nan)
 | |
|     rankc = rankdata(np.concatenate((x,y)))
 | |
|     rankcx = rankc[0:nx]
 | |
|     rankcy = rankc[nx:nx+ny]
 | |
|     rankcx_mean = np.mean(rankcx)
 | |
|     rankcy_mean = np.mean(rankcy)
 | |
|     rankx = rankdata(x)
 | |
|     ranky = rankdata(y)
 | |
|     rankx_mean = np.mean(rankx)
 | |
|     ranky_mean = np.mean(ranky)
 | |
| 
 | |
|     Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
 | |
|     Sx /= nx - 1
 | |
|     Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
 | |
|     Sy /= ny - 1
 | |
| 
 | |
|     wbfn = nx * ny * (rankcy_mean - rankcx_mean)
 | |
|     wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)
 | |
| 
 | |
|     if distribution == "t":
 | |
|         df_numer = np.power(nx * Sx + ny * Sy, 2.0)
 | |
|         df_denom = np.power(nx * Sx, 2.0) / (nx - 1)
 | |
|         df_denom += np.power(ny * Sy, 2.0) / (ny - 1)
 | |
|         df = df_numer / df_denom
 | |
|         p = distributions.t.cdf(wbfn, df)
 | |
|     elif distribution == "normal":
 | |
|         p = distributions.norm.cdf(wbfn)
 | |
|     else:
 | |
|         raise ValueError(
 | |
|             "distribution should be 't' or 'normal'")
 | |
| 
 | |
|     if alternative == "greater":
 | |
|         pass
 | |
|     elif alternative == "less":
 | |
|         p = 1 - p
 | |
|     elif alternative == "two-sided":
 | |
|         p = 2 * np.min([p, 1-p])
 | |
|     else:
 | |
|         raise ValueError(
 | |
|             "alternative should be 'less', 'greater' or 'two-sided'")
 | |
| 
 | |
|     return BrunnerMunzelResult(wbfn, p)
 |