336 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			336 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import numpy as np
 | |
| from scipy.special import betainc
 | |
| from scipy._lib._array_api import xp_ravel, array_namespace, xp_promote
 | |
| import scipy._lib.array_api_extra as xpx
 | |
| from scipy.stats._axis_nan_policy import _broadcast_arrays, _contains_nan
 | |
| from scipy.stats._stats_py import _length_nonmasked
 | |
| 
 | |
| 
 | |
| def _quantile_iv(x, p, method, axis, nan_policy, keepdims):
 | |
|     xp = array_namespace(x, p)
 | |
| 
 | |
|     if not xp.isdtype(xp.asarray(x).dtype, ('integral', 'real floating')):
 | |
|         raise ValueError("`x` must have real dtype.")
 | |
| 
 | |
|     if not xp.isdtype(xp.asarray(p).dtype, 'real floating'):
 | |
|         raise ValueError("`p` must have real floating dtype.")
 | |
| 
 | |
|     x, p = xp_promote(x, p, force_floating=True, xp=xp)
 | |
|     dtype = x.dtype
 | |
| 
 | |
|     axis_none = axis is None
 | |
|     ndim = max(x.ndim, p.ndim)
 | |
|     if axis_none:
 | |
|         x = xp_ravel(x)
 | |
|         p = xp_ravel(p)
 | |
|         axis = 0
 | |
|     elif np.iterable(axis) or int(axis) != axis:
 | |
|         message = "`axis` must be an integer or None."
 | |
|         raise ValueError(message)
 | |
|     elif (axis >= ndim) or (axis < -ndim):
 | |
|         message = "`axis` is not compatible with the shapes of the inputs."
 | |
|         raise ValueError(message)
 | |
|     axis = int(axis)
 | |
| 
 | |
|     methods = {'inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
 | |
|                'hazen', 'interpolated_inverted_cdf', 'linear',
 | |
|                'median_unbiased', 'normal_unbiased', 'weibull',
 | |
|                'harrell-davis'}
 | |
|     if method not in methods:
 | |
|         message = f"`method` must be one of {methods}"
 | |
|         raise ValueError(message)
 | |
| 
 | |
|     contains_nans = _contains_nan(x, nan_policy, xp_omit_okay=True, xp=xp)
 | |
| 
 | |
|     if keepdims not in {None, True, False}:
 | |
|         message = "If specified, `keepdims` must be True or False."
 | |
|         raise ValueError(message)
 | |
| 
 | |
|     # If data has length zero along `axis`, the result will be an array of NaNs just
 | |
|     # as if the data had length 1 along axis and were filled with NaNs. This is treated
 | |
|     # naturally below whether `nan_policy` is `'propagate'` or `'omit'`.
 | |
|     if x.shape[axis] == 0:
 | |
|         shape = list(x.shape)
 | |
|         shape[axis] = 1
 | |
|         x = xp.full(shape, xp.asarray(xp.nan, dtype=dtype))
 | |
| 
 | |
|     y = xp.sort(x, axis=axis)
 | |
|     y, p = _broadcast_arrays((y, p), axis=axis)
 | |
| 
 | |
|     if (keepdims is False) and (p.shape[axis] != 1):
 | |
|         message = "`keepdims` may be False only if the length of `p` along `axis` is 1."
 | |
|         raise ValueError(message)
 | |
|     keepdims = (p.shape[axis] != 1) if keepdims is None else keepdims
 | |
| 
 | |
|     y = xp.moveaxis(y, axis, -1)
 | |
|     p = xp.moveaxis(p, axis, -1)
 | |
| 
 | |
|     n = _length_nonmasked(y, -1, xp=xp, keepdims=True)
 | |
|     n = xp.asarray(n, dtype=dtype)
 | |
|     if contains_nans:
 | |
|         nans = xp.isnan(y)
 | |
| 
 | |
|         # Note that if length along `axis` were 0 to begin with,
 | |
|         # it is now length 1 and filled with NaNs.
 | |
|         if nan_policy == 'propagate':
 | |
|             nan_out = xp.any(nans, axis=-1)
 | |
|         else:  # 'omit'
 | |
|             non_nan = xp.astype(~nans, xp.uint64)
 | |
|             n_int = xp.sum(non_nan, axis=-1, keepdims=True)
 | |
|             n = xp.astype(n_int, dtype)
 | |
|             # NaNs are produced only if slice is empty after removing NaNs
 | |
|             nan_out = xp.any(n == 0, axis=-1)
 | |
|             n = xpx.at(n, nan_out).set(y.shape[-1])  # avoids pytorch/pytorch#146211
 | |
| 
 | |
|         if xp.any(nan_out):
 | |
|             y = xp.asarray(y, copy=True)  # ensure writable
 | |
|             y = xpx.at(y, nan_out).set(xp.nan)
 | |
|         elif xp.any(nans) and method == 'harrell-davis':
 | |
|             y = xp.asarray(y, copy=True)  # ensure writable
 | |
|             y = xpx.at(y, nans).set(0)  # any non-nan will prevent NaN from propagating
 | |
| 
 | |
|     p_mask = (p > 1) | (p < 0) | xp.isnan(p)
 | |
|     if xp.any(p_mask):
 | |
|         p = xp.asarray(p, copy=True)
 | |
|         p = xpx.at(p, p_mask).set(0.5)  # these get NaN-ed out at the end
 | |
| 
 | |
|     return y, p, method, axis, nan_policy, keepdims, n, axis_none, ndim, p_mask, xp
 | |
| 
 | |
| 
 | |
| def quantile(x, p, *, method='linear', axis=0, nan_policy='propagate', keepdims=None):
 | |
|     """
 | |
|     Compute the p-th quantile of the data along the specified axis.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : array_like of real numbers
 | |
|         Data array.
 | |
|     p : array_like of float
 | |
|         Probability or sequence of probabilities of the quantiles to compute.
 | |
|         Values must be between 0 and 1 (inclusive).
 | |
|         Must have length 1 along `axis` unless ``keepdims=True``.
 | |
|     method : str, default: 'linear'
 | |
|         The method to use for estimating the quantile.
 | |
|         The available options, numbered as they appear in [1]_, are:
 | |
| 
 | |
|         1. 'inverted_cdf'
 | |
|         2. 'averaged_inverted_cdf'
 | |
|         3. 'closest_observation'
 | |
|         4. 'interpolated_inverted_cdf'
 | |
|         5. 'hazen'
 | |
|         6. 'weibull'
 | |
|         7. 'linear'  (default)
 | |
|         8. 'median_unbiased'
 | |
|         9. 'normal_unbiased'
 | |
| 
 | |
|         'harrell-davis' is also available to compute the quantile estimate
 | |
|         according to [2]_.
 | |
|         See Notes for details.
 | |
|     axis : int or None, default: 0
 | |
|         Axis along which the quantiles are computed.
 | |
|         ``None`` ravels both `x` and `p` before performing the calculation,
 | |
|         without checking whether the original shapes were compatible.
 | |
|     nan_policy : str, default: 'propagate'
 | |
|         Defines how to handle NaNs in the input data `x`.
 | |
| 
 | |
|         - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
 | |
|           which the  statistic is computed, the corresponding slice of the output
 | |
|           will contain NaN(s).
 | |
|         - ``omit``: NaNs will be omitted when performing the calculation.
 | |
|           If insufficient data remains in the axis slice along which the
 | |
|           statistic is computed, the corresponding slice of the output will
 | |
|           contain NaN(s).
 | |
|         - ``raise``: if a NaN is present, a ``ValueError`` will be raised.
 | |
| 
 | |
|         If NaNs are present in `p`, a ``ValueError`` will be raised.
 | |
|     keepdims : bool, optional
 | |
|         Consider the case in which `x` is 1-D and `p` is a scalar: the quantile
 | |
|         is a reducing statistic, and the default behavior is to return a scalar.
 | |
|         If `keepdims` is set to True, the axis will not be reduced away, and the
 | |
|         result will be a 1-D array with one element.
 | |
| 
 | |
|         The general case is more subtle, since multiple quantiles may be
 | |
|         requested for each axis-slice of `x`. For instance, if both `x` and `p`
 | |
|         are 1-D and ``p.size > 1``, no axis can be reduced away; there must be an
 | |
|         axis to contain the number of quantiles given by ``p.size``. Therefore:
 | |
| 
 | |
|         - By default, the axis will be reduced away if possible (i.e. if there is
 | |
|           exactly one element of `q` per axis-slice of `x`).
 | |
|         - If `keepdims` is set to True, the axis will not be reduced away.
 | |
|         - If `keepdims` is set to False, the axis will be reduced away
 | |
|           if possible, and an error will be raised otherwise.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     quantile : scalar or ndarray
 | |
|         The resulting quantile(s). The dtype is the result dtype of `x` and `p`.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     Given a sample `x` from an underlying distribution, `quantile` provides a
 | |
|     nonparametric estimate of the inverse cumulative distribution function.
 | |
| 
 | |
|     By default, this is done by interpolating between adjacent elements in
 | |
|     ``y``, a sorted copy of `x`::
 | |
| 
 | |
|         (1-g)*y[j] + g*y[j+1]
 | |
| 
 | |
|     where the index ``j`` and coefficient ``g`` are the integral and
 | |
|     fractional components of ``p * (n-1)``, and ``n`` is the number of
 | |
|     elements in the sample.
 | |
| 
 | |
|     This is a special case of Equation 1 of H&F [1]_. More generally,
 | |
| 
 | |
|     - ``j = (p*n + m - 1) // 1``, and
 | |
|     - ``g = (p*n + m - 1) % 1``,
 | |
| 
 | |
|     where ``m`` may be defined according to several different conventions.
 | |
|     The preferred convention may be selected using the ``method`` parameter:
 | |
| 
 | |
|     =============================== =============== ===============
 | |
|     ``method``                      number in H&F   ``m``
 | |
|     =============================== =============== ===============
 | |
|     ``interpolated_inverted_cdf``   4               ``0``
 | |
|     ``hazen``                       5               ``1/2``
 | |
|     ``weibull``                     6               ``p``
 | |
|     ``linear`` (default)            7               ``1 - p``
 | |
|     ``median_unbiased``             8               ``p/3 + 1/3``
 | |
|     ``normal_unbiased``             9               ``p/4 + 3/8``
 | |
|     =============================== =============== ===============
 | |
| 
 | |
|     Note that indices ``j`` and ``j + 1`` are clipped to the range ``0`` to
 | |
|     ``n - 1`` when the results of the formula would be outside the allowed
 | |
|     range of non-negative indices. When ``j`` is clipped to zero, ``g`` is
 | |
|     set to zero as well. The ``-1`` in the formulas for ``j`` and ``g``
 | |
|     accounts for Python's 0-based indexing.
 | |
| 
 | |
|     The table above includes only the estimators from [1]_ that are continuous
 | |
|     functions of probability `p` (estimators 4-9). SciPy also provides the
 | |
|     three discontinuous estimators from [1]_ (estimators 1-3), where ``j`` is
 | |
|     defined as above, ``m`` is defined as follows, and ``g`` is ``0`` when
 | |
|     ``index = p*n + m - 1`` is less than ``0`` and otherwise is defined below.
 | |
| 
 | |
|     1. ``inverted_cdf``: ``m = 0`` and ``g = int(index - j > 0)``
 | |
|     2. ``averaged_inverted_cdf``: ``m = 0`` and
 | |
|        ``g = (1 + int(index - j > 0)) / 2``
 | |
|     3. ``closest_observation``: ``m = -1/2`` and
 | |
|        ``g = 1 - int((index == j) & (j%2 == 1))``
 | |
| 
 | |
|     A different strategy for computing quantiles from [2]_, ``method='harrell-davis'``,
 | |
|     uses a weighted combination of all elements. The weights are computed as:
 | |
| 
 | |
|     .. math::
 | |
| 
 | |
|         w_{n, i} = I_{i/n}(a, b) - I_{(i - 1)/n}(a, b)
 | |
| 
 | |
|     where :math:`n` is the number of elements in the sample,
 | |
|     :math:`i` are the indices :math:`1, 2, ..., n-1, n` of the sorted elements,
 | |
|     :math:`a = p (n + 1)`, :math:`b = (1 - p)(n + 1)`,
 | |
|     :math:`p` is the probability of the quantile, and
 | |
|     :math:`I` is the regularized, lower incomplete beta function
 | |
|     (`scipy.special.betainc`).
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy import stats
 | |
|     >>> x = np.asarray([[10, 8, 7, 5, 4],
 | |
|     ...                 [0, 1, 2, 3, 5]])
 | |
| 
 | |
|     Take the median along the last axis.
 | |
| 
 | |
|     >>> stats.quantile(x, 0.5, axis=-1)
 | |
|     array([7.,  2.])
 | |
| 
 | |
|     Take a different quantile along each axis.
 | |
| 
 | |
|     >>> stats.quantile(x, [[0.25], [0.75]], axis=-1, keepdims=True)
 | |
|     array([[5.],
 | |
|            [3.]])
 | |
| 
 | |
|     Take multiple quantiles along each axis.
 | |
| 
 | |
|     >>> stats.quantile(x, [0.25, 0.75], axis=-1)
 | |
|     array([[5., 8.],
 | |
|            [1., 3.]])
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] R. J. Hyndman and Y. Fan,
 | |
|        "Sample quantiles in statistical packages,"
 | |
|        The American Statistician, 50(4), pp. 361-365, 1996
 | |
|     .. [2] Harrell, Frank E., and C. E. Davis.
 | |
|        "A new distribution-free quantile estimator."
 | |
|        Biometrika 69.3 (1982): 635-640.
 | |
| 
 | |
|     """
 | |
|     # Input validation / standardization
 | |
| 
 | |
|     temp = _quantile_iv(x, p, method, axis, nan_policy, keepdims)
 | |
|     y, p, method, axis, nan_policy, keepdims, n, axis_none, ndim, p_mask, xp = temp
 | |
| 
 | |
|     if method in {'inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
 | |
|                   'hazen', 'interpolated_inverted_cdf', 'linear',
 | |
|                   'median_unbiased', 'normal_unbiased', 'weibull'}:
 | |
|         res = _quantile_hf(y, p, n, method, xp)
 | |
|     elif method in {'harrell-davis'}:
 | |
|         res = _quantile_hd(y, p, n, xp)
 | |
| 
 | |
|     res = xpx.at(res, p_mask).set(xp.nan)
 | |
| 
 | |
|     # Reshape per axis/keepdims
 | |
|     if axis_none and keepdims:
 | |
|         shape = (1,)*(ndim - 1) + res.shape
 | |
|         res = xp.reshape(res, shape)
 | |
|         axis = -1
 | |
| 
 | |
|     res = xp.moveaxis(res, -1, axis)
 | |
| 
 | |
|     if not keepdims:
 | |
|         res = xp.squeeze(res, axis=axis)
 | |
| 
 | |
|     return res[()] if res.ndim == 0 else res
 | |
| 
 | |
| 
 | |
| def _quantile_hf(y, p, n, method, xp):
 | |
|     ms = dict(inverted_cdf=0, averaged_inverted_cdf=0, closest_observation=-0.5,
 | |
|               interpolated_inverted_cdf=0, hazen=0.5, weibull=p, linear=1 - p,
 | |
|               median_unbiased=p/3 + 1/3, normal_unbiased=p/4 + 3/8)
 | |
|     m = ms[method]
 | |
|     jg = p*n + m - 1
 | |
|     j = jg // 1
 | |
|     g = jg % 1
 | |
|     if method == 'inverted_cdf':
 | |
|         g = xp.astype((g > 0), jg.dtype)
 | |
|     elif method == 'averaged_inverted_cdf':
 | |
|         g = (1 + xp.astype((g > 0), jg.dtype)) / 2
 | |
|     elif method == 'closest_observation':
 | |
|         g = (1 - xp.astype((g == 0) & (j % 2 == 1), jg.dtype))
 | |
|     if method in {'inverted_cdf', 'averaged_inverted_cdf', 'closest_observation'}:
 | |
|         g = xp.asarray(g)
 | |
|         g = xpx.at(g, jg < 0).set(0)
 | |
| 
 | |
|     g[j < 0] = 0
 | |
|     j = xp.clip(j, 0., n - 1)
 | |
|     jp1 = xp.clip(j + 1, 0., n - 1)
 | |
| 
 | |
|     return ((1 - g) * xp.take_along_axis(y, xp.astype(j, xp.int64), axis=-1)
 | |
|             + g * xp.take_along_axis(y, xp.astype(jp1, xp.int64), axis=-1))
 | |
| 
 | |
| 
 | |
| def _quantile_hd(y, p, n, xp):
 | |
|     # RE axis handling: We need to perform a reducing operation over rows of `y` for
 | |
|     # each element in the corresponding row of `p` (a la Cartesian product). Strategy:
 | |
|     # move rows of `p` to an axis at the front that is orthogonal to all the rest,
 | |
|     # perform the reducing operating over the last axis, then move the front axis back
 | |
|     # to the end.
 | |
|     p = xp.moveaxis(p, -1, 0)[..., xp.newaxis]
 | |
|     a = p * (n + 1)
 | |
|     b = (1 - p) * (n + 1)
 | |
|     i = xp.arange(y.shape[-1] + 1, dtype=y.dtype)
 | |
|     w = betainc(a, b, i / n)
 | |
|     w = w[..., 1:] - w[..., :-1]
 | |
|     w = xpx.at(w, xp.isnan(w)).set(0)
 | |
|     res = xp.vecdot(w, y, axis=-1)
 | |
|     return xp.moveaxis(res, 0, -1)
 |