429 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			429 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| Created on Fri Apr  2 09:06:05 2021
 | |
| 
 | |
| @author: matth
 | |
| """
 | |
| 
 | |
| import math
 | |
| import numpy as np
 | |
| from scipy import special
 | |
| from ._axis_nan_policy import _axis_nan_policy_factory, _broadcast_arrays
 | |
| from scipy._lib._array_api import array_namespace, xp_promote
 | |
| from scipy._lib import array_api_extra as xpx
 | |
| 
 | |
| __all__ = ['entropy', 'differential_entropy']
 | |
| 
 | |
| 
 | |
| @_axis_nan_policy_factory(
 | |
|     lambda x: x,
 | |
|     n_samples=lambda kwgs: (
 | |
|         2 if ("qk" in kwgs and kwgs["qk"] is not None)
 | |
|         else 1
 | |
|     ),
 | |
|     n_outputs=1, result_to_tuple=lambda x, _: (x,), paired=True,
 | |
|     too_small=-1  # entropy doesn't have too small inputs
 | |
| )
 | |
| def entropy(pk: np.typing.ArrayLike,
 | |
|             qk: np.typing.ArrayLike | None = None,
 | |
|             base: float | None = None,
 | |
|             axis: int = 0
 | |
|             ) -> np.number | np.ndarray:
 | |
|     """
 | |
|     Calculate the Shannon entropy/relative entropy of given distribution(s).
 | |
| 
 | |
|     If only probabilities `pk` are given, the Shannon entropy is calculated as
 | |
|     ``H = -sum(pk * log(pk))``.
 | |
| 
 | |
|     If `qk` is not None, then compute the relative entropy
 | |
|     ``D = sum(pk * log(pk / qk))``. This quantity is also known
 | |
|     as the Kullback-Leibler divergence.
 | |
| 
 | |
|     This routine will normalize `pk` and `qk` if they don't sum to 1.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     pk : array_like
 | |
|         Defines the (discrete) distribution. Along each axis-slice of ``pk``,
 | |
|         element ``i`` is the  (possibly unnormalized) probability of event
 | |
|         ``i``.
 | |
|     qk : array_like, optional
 | |
|         Sequence against which the relative entropy is computed. Should be in
 | |
|         the same format as `pk`.
 | |
|     base : float, optional
 | |
|         The logarithmic base to use, defaults to ``e`` (natural logarithm).
 | |
|     axis : int, optional
 | |
|         The axis along which the entropy is calculated. Default is 0.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     S : {float, array_like}
 | |
|         The calculated entropy.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     Informally, the Shannon entropy quantifies the expected uncertainty
 | |
|     inherent in the possible outcomes of a discrete random variable.
 | |
|     For example,
 | |
|     if messages consisting of sequences of symbols from a set are to be
 | |
|     encoded and transmitted over a noiseless channel, then the Shannon entropy
 | |
|     ``H(pk)`` gives a tight lower bound for the average number of units of
 | |
|     information needed per symbol if the symbols occur with frequencies
 | |
|     governed by the discrete distribution `pk` [1]_. The choice of base
 | |
|     determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc.
 | |
| 
 | |
|     The relative entropy, ``D(pk|qk)``, quantifies the increase in the average
 | |
|     number of units of information needed per symbol if the encoding is
 | |
|     optimized for the probability distribution `qk` instead of the true
 | |
|     distribution `pk`. Informally, the relative entropy quantifies the expected
 | |
|     excess in surprise experienced if one believes the true distribution is
 | |
|     `qk` when it is actually `pk`.
 | |
| 
 | |
|     A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
 | |
|     equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with
 | |
|     the formula ``CE = -sum(pk * log(qk))``. It gives the average
 | |
|     number of units of information needed per symbol if an encoding is
 | |
|     optimized for the probability distribution `qk` when the true distribution
 | |
|     is `pk`. It is not computed directly by `entropy`, but it can be computed
 | |
|     using two calls to the function (see Examples).
 | |
| 
 | |
|     See [2]_ for more information.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication.
 | |
|            Bell System Technical Journal, 27: 379-423.
 | |
|            https://doi.org/10.1002/j.1538-7305.1948.tb01338.x
 | |
|     .. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information
 | |
|            Theory (Wiley Series in Telecommunications and Signal Processing).
 | |
|            Wiley-Interscience, USA.
 | |
| 
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     The outcome of a fair coin is the most uncertain:
 | |
| 
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats import entropy
 | |
|     >>> base = 2  # work in units of bits
 | |
|     >>> pk = np.array([1/2, 1/2])  # fair coin
 | |
|     >>> H = entropy(pk, base=base)
 | |
|     >>> H
 | |
|     1.0
 | |
|     >>> H == -np.sum(pk * np.log(pk)) / np.log(base)
 | |
|     True
 | |
| 
 | |
|     The outcome of a biased coin is less uncertain:
 | |
| 
 | |
|     >>> qk = np.array([9/10, 1/10])  # biased coin
 | |
|     >>> entropy(qk, base=base)
 | |
|     0.46899559358928117
 | |
| 
 | |
|     The relative entropy between the fair coin and biased coin is calculated
 | |
|     as:
 | |
| 
 | |
|     >>> D = entropy(pk, qk, base=base)
 | |
|     >>> D
 | |
|     0.7369655941662062
 | |
|     >>> np.isclose(D, np.sum(pk * np.log(pk/qk)) / np.log(base), rtol=4e-16, atol=0)
 | |
|     True
 | |
| 
 | |
|     The cross entropy can be calculated as the sum of the entropy and
 | |
|     relative entropy`:
 | |
| 
 | |
|     >>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base)
 | |
|     >>> CE
 | |
|     1.736965594166206
 | |
|     >>> CE == -np.sum(pk * np.log(qk)) / np.log(base)
 | |
|     True
 | |
| 
 | |
|     """
 | |
|     if base is not None and base <= 0:
 | |
|         raise ValueError("`base` must be a positive number or `None`.")
 | |
| 
 | |
|     xp = array_namespace(pk) if qk is None else array_namespace(pk, qk)
 | |
| 
 | |
|     pk = xp.asarray(pk)
 | |
|     with np.errstate(invalid='ignore'):
 | |
|         pk = 1.0*pk / xp.sum(pk, axis=axis, keepdims=True)  # type: ignore[operator]
 | |
|     if qk is None:
 | |
|         vec = special.entr(pk)
 | |
|     else:
 | |
|         qk = xp.asarray(qk)
 | |
|         pk, qk = _broadcast_arrays((pk, qk), axis=None, xp=xp)  # don't ignore any axes
 | |
|         sum_kwargs = dict(axis=axis, keepdims=True)
 | |
|         qk = 1.0*qk / xp.sum(qk, **sum_kwargs)  # type: ignore[operator, call-overload]
 | |
|         vec = special.rel_entr(pk, qk)
 | |
|     S = xp.sum(vec, axis=axis)
 | |
|     if base is not None:
 | |
|         S /= math.log(base)
 | |
|     return S
 | |
| 
 | |
| 
 | |
| def _differential_entropy_is_too_small(samples, kwargs, axis=-1):
 | |
|     values = samples[0]
 | |
|     n = values.shape[axis]
 | |
|     window_length = kwargs.get("window_length",
 | |
|                                math.floor(math.sqrt(n) + 0.5))
 | |
|     if not 2 <= 2 * window_length < n:
 | |
|         return True
 | |
|     return False
 | |
| 
 | |
| 
 | |
| @_axis_nan_policy_factory(
 | |
|     lambda x: x, n_outputs=1, result_to_tuple=lambda x, _: (x,),
 | |
|     too_small=_differential_entropy_is_too_small
 | |
| )
 | |
| def differential_entropy(
 | |
|     values: np.typing.ArrayLike,
 | |
|     *,
 | |
|     window_length: int | None = None,
 | |
|     base: float | None = None,
 | |
|     axis: int = 0,
 | |
|     method: str = "auto",
 | |
| ) -> np.number | np.ndarray:
 | |
|     r"""Given a sample of a distribution, estimate the differential entropy.
 | |
| 
 | |
|     Several estimation methods are available using the `method` parameter. By
 | |
|     default, a method is selected based the size of the sample.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     values : sequence
 | |
|         Sample from a continuous distribution.
 | |
|     window_length : int, optional
 | |
|         Window length for computing Vasicek estimate. Must be an integer
 | |
|         between 1 and half of the sample size. If ``None`` (the default), it
 | |
|         uses the heuristic value
 | |
| 
 | |
|         .. math::
 | |
|             \left \lfloor \sqrt{n} + 0.5 \right \rfloor
 | |
| 
 | |
|         where :math:`n` is the sample size. This heuristic was originally
 | |
|         proposed in [2]_ and has become common in the literature.
 | |
|     base : float, optional
 | |
|         The logarithmic base to use, defaults to ``e`` (natural logarithm).
 | |
|     axis : int, optional
 | |
|         The axis along which the differential entropy is calculated.
 | |
|         Default is 0.
 | |
|     method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
 | |
|         The method used to estimate the differential entropy from the sample.
 | |
|         Default is ``'auto'``.  See Notes for more information.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     entropy : float
 | |
|         The calculated differential entropy.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     This function will converge to the true differential entropy in the limit
 | |
| 
 | |
|     .. math::
 | |
|         n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
 | |
| 
 | |
|     The optimal choice of ``window_length`` for a given sample size depends on
 | |
|     the (unknown) distribution. Typically, the smoother the density of the
 | |
|     distribution, the larger the optimal value of ``window_length`` [1]_.
 | |
| 
 | |
|     The following options are available for the `method` parameter.
 | |
| 
 | |
|     * ``'vasicek'`` uses the estimator presented in [1]_. This is
 | |
|       one of the first and most influential estimators of differential entropy.
 | |
|     * ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
 | |
|       is not only consistent but, under some conditions, asymptotically normal.
 | |
|     * ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
 | |
|       in simulation to have smaller bias and mean squared error than
 | |
|       the Vasicek estimator.
 | |
|     * ``'correa'`` uses the estimator presented in [5]_ based on local linear
 | |
|       regression. In a simulation study, it had consistently smaller mean
 | |
|       square error than the Vasiceck estimator, but it is more expensive to
 | |
|       compute.
 | |
|     * ``'auto'`` selects the method automatically (default). Currently,
 | |
|       this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
 | |
|       for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
 | |
|       samples, but this behavior is subject to change in future versions.
 | |
| 
 | |
|     All estimators are implemented as described in [6]_.
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
 | |
|            Journal of the Royal Statistical Society:
 | |
|            Series B (Methodological), 38(1), 54-59.
 | |
|     .. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
 | |
|            goodness-of-fit test for exponentiality. Communications in
 | |
|            Statistics-Theory and Methods, 28(5), 1183-1202.
 | |
|     .. [3] Van Es, B. (1992). Estimating functionals related to a density by a
 | |
|            class of statistics based on spacings. Scandinavian Journal of
 | |
|            Statistics, 61-72.
 | |
|     .. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
 | |
|            of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
 | |
|     .. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
 | |
|            in Statistics-Theory and Methods, 24(10), 2439-2449.
 | |
|     .. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
 | |
|            Annals of Data Science, 2(2), 231-241.
 | |
|            https://link.springer.com/article/10.1007/s40745-015-0045-9
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats import differential_entropy, norm
 | |
| 
 | |
|     Entropy of a standard normal distribution:
 | |
| 
 | |
|     >>> rng = np.random.default_rng()
 | |
|     >>> values = rng.standard_normal(100)
 | |
|     >>> differential_entropy(values)
 | |
|     1.3407817436640392
 | |
| 
 | |
|     Compare with the true entropy:
 | |
| 
 | |
|     >>> float(norm.entropy())
 | |
|     1.4189385332046727
 | |
| 
 | |
|     For several sample sizes between 5 and 1000, compare the accuracy of
 | |
|     the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
 | |
|     compare the root mean squared error (over 1000 trials) between the estimate
 | |
|     and the true differential entropy of the distribution.
 | |
| 
 | |
|     >>> from scipy import stats
 | |
|     >>> import matplotlib.pyplot as plt
 | |
|     >>>
 | |
|     >>>
 | |
|     >>> def rmse(res, expected):
 | |
|     ...     '''Root mean squared error'''
 | |
|     ...     return np.sqrt(np.mean((res - expected)**2))
 | |
|     >>>
 | |
|     >>>
 | |
|     >>> a, b = np.log10(5), np.log10(1000)
 | |
|     >>> ns = np.round(np.logspace(a, b, 10)).astype(int)
 | |
|     >>> reps = 1000  # number of repetitions for each sample size
 | |
|     >>> expected = stats.expon.entropy()
 | |
|     >>>
 | |
|     >>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
 | |
|     >>> for method in method_errors:
 | |
|     ...     for n in ns:
 | |
|     ...        rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
 | |
|     ...        res = stats.differential_entropy(rvs, method=method, axis=-1)
 | |
|     ...        error = rmse(res, expected)
 | |
|     ...        method_errors[method].append(error)
 | |
|     >>>
 | |
|     >>> for method, errors in method_errors.items():
 | |
|     ...     plt.loglog(ns, errors, label=method)
 | |
|     >>>
 | |
|     >>> plt.legend()
 | |
|     >>> plt.xlabel('sample size')
 | |
|     >>> plt.ylabel('RMSE (1000 trials)')
 | |
|     >>> plt.title('Entropy Estimator Error (Exponential Distribution)')
 | |
| 
 | |
|     """
 | |
|     xp = array_namespace(values)
 | |
|     values = xp_promote(values, force_floating=True, xp=xp)
 | |
|     values = xp.moveaxis(values, axis, -1)
 | |
|     n = values.shape[-1]  # type: ignore[union-attr]
 | |
| 
 | |
|     if window_length is None:
 | |
|         window_length = math.floor(math.sqrt(n) + 0.5)
 | |
| 
 | |
|     if not 2 <= 2 * window_length < n:
 | |
|         raise ValueError(
 | |
|             f"Window length ({window_length}) must be positive and less "
 | |
|             f"than half the sample size ({n}).",
 | |
|         )
 | |
| 
 | |
|     if base is not None and base <= 0:
 | |
|         raise ValueError("`base` must be a positive number or `None`.")
 | |
| 
 | |
|     sorted_data = xp.sort(values, axis=-1)
 | |
| 
 | |
|     methods = {"vasicek": _vasicek_entropy,
 | |
|                "van es": _van_es_entropy,
 | |
|                "correa": _correa_entropy,
 | |
|                "ebrahimi": _ebrahimi_entropy,
 | |
|                "auto": _vasicek_entropy}
 | |
|     method = method.lower()
 | |
|     if method not in methods:
 | |
|         message = f"`method` must be one of {set(methods)}"
 | |
|         raise ValueError(message)
 | |
| 
 | |
|     if method == "auto":
 | |
|         if n <= 10:
 | |
|             method = 'van es'
 | |
|         elif n <= 1000:
 | |
|             method = 'ebrahimi'
 | |
|         else:
 | |
|             method = 'vasicek'
 | |
| 
 | |
|     res = methods[method](sorted_data, window_length, xp=xp)
 | |
| 
 | |
|     if base is not None:
 | |
|         res /= math.log(base)
 | |
| 
 | |
|     # avoid dtype changes due to data-apis/array-api-compat#152
 | |
|     # can be removed when data-apis/array-api-compat#152 is resolved
 | |
|     return xp.astype(res, values.dtype)  # type: ignore[union-attr]
 | |
| 
 | |
| 
 | |
| def _pad_along_last_axis(X, m, *, xp):
 | |
|     """Pad the data for computing the rolling window difference."""
 | |
|     # scales a  bit better than method in _vasicek_like_entropy
 | |
|     shape = X.shape[:-1] + (m,)
 | |
|     Xl = xp.broadcast_to(X[..., :1], shape)  # :1 vs 0 to maintain shape
 | |
|     Xr = xp.broadcast_to(X[..., -1:], shape)
 | |
|     return xp.concat((Xl, X, Xr), axis=-1)
 | |
| 
 | |
| 
 | |
| def _vasicek_entropy(X, m, *, xp):
 | |
|     """Compute the Vasicek estimator as described in [6] Eq. 1.3."""
 | |
|     n = X.shape[-1]
 | |
|     X = _pad_along_last_axis(X, m, xp=xp)
 | |
|     differences = X[..., 2 * m:] - X[..., : -2 * m:]
 | |
|     logs = xp.log(n/(2*m) * differences)
 | |
|     return xp.mean(logs, axis=-1)
 | |
| 
 | |
| 
 | |
| def _van_es_entropy(X, m, *, xp):
 | |
|     """Compute the van Es estimator as described in [6]."""
 | |
|     # No equation number, but referred to as HVE_mn.
 | |
|     # Typo: there should be a log within the summation.
 | |
|     n = X.shape[-1]
 | |
|     difference = X[..., m:] - X[..., :-m]
 | |
|     term1 = 1/(n-m) * xp.sum(xp.log((n+1)/m * difference), axis=-1)
 | |
|     k = xp.arange(m, n+1, dtype=term1.dtype)
 | |
|     return term1 + xp.sum(1/k) + math.log(m) - math.log(n+1)
 | |
| 
 | |
| 
 | |
| def _ebrahimi_entropy(X, m, *, xp):
 | |
|     """Compute the Ebrahimi estimator as described in [6]."""
 | |
|     # No equation number, but referred to as HE_mn
 | |
|     n = X.shape[-1]
 | |
|     X = _pad_along_last_axis(X, m, xp=xp)
 | |
| 
 | |
|     differences = X[..., 2 * m:] - X[..., : -2 * m:]
 | |
| 
 | |
|     i = xp.arange(1, n+1, dtype=X.dtype)
 | |
|     ci = xp.where(i <= m, 1 + (i - 1)/m, 2.)
 | |
|     cond = i >= n - m + 1
 | |
|     ci = xpx.at(ci, cond).set(1 + (n - i[cond])/m)
 | |
| 
 | |
|     logs = xp.log(n * differences / (ci * m))
 | |
|     return xp.mean(logs, axis=-1)
 | |
| 
 | |
| 
 | |
| def _correa_entropy(X, m, *, xp):
 | |
|     """Compute the Correa estimator as described in [6]."""
 | |
|     # No equation number, but referred to as HC_mn
 | |
|     n = X.shape[-1]
 | |
|     X = _pad_along_last_axis(X, m, xp=xp)
 | |
| 
 | |
|     i = xp.arange(1, n+1)
 | |
|     dj = xp.arange(-m, m+1)[:, None]
 | |
|     j = i + dj
 | |
|     j0 = j + m - 1  # 0-indexed version of j
 | |
| 
 | |
|     Xibar = xp.mean(X[..., j0], axis=-2, keepdims=True)
 | |
|     difference = X[..., j0] - Xibar
 | |
|     num = xp.sum(difference*dj, axis=-2)  # dj is d-i
 | |
|     den = n*xp.sum(difference**2, axis=-2)
 | |
|     return -xp.mean(xp.log(num/den), axis=-1)
 |