796 lines
		
	
	
		
			32 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			796 lines
		
	
	
		
			32 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import builtins
 | |
| from warnings import catch_warnings, simplefilter
 | |
| import numpy as np
 | |
| from operator import index
 | |
| from collections import namedtuple
 | |
| 
 | |
| __all__ = ['binned_statistic',
 | |
|            'binned_statistic_2d',
 | |
|            'binned_statistic_dd']
 | |
| 
 | |
| 
 | |
| BinnedStatisticResult = namedtuple('BinnedStatisticResult',
 | |
|                                    ('statistic', 'bin_edges', 'binnumber'))
 | |
| 
 | |
| 
 | |
| def binned_statistic(x, values, statistic='mean',
 | |
|                      bins=10, range=None):
 | |
|     """
 | |
|     Compute a binned statistic for one or more sets of data.
 | |
| 
 | |
|     This is a generalization of a histogram function.  A histogram divides
 | |
|     the space into bins, and returns the count of the number of points in
 | |
|     each bin.  This function allows the computation of the sum, mean, median,
 | |
|     or other statistic of the values (or set of values) within each bin.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : (N,) array_like
 | |
|         A sequence of values to be binned.
 | |
|     values : (N,) array_like or list of (N,) array_like
 | |
|         The data on which the statistic will be computed.  This must be
 | |
|         the same shape as `x`, or a set of sequences - each the same shape as
 | |
|         `x`.  If `values` is a set of sequences, the statistic will be computed
 | |
|         on each independently.
 | |
|     statistic : string or callable, optional
 | |
|         The statistic to compute (default is 'mean').
 | |
|         The following statistics are available:
 | |
| 
 | |
|           * 'mean' : compute the mean of values for points within each bin.
 | |
|             Empty bins will be represented by NaN.
 | |
|           * 'std' : compute the standard deviation within each bin. This
 | |
|             is implicitly calculated with ddof=0.
 | |
|           * 'median' : compute the median of values for points within each
 | |
|             bin. Empty bins will be represented by NaN.
 | |
|           * 'count' : compute the count of points within each bin.  This is
 | |
|             identical to an unweighted histogram.  `values` array is not
 | |
|             referenced.
 | |
|           * 'sum' : compute the sum of values for points within each bin.
 | |
|             This is identical to a weighted histogram.
 | |
|           * 'min' : compute the minimum of values for points within each bin.
 | |
|             Empty bins will be represented by NaN.
 | |
|           * 'max' : compute the maximum of values for point within each bin.
 | |
|             Empty bins will be represented by NaN.
 | |
|           * function : a user-defined function which takes a 1D array of
 | |
|             values, and outputs a single numerical statistic. This function
 | |
|             will be called on the values in each bin.  Empty bins will be
 | |
|             represented by function([]), or NaN if this returns an error.
 | |
| 
 | |
|     bins : int or sequence of scalars, optional
 | |
|         If `bins` is an int, it defines the number of equal-width bins in the
 | |
|         given range (10 by default).  If `bins` is a sequence, it defines the
 | |
|         bin edges, including the rightmost edge, allowing for non-uniform bin
 | |
|         widths.  Values in `x` that are smaller than lowest bin edge are
 | |
|         assigned to bin number 0, values beyond the highest bin are assigned to
 | |
|         ``bins[-1]``.  If the bin edges are specified, the number of bins will
 | |
|         be, (nx = len(bins)-1).
 | |
|     range : (float, float) or [(float, float)], optional
 | |
|         The lower and upper range of the bins.  If not provided, range
 | |
|         is simply ``(x.min(), x.max())``.  Values outside the range are
 | |
|         ignored.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : array
 | |
|         The values of the selected statistic in each bin.
 | |
|     bin_edges : array of dtype float
 | |
|         Return the bin edges ``(length(statistic)+1)``.
 | |
|     binnumber: 1-D ndarray of ints
 | |
|         Indices of the bins (corresponding to `bin_edges`) in which each value
 | |
|         of `x` belongs.  Same length as `values`.  A binnumber of `i` means the
 | |
|         corresponding value is between (bin_edges[i-1], bin_edges[i]).
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     All but the last (righthand-most) bin is half-open.  In other words, if
 | |
|     `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
 | |
|     but excluding 2) and the second ``[2, 3)``.  The last bin, however, is
 | |
|     ``[3, 4]``, which *includes* 4.
 | |
| 
 | |
|     .. versionadded:: 0.11.0
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy import stats
 | |
|     >>> import matplotlib.pyplot as plt
 | |
| 
 | |
|     First some basic examples:
 | |
| 
 | |
|     Create two evenly spaced bins in the range of the given sample, and sum the
 | |
|     corresponding values in each of those bins:
 | |
| 
 | |
|     >>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
 | |
|     >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
 | |
|     BinnedStatisticResult(statistic=array([4. , 4.5]),
 | |
|             bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
 | |
| 
 | |
|     Multiple arrays of values can also be passed.  The statistic is calculated
 | |
|     on each set independently:
 | |
| 
 | |
|     >>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
 | |
|     >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
 | |
|     BinnedStatisticResult(statistic=array([[4. , 4.5],
 | |
|            [8. , 9. ]]), bin_edges=array([1., 4., 7.]),
 | |
|            binnumber=array([1, 1, 1, 2, 2]))
 | |
| 
 | |
|     >>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
 | |
|     ...                        bins=3)
 | |
|     BinnedStatisticResult(statistic=array([1., 2., 4.]),
 | |
|             bin_edges=array([1., 2., 3., 4.]),
 | |
|             binnumber=array([1, 2, 1, 2, 3]))
 | |
| 
 | |
|     As a second example, we now generate some random data of sailing boat speed
 | |
|     as a function of wind speed, and then determine how fast our boat is for
 | |
|     certain wind speeds:
 | |
| 
 | |
|     >>> rng = np.random.default_rng()
 | |
|     >>> windspeed = 8 * rng.random(500)
 | |
|     >>> boatspeed = .3 * windspeed**.5 + .2 * rng.random(500)
 | |
|     >>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
 | |
|     ...                 boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
 | |
|     >>> plt.figure()
 | |
|     >>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
 | |
|     >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
 | |
|     ...            label='binned statistic of data')
 | |
|     >>> plt.legend()
 | |
| 
 | |
|     Now we can use ``binnumber`` to select all datapoints with a windspeed
 | |
|     below 1:
 | |
| 
 | |
|     >>> low_boatspeed = boatspeed[binnumber == 0]
 | |
| 
 | |
|     As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
 | |
|     plot of a distribution that shows the mean and distribution around that
 | |
|     mean per bin, on top of a regular histogram and the probability
 | |
|     distribution function:
 | |
| 
 | |
|     >>> x = np.linspace(0, 5, num=500)
 | |
|     >>> x_pdf = stats.maxwell.pdf(x)
 | |
|     >>> samples = stats.maxwell.rvs(size=10000)
 | |
| 
 | |
|     >>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
 | |
|     ...         statistic='mean', bins=25)
 | |
|     >>> bin_width = (bin_edges[1] - bin_edges[0])
 | |
|     >>> bin_centers = bin_edges[1:] - bin_width/2
 | |
| 
 | |
|     >>> plt.figure()
 | |
|     >>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
 | |
|     ...          alpha=0.2, label='histogram of data')
 | |
|     >>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
 | |
|     >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
 | |
|     ...            label='binned statistic of data')
 | |
|     >>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
 | |
|     >>> plt.legend(fontsize=10)
 | |
|     >>> plt.show()
 | |
| 
 | |
|     """
 | |
|     try:
 | |
|         N = len(bins)
 | |
|     except TypeError:
 | |
|         N = 1
 | |
| 
 | |
|     if N != 1:
 | |
|         bins = [np.asarray(bins, float)]
 | |
| 
 | |
|     if range is not None:
 | |
|         if len(range) == 2:
 | |
|             range = [range]
 | |
| 
 | |
|     medians, edges, binnumbers = binned_statistic_dd(
 | |
|         [x], values, statistic, bins, range)
 | |
| 
 | |
|     return BinnedStatisticResult(medians, edges[0], binnumbers)
 | |
| 
 | |
| 
 | |
| BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
 | |
|                                      ('statistic', 'x_edge', 'y_edge',
 | |
|                                       'binnumber'))
 | |
| 
 | |
| 
 | |
| def binned_statistic_2d(x, y, values, statistic='mean',
 | |
|                         bins=10, range=None, expand_binnumbers=False):
 | |
|     """
 | |
|     Compute a bidimensional binned statistic for one or more sets of data.
 | |
| 
 | |
|     This is a generalization of a histogram2d function.  A histogram divides
 | |
|     the space into bins, and returns the count of the number of points in
 | |
|     each bin.  This function allows the computation of the sum, mean, median,
 | |
|     or other statistic of the values (or set of values) within each bin.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x : (N,) array_like
 | |
|         A sequence of values to be binned along the first dimension.
 | |
|     y : (N,) array_like
 | |
|         A sequence of values to be binned along the second dimension.
 | |
|     values : (N,) array_like or list of (N,) array_like
 | |
|         The data on which the statistic will be computed.  This must be
 | |
|         the same shape as `x`, or a list of sequences - each with the same
 | |
|         shape as `x`.  If `values` is such a list, the statistic will be
 | |
|         computed on each independently.
 | |
|     statistic : string or callable, optional
 | |
|         The statistic to compute (default is 'mean').
 | |
|         The following statistics are available:
 | |
| 
 | |
|           * 'mean' : compute the mean of values for points within each bin.
 | |
|             Empty bins will be represented by NaN.
 | |
|           * 'std' : compute the standard deviation within each bin. This
 | |
|             is implicitly calculated with ddof=0.
 | |
|           * 'median' : compute the median of values for points within each
 | |
|             bin. Empty bins will be represented by NaN.
 | |
|           * 'count' : compute the count of points within each bin.  This is
 | |
|             identical to an unweighted histogram.  `values` array is not
 | |
|             referenced.
 | |
|           * 'sum' : compute the sum of values for points within each bin.
 | |
|             This is identical to a weighted histogram.
 | |
|           * 'min' : compute the minimum of values for points within each bin.
 | |
|             Empty bins will be represented by NaN.
 | |
|           * 'max' : compute the maximum of values for point within each bin.
 | |
|             Empty bins will be represented by NaN.
 | |
|           * function : a user-defined function which takes a 1D array of
 | |
|             values, and outputs a single numerical statistic. This function
 | |
|             will be called on the values in each bin.  Empty bins will be
 | |
|             represented by function([]), or NaN if this returns an error.
 | |
| 
 | |
|     bins : int or [int, int] or array_like or [array, array], optional
 | |
|         The bin specification:
 | |
| 
 | |
|           * the number of bins for the two dimensions (nx = ny = bins),
 | |
|           * the number of bins in each dimension (nx, ny = bins),
 | |
|           * the bin edges for the two dimensions (x_edge = y_edge = bins),
 | |
|           * the bin edges in each dimension (x_edge, y_edge = bins).
 | |
| 
 | |
|         If the bin edges are specified, the number of bins will be,
 | |
|         (nx = len(x_edge)-1, ny = len(y_edge)-1).
 | |
| 
 | |
|     range : (2,2) array_like, optional
 | |
|         The leftmost and rightmost edges of the bins along each dimension
 | |
|         (if not specified explicitly in the `bins` parameters):
 | |
|         [[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
 | |
|         considered outliers and not tallied in the histogram.
 | |
|     expand_binnumbers : bool, optional
 | |
|         'False' (default): the returned `binnumber` is a shape (N,) array of
 | |
|         linearized bin indices.
 | |
|         'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
 | |
|         ndarray, where each row gives the bin numbers in the corresponding
 | |
|         dimension.
 | |
|         See the `binnumber` returned value, and the `Examples` section.
 | |
| 
 | |
|         .. versionadded:: 0.17.0
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : (nx, ny) ndarray
 | |
|         The values of the selected statistic in each two-dimensional bin.
 | |
|     x_edge : (nx + 1) ndarray
 | |
|         The bin edges along the first dimension.
 | |
|     y_edge : (ny + 1) ndarray
 | |
|         The bin edges along the second dimension.
 | |
|     binnumber : (N,) array of ints or (2,N) ndarray of ints
 | |
|         This assigns to each element of `sample` an integer that represents the
 | |
|         bin in which this observation falls.  The representation depends on the
 | |
|         `expand_binnumbers` argument.  See `Notes` for details.
 | |
| 
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     Binedges:
 | |
|     All but the last (righthand-most) bin is half-open.  In other words, if
 | |
|     `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
 | |
|     but excluding 2) and the second ``[2, 3)``.  The last bin, however, is
 | |
|     ``[3, 4]``, which *includes* 4.
 | |
| 
 | |
|     `binnumber`:
 | |
|     This returned argument assigns to each element of `sample` an integer that
 | |
|     represents the bin in which it belongs.  The representation depends on the
 | |
|     `expand_binnumbers` argument. If 'False' (default): The returned
 | |
|     `binnumber` is a shape (N,) array of linearized indices mapping each
 | |
|     element of `sample` to its corresponding bin (using row-major ordering).
 | |
|     Note that the returned linearized bin indices are used for an array with
 | |
|     extra bins on the outer binedges to capture values outside of the defined
 | |
|     bin bounds.
 | |
|     If 'True': The returned `binnumber` is a shape (2,N) ndarray where
 | |
|     each row indicates bin placements for each dimension respectively.  In each
 | |
|     dimension, a binnumber of `i` means the corresponding value is between
 | |
|     (D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
 | |
| 
 | |
|     .. versionadded:: 0.11.0
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> from scipy import stats
 | |
| 
 | |
|     Calculate the counts with explicit bin-edges:
 | |
| 
 | |
|     >>> x = [0.1, 0.1, 0.1, 0.6]
 | |
|     >>> y = [2.1, 2.6, 2.1, 2.1]
 | |
|     >>> binx = [0.0, 0.5, 1.0]
 | |
|     >>> biny = [2.0, 2.5, 3.0]
 | |
|     >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
 | |
|     >>> ret.statistic
 | |
|     array([[2., 1.],
 | |
|            [1., 0.]])
 | |
| 
 | |
|     The bin in which each sample is placed is given by the `binnumber`
 | |
|     returned parameter.  By default, these are the linearized bin indices:
 | |
| 
 | |
|     >>> ret.binnumber
 | |
|     array([5, 6, 5, 9])
 | |
| 
 | |
|     The bin indices can also be expanded into separate entries for each
 | |
|     dimension using the `expand_binnumbers` parameter:
 | |
| 
 | |
|     >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
 | |
|     ...                                 expand_binnumbers=True)
 | |
|     >>> ret.binnumber
 | |
|     array([[1, 1, 1, 2],
 | |
|            [1, 2, 1, 1]])
 | |
| 
 | |
|     Which shows that the first three elements belong in the xbin 1, and the
 | |
|     fourth into xbin 2; and so on for y.
 | |
| 
 | |
|     """
 | |
| 
 | |
|     # This code is based on np.histogram2d
 | |
|     try:
 | |
|         N = len(bins)
 | |
|     except TypeError:
 | |
|         N = 1
 | |
| 
 | |
|     if N != 1 and N != 2:
 | |
|         xedges = yedges = np.asarray(bins, float)
 | |
|         bins = [xedges, yedges]
 | |
| 
 | |
|     medians, edges, binnumbers = binned_statistic_dd(
 | |
|         [x, y], values, statistic, bins, range,
 | |
|         expand_binnumbers=expand_binnumbers)
 | |
| 
 | |
|     return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
 | |
| 
 | |
| 
 | |
| BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
 | |
|                                      ('statistic', 'bin_edges',
 | |
|                                       'binnumber'))
 | |
| 
 | |
| 
 | |
| def _bincount(x, weights):
 | |
|     if np.iscomplexobj(weights):
 | |
|         a = np.bincount(x, np.real(weights))
 | |
|         b = np.bincount(x, np.imag(weights))
 | |
|         z = a + b*1j
 | |
| 
 | |
|     else:
 | |
|         z = np.bincount(x, weights)
 | |
|     return z
 | |
| 
 | |
| 
 | |
| def binned_statistic_dd(sample, values, statistic='mean',
 | |
|                         bins=10, range=None, expand_binnumbers=False,
 | |
|                         binned_statistic_result=None):
 | |
|     """
 | |
|     Compute a multidimensional binned statistic for a set of data.
 | |
| 
 | |
|     This is a generalization of a histogramdd function.  A histogram divides
 | |
|     the space into bins, and returns the count of the number of points in
 | |
|     each bin.  This function allows the computation of the sum, mean, median,
 | |
|     or other statistic of the values within each bin.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     sample : array_like
 | |
|         Data to histogram passed as a sequence of N arrays of length D, or
 | |
|         as an (N,D) array.
 | |
|     values : (N,) array_like or list of (N,) array_like
 | |
|         The data on which the statistic will be computed.  This must be
 | |
|         the same shape as `sample`, or a list of sequences - each with the
 | |
|         same shape as `sample`.  If `values` is such a list, the statistic
 | |
|         will be computed on each independently.
 | |
|     statistic : string or callable, optional
 | |
|         The statistic to compute (default is 'mean').
 | |
|         The following statistics are available:
 | |
| 
 | |
|           * 'mean' : compute the mean of values for points within each bin.
 | |
|             Empty bins will be represented by NaN.
 | |
|           * 'median' : compute the median of values for points within each
 | |
|             bin. Empty bins will be represented by NaN.
 | |
|           * 'count' : compute the count of points within each bin.  This is
 | |
|             identical to an unweighted histogram.  `values` array is not
 | |
|             referenced.
 | |
|           * 'sum' : compute the sum of values for points within each bin.
 | |
|             This is identical to a weighted histogram.
 | |
|           * 'std' : compute the standard deviation within each bin. This
 | |
|             is implicitly calculated with ddof=0. If the number of values
 | |
|             within a given bin is 0 or 1, the computed standard deviation value
 | |
|             will be 0 for the bin.
 | |
|           * 'min' : compute the minimum of values for points within each bin.
 | |
|             Empty bins will be represented by NaN.
 | |
|           * 'max' : compute the maximum of values for point within each bin.
 | |
|             Empty bins will be represented by NaN.
 | |
|           * function : a user-defined function which takes a 1D array of
 | |
|             values, and outputs a single numerical statistic. This function
 | |
|             will be called on the values in each bin.  Empty bins will be
 | |
|             represented by function([]), or NaN if this returns an error.
 | |
| 
 | |
|     bins : sequence or positive int, optional
 | |
|         The bin specification must be in one of the following forms:
 | |
| 
 | |
|           * A sequence of arrays describing the bin edges along each dimension.
 | |
|           * The number of bins for each dimension (nx, ny, ... = bins).
 | |
|           * The number of bins for all dimensions (nx = ny = ... = bins).
 | |
|     range : sequence, optional
 | |
|         A sequence of lower and upper bin edges to be used if the edges are
 | |
|         not given explicitly in `bins`. Defaults to the minimum and maximum
 | |
|         values along each dimension.
 | |
|     expand_binnumbers : bool, optional
 | |
|         'False' (default): the returned `binnumber` is a shape (N,) array of
 | |
|         linearized bin indices.
 | |
|         'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
 | |
|         ndarray, where each row gives the bin numbers in the corresponding
 | |
|         dimension.
 | |
|         See the `binnumber` returned value, and the `Examples` section of
 | |
|         `binned_statistic_2d`.
 | |
|     binned_statistic_result : binnedStatisticddResult
 | |
|         Result of a previous call to the function in order to reuse bin edges
 | |
|         and bin numbers with new values and/or a different statistic.
 | |
|         To reuse bin numbers, `expand_binnumbers` must have been set to False
 | |
|         (the default)
 | |
| 
 | |
|         .. versionadded:: 0.17.0
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     statistic : ndarray, shape(nx1, nx2, nx3,...)
 | |
|         The values of the selected statistic in each two-dimensional bin.
 | |
|     bin_edges : list of ndarrays
 | |
|         A list of D arrays describing the (nxi + 1) bin edges for each
 | |
|         dimension.
 | |
|     binnumber : (N,) array of ints or (D,N) ndarray of ints
 | |
|         This assigns to each element of `sample` an integer that represents the
 | |
|         bin in which this observation falls.  The representation depends on the
 | |
|         `expand_binnumbers` argument.  See `Notes` for details.
 | |
| 
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     Binedges:
 | |
|     All but the last (righthand-most) bin is half-open in each dimension.  In
 | |
|     other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
 | |
|     ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``.  The
 | |
|     last bin, however, is ``[3, 4]``, which *includes* 4.
 | |
| 
 | |
|     `binnumber`:
 | |
|     This returned argument assigns to each element of `sample` an integer that
 | |
|     represents the bin in which it belongs.  The representation depends on the
 | |
|     `expand_binnumbers` argument. If 'False' (default): The returned
 | |
|     `binnumber` is a shape (N,) array of linearized indices mapping each
 | |
|     element of `sample` to its corresponding bin (using row-major ordering).
 | |
|     If 'True': The returned `binnumber` is a shape (D,N) ndarray where
 | |
|     each row indicates bin placements for each dimension respectively.  In each
 | |
|     dimension, a binnumber of `i` means the corresponding value is between
 | |
|     (bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
 | |
| 
 | |
|     .. versionadded:: 0.11.0
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy import stats
 | |
|     >>> import matplotlib.pyplot as plt
 | |
|     >>> from mpl_toolkits.mplot3d import Axes3D
 | |
| 
 | |
|     Take an array of 600 (x, y) coordinates as an example.
 | |
|     `binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
 | |
|     of dimension `D+1` is required.
 | |
| 
 | |
|     >>> mu = np.array([0., 1.])
 | |
|     >>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
 | |
|     >>> multinormal = stats.multivariate_normal(mu, sigma)
 | |
|     >>> data = multinormal.rvs(size=600, random_state=235412)
 | |
|     >>> data.shape
 | |
|     (600, 2)
 | |
| 
 | |
|     Create bins and count how many arrays fall in each bin:
 | |
| 
 | |
|     >>> N = 60
 | |
|     >>> x = np.linspace(-3, 3, N)
 | |
|     >>> y = np.linspace(-3, 4, N)
 | |
|     >>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
 | |
|     ...                                 statistic='count')
 | |
|     >>> bincounts = ret.statistic
 | |
| 
 | |
|     Set the volume and the location of bars:
 | |
| 
 | |
|     >>> dx = x[1] - x[0]
 | |
|     >>> dy = y[1] - y[0]
 | |
|     >>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
 | |
|     >>> z = 0
 | |
| 
 | |
|     >>> bincounts = bincounts.ravel()
 | |
|     >>> x = x.ravel()
 | |
|     >>> y = y.ravel()
 | |
| 
 | |
|     >>> fig = plt.figure()
 | |
|     >>> ax = fig.add_subplot(111, projection='3d')
 | |
|     >>> with np.errstate(divide='ignore'):   # silence random axes3d warning
 | |
|     ...     ax.bar3d(x, y, z, dx, dy, bincounts)
 | |
| 
 | |
|     Reuse bin numbers and bin edges with new values:
 | |
| 
 | |
|     >>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
 | |
|     ...                                  binned_statistic_result=ret,
 | |
|     ...                                  statistic='mean')
 | |
|     """
 | |
|     known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
 | |
|     if not callable(statistic) and statistic not in known_stats:
 | |
|         raise ValueError(f'invalid statistic {statistic!r}')
 | |
| 
 | |
|     try:
 | |
|         bins = index(bins)
 | |
|     except TypeError:
 | |
|         # bins is not an integer
 | |
|         pass
 | |
|     # If bins was an integer-like object, now it is an actual Python int.
 | |
| 
 | |
|     # NOTE: for _bin_edges(), see e.g. gh-11365
 | |
|     if isinstance(bins, int) and not np.isfinite(sample).all():
 | |
|         raise ValueError(f'{sample!r} contains non-finite values.')
 | |
| 
 | |
|     # `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
 | |
|     # `Dlen` is the length of elements along each dimension.
 | |
|     # This code is based on np.histogramdd
 | |
|     try:
 | |
|         # `sample` is an ND-array.
 | |
|         Dlen, Ndim = sample.shape
 | |
|     except (AttributeError, ValueError):
 | |
|         # `sample` is a sequence of 1D arrays.
 | |
|         sample = np.atleast_2d(sample).T
 | |
|         Dlen, Ndim = sample.shape
 | |
| 
 | |
|     # Store initial shape of `values` to preserve it in the output
 | |
|     values = np.asarray(values)
 | |
|     input_shape = list(values.shape)
 | |
|     # Make sure that `values` is 2D to iterate over rows
 | |
|     values = np.atleast_2d(values)
 | |
|     Vdim, Vlen = values.shape
 | |
| 
 | |
|     # Make sure `values` match `sample`
 | |
|     if statistic != 'count' and Vlen != Dlen:
 | |
|         raise AttributeError('The number of `values` elements must match the '
 | |
|                              'length of each `sample` dimension.')
 | |
| 
 | |
|     try:
 | |
|         M = len(bins)
 | |
|         if M != Ndim:
 | |
|             raise AttributeError('The dimension of bins must be equal '
 | |
|                                  'to the dimension of the sample x.')
 | |
|     except TypeError:
 | |
|         bins = Ndim * [bins]
 | |
| 
 | |
|     if binned_statistic_result is None:
 | |
|         nbin, edges, dedges = _bin_edges(sample, bins, range)
 | |
|         binnumbers = _bin_numbers(sample, nbin, edges, dedges)
 | |
|     else:
 | |
|         edges = binned_statistic_result.bin_edges
 | |
|         nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
 | |
|         # +1 for outlier bins
 | |
|         dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
 | |
|         binnumbers = binned_statistic_result.binnumber
 | |
| 
 | |
|     # Avoid overflow with double precision. Complex `values` -> `complex128`.
 | |
|     result_type = np.result_type(values, np.float64)
 | |
|     result = np.empty([Vdim, nbin.prod()], dtype=result_type)
 | |
| 
 | |
|     if statistic in {'mean', np.mean}:
 | |
|         result.fill(np.nan)
 | |
|         flatcount = _bincount(binnumbers, None)
 | |
|         a = flatcount.nonzero()
 | |
|         for vv in builtins.range(Vdim):
 | |
|             flatsum = _bincount(binnumbers, values[vv])
 | |
|             result[vv, a] = flatsum[a] / flatcount[a]
 | |
|     elif statistic in {'std', np.std}:
 | |
|         result.fill(np.nan)
 | |
|         flatcount = _bincount(binnumbers, None)
 | |
|         a = flatcount.nonzero()
 | |
|         for vv in builtins.range(Vdim):
 | |
|             flatsum = _bincount(binnumbers, values[vv])
 | |
|             delta = values[vv] - flatsum[binnumbers] / flatcount[binnumbers]
 | |
|             std = np.sqrt(
 | |
|                 _bincount(binnumbers, delta*np.conj(delta))[a] / flatcount[a]
 | |
|             )
 | |
|             result[vv, a] = std
 | |
|         result = np.real(result)
 | |
|     elif statistic == 'count':
 | |
|         result = np.empty([Vdim, nbin.prod()], dtype=np.float64)
 | |
|         result.fill(0)
 | |
|         flatcount = _bincount(binnumbers, None)
 | |
|         a = np.arange(len(flatcount))
 | |
|         result[:, a] = flatcount[np.newaxis, :]
 | |
|     elif statistic in {'sum', np.sum}:
 | |
|         result.fill(0)
 | |
|         for vv in builtins.range(Vdim):
 | |
|             flatsum = _bincount(binnumbers, values[vv])
 | |
|             a = np.arange(len(flatsum))
 | |
|             result[vv, a] = flatsum
 | |
|     elif statistic in {'median', np.median}:
 | |
|         result.fill(np.nan)
 | |
|         for vv in builtins.range(Vdim):
 | |
|             i = np.lexsort((values[vv], binnumbers))
 | |
|             _, j, counts = np.unique(binnumbers[i],
 | |
|                                      return_index=True, return_counts=True)
 | |
|             mid = j + (counts - 1) / 2
 | |
|             mid_a = values[vv, i][np.floor(mid).astype(int)]
 | |
|             mid_b = values[vv, i][np.ceil(mid).astype(int)]
 | |
|             medians = (mid_a + mid_b) / 2
 | |
|             result[vv, binnumbers[i][j]] = medians
 | |
|     elif statistic in {'min', np.min}:
 | |
|         result.fill(np.nan)
 | |
|         for vv in builtins.range(Vdim):
 | |
|             i = np.argsort(values[vv])[::-1]  # Reversed so the min is last
 | |
|             result[vv, binnumbers[i]] = values[vv, i]
 | |
|     elif statistic in {'max', np.max}:
 | |
|         result.fill(np.nan)
 | |
|         for vv in builtins.range(Vdim):
 | |
|             i = np.argsort(values[vv])
 | |
|             result[vv, binnumbers[i]] = values[vv, i]
 | |
|     elif callable(statistic):
 | |
|         with np.errstate(invalid='ignore'), catch_warnings():
 | |
|             simplefilter("ignore", RuntimeWarning)
 | |
|             try:
 | |
|                 null = statistic([])
 | |
|             except Exception:
 | |
|                 null = np.nan
 | |
|         if np.iscomplexobj(null):
 | |
|             result = result.astype(np.complex128)
 | |
|         result.fill(null)
 | |
|         try:
 | |
|             _calc_binned_statistic(
 | |
|                 Vdim, binnumbers, result, values, statistic
 | |
|             )
 | |
|         except ValueError:
 | |
|             result = result.astype(np.complex128)
 | |
|             _calc_binned_statistic(
 | |
|                 Vdim, binnumbers, result, values, statistic
 | |
|             )
 | |
| 
 | |
|     # Shape into a proper matrix
 | |
|     result = result.reshape(np.append(Vdim, nbin))
 | |
| 
 | |
|     # Remove outliers (indices 0 and -1 for each bin-dimension).
 | |
|     core = tuple([slice(None)] + Ndim * [slice(1, -1)])
 | |
|     result = result[core]
 | |
| 
 | |
|     # Unravel binnumbers into an ndarray, each row the bins for each dimension
 | |
|     if expand_binnumbers and Ndim > 1:
 | |
|         binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
 | |
| 
 | |
|     if np.any(result.shape[1:] != nbin - 2):
 | |
|         raise RuntimeError('Internal Shape Error')
 | |
| 
 | |
|     # Reshape to have output (`result`) match input (`values`) shape
 | |
|     result = result.reshape(input_shape[:-1] + list(nbin-2))
 | |
| 
 | |
|     return BinnedStatisticddResult(result, edges, binnumbers)
 | |
| 
 | |
| 
 | |
| def _calc_binned_statistic(Vdim, bin_numbers, result, values, stat_func):
 | |
|     unique_bin_numbers = np.unique(bin_numbers)
 | |
|     for vv in builtins.range(Vdim):
 | |
|         bin_map = _create_binned_data(bin_numbers, unique_bin_numbers,
 | |
|                                       values, vv)
 | |
|         for i in unique_bin_numbers:
 | |
|             stat = stat_func(np.array(bin_map[i]))
 | |
|             if np.iscomplexobj(stat) and not np.iscomplexobj(result):
 | |
|                 raise ValueError("The statistic function returns complex ")
 | |
|             result[vv, i] = stat
 | |
| 
 | |
| 
 | |
| def _create_binned_data(bin_numbers, unique_bin_numbers, values, vv):
 | |
|     """ Create hashmap of bin ids to values in bins
 | |
|     key: bin number
 | |
|     value: list of binned data
 | |
|     """
 | |
|     bin_map = dict()
 | |
|     for i in unique_bin_numbers:
 | |
|         bin_map[i] = []
 | |
|     for i in builtins.range(len(bin_numbers)):
 | |
|         bin_map[bin_numbers[i]].append(values[vv, i])
 | |
|     return bin_map
 | |
| 
 | |
| 
 | |
| def _bin_edges(sample, bins=None, range=None):
 | |
|     """ Create edge arrays
 | |
|     """
 | |
|     Dlen, Ndim = sample.shape
 | |
| 
 | |
|     nbin = np.empty(Ndim, int)    # Number of bins in each dimension
 | |
|     edges = Ndim * [None]         # Bin edges for each dim (will be 2D array)
 | |
|     dedges = Ndim * [None]        # Spacing between edges (will be 2D array)
 | |
| 
 | |
|     # Select range for each dimension
 | |
|     # Used only if number of bins is given.
 | |
|     if range is None:
 | |
|         smin = np.atleast_1d(np.array(sample.min(axis=0), float))
 | |
|         smax = np.atleast_1d(np.array(sample.max(axis=0), float))
 | |
|     else:
 | |
|         if len(range) != Ndim:
 | |
|             raise ValueError(
 | |
|                 f"range given for {len(range)} dimensions; {Ndim} required")
 | |
|         smin = np.empty(Ndim)
 | |
|         smax = np.empty(Ndim)
 | |
|         for i in builtins.range(Ndim):
 | |
|             if range[i][1] < range[i][0]:
 | |
|                 raise ValueError(
 | |
|                     f"In {f'dimension {i + 1} of ' if Ndim > 1 else ''}range,"
 | |
|                     " start must be <= stop")
 | |
|             smin[i], smax[i] = range[i]
 | |
| 
 | |
|     # Make sure the bins have a finite width.
 | |
|     for i in builtins.range(len(smin)):
 | |
|         if smin[i] == smax[i]:
 | |
|             smin[i] = smin[i] - .5
 | |
|             smax[i] = smax[i] + .5
 | |
| 
 | |
|     # Preserve sample floating point precision in bin edges
 | |
|     edges_dtype = (sample.dtype if np.issubdtype(sample.dtype, np.floating)
 | |
|                    else float)
 | |
| 
 | |
|     # Create edge arrays
 | |
|     for i in builtins.range(Ndim):
 | |
|         if np.isscalar(bins[i]):
 | |
|             nbin[i] = bins[i] + 2  # +2 for outlier bins
 | |
|             edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1,
 | |
|                                    dtype=edges_dtype)
 | |
|         else:
 | |
|             edges[i] = np.asarray(bins[i], edges_dtype)
 | |
|             nbin[i] = len(edges[i]) + 1  # +1 for outlier bins
 | |
|         dedges[i] = np.diff(edges[i])
 | |
| 
 | |
|     nbin = np.asarray(nbin)
 | |
| 
 | |
|     return nbin, edges, dedges
 | |
| 
 | |
| 
 | |
| def _bin_numbers(sample, nbin, edges, dedges):
 | |
|     """Compute the bin number each sample falls into, in each dimension
 | |
|     """
 | |
|     Dlen, Ndim = sample.shape
 | |
| 
 | |
|     sampBin = [
 | |
|         np.digitize(sample[:, i], edges[i])
 | |
|         for i in range(Ndim)
 | |
|     ]
 | |
| 
 | |
|     # Using `digitize`, values that fall on an edge are put in the right bin.
 | |
|     # For the rightmost bin, we want values equal to the right
 | |
|     # edge to be counted in the last bin, and not as an outlier.
 | |
|     for i in range(Ndim):
 | |
|         # Find the rounding precision
 | |
|         dedges_min = dedges[i].min()
 | |
|         if dedges_min == 0:
 | |
|             raise ValueError('The smallest edge difference is numerically 0.')
 | |
|         decimal = int(-np.log10(dedges_min)) + 6
 | |
|         # Find which points are on the rightmost edge.
 | |
|         on_edge = np.where((sample[:, i] >= edges[i][-1]) &
 | |
|                            (np.around(sample[:, i], decimal) ==
 | |
|                             np.around(edges[i][-1], decimal)))[0]
 | |
|         # Shift these points one bin to the left.
 | |
|         sampBin[i][on_edge] -= 1
 | |
| 
 | |
|     # Compute the sample indices in the flattened statistic matrix.
 | |
|     binnumbers = np.ravel_multi_index(sampBin, nbin)
 | |
| 
 | |
|     return binnumbers
 |