551 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			551 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import warnings
 | |
| import numpy as np
 | |
| 
 | |
| from scipy._lib._util import check_random_state, MapWrapper, rng_integers, _contains_nan
 | |
| from scipy._lib._bunch import _make_tuple_bunch
 | |
| from scipy.spatial.distance import cdist
 | |
| from scipy.ndimage import _measurements
 | |
| 
 | |
| from ._stats import _local_correlations  # type: ignore[import-not-found]
 | |
| from . import distributions
 | |
| 
 | |
| __all__ = ['multiscale_graphcorr']
 | |
| 
 | |
| # FROM MGCPY: https://github.com/neurodata/mgcpy
 | |
| 
 | |
| 
 | |
| class _ParallelP:
 | |
|     """Helper function to calculate parallel p-value."""
 | |
| 
 | |
|     def __init__(self, x, y, random_states):
 | |
|         self.x = x
 | |
|         self.y = y
 | |
|         self.random_states = random_states
 | |
| 
 | |
|     def __call__(self, index):
 | |
|         order = self.random_states[index].permutation(self.y.shape[0])
 | |
|         permy = self.y[order][:, order]
 | |
| 
 | |
|         # calculate permuted stats, store in null distribution
 | |
|         perm_stat = _mgc_stat(self.x, permy)[0]
 | |
| 
 | |
|         return perm_stat
 | |
| 
 | |
| 
 | |
| def _perm_test(x, y, stat, reps=1000, workers=-1, random_state=None):
 | |
|     r"""Helper function that calculates the p-value. See below for uses.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x, y : ndarray
 | |
|         `x` and `y` have shapes ``(n, p)`` and ``(n, q)``.
 | |
|     stat : float
 | |
|         The sample test statistic.
 | |
|     reps : int, optional
 | |
|         The number of replications used to estimate the null when using the
 | |
|         permutation test. The default is 1000 replications.
 | |
|     workers : int or map-like callable, optional
 | |
|         If `workers` is an int the population is subdivided into `workers`
 | |
|         sections and evaluated in parallel (uses
 | |
|         `multiprocessing.Pool <multiprocessing>`). Supply `-1` to use all cores
 | |
|         available to the Process. Alternatively supply a map-like callable,
 | |
|         such as `multiprocessing.Pool.map` for evaluating the population in
 | |
|         parallel. This evaluation is carried out as `workers(func, iterable)`.
 | |
|         Requires that `func` be pickleable.
 | |
|     random_state : {None, int, `numpy.random.Generator`,
 | |
|                     `numpy.random.RandomState`}, optional
 | |
| 
 | |
|         If `seed` is None (or `np.random`), the `numpy.random.RandomState`
 | |
|         singleton is used.
 | |
|         If `seed` is an int, a new ``RandomState`` instance is used,
 | |
|         seeded with `seed`.
 | |
|         If `seed` is already a ``Generator`` or ``RandomState`` instance then
 | |
|         that instance is used.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     pvalue : float
 | |
|         The sample test p-value.
 | |
|     null_dist : list
 | |
|         The approximated null distribution.
 | |
| 
 | |
|     """
 | |
|     # generate seeds for each rep (change to new parallel random number
 | |
|     # capabilities in numpy >= 1.17+)
 | |
|     random_state = check_random_state(random_state)
 | |
|     random_states = [np.random.RandomState(rng_integers(random_state, 1 << 32,
 | |
|                      size=4, dtype=np.uint32)) for _ in range(reps)]
 | |
| 
 | |
|     # parallelizes with specified workers over number of reps and set seeds
 | |
|     parallelp = _ParallelP(x=x, y=y, random_states=random_states)
 | |
|     with MapWrapper(workers) as mapwrapper:
 | |
|         null_dist = np.array(list(mapwrapper(parallelp, range(reps))))
 | |
| 
 | |
|     # calculate p-value and significant permutation map through list
 | |
|     pvalue = (1 + (null_dist >= stat).sum()) / (1 + reps)
 | |
| 
 | |
|     return pvalue, null_dist
 | |
| 
 | |
| 
 | |
| def _euclidean_dist(x):
 | |
|     return cdist(x, x)
 | |
| 
 | |
| 
 | |
| MGCResult = _make_tuple_bunch('MGCResult',
 | |
|                               ['statistic', 'pvalue', 'mgc_dict'], [])
 | |
| 
 | |
| 
 | |
| def multiscale_graphcorr(x, y, compute_distance=_euclidean_dist, reps=1000,
 | |
|                          workers=1, is_twosamp=False, random_state=None):
 | |
|     r"""Computes the Multiscale Graph Correlation (MGC) test statistic.
 | |
| 
 | |
|     Specifically, for each point, MGC finds the :math:`k`-nearest neighbors for
 | |
|     one property (e.g. cloud density), and the :math:`l`-nearest neighbors for
 | |
|     the other property (e.g. grass wetness) [1]_. This pair :math:`(k, l)` is
 | |
|     called the "scale". A priori, however, it is not know which scales will be
 | |
|     most informative. So, MGC computes all distance pairs, and then efficiently
 | |
|     computes the distance correlations for all scales. The local correlations
 | |
|     illustrate which scales are relatively informative about the relationship.
 | |
|     The key, therefore, to successfully discover and decipher relationships
 | |
|     between disparate data modalities is to adaptively determine which scales
 | |
|     are the most informative, and the geometric implication for the most
 | |
|     informative scales. Doing so not only provides an estimate of whether the
 | |
|     modalities are related, but also provides insight into how the
 | |
|     determination was made. This is especially important in high-dimensional
 | |
|     data, where simple visualizations do not reveal relationships to the
 | |
|     unaided human eye. Characterizations of this implementation in particular
 | |
|     have been derived from and benchmarked within in [2]_.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     x, y : ndarray
 | |
|         If ``x`` and ``y`` have shapes ``(n, p)`` and ``(n, q)`` where `n` is
 | |
|         the number of samples and `p` and `q` are the number of dimensions,
 | |
|         then the MGC independence test will be run.  Alternatively, ``x`` and
 | |
|         ``y`` can have shapes ``(n, n)`` if they are distance or similarity
 | |
|         matrices, and ``compute_distance`` must be sent to ``None``. If ``x``
 | |
|         and ``y`` have shapes ``(n, p)`` and ``(m, p)``, an unpaired
 | |
|         two-sample MGC test will be run.
 | |
|     compute_distance : callable, optional
 | |
|         A function that computes the distance or similarity among the samples
 | |
|         within each data matrix. Set to ``None`` if ``x`` and ``y`` are
 | |
|         already distance matrices. The default uses the euclidean norm metric.
 | |
|         If you are calling a custom function, either create the distance
 | |
|         matrix before-hand or create a function of the form
 | |
|         ``compute_distance(x)`` where `x` is the data matrix for which
 | |
|         pairwise distances are calculated.
 | |
|     reps : int, optional
 | |
|         The number of replications used to estimate the null when using the
 | |
|         permutation test. The default is ``1000``.
 | |
|     workers : int or map-like callable, optional
 | |
|         If ``workers`` is an int the population is subdivided into ``workers``
 | |
|         sections and evaluated in parallel (uses ``multiprocessing.Pool
 | |
|         <multiprocessing>``). Supply ``-1`` to use all cores available to the
 | |
|         Process. Alternatively supply a map-like callable, such as
 | |
|         ``multiprocessing.Pool.map`` for evaluating the p-value in parallel.
 | |
|         This evaluation is carried out as ``workers(func, iterable)``.
 | |
|         Requires that `func` be pickleable. The default is ``1``.
 | |
|     is_twosamp : bool, optional
 | |
|         If `True`, a two sample test will be run. If ``x`` and ``y`` have
 | |
|         shapes ``(n, p)`` and ``(m, p)``, this optional will be overridden and
 | |
|         set to ``True``. Set to ``True`` if ``x`` and ``y`` both have shapes
 | |
|         ``(n, p)`` and a two sample test is desired. The default is ``False``.
 | |
|         Note that this will not run if inputs are distance matrices.
 | |
|     random_state : {None, int, `numpy.random.Generator`,
 | |
|                     `numpy.random.RandomState`}, optional
 | |
| 
 | |
|         If `seed` is None (or `np.random`), the `numpy.random.RandomState`
 | |
|         singleton is used.
 | |
|         If `seed` is an int, a new ``RandomState`` instance is used,
 | |
|         seeded with `seed`.
 | |
|         If `seed` is already a ``Generator`` or ``RandomState`` instance then
 | |
|         that instance is used.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     res : MGCResult
 | |
|         An object containing attributes:
 | |
| 
 | |
|         statistic : float
 | |
|             The sample MGC test statistic within ``[-1, 1]``.
 | |
|         pvalue : float
 | |
|             The p-value obtained via permutation.
 | |
|         mgc_dict : dict
 | |
|             Contains additional useful results:
 | |
| 
 | |
|                 - mgc_map : ndarray
 | |
|                     A 2D representation of the latent geometry of the
 | |
|                     relationship.
 | |
|                 - opt_scale : (int, int)
 | |
|                     The estimated optimal scale as a ``(x, y)`` pair.
 | |
|                 - null_dist : list
 | |
|                     The null distribution derived from the permuted matrices.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     pearsonr : Pearson correlation coefficient and p-value for testing
 | |
|                non-correlation.
 | |
|     kendalltau : Calculates Kendall's tau.
 | |
|     spearmanr : Calculates a Spearman rank-order correlation coefficient.
 | |
| 
 | |
|     Notes
 | |
|     -----
 | |
|     A description of the process of MGC and applications on neuroscience data
 | |
|     can be found in [1]_. It is performed using the following steps:
 | |
| 
 | |
|     #. Two distance matrices :math:`D^X` and :math:`D^Y` are computed and
 | |
|        modified to be mean zero columnwise. This results in two
 | |
|        :math:`n \times n` distance matrices :math:`A` and :math:`B` (the
 | |
|        centering and unbiased modification) [3]_.
 | |
| 
 | |
|     #. For all values :math:`k` and :math:`l` from :math:`1, ..., n`,
 | |
| 
 | |
|        * The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs
 | |
|          are calculated for each property. Here, :math:`G_k (i, j)` indicates
 | |
|          the :math:`k`-smallest values of the :math:`i`-th row of :math:`A`
 | |
|          and :math:`H_l (i, j)` indicates the :math:`l` smallested values of
 | |
|          the :math:`i`-th row of :math:`B`
 | |
| 
 | |
|        * Let :math:`\circ` denotes the entry-wise matrix product, then local
 | |
|          correlations are summed and normalized using the following statistic:
 | |
| 
 | |
|     .. math::
 | |
| 
 | |
|         c^{kl} = \frac{\sum_{ij} A G_k B H_l}
 | |
|                       {\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}
 | |
| 
 | |
|     #. The MGC test statistic is the smoothed optimal local correlation of
 | |
|        :math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`
 | |
|        (which essentially set all isolated large correlations) as 0 and
 | |
|        connected large correlations the same as before, see [3]_.) MGC is,
 | |
| 
 | |
|     .. math::
 | |
| 
 | |
|         MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)
 | |
|                                                     \right)
 | |
| 
 | |
|     The test statistic returns a value between :math:`(-1, 1)` since it is
 | |
|     normalized.
 | |
| 
 | |
|     The p-value returned is calculated using a permutation test. This process
 | |
|     is completed by first randomly permuting :math:`y` to estimate the null
 | |
|     distribution and then calculating the probability of observing a test
 | |
|     statistic, under the null, at least as extreme as the observed test
 | |
|     statistic.
 | |
| 
 | |
|     MGC requires at least 5 samples to run with reliable results. It can also
 | |
|     handle high-dimensional data sets.
 | |
|     In addition, by manipulating the input data matrices, the two-sample
 | |
|     testing problem can be reduced to the independence testing problem [4]_.
 | |
|     Given sample data :math:`U` and :math:`V` of sizes :math:`p \times n`
 | |
|     :math:`p \times m`, data matrix :math:`X` and :math:`Y` can be created as
 | |
|     follows:
 | |
| 
 | |
|     .. math::
 | |
| 
 | |
|         X = [U | V] \in \mathcal{R}^{p \times (n + m)}
 | |
|         Y = [0_{1 \times n} | 1_{1 \times m}] \in \mathcal{R}^{(n + m)}
 | |
| 
 | |
|     Then, the MGC statistic can be calculated as normal. This methodology can
 | |
|     be extended to similar tests such as distance correlation [4]_.
 | |
| 
 | |
|     .. versionadded:: 1.4.0
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     .. [1] Vogelstein, J. T., Bridgeford, E. W., Wang, Q., Priebe, C. E.,
 | |
|            Maggioni, M., & Shen, C. (2019). Discovering and deciphering
 | |
|            relationships across disparate data modalities. ELife.
 | |
|     .. [2] Panda, S., Palaniappan, S., Xiong, J., Swaminathan, A.,
 | |
|            Ramachandran, S., Bridgeford, E. W., ... Vogelstein, J. T. (2019).
 | |
|            mgcpy: A Comprehensive High Dimensional Independence Testing Python
 | |
|            Package. :arXiv:`1907.02088`
 | |
|     .. [3] Shen, C., Priebe, C.E., & Vogelstein, J. T. (2019). From distance
 | |
|            correlation to multiscale graph correlation. Journal of the American
 | |
|            Statistical Association.
 | |
|     .. [4] Shen, C. & Vogelstein, J. T. (2018). The Exact Equivalence of
 | |
|            Distance and Kernel Methods for Hypothesis Testing.
 | |
|            :arXiv:`1806.05514`
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> import numpy as np
 | |
|     >>> from scipy.stats import multiscale_graphcorr
 | |
|     >>> x = np.arange(100)
 | |
|     >>> y = x
 | |
|     >>> res = multiscale_graphcorr(x, y)
 | |
|     >>> res.statistic, res.pvalue
 | |
|     (1.0, 0.001)
 | |
| 
 | |
|     To run an unpaired two-sample test,
 | |
| 
 | |
|     >>> x = np.arange(100)
 | |
|     >>> y = np.arange(79)
 | |
|     >>> res = multiscale_graphcorr(x, y)
 | |
|     >>> res.statistic, res.pvalue  # doctest: +SKIP
 | |
|     (0.033258146255703246, 0.023)
 | |
| 
 | |
|     or, if shape of the inputs are the same,
 | |
| 
 | |
|     >>> x = np.arange(100)
 | |
|     >>> y = x
 | |
|     >>> res = multiscale_graphcorr(x, y, is_twosamp=True)
 | |
|     >>> res.statistic, res.pvalue  # doctest: +SKIP
 | |
|     (-0.008021809890200488, 1.0)
 | |
| 
 | |
|     """
 | |
|     if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray):
 | |
|         raise ValueError("x and y must be ndarrays")
 | |
| 
 | |
|     # convert arrays of type (n,) to (n, 1)
 | |
|     if x.ndim == 1:
 | |
|         x = x[:, np.newaxis]
 | |
|     elif x.ndim != 2:
 | |
|         raise ValueError(f"Expected a 2-D array `x`, found shape {x.shape}")
 | |
|     if y.ndim == 1:
 | |
|         y = y[:, np.newaxis]
 | |
|     elif y.ndim != 2:
 | |
|         raise ValueError(f"Expected a 2-D array `y`, found shape {y.shape}")
 | |
| 
 | |
|     nx, px = x.shape
 | |
|     ny, py = y.shape
 | |
| 
 | |
|     # check for NaNs
 | |
|     _contains_nan(x, nan_policy='raise')
 | |
|     _contains_nan(y, nan_policy='raise')
 | |
| 
 | |
|     # check for positive or negative infinity and raise error
 | |
|     if np.sum(np.isinf(x)) > 0 or np.sum(np.isinf(y)) > 0:
 | |
|         raise ValueError("Inputs contain infinities")
 | |
| 
 | |
|     if nx != ny:
 | |
|         if px == py:
 | |
|             # reshape x and y for two sample testing
 | |
|             is_twosamp = True
 | |
|         else:
 | |
|             raise ValueError("Shape mismatch, x and y must have shape [n, p] "
 | |
|                              "and [n, q] or have shape [n, p] and [m, p].")
 | |
| 
 | |
|     if nx < 5 or ny < 5:
 | |
|         raise ValueError("MGC requires at least 5 samples to give reasonable "
 | |
|                          "results.")
 | |
| 
 | |
|     # convert x and y to float
 | |
|     x = x.astype(np.float64)
 | |
|     y = y.astype(np.float64)
 | |
| 
 | |
|     # check if compute_distance_matrix if a callable()
 | |
|     if not callable(compute_distance) and compute_distance is not None:
 | |
|         raise ValueError("Compute_distance must be a function.")
 | |
| 
 | |
|     # check if number of reps exists, integer, or > 0 (if under 1000 raises
 | |
|     # warning)
 | |
|     if not isinstance(reps, int) or reps < 0:
 | |
|         raise ValueError("Number of reps must be an integer greater than 0.")
 | |
|     elif reps < 1000:
 | |
|         msg = ("The number of replications is low (under 1000), and p-value "
 | |
|                "calculations may be unreliable. Use the p-value result, with "
 | |
|                "caution!")
 | |
|         warnings.warn(msg, RuntimeWarning, stacklevel=2)
 | |
| 
 | |
|     if is_twosamp:
 | |
|         if compute_distance is None:
 | |
|             raise ValueError("Cannot run if inputs are distance matrices")
 | |
|         x, y = _two_sample_transform(x, y)
 | |
| 
 | |
|     if compute_distance is not None:
 | |
|         # compute distance matrices for x and y
 | |
|         x = compute_distance(x)
 | |
|         y = compute_distance(y)
 | |
| 
 | |
|     # calculate MGC stat
 | |
|     stat, stat_dict = _mgc_stat(x, y)
 | |
|     stat_mgc_map = stat_dict["stat_mgc_map"]
 | |
|     opt_scale = stat_dict["opt_scale"]
 | |
| 
 | |
|     # calculate permutation MGC p-value
 | |
|     pvalue, null_dist = _perm_test(x, y, stat, reps=reps, workers=workers,
 | |
|                                    random_state=random_state)
 | |
| 
 | |
|     # save all stats (other than stat/p-value) in dictionary
 | |
|     mgc_dict = {"mgc_map": stat_mgc_map,
 | |
|                 "opt_scale": opt_scale,
 | |
|                 "null_dist": null_dist}
 | |
| 
 | |
|     # create result object with alias for backward compatibility
 | |
|     res = MGCResult(stat, pvalue, mgc_dict)
 | |
|     res.stat = stat
 | |
|     return res
 | |
| 
 | |
| 
 | |
| def _mgc_stat(distx, disty):
 | |
|     r"""Helper function that calculates the MGC stat. See above for use.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     distx, disty : ndarray
 | |
|         `distx` and `disty` have shapes ``(n, p)`` and ``(n, q)`` or
 | |
|         ``(n, n)`` and ``(n, n)``
 | |
|         if distance matrices.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     stat : float
 | |
|         The sample MGC test statistic within ``[-1, 1]``.
 | |
|     stat_dict : dict
 | |
|         Contains additional useful additional returns containing the following
 | |
|         keys:
 | |
| 
 | |
|             - stat_mgc_map : ndarray
 | |
|                 MGC-map of the statistics.
 | |
|             - opt_scale : (float, float)
 | |
|                 The estimated optimal scale as a ``(x, y)`` pair.
 | |
| 
 | |
|     """
 | |
|     # calculate MGC map and optimal scale
 | |
|     stat_mgc_map = _local_correlations(distx, disty, global_corr='mgc')
 | |
| 
 | |
|     n, m = stat_mgc_map.shape
 | |
|     if m == 1 or n == 1:
 | |
|         # the global scale at is the statistic calculated at maximal nearest
 | |
|         # neighbors. There is not enough local scale to search over, so
 | |
|         # default to global scale
 | |
|         stat = stat_mgc_map[m - 1][n - 1]
 | |
|         opt_scale = m * n
 | |
|     else:
 | |
|         samp_size = len(distx) - 1
 | |
| 
 | |
|         # threshold to find connected region of significant local correlations
 | |
|         sig_connect = _threshold_mgc_map(stat_mgc_map, samp_size)
 | |
| 
 | |
|         # maximum within the significant region
 | |
|         stat, opt_scale = _smooth_mgc_map(sig_connect, stat_mgc_map)
 | |
| 
 | |
|     stat_dict = {"stat_mgc_map": stat_mgc_map,
 | |
|                  "opt_scale": opt_scale}
 | |
| 
 | |
|     return stat, stat_dict
 | |
| 
 | |
| 
 | |
| def _threshold_mgc_map(stat_mgc_map, samp_size):
 | |
|     r"""
 | |
|     Finds a connected region of significance in the MGC-map by thresholding.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     stat_mgc_map : ndarray
 | |
|         All local correlations within ``[-1,1]``.
 | |
|     samp_size : int
 | |
|         The sample size of original data.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     sig_connect : ndarray
 | |
|         A binary matrix with 1's indicating the significant region.
 | |
| 
 | |
|     """
 | |
|     m, n = stat_mgc_map.shape
 | |
| 
 | |
|     # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
 | |
|     # with varying levels of performance. Threshold is based on a beta
 | |
|     # approximation.
 | |
|     per_sig = 1 - (0.02 / samp_size)  # Percentile to consider as significant
 | |
|     threshold = samp_size * (samp_size - 3)/4 - 1/2  # Beta approximation
 | |
|     threshold = distributions.beta.ppf(per_sig, threshold, threshold) * 2 - 1
 | |
| 
 | |
|     # the global scale at is the statistic calculated at maximal nearest
 | |
|     # neighbors. Threshold is the maximum on the global and local scales
 | |
|     threshold = max(threshold, stat_mgc_map[m - 1][n - 1])
 | |
| 
 | |
|     # find the largest connected component of significant correlations
 | |
|     sig_connect = stat_mgc_map > threshold
 | |
|     if np.sum(sig_connect) > 0:
 | |
|         sig_connect, _ = _measurements.label(sig_connect)
 | |
|         _, label_counts = np.unique(sig_connect, return_counts=True)
 | |
| 
 | |
|         # skip the first element in label_counts, as it is count(zeros)
 | |
|         max_label = np.argmax(label_counts[1:]) + 1
 | |
|         sig_connect = sig_connect == max_label
 | |
|     else:
 | |
|         sig_connect = np.array([[False]])
 | |
| 
 | |
|     return sig_connect
 | |
| 
 | |
| 
 | |
| def _smooth_mgc_map(sig_connect, stat_mgc_map):
 | |
|     """Finds the smoothed maximal within the significant region R.
 | |
| 
 | |
|     If area of R is too small it returns the last local correlation. Otherwise,
 | |
|     returns the maximum within significant_connected_region.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     sig_connect : ndarray
 | |
|         A binary matrix with 1's indicating the significant region.
 | |
|     stat_mgc_map : ndarray
 | |
|         All local correlations within ``[-1, 1]``.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     stat : float
 | |
|         The sample MGC statistic within ``[-1, 1]``.
 | |
|     opt_scale: (float, float)
 | |
|         The estimated optimal scale as an ``(x, y)`` pair.
 | |
| 
 | |
|     """
 | |
|     m, n = stat_mgc_map.shape
 | |
| 
 | |
|     # the global scale at is the statistic calculated at maximal nearest
 | |
|     # neighbors. By default, statistic and optimal scale are global.
 | |
|     stat = stat_mgc_map[m - 1][n - 1]
 | |
|     opt_scale = [m, n]
 | |
| 
 | |
|     if np.linalg.norm(sig_connect) != 0:
 | |
|         # proceed only when the connected region's area is sufficiently large
 | |
|         # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
 | |
|         # with varying levels of performance
 | |
|         if np.sum(sig_connect) >= np.ceil(0.02 * max(m, n)) * min(m, n):
 | |
|             max_corr = max(stat_mgc_map[sig_connect])
 | |
| 
 | |
|             # find all scales within significant_connected_region that maximize
 | |
|             # the local correlation
 | |
|             max_corr_index = np.where((stat_mgc_map >= max_corr) & sig_connect)
 | |
| 
 | |
|             if max_corr >= stat:
 | |
|                 stat = max_corr
 | |
| 
 | |
|                 k, l = max_corr_index
 | |
|                 one_d_indices = k * n + l  # 2D to 1D indexing
 | |
|                 k = np.max(one_d_indices) // n
 | |
|                 l = np.max(one_d_indices) % n
 | |
|                 opt_scale = [k+1, l+1]  # adding 1s to match R indexing
 | |
| 
 | |
|     return stat, opt_scale
 | |
| 
 | |
| 
 | |
| def _two_sample_transform(u, v):
 | |
|     """Helper function that concatenates x and y for two sample MGC stat.
 | |
| 
 | |
|     See above for use.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     u, v : ndarray
 | |
|         `u` and `v` have shapes ``(n, p)`` and ``(m, p)``.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     x : ndarray
 | |
|         Concatenate `u` and `v` along the ``axis = 0``. `x` thus has shape
 | |
|         ``(2n, p)``.
 | |
|     y : ndarray
 | |
|         Label matrix for `x` where 0 refers to samples that comes from `u` and
 | |
|         1 refers to samples that come from `v`. `y` thus has shape ``(2n, 1)``.
 | |
| 
 | |
|     """
 | |
|     nx = u.shape[0]
 | |
|     ny = v.shape[0]
 | |
|     x = np.concatenate([u, v], axis=0)
 | |
|     y = np.concatenate([np.zeros(nx), np.ones(ny)], axis=0).reshape(-1, 1)
 | |
|     return x, y
 |