601 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			601 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Compute the two-sided one-sample Kolmogorov-Smirnov Prob(Dn <= d) where:
 | |
| #    D_n = sup_x{|F_n(x) - F(x)|},
 | |
| #    F_n(x) is the empirical CDF for a sample of size n {x_i: i=1,...,n},
 | |
| #    F(x) is the CDF of a probability distribution.
 | |
| #
 | |
| # Exact methods:
 | |
| # Prob(D_n >= d) can be computed via a matrix algorithm of Durbin[1]
 | |
| #   or a recursion algorithm due to Pomeranz[2].
 | |
| # Marsaglia, Tsang & Wang[3] gave a computation-efficient way to perform
 | |
| #   the Durbin algorithm.
 | |
| #   D_n >= d <==>  D_n+ >= d or D_n- >= d (the one-sided K-S statistics), hence
 | |
| #   Prob(D_n >= d) = 2*Prob(D_n+ >= d) - Prob(D_n+ >= d and D_n- >= d).
 | |
| #   For d > 0.5, the latter intersection probability is 0.
 | |
| #
 | |
| # Approximate methods:
 | |
| # For d close to 0.5, ignoring that intersection term may still give a
 | |
| #   reasonable approximation.
 | |
| # Li-Chien[4] and Korolyuk[5] gave an asymptotic formula extending
 | |
| # Kolmogorov's initial asymptotic, suitable for large d. (See
 | |
| #   scipy.special.kolmogorov for that asymptotic)
 | |
| # Pelz-Good[6] used the functional equation for Jacobi theta functions to
 | |
| #   transform the Li-Chien/Korolyuk formula produce a computational formula
 | |
| #   suitable for small d.
 | |
| #
 | |
| # Simard and L'Ecuyer[7] provided an algorithm to decide when to use each of
 | |
| #   the above approaches and it is that which is used here.
 | |
| #
 | |
| # Other approaches:
 | |
| # Carvalho[8] optimizes Durbin's matrix algorithm for large values of d.
 | |
| # Moscovich and Nadler[9] use FFTs to compute the convolutions.
 | |
| 
 | |
| # References:
 | |
| # [1] Durbin J (1968).
 | |
| #     "The Probability that the Sample Distribution Function Lies Between Two
 | |
| #     Parallel Straight Lines."
 | |
| #     Annals of Mathematical Statistics, 39, 398-411.
 | |
| # [2] Pomeranz J (1974).
 | |
| #     "Exact Cumulative Distribution of the Kolmogorov-Smirnov Statistic for
 | |
| #     Small Samples (Algorithm 487)."
 | |
| #     Communications of the ACM, 17(12), 703-704.
 | |
| # [3] Marsaglia G, Tsang WW, Wang J (2003).
 | |
| #     "Evaluating Kolmogorov's Distribution."
 | |
| #     Journal of Statistical Software, 8(18), 1-4.
 | |
| # [4] LI-CHIEN, C. (1956).
 | |
| #     "On the exact distribution of the statistics of A. N. Kolmogorov and
 | |
| #     their asymptotic expansion."
 | |
| #     Acta Matematica Sinica, 6, 55-81.
 | |
| # [5] KOROLYUK, V. S. (1960).
 | |
| #     "Asymptotic analysis of the distribution of the maximum deviation in
 | |
| #     the Bernoulli scheme."
 | |
| #     Theor. Probability Appl., 4, 339-366.
 | |
| # [6] Pelz W, Good IJ (1976).
 | |
| #     "Approximating the Lower Tail-areas of the Kolmogorov-Smirnov One-sample
 | |
| #     Statistic."
 | |
| #     Journal of the Royal Statistical Society, Series B, 38(2), 152-156.
 | |
| #  [7] Simard, R., L'Ecuyer, P. (2011)
 | |
| # 	  "Computing the Two-Sided Kolmogorov-Smirnov Distribution",
 | |
| # 	  Journal of Statistical Software, Vol 39, 11, 1-18.
 | |
| #  [8] Carvalho, Luis (2015)
 | |
| #     "An Improved Evaluation of Kolmogorov's Distribution"
 | |
| #     Journal of Statistical Software, Code Snippets; Vol 65(3), 1-8.
 | |
| #  [9] Amit Moscovich, Boaz Nadler (2017)
 | |
| #     "Fast calculation of boundary crossing probabilities for Poisson
 | |
| #     processes",
 | |
| #     Statistics & Probability Letters, Vol 123, 177-182.
 | |
| 
 | |
| 
 | |
| import numpy as np
 | |
| import scipy.special
 | |
| import scipy.special._ufuncs as scu
 | |
| from scipy.stats._finite_differences import _derivative
 | |
| 
 | |
| _E128 = 128
 | |
| _EP128 = np.ldexp(np.longdouble(1), _E128)
 | |
| _EM128 = np.ldexp(np.longdouble(1), -_E128)
 | |
| 
 | |
| _SQRT2PI = np.sqrt(2 * np.pi)
 | |
| _LOG_2PI = np.log(2 * np.pi)
 | |
| _MIN_LOG = -708
 | |
| _SQRT3 = np.sqrt(3)
 | |
| _PI_SQUARED = np.pi ** 2
 | |
| _PI_FOUR = np.pi ** 4
 | |
| _PI_SIX = np.pi ** 6
 | |
| 
 | |
| # [Lifted from _loggamma.pxd.] If B_m are the Bernoulli numbers,
 | |
| # then Stirling coeffs are B_{2j}/(2j)/(2j-1) for j=8,...1.
 | |
| _STIRLING_COEFFS = [-2.955065359477124183e-2, 6.4102564102564102564e-3,
 | |
|                     -1.9175269175269175269e-3, 8.4175084175084175084e-4,
 | |
|                     -5.952380952380952381e-4, 7.9365079365079365079e-4,
 | |
|                     -2.7777777777777777778e-3, 8.3333333333333333333e-2]
 | |
| 
 | |
| 
 | |
| def _log_nfactorial_div_n_pow_n(n):
 | |
|     # Computes n! / n**n
 | |
|     #    = (n-1)! / n**(n-1)
 | |
|     # Uses Stirling's approximation, but removes n*log(n) up-front to
 | |
|     # avoid subtractive cancellation.
 | |
|     #    = log(n)/2 - n + log(sqrt(2pi)) + sum B_{2j}/(2j)/(2j-1)/n**(2j-1)
 | |
|     rn = 1.0/n
 | |
|     return np.log(n)/2 - n + _LOG_2PI/2 + rn * np.polyval(_STIRLING_COEFFS, rn/n)
 | |
| 
 | |
| 
 | |
| def _clip_prob(p):
 | |
|     """clips a probability to range 0<=p<=1."""
 | |
|     return np.clip(p, 0.0, 1.0)
 | |
| 
 | |
| 
 | |
| def _select_and_clip_prob(cdfprob, sfprob, cdf=True):
 | |
|     """Selects either the CDF or SF, and then clips to range 0<=p<=1."""
 | |
|     p = np.where(cdf, cdfprob, sfprob)
 | |
|     return _clip_prob(p)
 | |
| 
 | |
| 
 | |
| def _kolmogn_DMTW(n, d, cdf=True):
 | |
|     r"""Computes the Kolmogorov CDF:  Pr(D_n <= d) using the MTW approach to
 | |
|     the Durbin matrix algorithm.
 | |
| 
 | |
|     Durbin (1968); Marsaglia, Tsang, Wang (2003). [1], [3].
 | |
|     """
 | |
|     # Write d = (k-h)/n, where k is positive integer and 0 <= h < 1
 | |
|     # Generate initial matrix H of size m*m where m=(2k-1)
 | |
|     # Compute k-th row of (n!/n^n) * H^n, scaling intermediate results.
 | |
|     # Requires memory O(m^2) and computation O(m^2 log(n)).
 | |
|     # Most suitable for small m.
 | |
| 
 | |
|     if d >= 1.0:
 | |
|         return _select_and_clip_prob(1.0, 0.0, cdf)
 | |
|     nd = n * d
 | |
|     if nd <= 0.5:
 | |
|         return _select_and_clip_prob(0.0, 1.0, cdf)
 | |
|     k = int(np.ceil(nd))
 | |
|     h = k - nd
 | |
|     m = 2 * k - 1
 | |
| 
 | |
|     H = np.zeros([m, m])
 | |
| 
 | |
|     # Initialize: v is first column (and last row) of H
 | |
|     #  v[j] = (1-h^(j+1)/(j+1)!  (except for v[-1])
 | |
|     #  w[j] = 1/(j)!
 | |
|     # q = k-th row of H (actually i!/n^i*H^i)
 | |
|     intm = np.arange(1, m + 1)
 | |
|     v = 1.0 - h ** intm
 | |
|     w = np.empty(m)
 | |
|     fac = 1.0
 | |
|     for j in intm:
 | |
|         w[j - 1] = fac
 | |
|         fac /= j  # This might underflow.  Isn't a problem.
 | |
|         v[j - 1] *= fac
 | |
|     tt = max(2 * h - 1.0, 0)**m - 2*h**m
 | |
|     v[-1] = (1.0 + tt) * fac
 | |
| 
 | |
|     for i in range(1, m):
 | |
|         H[i - 1:, i] = w[:m - i + 1]
 | |
|     H[:, 0] = v
 | |
|     H[-1, :] = np.flip(v, axis=0)
 | |
| 
 | |
|     Hpwr = np.eye(np.shape(H)[0])  # Holds intermediate powers of H
 | |
|     nn = n
 | |
|     expnt = 0  # Scaling of Hpwr
 | |
|     Hexpnt = 0  # Scaling of H
 | |
|     while nn > 0:
 | |
|         if nn % 2:
 | |
|             Hpwr = np.matmul(Hpwr, H)
 | |
|             expnt += Hexpnt
 | |
|         H = np.matmul(H, H)
 | |
|         Hexpnt *= 2
 | |
|         # Scale as needed.
 | |
|         if np.abs(H[k - 1, k - 1]) > _EP128:
 | |
|             H /= _EP128
 | |
|             Hexpnt += _E128
 | |
|         nn = nn // 2
 | |
| 
 | |
|     p = Hpwr[k - 1, k - 1]
 | |
| 
 | |
|     # Multiply by n!/n^n
 | |
|     for i in range(1, n + 1):
 | |
|         p = i * p / n
 | |
|         if np.abs(p) < _EM128:
 | |
|             p *= _EP128
 | |
|             expnt -= _E128
 | |
| 
 | |
|     # unscale
 | |
|     if expnt != 0:
 | |
|         p = np.ldexp(p, expnt)
 | |
| 
 | |
|     return _select_and_clip_prob(p, 1.0-p, cdf)
 | |
| 
 | |
| 
 | |
| def _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf):
 | |
|     """Compute the endpoints of the interval for row i."""
 | |
|     if i == 0:
 | |
|         j1, j2 = -ll - ceilf - 1, ll + ceilf - 1
 | |
|     else:
 | |
|         # i + 1 = 2*ip1div2 + ip1mod2
 | |
|         ip1div2, ip1mod2 = divmod(i + 1, 2)
 | |
|         if ip1mod2 == 0:  # i is odd
 | |
|             if ip1div2 == n + 1:
 | |
|                 j1, j2 = n - ll - ceilf - 1, n + ll + ceilf - 1
 | |
|             else:
 | |
|                 j1, j2 = ip1div2 - 1 - ll - roundf - 1, ip1div2 + ll - 1 + ceilf - 1
 | |
|         else:
 | |
|             j1, j2 = ip1div2 - 1 - ll - 1, ip1div2 + ll + roundf - 1
 | |
| 
 | |
|     return max(j1 + 2, 0), min(j2, n)
 | |
| 
 | |
| 
 | |
| def _kolmogn_Pomeranz(n, x, cdf=True):
 | |
|     r"""Computes Pr(D_n <= d) using the Pomeranz recursion algorithm.
 | |
| 
 | |
|     Pomeranz (1974) [2]
 | |
|     """
 | |
| 
 | |
|     # V is n*(2n+2) matrix.
 | |
|     # Each row is convolution of the previous row and probabilities from a
 | |
|     #  Poisson distribution.
 | |
|     # Desired CDF probability is n! V[n-1, 2n+1]  (final entry in final row).
 | |
|     # Only two rows are needed at any given stage:
 | |
|     #  - Call them V0 and V1.
 | |
|     #  - Swap each iteration
 | |
|     # Only a few (contiguous) entries in each row can be non-zero.
 | |
|     #  - Keep track of start and end (j1 and j2 below)
 | |
|     #  - V0s and V1s track the start in the two rows
 | |
|     # Scale intermediate results as needed.
 | |
|     # Only a few different Poisson distributions can occur
 | |
|     t = n * x
 | |
|     ll = int(np.floor(t))
 | |
|     f = 1.0 * (t - ll)  # fractional part of t
 | |
|     g = min(f, 1.0 - f)
 | |
|     ceilf = (1 if f > 0 else 0)
 | |
|     roundf = (1 if f > 0.5 else 0)
 | |
|     npwrs = 2 * (ll + 1)    # Maximum number of powers needed in convolutions
 | |
|     gpower = np.empty(npwrs)  # gpower = (g/n)^m/m!
 | |
|     twogpower = np.empty(npwrs)  # twogpower = (2g/n)^m/m!
 | |
|     onem2gpower = np.empty(npwrs)  # onem2gpower = ((1-2g)/n)^m/m!
 | |
|     # gpower etc are *almost* Poisson probs, just missing normalizing factor.
 | |
| 
 | |
|     gpower[0] = 1.0
 | |
|     twogpower[0] = 1.0
 | |
|     onem2gpower[0] = 1.0
 | |
|     expnt = 0
 | |
|     g_over_n, two_g_over_n, one_minus_two_g_over_n = g/n, 2*g/n, (1 - 2*g)/n
 | |
|     for m in range(1, npwrs):
 | |
|         gpower[m] = gpower[m - 1] * g_over_n / m
 | |
|         twogpower[m] = twogpower[m - 1] * two_g_over_n / m
 | |
|         onem2gpower[m] = onem2gpower[m - 1] * one_minus_two_g_over_n / m
 | |
| 
 | |
|     V0 = np.zeros([npwrs])
 | |
|     V1 = np.zeros([npwrs])
 | |
|     V1[0] = 1  # first row
 | |
|     V0s, V1s = 0, 0  # start indices of the two rows
 | |
| 
 | |
|     j1, j2 = _pomeranz_compute_j1j2(0, n, ll, ceilf, roundf)
 | |
|     for i in range(1, 2 * n + 2):
 | |
|         # Preserve j1, V1, V1s, V0s from last iteration
 | |
|         k1 = j1
 | |
|         V0, V1 = V1, V0
 | |
|         V0s, V1s = V1s, V0s
 | |
|         V1.fill(0.0)
 | |
|         j1, j2 = _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf)
 | |
|         if i == 1 or i == 2 * n + 1:
 | |
|             pwrs = gpower
 | |
|         else:
 | |
|             pwrs = (twogpower if i % 2 else onem2gpower)
 | |
|         ln2 = j2 - k1 + 1
 | |
|         if ln2 > 0:
 | |
|             conv = np.convolve(V0[k1 - V0s:k1 - V0s + ln2], pwrs[:ln2])
 | |
|             conv_start = j1 - k1  # First index to use from conv
 | |
|             conv_len = j2 - j1 + 1  # Number of entries to use from conv
 | |
|             V1[:conv_len] = conv[conv_start:conv_start + conv_len]
 | |
|             # Scale to avoid underflow.
 | |
|             if 0 < np.max(V1) < _EM128:
 | |
|                 V1 *= _EP128
 | |
|                 expnt -= _E128
 | |
|             V1s = V0s + j1 - k1
 | |
| 
 | |
|     # multiply by n!
 | |
|     ans = V1[n - V1s]
 | |
|     for m in range(1, n + 1):
 | |
|         if np.abs(ans) > _EP128:
 | |
|             ans *= _EM128
 | |
|             expnt += _E128
 | |
|         ans *= m
 | |
| 
 | |
|     # Undo any intermediate scaling
 | |
|     if expnt != 0:
 | |
|         ans = np.ldexp(ans, expnt)
 | |
|     ans = _select_and_clip_prob(ans, 1.0 - ans, cdf)
 | |
|     return ans
 | |
| 
 | |
| 
 | |
| def _kolmogn_PelzGood(n, x, cdf=True):
 | |
|     """Computes the Pelz-Good approximation to Prob(Dn <= x) with 0<=x<=1.
 | |
| 
 | |
|     Start with Li-Chien, Korolyuk approximation:
 | |
|         Prob(Dn <= x) ~ K0(z) + K1(z)/sqrt(n) + K2(z)/n + K3(z)/n**1.5
 | |
|     where z = x*sqrt(n).
 | |
|     Transform each K_(z) using Jacobi theta functions into a form suitable
 | |
|     for small z.
 | |
|     Pelz-Good (1976). [6]
 | |
|     """
 | |
|     if x <= 0.0:
 | |
|         return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
 | |
|     if x >= 1.0:
 | |
|         return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
 | |
| 
 | |
|     z = np.sqrt(n) * x
 | |
|     zsquared, zthree, zfour, zsix = z**2, z**3, z**4, z**6
 | |
| 
 | |
|     qlog = -_PI_SQUARED / 8 / zsquared
 | |
|     if qlog < _MIN_LOG:  # z ~ 0.041743441416853426
 | |
|         return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
 | |
| 
 | |
|     q = np.exp(qlog)
 | |
| 
 | |
|     # Coefficients of terms in the sums for K1, K2 and K3
 | |
|     k1a = -zsquared
 | |
|     k1b = _PI_SQUARED / 4
 | |
| 
 | |
|     k2a = 6 * zsix + 2 * zfour
 | |
|     k2b = (2 * zfour - 5 * zsquared) * _PI_SQUARED / 4
 | |
|     k2c = _PI_FOUR * (1 - 2 * zsquared) / 16
 | |
| 
 | |
|     k3d = _PI_SIX * (5 - 30 * zsquared) / 64
 | |
|     k3c = _PI_FOUR * (-60 * zsquared + 212 * zfour) / 16
 | |
|     k3b = _PI_SQUARED * (135 * zfour - 96 * zsix) / 4
 | |
|     k3a = -30 * zsix - 90 * z**8
 | |
| 
 | |
|     K0to3 = np.zeros(4)
 | |
|     # Use a Horner scheme to evaluate sum c_i q^(i^2)
 | |
|     # Reduces to a sum over odd integers.
 | |
|     maxk = int(np.ceil(16 * z / np.pi))
 | |
|     for k in range(maxk, 0, -1):
 | |
|         m = 2 * k - 1
 | |
|         msquared, mfour, msix = m**2, m**4, m**6
 | |
|         qpower = np.power(q, 8 * k)
 | |
|         coeffs = np.array([1.0,
 | |
|                            k1a + k1b*msquared,
 | |
|                            k2a + k2b*msquared + k2c*mfour,
 | |
|                            k3a + k3b*msquared + k3c*mfour + k3d*msix])
 | |
|         K0to3 *= qpower
 | |
|         K0to3 += coeffs
 | |
|     K0to3 *= q
 | |
|     K0to3 *= _SQRT2PI
 | |
|     # z**10 > 0 as z > 0.04
 | |
|     K0to3 /= np.array([z, 6 * zfour, 72 * z**7, 6480 * z**10])
 | |
| 
 | |
|     # Now do the other sum over the other terms, all integers k
 | |
|     # K_2:  (pi^2 k^2) q^(k^2),
 | |
|     # K_3:  (3pi^2 k^2 z^2 - pi^4 k^4)*q^(k^2)
 | |
|     # Don't expect much subtractive cancellation so use direct calculation
 | |
|     q = np.exp(-_PI_SQUARED / 2 / zsquared)
 | |
|     ks = np.arange(maxk, 0, -1)
 | |
|     ksquared = ks ** 2
 | |
|     sqrt3z = _SQRT3 * z
 | |
|     kspi = np.pi * ks
 | |
|     qpwers = q ** ksquared
 | |
|     k2extra = np.sum(ksquared * qpwers)
 | |
|     k2extra *= _PI_SQUARED * _SQRT2PI/(-36 * zthree)
 | |
|     K0to3[2] += k2extra
 | |
|     k3extra = np.sum((sqrt3z + kspi) * (sqrt3z - kspi) * ksquared * qpwers)
 | |
|     k3extra *= _PI_SQUARED * _SQRT2PI/(216 * zsix)
 | |
|     K0to3[3] += k3extra
 | |
|     powers_of_n = np.power(n * 1.0, np.arange(len(K0to3)) / 2.0)
 | |
|     K0to3 /= powers_of_n
 | |
| 
 | |
|     if not cdf:
 | |
|         K0to3 *= -1
 | |
|         K0to3[0] += 1
 | |
| 
 | |
|     Ksum = sum(K0to3)
 | |
|     return Ksum
 | |
| 
 | |
| 
 | |
| def _kolmogn(n, x, cdf=True):
 | |
|     """Computes the CDF(or SF) for the two-sided Kolmogorov-Smirnov statistic.
 | |
| 
 | |
|     x must be of type float, n of type integer.
 | |
| 
 | |
|     Simard & L'Ecuyer (2011) [7].
 | |
|     """
 | |
|     if np.isnan(n):
 | |
|         return n  # Keep the same type of nan
 | |
|     if int(n) != n or n <= 0:
 | |
|         return np.nan
 | |
|     if x >= 1.0:
 | |
|         return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
 | |
|     if x <= 0.0:
 | |
|         return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
 | |
|     t = n * x
 | |
|     if t <= 1.0:  # Ruben-Gambino: 1/2n <= x <= 1/n
 | |
|         if t <= 0.5:
 | |
|             return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
 | |
|         if n <= 140:
 | |
|             prob = np.prod(np.arange(1, n+1) * (1.0/n) * (2*t - 1))
 | |
|         else:
 | |
|             prob = np.exp(_log_nfactorial_div_n_pow_n(n) + n * np.log(2*t-1))
 | |
|         return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
 | |
|     if t >= n - 1:  # Ruben-Gambino
 | |
|         prob = 2 * (1.0 - x)**n
 | |
|         return _select_and_clip_prob(1 - prob, prob, cdf=cdf)
 | |
|     if x >= 0.5:  # Exact: 2 * smirnov
 | |
|         prob = 2 * scipy.special.smirnov(n, x)
 | |
|         return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
 | |
| 
 | |
|     nxsquared = t * x
 | |
|     if n <= 140:
 | |
|         if nxsquared <= 0.754693:
 | |
|             prob = _kolmogn_DMTW(n, x, cdf=True)
 | |
|             return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
 | |
|         if nxsquared <= 4:
 | |
|             prob = _kolmogn_Pomeranz(n, x, cdf=True)
 | |
|             return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
 | |
|         # Now use Miller approximation of 2*smirnov
 | |
|         prob = 2 * scipy.special.smirnov(n, x)
 | |
|         return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
 | |
| 
 | |
|     # Split CDF and SF as they have different cutoffs on nxsquared.
 | |
|     if not cdf:
 | |
|         if nxsquared >= 370.0:
 | |
|             return 0.0
 | |
|         if nxsquared >= 2.2:
 | |
|             prob = 2 * scipy.special.smirnov(n, x)
 | |
|             return _clip_prob(prob)
 | |
|         # Fall through and compute the SF as 1.0-CDF
 | |
|     if nxsquared >= 18.0:
 | |
|         cdfprob = 1.0
 | |
|     elif n <= 100000 and n * x**1.5 <= 1.4:
 | |
|         cdfprob = _kolmogn_DMTW(n, x, cdf=True)
 | |
|     else:
 | |
|         cdfprob = _kolmogn_PelzGood(n, x, cdf=True)
 | |
|     return _select_and_clip_prob(cdfprob, 1.0 - cdfprob, cdf=cdf)
 | |
| 
 | |
| 
 | |
| def _kolmogn_p(n, x):
 | |
|     """Computes the PDF for the two-sided Kolmogorov-Smirnov statistic.
 | |
| 
 | |
|     x must be of type float, n of type integer.
 | |
|     """
 | |
|     if np.isnan(n):
 | |
|         return n  # Keep the same type of nan
 | |
|     if int(n) != n or n <= 0:
 | |
|         return np.nan
 | |
|     if x >= 1.0 or x <= 0:
 | |
|         return 0
 | |
|     t = n * x
 | |
|     if t <= 1.0:
 | |
|         # Ruben-Gambino: n!/n^n * (2t-1)^n -> 2 n!/n^n * n^2 * (2t-1)^(n-1)
 | |
|         if t <= 0.5:
 | |
|             return 0.0
 | |
|         if n <= 140:
 | |
|             prd = np.prod(np.arange(1, n) * (1.0 / n) * (2 * t - 1))
 | |
|         else:
 | |
|             prd = np.exp(_log_nfactorial_div_n_pow_n(n) + (n-1) * np.log(2 * t - 1))
 | |
|         return prd * 2 * n**2
 | |
|     if t >= n - 1:
 | |
|         # Ruben-Gambino : 1-2(1-x)**n -> 2n*(1-x)**(n-1)
 | |
|         return 2 * (1.0 - x) ** (n-1) * n
 | |
|     if x >= 0.5:
 | |
|         return 2 * scipy.stats.ksone.pdf(x, n)
 | |
| 
 | |
|     # Just take a small delta.
 | |
|     # Ideally x +/- delta would stay within [i/n, (i+1)/n] for some integer a.
 | |
|     # as the CDF is a piecewise degree n polynomial.
 | |
|     # It has knots at 1/n, 2/n, ... (n-1)/n
 | |
|     # and is not a C-infinity function at the knots
 | |
|     delta = x / 2.0**16
 | |
|     delta = min(delta, x - 1.0/n)
 | |
|     delta = min(delta, 0.5 - x)
 | |
| 
 | |
|     def _kk(_x):
 | |
|         return kolmogn(n, _x)
 | |
| 
 | |
|     return _derivative(_kk, x, dx=delta, order=5)
 | |
| 
 | |
| 
 | |
| def _kolmogni(n, p, q):
 | |
|     """Computes the PPF/ISF of kolmogn.
 | |
| 
 | |
|     n of type integer, n>= 1
 | |
|     p is the CDF, q the SF, p+q=1
 | |
|     """
 | |
|     if np.isnan(n):
 | |
|         return n  # Keep the same type of nan
 | |
|     if int(n) != n or n <= 0:
 | |
|         return np.nan
 | |
|     if p <= 0:
 | |
|         return 1.0/n
 | |
|     if q <= 0:
 | |
|         return 1.0
 | |
|     delta = np.exp((np.log(p) - scipy.special.loggamma(n+1))/n)
 | |
|     if delta <= 1.0/n:
 | |
|         return (delta + 1.0 / n) / 2
 | |
|     x = -np.expm1(np.log(q/2.0)/n)
 | |
|     if x >= 1 - 1.0/n:
 | |
|         return x
 | |
|     x1 = scu._kolmogci(p)/np.sqrt(n)
 | |
|     x1 = min(x1, 1.0 - 1.0/n)
 | |
| 
 | |
|     def _f(x):
 | |
|         return _kolmogn(n, x) - p
 | |
| 
 | |
|     return scipy.optimize.brentq(_f, 1.0/n, x1, xtol=1e-14)
 | |
| 
 | |
| 
 | |
| def kolmogn(n, x, cdf=True):
 | |
|     """Computes the CDF for the two-sided Kolmogorov-Smirnov distribution.
 | |
| 
 | |
|     The two-sided Kolmogorov-Smirnov distribution has as its CDF Pr(D_n <= x),
 | |
|     for a sample of size n drawn from a distribution with CDF F(t), where
 | |
|     :math:`D_n &= sup_t |F_n(t) - F(t)|`, and
 | |
|     :math:`F_n(t)` is the Empirical Cumulative Distribution Function of the sample.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     n : integer, array_like
 | |
|         the number of samples
 | |
|     x : float, array_like
 | |
|         The K-S statistic, float between 0 and 1
 | |
|     cdf : bool, optional
 | |
|         whether to compute the CDF(default=true) or the SF.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     cdf : ndarray
 | |
|         CDF (or SF it cdf is False) at the specified locations.
 | |
| 
 | |
|     The return value has shape the result of numpy broadcasting n and x.
 | |
|     """
 | |
|     it = np.nditer([n, x, cdf, None], flags=['zerosize_ok'],
 | |
|                    op_dtypes=[None, np.float64, np.bool_, np.float64])
 | |
|     for _n, _x, _cdf, z in it:
 | |
|         if np.isnan(_n):
 | |
|             z[...] = _n
 | |
|             continue
 | |
|         if int(_n) != _n:
 | |
|             raise ValueError(f'n is not integral: {_n}')
 | |
|         z[...] = _kolmogn(int(_n), _x, cdf=_cdf)
 | |
|     result = it.operands[-1]
 | |
|     return result
 | |
| 
 | |
| 
 | |
| def kolmognp(n, x):
 | |
|     """Computes the PDF for the two-sided Kolmogorov-Smirnov distribution.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     n : integer, array_like
 | |
|         the number of samples
 | |
|     x : float, array_like
 | |
|         The K-S statistic, float between 0 and 1
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     pdf : ndarray
 | |
|         The PDF at the specified locations
 | |
| 
 | |
|     The return value has shape the result of numpy broadcasting n and x.
 | |
|     """
 | |
|     it = np.nditer([n, x, None])
 | |
|     for _n, _x, z in it:
 | |
|         if np.isnan(_n):
 | |
|             z[...] = _n
 | |
|             continue
 | |
|         if int(_n) != _n:
 | |
|             raise ValueError(f'n is not integral: {_n}')
 | |
|         z[...] = _kolmogn_p(int(_n), _x)
 | |
|     result = it.operands[-1]
 | |
|     return result
 | |
| 
 | |
| 
 | |
| def kolmogni(n, q, cdf=True):
 | |
|     """Computes the PPF(or ISF) for the two-sided Kolmogorov-Smirnov distribution.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     n : integer, array_like
 | |
|         the number of samples
 | |
|     q : float, array_like
 | |
|         Probabilities, float between 0 and 1
 | |
|     cdf : bool, optional
 | |
|         whether to compute the PPF(default=true) or the ISF.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     ppf : ndarray
 | |
|         PPF (or ISF if cdf is False) at the specified locations
 | |
| 
 | |
|     The return value has shape the result of numpy broadcasting n and x.
 | |
|     """
 | |
|     it = np.nditer([n, q, cdf, None])
 | |
|     for _n, _q, _cdf, z in it:
 | |
|         if np.isnan(_n):
 | |
|             z[...] = _n
 | |
|             continue
 | |
|         if int(_n) != _n:
 | |
|             raise ValueError(f'n is not integral: {_n}')
 | |
|         _pcdf, _psf = (_q, 1-_q) if _cdf else (1-_q, _q)
 | |
|         z[...] = _kolmogni(int(_n), _pcdf, _psf)
 | |
|     result = it.operands[-1]
 | |
|     return result
 |