11# cython: boundscheck=False, wraparound=False, cdivision=True
22
33from libc.math cimport (
4+ fabs,
45 round ,
56 signbit,
67 sqrt,
6061 float64_t MAXfloat64 = np.inf
6162
6263 float64_t NaN = < float64_t> np.nan
64+ float64_t EpsF64 = np.finfo(np.float64).eps
65+
66+ # Consider an operation ill-conditioned if
67+ # it will only have up to 3 significant digits in base 10 remaining.
68+ # https://en.wikipedia.org/wiki/Condition_number
69+ float64_t InvCondTol = EpsF64 * 1e3
6370
6471cdef bint is_monotonic_increasing_start_end_bounds(
6572 ndarray[int64_t, ndim= 1 ] start, ndarray[int64_t, ndim= 1 ] end
@@ -482,75 +489,74 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
482489
483490
484491cdef float64_t calc_skew(int64_t minp, int64_t nobs,
485- float64_t x , float64_t xx , float64_t xxx ,
492+ float64_t mean , float64_t m2 , float64_t m3 ,
486493 int64_t num_consecutive_same_value
487494 ) noexcept nogil:
488495 cdef:
489496 float64_t result, dnobs
490- float64_t A, B, C, R
497+ float64_t moments_ratio, correction
491498
492499 if nobs >= minp:
493500 dnobs = < float64_t> nobs
494- A = x / dnobs
495- B = xx / dnobs - A * A
496- C = xxx / dnobs - A * A * A - 3 * A * B
497501
498502 if nobs < 3 :
499503 result = NaN
500504 # GH 42064 46431
501505 # uniform case, force result to be 0
502506 elif num_consecutive_same_value >= nobs:
503507 result = 0.0
504- # #18044: with uniform distribution, floating issue will
505- # cause B != 0. and cause the result is a very
508+ # #18044: with degenerate distribution, floating issue will
509+ # cause m2 != 0. and cause the result is a very
506510 # large number.
507511 #
508512 # in core/nanops.py nanskew/nankurt call the function
509513 # _zero_out_fperr(m2) to fix floating error.
510514 # if the variance is less than 1e-14, it could be
511515 # treat as zero, here we follow the original
512- # skew/kurt behaviour to check B <= 1e-14
513- elif B <= 1e-14 :
516+ # skew/kurt behaviour to check m2 <= n * 1e-14
517+ elif m2 <= dnobs * 1e-14 :
514518 result = NaN
515519 else :
516- R = sqrt(B )
517- result = ((sqrt( dnobs * ( dnobs - 1. )) * C) /
518- ((dnobs - 2 ) * R * R * R))
520+ moments_ratio = m3 / (m2 * sqrt(m2) )
521+ correction = dnobs * sqrt(( dnobs - 1 )) / (dnobs - 2 )
522+ result = moments_ratio * correction
519523 else :
520524 result = NaN
521525
522526 return result
523527
524528
525529cdef void add_skew(float64_t val, int64_t * nobs,
526- float64_t * x, float64_t * xx,
527- float64_t * xxx,
528- float64_t * compensation_x,
529- float64_t * compensation_xx,
530- float64_t * compensation_xxx,
530+ float64_t * mean, float64_t * m2,
531+ float64_t * m3,
532+ bint * numerically_unstable,
531533 int64_t * num_consecutive_same_value,
532534 float64_t * prev_value,
533535 ) noexcept nogil:
534536 """ add a value from the skew calc """
535537 cdef:
536- float64_t y, t
538+ float64_t n, delta, delta_n, term1, m3_update, new_m3
539+
540+ # Formulas adapted from
541+ # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics
537542
538543 # Not NaN
539544 if val == val:
540- nobs[0 ] = nobs[0 ] + 1
541-
542- y = val - compensation_x[0 ]
543- t = x[0 ] + y
544- compensation_x[0 ] = t - x[0 ] - y
545- x[0 ] = t
546- y = val * val - compensation_xx[0 ]
547- t = xx[0 ] + y
548- compensation_xx[0 ] = t - xx[0 ] - y
549- xx[0 ] = t
550- y = val * val * val - compensation_xxx[0 ]
551- t = xxx[0 ] + y
552- compensation_xxx[0 ] = t - xxx[0 ] - y
553- xxx[0 ] = t
545+ nobs[0 ] += 1
546+ n = < float64_t> (nobs[0 ])
547+ delta = val - mean[0 ]
548+ delta_n = delta / n
549+ term1 = delta * delta_n * (n - 1.0 )
550+
551+ m3_update = delta_n * (term1 * (n - 2.0 ) - 3.0 * m2[0 ])
552+ new_m3 = m3[0 ] + m3_update
553+ if (fabs(m3_update) + fabs(m3[0 ])) * InvCondTol > fabs(new_m3):
554+ # possible catastrophic cancellation
555+ numerically_unstable[0 ] = True
556+
557+ m3[0 ] = new_m3
558+ m2[0 ] += term1
559+ mean[0 ] += delta_n
554560
555561 # GH#42064, record num of same values to remove floating point artifacts
556562 if val == prev_value[0 ]:
@@ -562,116 +568,112 @@ cdef void add_skew(float64_t val, int64_t *nobs,
562568
563569
564570cdef void remove_skew(float64_t val, int64_t * nobs,
565- float64_t * x, float64_t * xx,
566- float64_t * xxx,
567- float64_t * compensation_x,
568- float64_t * compensation_xx,
569- float64_t * compensation_xxx) noexcept nogil:
571+ float64_t * mean, float64_t * m2,
572+ float64_t * m3,
573+ bint * numerically_unstable) noexcept nogil:
570574 """ remove a value from the skew calc """
571575 cdef:
572- float64_t y, t
576+ float64_t n, delta, delta_n, term1, m3_update, new_m3
577+
578+ # This is the online update for the central moments
579+ # when we remove an observation.
580+ #
581+ # δ = x - m_{n+1}
582+ # m_{n} = m_{n+1} - (δ / n)
583+ # m²_n = Σ_{i=1}^{n+1}(x_i - m_{n})² - (x - m_{n})² # uses new mean
584+ # = m²_{n+1} - (δ²/n)*(n+1)
585+ # m³_n = Σ_{i=1}^{n+1}(x_i - m_{n})³ - (x - m_{n})³ # uses new mean
586+ # = m³_{n+1} - (δ³/n²)*(n+1)*(n+2) + 3 * m²_{n+1}*(δ/n)
573587
574588 # Not NaN
575589 if val == val:
576- nobs[0 ] = nobs[0 ] - 1
590+ nobs[0 ] -= 1
591+ n = < float64_t> (nobs[0 ])
592+ delta = val - mean[0 ]
593+ delta_n = delta / n
594+ term1 = delta_n * delta * (n + 1.0 )
577595
578- y = - val - compensation_x[0 ]
579- t = x[0 ] + y
580- compensation_x[0 ] = t - x[0 ] - y
581- x[0 ] = t
582- y = - val * val - compensation_xx[0 ]
583- t = xx[0 ] + y
584- compensation_xx[0 ] = t - xx[0 ] - y
585- xx[0 ] = t
586- y = - val * val * val - compensation_xxx[0 ]
587- t = xxx[0 ] + y
588- compensation_xxx[0 ] = t - xxx[0 ] - y
589- xxx[0 ] = t
596+ m3_update = delta_n * (term1 * (n + 2.0 ) - 3.0 * m2[0 ])
597+ new_m3 = m3[0 ] - m3_update
598+
599+ if (fabs(m3_update) + fabs(m3[0 ])) * InvCondTol > fabs(new_m3):
600+ # possible catastrophic cancellation
601+ numerically_unstable[0 ] = True
602+
603+ m3[0 ] = new_m3
604+ m2[0 ] -= term1
605+ mean[0 ] -= delta_n
590606
591607
592- def roll_skew (ndarray[ float64_t] values , ndarray[int64_t] start ,
608+ def roll_skew (const float64_t[: ] values , ndarray[int64_t] start ,
593609 ndarray[int64_t] end , int64_t minp ) -> np.ndarray:
594610 cdef:
595611 Py_ssize_t i , j
596- float64_t val , min_val , mean_val , sum_val = 0
597- float64_t compensation_xxx_add , compensation_xxx_remove
598- float64_t compensation_xx_add , compensation_xx_remove
599- float64_t compensation_x_add , compensation_x_remove
600- float64_t x , xx , xxx
612+ float64_t val
613+ float64_t mean , m2 , m3
601614 float64_t prev_value
602- int64_t nobs = 0 , N = len (start), V = len (values), nobs_mean = 0
615+ int64_t nobs = 0 , N = len (start)
603616 int64_t s , e , num_consecutive_same_value
604- ndarray[float64_t] output , values_copy
617+ ndarray[float64_t] output
605618 bint is_monotonic_increasing_bounds
619+ bint requires_recompute , numerically_unstable = False
606620
607621 minp = max (minp, 3 )
608622 is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
609623 start, end
610624 )
611625 output = np.empty(N, dtype = np.float64)
612- min_val = np.nanmin(values)
613- values_copy = np.copy(values)
614626
615627 with nogil:
616- for i in range(0, V ):
617- val = values_copy[i]
618- if val == val:
619- nobs_mean += 1
620- sum_val += val
621- mean_val = sum_val / nobs_mean
622- # Other cases would lead to imprecision for smallest values
623- if min_val - mean_val > - 1e5 :
624- mean_val = round (mean_val)
625- for i in range (0 , V):
626- values_copy[i] = values_copy[i] - mean_val
627-
628628 for i in range(0, N ):
629629
630630 s = start[i]
631631 e = end[i]
632632
633633 # Over the first window, observations can only be added
634634 # never removed
635- if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1 ]:
636-
637- prev_value = values[s]
638- num_consecutive_same_value = 0
639-
640- compensation_xxx_add = compensation_xxx_remove = 0
641- compensation_xx_add = compensation_xx_remove = 0
642- compensation_x_add = compensation_x_remove = 0
643- x = xx = xxx = 0
644- nobs = 0
645- for j in range (s, e):
646- val = values_copy[j]
647- add_skew(val, & nobs, & x, & xx, & xxx, & compensation_x_add,
648- & compensation_xx_add, & compensation_xxx_add,
649- & num_consecutive_same_value, & prev_value)
650-
651- else :
635+ requires_recompute = (
636+ i == 0
637+ or not is_monotonic_increasing_bounds
638+ or s >= end[i - 1 ]
639+ )
652640
641+ if not requires_recompute:
653642 # After the first window, observations can both be added
654643 # and removed
655644 # calculate deletes
656645 for j in range (start[i - 1 ], s):
657- val = values_copy[j]
658- remove_skew(val, & nobs, & x, & xx, & xxx, & compensation_x_remove,
659- & compensation_xx_remove, & compensation_xxx_remove)
646+ val = values[j]
647+ remove_skew(val, & nobs, & mean, & m2, & m3, & numerically_unstable)
660648
661649 # calculate adds
662650 for j in range (end[i - 1 ], e):
663- val = values_copy[j]
664- add_skew(val, & nobs, & x, & xx, & xxx, & compensation_x_add,
665- & compensation_xx_add, & compensation_xxx_add,
651+ val = values[j]
652+ add_skew(val, & nobs, & mean, & m2, & m3, & numerically_unstable,
666653 & num_consecutive_same_value, & prev_value)
667654
668- output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value)
655+ if requires_recompute or numerically_unstable:
656+
657+ prev_value = values[s]
658+ num_consecutive_same_value = 0
659+
660+ mean = m2 = m3 = 0.0
661+ nobs = 0
662+
663+ for j in range (s, e):
664+ val = values[j]
665+ add_skew(val, & nobs, & mean, & m2, & m3, & numerically_unstable,
666+ & num_consecutive_same_value, & prev_value)
667+
668+ numerically_unstable = False
669+
670+ output[i] = calc_skew(minp, nobs, mean, m2, m3, num_consecutive_same_value)
669671
670672 if not is_monotonic_increasing_bounds:
671673 nobs = 0
672- x = 0.0
673- xx = 0.0
674- xxx = 0.0
674+ mean = 0.0
675+ m2 = 0.0
676+ m3 = 0.0
675677
676678 return output
677679
0 commit comments