1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
| def hampel(vals_orig, k=7, t0=3): ''' vals: pandas series of values from which to remove outliers k: size of window (including the sample; 7 is equal to 3 on either side of value) ''' vals = vals_orig.copy() L = 1.4826 rolling_median = vals.rolling(window=k, center=True).median() MAD = lambda x: np.median(np.abs(x - np.median(x))) rolling_MAD = vals.rolling(window=k, center=True).apply(MAD) threshold = t0 * L * rolling_MAD difference = np.abs(vals - rolling_median) ''' Perhaps a condition should be added here in the case that the threshold value is 0.0; maybe do not mark as outlier. MAD may be 0.0 without the original values being equal. See differences between MAD vs SDV. ''' outlier_idx = difference > threshold vals[outlier_idx] = rolling_median[outlier_idx] return(vals)
|