criterion/stats/univariate/outliers/
tukey.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
//! Tukey's method
//!
//! The original method uses two "fences" to classify the data. All the observations "inside" the
//! fences are considered "normal", and the rest are considered outliers.
//!
//! The fences are computed from the quartiles of the sample, according to the following formula:
//!
//! ``` ignore
//! // q1, q3 are the first and third quartiles
//! let iqr = q3 - q1;  // The interquartile range
//! let (f1, f2) = (q1 - 1.5 * iqr, q3 + 1.5 * iqr);  // the "fences"
//!
//! let is_outlier = |x| if x > f1 && x < f2 { true } else { false };
//! ```
//!
//! The classifier provided here adds two extra outer fences:
//!
//! ``` ignore
//! let (f3, f4) = (q1 - 3 * iqr, q3 + 3 * iqr);  // the outer "fences"
//! ```
//!
//! The extra fences add a sense of "severity" to the classification. Data points outside of the
//! outer fences are considered "severe" outliers, whereas points outside the inner fences are just
//! "mild" outliers, and, as the original method, everything inside the inner fences is considered
//! "normal" data.
//!
//! Some ASCII art for the visually oriented people:
//!
//! ``` ignore
//!          LOW-ish                NORMAL-ish                 HIGH-ish
//!         x   |       +    |  o o  o    o   o o  o  |        +   |   x
//!             f3           f1                       f2           f4
//!
//! Legend:
//! o: "normal" data (not an outlier)
//! +: "mild" outlier
//! x: "severe" outlier
//! ```

use std::iter::IntoIterator;
use std::ops::{Deref, Index};
use std::slice;

use crate::stats::float::Float;
use crate::stats::univariate::Sample;

use self::Label::*;

/// A classified/labeled sample.
///
/// The labeled data can be accessed using the indexing operator. The order of the data points is
/// retained.
///
/// NOTE: Due to limitations in the indexing traits, only the label is returned. Once the
/// `IndexGet` trait lands in stdlib, the indexing operation will return a `(data_point, label)`
/// pair.
#[derive(Clone, Copy)]
pub struct LabeledSample<'a, A>
where
    A: Float,
{
    fences: (A, A, A, A),
    sample: &'a Sample<A>,
}

impl<'a, A> LabeledSample<'a, A>
where
    A: Float,
{
    /// Returns the number of data points per label
    ///
    /// - Time: `O(length)`
    #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
    pub fn count(&self) -> (usize, usize, usize, usize, usize) {
        let (mut los, mut lom, mut noa, mut him, mut his) = (0, 0, 0, 0, 0);

        for (_, label) in self {
            match label {
                LowSevere => {
                    los += 1;
                }
                LowMild => {
                    lom += 1;
                }
                NotAnOutlier => {
                    noa += 1;
                }
                HighMild => {
                    him += 1;
                }
                HighSevere => {
                    his += 1;
                }
            }
        }

        (los, lom, noa, him, his)
    }

    /// Returns the fences used to classify the outliers
    pub fn fences(&self) -> (A, A, A, A) {
        self.fences
    }

    /// Returns an iterator over the labeled data
    pub fn iter(&self) -> Iter<'a, A> {
        Iter {
            fences: self.fences,
            iter: self.sample.iter(),
        }
    }
}

impl<'a, A> Deref for LabeledSample<'a, A>
where
    A: Float,
{
    type Target = Sample<A>;

    fn deref(&self) -> &Sample<A> {
        self.sample
    }
}

// FIXME Use the `IndexGet` trait
impl<'a, A> Index<usize> for LabeledSample<'a, A>
where
    A: Float,
{
    type Output = Label;

    #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
    fn index(&self, i: usize) -> &Label {
        static LOW_SEVERE: Label = LowSevere;
        static LOW_MILD: Label = LowMild;
        static HIGH_MILD: Label = HighMild;
        static HIGH_SEVERE: Label = HighSevere;
        static NOT_AN_OUTLIER: Label = NotAnOutlier;

        let x = self.sample[i];
        let (lost, lomt, himt, hist) = self.fences;

        if x < lost {
            &LOW_SEVERE
        } else if x > hist {
            &HIGH_SEVERE
        } else if x < lomt {
            &LOW_MILD
        } else if x > himt {
            &HIGH_MILD
        } else {
            &NOT_AN_OUTLIER
        }
    }
}

impl<'a, 'b, A> IntoIterator for &'b LabeledSample<'a, A>
where
    A: Float,
{
    type Item = (A, Label);
    type IntoIter = Iter<'a, A>;

    fn into_iter(self) -> Iter<'a, A> {
        self.iter()
    }
}

/// Iterator over the labeled data
pub struct Iter<'a, A>
where
    A: Float,
{
    fences: (A, A, A, A),
    iter: slice::Iter<'a, A>,
}

impl<'a, A> Iterator for Iter<'a, A>
where
    A: Float,
{
    type Item = (A, Label);

    #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
    fn next(&mut self) -> Option<(A, Label)> {
        self.iter.next().map(|&x| {
            let (lost, lomt, himt, hist) = self.fences;

            let label = if x < lost {
                LowSevere
            } else if x > hist {
                HighSevere
            } else if x < lomt {
                LowMild
            } else if x > himt {
                HighMild
            } else {
                NotAnOutlier
            };

            (x, label)
        })
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        self.iter.size_hint()
    }
}

/// Labels used to classify outliers
pub enum Label {
    /// A "mild" outlier in the "high" spectrum
    HighMild,
    /// A "severe" outlier in the "high" spectrum
    HighSevere,
    /// A "mild" outlier in the "low" spectrum
    LowMild,
    /// A "severe" outlier in the "low" spectrum
    LowSevere,
    /// A normal data point
    NotAnOutlier,
}

impl Label {
    /// Checks if the data point has an "unusually" high value
    pub fn is_high(&self) -> bool {
        matches!(*self, HighMild | HighSevere)
    }

    /// Checks if the data point is labeled as a "mild" outlier
    pub fn is_mild(&self) -> bool {
        matches!(*self, HighMild | LowMild)
    }

    /// Checks if the data point has an "unusually" low value
    pub fn is_low(&self) -> bool {
        matches!(*self, LowMild | LowSevere)
    }

    /// Checks if the data point is labeled as an outlier
    pub fn is_outlier(&self) -> bool {
        matches!(*self, NotAnOutlier)
    }

    /// Checks if the data point is labeled as a "severe" outlier
    pub fn is_severe(&self) -> bool {
        matches!(*self, HighSevere | LowSevere)
    }
}

/// Classifies the sample, and returns a labeled sample.
///
/// - Time: `O(N log N) where N = length`
pub fn classify<A>(sample: &Sample<A>) -> LabeledSample<'_, A>
where
    A: Float,
    usize: cast::From<A, Output = Result<usize, cast::Error>>,
{
    let (q1, _, q3) = sample.percentiles().quartiles();
    let iqr = q3 - q1;

    // Mild
    let k_m = A::cast(1.5_f32);
    // Severe
    let k_s = A::cast(3);

    LabeledSample {
        fences: (
            q1 - k_s * iqr,
            q1 - k_m * iqr,
            q3 + k_m * iqr,
            q3 + k_s * iqr,
        ),
        sample,
    }
}