criterion/stats/univariate/outliers/
tukey.rs

1//! Tukey's method
2//!
3//! The original method uses two "fences" to classify the data. All the observations "inside" the
4//! fences are considered "normal", and the rest are considered outliers.
5//!
6//! The fences are computed from the quartiles of the sample, according to the following formula:
7//!
8//! ``` ignore
9//! // q1, q3 are the first and third quartiles
10//! let iqr = q3 - q1;  // The interquartile range
11//! let (f1, f2) = (q1 - 1.5 * iqr, q3 + 1.5 * iqr);  // the "fences"
12//!
13//! let is_outlier = |x| if x > f1 && x < f2 { true } else { false };
14//! ```
15//!
16//! The classifier provided here adds two extra outer fences:
17//!
18//! ``` ignore
19//! let (f3, f4) = (q1 - 3 * iqr, q3 + 3 * iqr);  // the outer "fences"
20//! ```
21//!
22//! The extra fences add a sense of "severity" to the classification. Data points outside of the
23//! outer fences are considered "severe" outliers, whereas points outside the inner fences are just
24//! "mild" outliers, and, as the original method, everything inside the inner fences is considered
25//! "normal" data.
26//!
27//! Some ASCII art for the visually oriented people:
28//!
29//! ``` ignore
30//!          LOW-ish                NORMAL-ish                 HIGH-ish
31//!         x   |       +    |  o o  o    o   o o  o  |        +   |   x
32//!             f3           f1                       f2           f4
33//!
34//! Legend:
35//! o: "normal" data (not an outlier)
36//! +: "mild" outlier
37//! x: "severe" outlier
38//! ```
39
40use std::iter::IntoIterator;
41use std::ops::{Deref, Index};
42use std::slice;
43
44use crate::stats::float::Float;
45use crate::stats::univariate::Sample;
46
47use self::Label::*;
48
49/// A classified/labeled sample.
50///
51/// The labeled data can be accessed using the indexing operator. The order of the data points is
52/// retained.
53///
54/// NOTE: Due to limitations in the indexing traits, only the label is returned. Once the
55/// `IndexGet` trait lands in stdlib, the indexing operation will return a `(data_point, label)`
56/// pair.
57#[derive(Clone, Copy)]
58pub struct LabeledSample<'a, A>
59where
60    A: Float,
61{
62    fences: (A, A, A, A),
63    sample: &'a Sample<A>,
64}
65
66impl<'a, A> LabeledSample<'a, A>
67where
68    A: Float,
69{
70    /// Returns the number of data points per label
71    ///
72    /// - Time: `O(length)`
73    #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
74    pub fn count(&self) -> (usize, usize, usize, usize, usize) {
75        let (mut los, mut lom, mut noa, mut him, mut his) = (0, 0, 0, 0, 0);
76
77        for (_, label) in self {
78            match label {
79                LowSevere => {
80                    los += 1;
81                }
82                LowMild => {
83                    lom += 1;
84                }
85                NotAnOutlier => {
86                    noa += 1;
87                }
88                HighMild => {
89                    him += 1;
90                }
91                HighSevere => {
92                    his += 1;
93                }
94            }
95        }
96
97        (los, lom, noa, him, his)
98    }
99
100    /// Returns the fences used to classify the outliers
101    pub fn fences(&self) -> (A, A, A, A) {
102        self.fences
103    }
104
105    /// Returns an iterator over the labeled data
106    pub fn iter(&self) -> Iter<'a, A> {
107        Iter {
108            fences: self.fences,
109            iter: self.sample.iter(),
110        }
111    }
112}
113
114impl<'a, A> Deref for LabeledSample<'a, A>
115where
116    A: Float,
117{
118    type Target = Sample<A>;
119
120    fn deref(&self) -> &Sample<A> {
121        self.sample
122    }
123}
124
125// FIXME Use the `IndexGet` trait
126impl<'a, A> Index<usize> for LabeledSample<'a, A>
127where
128    A: Float,
129{
130    type Output = Label;
131
132    #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
133    fn index(&self, i: usize) -> &Label {
134        static LOW_SEVERE: Label = LowSevere;
135        static LOW_MILD: Label = LowMild;
136        static HIGH_MILD: Label = HighMild;
137        static HIGH_SEVERE: Label = HighSevere;
138        static NOT_AN_OUTLIER: Label = NotAnOutlier;
139
140        let x = self.sample[i];
141        let (lost, lomt, himt, hist) = self.fences;
142
143        if x < lost {
144            &LOW_SEVERE
145        } else if x > hist {
146            &HIGH_SEVERE
147        } else if x < lomt {
148            &LOW_MILD
149        } else if x > himt {
150            &HIGH_MILD
151        } else {
152            &NOT_AN_OUTLIER
153        }
154    }
155}
156
157impl<'a, 'b, A> IntoIterator for &'b LabeledSample<'a, A>
158where
159    A: Float,
160{
161    type Item = (A, Label);
162    type IntoIter = Iter<'a, A>;
163
164    fn into_iter(self) -> Iter<'a, A> {
165        self.iter()
166    }
167}
168
169/// Iterator over the labeled data
170pub struct Iter<'a, A>
171where
172    A: Float,
173{
174    fences: (A, A, A, A),
175    iter: slice::Iter<'a, A>,
176}
177
178impl<'a, A> Iterator for Iter<'a, A>
179where
180    A: Float,
181{
182    type Item = (A, Label);
183
184    #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
185    fn next(&mut self) -> Option<(A, Label)> {
186        self.iter.next().map(|&x| {
187            let (lost, lomt, himt, hist) = self.fences;
188
189            let label = if x < lost {
190                LowSevere
191            } else if x > hist {
192                HighSevere
193            } else if x < lomt {
194                LowMild
195            } else if x > himt {
196                HighMild
197            } else {
198                NotAnOutlier
199            };
200
201            (x, label)
202        })
203    }
204
205    fn size_hint(&self) -> (usize, Option<usize>) {
206        self.iter.size_hint()
207    }
208}
209
210/// Labels used to classify outliers
211pub enum Label {
212    /// A "mild" outlier in the "high" spectrum
213    HighMild,
214    /// A "severe" outlier in the "high" spectrum
215    HighSevere,
216    /// A "mild" outlier in the "low" spectrum
217    LowMild,
218    /// A "severe" outlier in the "low" spectrum
219    LowSevere,
220    /// A normal data point
221    NotAnOutlier,
222}
223
224impl Label {
225    /// Checks if the data point has an "unusually" high value
226    pub fn is_high(&self) -> bool {
227        matches!(*self, HighMild | HighSevere)
228    }
229
230    /// Checks if the data point is labeled as a "mild" outlier
231    pub fn is_mild(&self) -> bool {
232        matches!(*self, HighMild | LowMild)
233    }
234
235    /// Checks if the data point has an "unusually" low value
236    pub fn is_low(&self) -> bool {
237        matches!(*self, LowMild | LowSevere)
238    }
239
240    /// Checks if the data point is labeled as an outlier
241    pub fn is_outlier(&self) -> bool {
242        matches!(*self, NotAnOutlier)
243    }
244
245    /// Checks if the data point is labeled as a "severe" outlier
246    pub fn is_severe(&self) -> bool {
247        matches!(*self, HighSevere | LowSevere)
248    }
249}
250
251/// Classifies the sample, and returns a labeled sample.
252///
253/// - Time: `O(N log N) where N = length`
254pub fn classify<A>(sample: &Sample<A>) -> LabeledSample<'_, A>
255where
256    A: Float,
257    usize: cast::From<A, Output = Result<usize, cast::Error>>,
258{
259    let (q1, _, q3) = sample.percentiles().quartiles();
260    let iqr = q3 - q1;
261
262    // Mild
263    let k_m = A::cast(1.5_f32);
264    // Severe
265    let k_s = A::cast(3);
266
267    LabeledSample {
268        fences: (
269            q1 - k_s * iqr,
270            q1 - k_m * iqr,
271            q3 + k_m * iqr,
272            q3 + k_s * iqr,
273        ),
274        sample,
275    }
276}