criterion/stats/univariate/outliers/
tukey.rs

1//! Tukey's method
2//!
3//! The original method uses two "fences" to classify the data. All the observations "inside" the
4//! fences are considered "normal", and the rest are considered outliers.
5//!
6//! The fences are computed from the quartiles of the sample, according to the following formula:
7//!
8//! ``` ignore
9//! // q1, q3 are the first and third quartiles
10//! let iqr = q3 - q1;  // The interquartile range
11//! let (f1, f2) = (q1 - 1.5 * iqr, q3 + 1.5 * iqr);  // the "fences"
12//!
13//! let is_outlier = |x| if x > f1 && x < f2 { true } else { false };
14//! ```
15//!
16//! The classifier provided here adds two extra outer fences:
17//!
18//! ``` ignore
19//! let (f3, f4) = (q1 - 3 * iqr, q3 + 3 * iqr);  // the outer "fences"
20//! ```
21//!
22//! The extra fences add a sense of "severity" to the classification. Data points outside of the
23//! outer fences are considered "severe" outliers, whereas points outside the inner fences are just
24//! "mild" outliers, and, as the original method, everything inside the inner fences is considered
25//! "normal" data.
26//!
27//! Some ASCII art for the visually oriented people:
28//!
29//! ``` ignore
30//!          LOW-ish                NORMAL-ish                 HIGH-ish
31//!         x   |       +    |  o o  o    o   o o  o  |        +   |   x
32//!             f3           f1                       f2           f4
33//!
34//! Legend:
35//! o: "normal" data (not an outlier)
36//! +: "mild" outlier
37//! x: "severe" outlier
38//! ```
39
40use std::ops::{Deref, Index};
41use std::slice;
42
43use crate::stats::float::Float;
44use crate::stats::univariate::Sample;
45
46use self::Label::*;
47
48/// A classified/labeled sample.
49///
50/// The labeled data can be accessed using the indexing operator. The order of the data points is
51/// retained.
52///
53/// NOTE: Due to limitations in the indexing traits, only the label is returned. Once the
54/// `IndexGet` trait lands in stdlib, the indexing operation will return a `(data_point, label)`
55/// pair.
56#[derive(Clone, Copy)]
57pub struct LabeledSample<'a, A>
58where
59    A: Float,
60{
61    fences: (A, A, A, A),
62    sample: &'a Sample<A>,
63}
64
65impl<'a, A> LabeledSample<'a, A>
66where
67    A: Float,
68{
69    /// Returns the number of data points per label
70    ///
71    /// - Time: `O(length)`
72    #[allow(clippy::similar_names)]
73    pub fn count(&self) -> (usize, usize, usize, usize, usize) {
74        let (mut los, mut lom, mut noa, mut him, mut his) = (0, 0, 0, 0, 0);
75
76        for (_, label) in self {
77            match label {
78                LowSevere => {
79                    los += 1;
80                }
81                LowMild => {
82                    lom += 1;
83                }
84                NotAnOutlier => {
85                    noa += 1;
86                }
87                HighMild => {
88                    him += 1;
89                }
90                HighSevere => {
91                    his += 1;
92                }
93            }
94        }
95
96        (los, lom, noa, him, his)
97    }
98
99    /// Returns the fences used to classify the outliers
100    pub fn fences(&self) -> (A, A, A, A) {
101        self.fences
102    }
103
104    /// Returns an iterator over the labeled data
105    pub fn iter(&self) -> Iter<'a, A> {
106        Iter {
107            fences: self.fences,
108            iter: self.sample.iter(),
109        }
110    }
111}
112
113impl<'a, A> Deref for LabeledSample<'a, A>
114where
115    A: Float,
116{
117    type Target = Sample<A>;
118
119    fn deref(&self) -> &Sample<A> {
120        self.sample
121    }
122}
123
124// FIXME Use the `IndexGet` trait
125impl<'a, A> Index<usize> for LabeledSample<'a, A>
126where
127    A: Float,
128{
129    type Output = Label;
130
131    #[allow(clippy::similar_names)]
132    fn index(&self, i: usize) -> &Label {
133        static LOW_SEVERE: Label = LowSevere;
134        static LOW_MILD: Label = LowMild;
135        static HIGH_MILD: Label = HighMild;
136        static HIGH_SEVERE: Label = HighSevere;
137        static NOT_AN_OUTLIER: Label = NotAnOutlier;
138
139        let x = self.sample[i];
140        let (lost, lomt, himt, hist) = self.fences;
141
142        if x < lost {
143            &LOW_SEVERE
144        } else if x > hist {
145            &HIGH_SEVERE
146        } else if x < lomt {
147            &LOW_MILD
148        } else if x > himt {
149            &HIGH_MILD
150        } else {
151            &NOT_AN_OUTLIER
152        }
153    }
154}
155
156impl<'a, A> IntoIterator for &LabeledSample<'a, A>
157where
158    A: Float,
159{
160    type Item = (A, Label);
161    type IntoIter = Iter<'a, A>;
162
163    fn into_iter(self) -> Iter<'a, A> {
164        self.iter()
165    }
166}
167
168/// Iterator over the labeled data
169pub struct Iter<'a, A>
170where
171    A: Float,
172{
173    fences: (A, A, A, A),
174    iter: slice::Iter<'a, A>,
175}
176
177impl<'a, A> Iterator for Iter<'a, A>
178where
179    A: Float,
180{
181    type Item = (A, Label);
182
183    #[allow(clippy::similar_names)]
184    fn next(&mut self) -> Option<(A, Label)> {
185        self.iter.next().map(|&x| {
186            let (lost, lomt, himt, hist) = self.fences;
187
188            let label = if x < lost {
189                LowSevere
190            } else if x > hist {
191                HighSevere
192            } else if x < lomt {
193                LowMild
194            } else if x > himt {
195                HighMild
196            } else {
197                NotAnOutlier
198            };
199
200            (x, label)
201        })
202    }
203
204    fn size_hint(&self) -> (usize, Option<usize>) {
205        self.iter.size_hint()
206    }
207}
208
209/// Labels used to classify outliers
210pub enum Label {
211    /// A "mild" outlier in the "high" spectrum
212    HighMild,
213    /// A "severe" outlier in the "high" spectrum
214    HighSevere,
215    /// A "mild" outlier in the "low" spectrum
216    LowMild,
217    /// A "severe" outlier in the "low" spectrum
218    LowSevere,
219    /// A normal data point
220    NotAnOutlier,
221}
222
223impl Label {
224    /// Checks if the data point has an "unusually" high value
225    pub fn is_high(&self) -> bool {
226        matches!(*self, HighMild | HighSevere)
227    }
228
229    /// Checks if the data point is labeled as a "mild" outlier
230    pub fn is_mild(&self) -> bool {
231        matches!(*self, HighMild | LowMild)
232    }
233
234    /// Checks if the data point has an "unusually" low value
235    pub fn is_low(&self) -> bool {
236        matches!(*self, LowMild | LowSevere)
237    }
238
239    /// Checks if the data point is labeled as an outlier
240    pub fn is_outlier(&self) -> bool {
241        !matches!(*self, NotAnOutlier)
242    }
243
244    /// Checks if the data point is labeled as a "severe" outlier
245    pub fn is_severe(&self) -> bool {
246        matches!(*self, HighSevere | LowSevere)
247    }
248}
249
250/// Classifies the sample, and returns a labeled sample.
251///
252/// - Time: `O(N log N) where N = length`
253pub fn classify<A>(sample: &Sample<A>) -> LabeledSample<'_, A>
254where
255    A: Float,
256    usize: cast::From<A, Output = Result<usize, cast::Error>>,
257{
258    let (q1, _, q3) = sample.percentiles().quartiles();
259    let iqr = q3 - q1;
260
261    // Mild
262    let k_m = A::cast(1.5_f32);
263    // Severe
264    let k_s = A::cast(3);
265
266    LabeledSample {
267        fences: (
268            q1 - k_s * iqr,
269            q1 - k_m * iqr,
270            q3 + k_m * iqr,
271            q3 + k_s * iqr,
272        ),
273        sample,
274    }
275}