criterion/stats/univariate/outliers/tukey.rs
1//! Tukey's method
2//!
3//! The original method uses two "fences" to classify the data. All the observations "inside" the
4//! fences are considered "normal", and the rest are considered outliers.
5//!
6//! The fences are computed from the quartiles of the sample, according to the following formula:
7//!
8//! ``` ignore
9//! // q1, q3 are the first and third quartiles
10//! let iqr = q3 - q1; // The interquartile range
11//! let (f1, f2) = (q1 - 1.5 * iqr, q3 + 1.5 * iqr); // the "fences"
12//!
13//! let is_outlier = |x| if x > f1 && x < f2 { true } else { false };
14//! ```
15//!
16//! The classifier provided here adds two extra outer fences:
17//!
18//! ``` ignore
19//! let (f3, f4) = (q1 - 3 * iqr, q3 + 3 * iqr); // the outer "fences"
20//! ```
21//!
22//! The extra fences add a sense of "severity" to the classification. Data points outside of the
23//! outer fences are considered "severe" outliers, whereas points outside the inner fences are just
24//! "mild" outliers, and, as the original method, everything inside the inner fences is considered
25//! "normal" data.
26//!
27//! Some ASCII art for the visually oriented people:
28//!
29//! ``` ignore
30//! LOW-ish NORMAL-ish HIGH-ish
31//! x | + | o o o o o o o | + | x
32//! f3 f1 f2 f4
33//!
34//! Legend:
35//! o: "normal" data (not an outlier)
36//! +: "mild" outlier
37//! x: "severe" outlier
38//! ```
39
40use std::ops::{Deref, Index};
41use std::slice;
42
43use crate::stats::float::Float;
44use crate::stats::univariate::Sample;
45
46use self::Label::*;
47
48/// A classified/labeled sample.
49///
50/// The labeled data can be accessed using the indexing operator. The order of the data points is
51/// retained.
52///
53/// NOTE: Due to limitations in the indexing traits, only the label is returned. Once the
54/// `IndexGet` trait lands in stdlib, the indexing operation will return a `(data_point, label)`
55/// pair.
56#[derive(Clone, Copy)]
57pub struct LabeledSample<'a, A>
58where
59 A: Float,
60{
61 fences: (A, A, A, A),
62 sample: &'a Sample<A>,
63}
64
65impl<'a, A> LabeledSample<'a, A>
66where
67 A: Float,
68{
69 /// Returns the number of data points per label
70 ///
71 /// - Time: `O(length)`
72 #[allow(clippy::similar_names)]
73 pub fn count(&self) -> (usize, usize, usize, usize, usize) {
74 let (mut los, mut lom, mut noa, mut him, mut his) = (0, 0, 0, 0, 0);
75
76 for (_, label) in self {
77 match label {
78 LowSevere => {
79 los += 1;
80 }
81 LowMild => {
82 lom += 1;
83 }
84 NotAnOutlier => {
85 noa += 1;
86 }
87 HighMild => {
88 him += 1;
89 }
90 HighSevere => {
91 his += 1;
92 }
93 }
94 }
95
96 (los, lom, noa, him, his)
97 }
98
99 /// Returns the fences used to classify the outliers
100 pub fn fences(&self) -> (A, A, A, A) {
101 self.fences
102 }
103
104 /// Returns an iterator over the labeled data
105 pub fn iter(&self) -> Iter<'a, A> {
106 Iter {
107 fences: self.fences,
108 iter: self.sample.iter(),
109 }
110 }
111}
112
113impl<'a, A> Deref for LabeledSample<'a, A>
114where
115 A: Float,
116{
117 type Target = Sample<A>;
118
119 fn deref(&self) -> &Sample<A> {
120 self.sample
121 }
122}
123
124// FIXME Use the `IndexGet` trait
125impl<'a, A> Index<usize> for LabeledSample<'a, A>
126where
127 A: Float,
128{
129 type Output = Label;
130
131 #[allow(clippy::similar_names)]
132 fn index(&self, i: usize) -> &Label {
133 static LOW_SEVERE: Label = LowSevere;
134 static LOW_MILD: Label = LowMild;
135 static HIGH_MILD: Label = HighMild;
136 static HIGH_SEVERE: Label = HighSevere;
137 static NOT_AN_OUTLIER: Label = NotAnOutlier;
138
139 let x = self.sample[i];
140 let (lost, lomt, himt, hist) = self.fences;
141
142 if x < lost {
143 &LOW_SEVERE
144 } else if x > hist {
145 &HIGH_SEVERE
146 } else if x < lomt {
147 &LOW_MILD
148 } else if x > himt {
149 &HIGH_MILD
150 } else {
151 &NOT_AN_OUTLIER
152 }
153 }
154}
155
156impl<'a, A> IntoIterator for &LabeledSample<'a, A>
157where
158 A: Float,
159{
160 type Item = (A, Label);
161 type IntoIter = Iter<'a, A>;
162
163 fn into_iter(self) -> Iter<'a, A> {
164 self.iter()
165 }
166}
167
168/// Iterator over the labeled data
169pub struct Iter<'a, A>
170where
171 A: Float,
172{
173 fences: (A, A, A, A),
174 iter: slice::Iter<'a, A>,
175}
176
177impl<'a, A> Iterator for Iter<'a, A>
178where
179 A: Float,
180{
181 type Item = (A, Label);
182
183 #[allow(clippy::similar_names)]
184 fn next(&mut self) -> Option<(A, Label)> {
185 self.iter.next().map(|&x| {
186 let (lost, lomt, himt, hist) = self.fences;
187
188 let label = if x < lost {
189 LowSevere
190 } else if x > hist {
191 HighSevere
192 } else if x < lomt {
193 LowMild
194 } else if x > himt {
195 HighMild
196 } else {
197 NotAnOutlier
198 };
199
200 (x, label)
201 })
202 }
203
204 fn size_hint(&self) -> (usize, Option<usize>) {
205 self.iter.size_hint()
206 }
207}
208
209/// Labels used to classify outliers
210pub enum Label {
211 /// A "mild" outlier in the "high" spectrum
212 HighMild,
213 /// A "severe" outlier in the "high" spectrum
214 HighSevere,
215 /// A "mild" outlier in the "low" spectrum
216 LowMild,
217 /// A "severe" outlier in the "low" spectrum
218 LowSevere,
219 /// A normal data point
220 NotAnOutlier,
221}
222
223impl Label {
224 /// Checks if the data point has an "unusually" high value
225 pub fn is_high(&self) -> bool {
226 matches!(*self, HighMild | HighSevere)
227 }
228
229 /// Checks if the data point is labeled as a "mild" outlier
230 pub fn is_mild(&self) -> bool {
231 matches!(*self, HighMild | LowMild)
232 }
233
234 /// Checks if the data point has an "unusually" low value
235 pub fn is_low(&self) -> bool {
236 matches!(*self, LowMild | LowSevere)
237 }
238
239 /// Checks if the data point is labeled as an outlier
240 pub fn is_outlier(&self) -> bool {
241 !matches!(*self, NotAnOutlier)
242 }
243
244 /// Checks if the data point is labeled as a "severe" outlier
245 pub fn is_severe(&self) -> bool {
246 matches!(*self, HighSevere | LowSevere)
247 }
248}
249
250/// Classifies the sample, and returns a labeled sample.
251///
252/// - Time: `O(N log N) where N = length`
253pub fn classify<A>(sample: &Sample<A>) -> LabeledSample<'_, A>
254where
255 A: Float,
256 usize: cast::From<A, Output = Result<usize, cast::Error>>,
257{
258 let (q1, _, q3) = sample.percentiles().quartiles();
259 let iqr = q3 - q1;
260
261 // Mild
262 let k_m = A::cast(1.5_f32);
263 // Severe
264 let k_s = A::cast(3);
265
266 LabeledSample {
267 fences: (
268 q1 - k_s * iqr,
269 q1 - k_m * iqr,
270 q3 + k_m * iqr,
271 q3 + k_s * iqr,
272 ),
273 sample,
274 }
275}