differential_dataflow/algorithms/
identifiers.rs

1//! Assign unique identifiers to records.
2
3use timely::dataflow::Scope;
4
5use crate::{Collection, ExchangeData, Hashable};
6use crate::lattice::Lattice;
7use crate::operators::*;
8use crate::difference::Abelian;
9
10/// Assign unique identifiers to elements of a collection.
11pub trait Identifiers<G: Scope, D: ExchangeData, R: ExchangeData+Abelian> {
12    /// Assign unique identifiers to elements of a collection.
13    ///
14    /// # Example
15    /// ```
16    /// use differential_dataflow::input::Input;
17    /// use differential_dataflow::algorithms::identifiers::Identifiers;
18    /// use differential_dataflow::operators::Threshold;
19    ///
20    /// ::timely::example(|scope| {
21    ///
22    ///     let identifiers =
23    ///     scope.new_collection_from(1 .. 10).1
24    ///          .identifiers()
25    ///          // assert no conflicts
26    ///          .map(|(data, id)| id)
27    ///          .threshold(|_id,cnt| if cnt > &1 { *cnt } else { 0 })
28    ///          .assert_empty();
29    /// });
30    /// ```
31    fn identifiers(&self) -> Collection<G, (D, u64), R>;
32}
33
34impl<G, D, R> Identifiers<G, D, R> for Collection<G, D, R>
35where
36    G: Scope,
37    G::Timestamp: Lattice,
38    D: ExchangeData+::std::hash::Hash,
39    R: ExchangeData+Abelian,
40{
41    fn identifiers(&self) -> Collection<G, (D, u64), R> {
42
43        // The design here is that we iteratively develop a collection
44        // of pairs (round, record), where each pair is a proposal that
45        // the hash for record should be (round, record).hashed().
46        //
47        // Iteratively, any colliding pairs establish a winner (the one
48        // with the lower round, breaking ties by record), and indicate
49        // that the losers should increment their round and try again.
50        //
51        // Non-obviously, this happens via a `reduce` operator that yields
52        // additions and subtractions of losers, rather than reproducing
53        // the winners. This is done under the premise that losers are
54        // very rare, and maintaining winners in both the input and output
55        // of `reduce` is an unnecessary duplication.
56
57        use crate::collection::AsCollection;
58
59        let init = self.map(|record| (0, record));
60        timely::dataflow::operators::generic::operator::empty(&init.scope())
61            .as_collection()
62            .iterate(|diff|
63                init.enter(&diff.scope())
64                    .concat(diff)
65                    .map(|pair| (pair.hashed(), pair))
66                    .reduce(|_hash, input, output| {
67                        // keep round-positive records as changes.
68                        let ((round, record), count) = &input[0];
69                        if *round > 0 {
70                            let mut neg_count = count.clone();
71                            neg_count.negate();
72                            output.push(((0, record.clone()), neg_count));
73                            output.push(((*round, record.clone()), count.clone()));
74                        }
75                        // if any losers, increment their rounds.
76                        for ((round, record), count) in input[1..].iter() {
77                            let mut neg_count = count.clone();
78                            neg_count.negate();
79                            output.push(((0, record.clone()), neg_count));
80                            output.push(((*round+1, record.clone()), count.clone()));
81                        }
82                    })
83                    .map(|(_hash, pair)| pair)
84            )
85            .concat(&init)
86            .map(|pair| { let hash = pair.hashed(); (pair.1, hash) })
87    }
88}
89
90#[cfg(test)]
91mod tests {
92
93    #[test]
94    fn are_unique() {
95
96        // It is hard to test the above method, because we would want
97        // to exercise the case with hash collisions. Instead, we test
98        // a version with a crippled hash function to see that even if
99        // there are collisions, everyone gets a unique identifier.
100
101        use crate::input::Input;
102        use crate::operators::{Threshold, Reduce};
103        use crate::operators::iterate::Iterate;
104
105        ::timely::example(|scope| {
106
107            let input = scope.new_collection_from(1 .. 4).1;
108
109            use crate::collection::AsCollection;
110
111            let init = input.map(|record| (0, record));
112            timely::dataflow::operators::generic::operator::empty(&init.scope())
113                .as_collection()
114                .iterate(|diff|
115                    init.enter(&diff.scope())
116                        .concat(&diff)
117                        .map(|(round, num)| ((round + num) / 10, (round, num)))
118                        .reduce(|_hash, input, output| {
119                            println!("Input: {:?}", input);
120                            // keep round-positive records as changes.
121                            let ((round, record), count) = &input[0];
122                            if *round > 0 {
123                                output.push(((0, record.clone()), -*count));
124                                output.push(((*round, record.clone()), *count));
125                            }
126                            // if any losers, increment their rounds.
127                            for ((round, record), count) in input[1..].iter() {
128                                output.push(((0, record.clone()), -*count));
129                                output.push(((*round+1, record.clone()), *count));
130                            }
131                        })
132                        .inspect(|x| println!("{:?}", x))
133                        .map(|(_hash, pair)| pair)
134                )
135                .concat(&init)
136                .map(|(round, num)| { (num, (round + num) / 10) })
137                .map(|(_data, id)| id)
138                .threshold(|_id,cnt| if cnt > &1 { *cnt } else { 0 })
139                .assert_empty();
140        });
141    }
142}