difflib/
sequencematcher.rs

1use std::cmp::{max, min};
2use std::collections::HashMap;
3use std::hash::Hash;
4use utils::calculate_ratio;
5
6#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
7pub struct Match {
8    pub first_start: usize,
9    pub second_start: usize,
10    pub size: usize,
11}
12
13impl Match {
14    fn new(first_start: usize, second_start: usize, size: usize) -> Match {
15        Match {
16            first_start,
17            second_start,
18            size,
19        }
20    }
21}
22
23#[derive(Debug, Clone, PartialEq)]
24pub struct Opcode {
25    pub tag: String,
26    pub first_start: usize,
27    pub first_end: usize,
28    pub second_start: usize,
29    pub second_end: usize,
30}
31
32impl Opcode {
33    fn new(
34        tag: String,
35        first_start: usize,
36        first_end: usize,
37        second_start: usize,
38        second_end: usize,
39    ) -> Opcode {
40        Opcode {
41            tag,
42            first_start,
43            first_end,
44            second_start,
45            second_end,
46        }
47    }
48}
49
50pub trait Sequence: Eq + Hash {}
51impl<T: Eq + Hash> Sequence for T {}
52
53pub struct SequenceMatcher<'a, T: 'a + Sequence> {
54    first_sequence: &'a [T],
55    second_sequence: &'a [T],
56    matching_blocks: Option<Vec<Match>>,
57    opcodes: Option<Vec<Opcode>>,
58    is_junk: Option<fn(&T) -> bool>,
59    second_sequence_elements: HashMap<&'a T, Vec<usize>>,
60}
61
62impl<'a, T: Sequence> SequenceMatcher<'a, T> {
63    pub fn new<S>(first_sequence: &'a S, second_sequence: &'a S) -> SequenceMatcher<'a, T>
64    where
65        S: AsRef<[T]> + ?Sized,
66    {
67        let mut matcher = SequenceMatcher {
68            first_sequence: first_sequence.as_ref(),
69            second_sequence: second_sequence.as_ref(),
70            matching_blocks: None,
71            opcodes: None,
72            is_junk: None,
73            second_sequence_elements: HashMap::new(),
74        };
75        matcher.set_seqs(first_sequence, second_sequence);
76        matcher
77    }
78
79    pub fn set_is_junk(&mut self, is_junk: Option<fn(&T) -> bool>) {
80        self.is_junk = is_junk;
81        self.matching_blocks = None;
82        self.opcodes = None;
83        self.chain_second_seq();
84    }
85
86    pub fn set_seqs<S>(&mut self, first_sequence: &'a S, second_sequence: &'a S)
87    where
88        S: AsRef<[T]> + ?Sized,
89    {
90        self.set_first_seq(first_sequence);
91        self.set_second_seq(second_sequence);
92    }
93
94    pub fn set_first_seq<S>(&mut self, sequence: &'a S)
95    where
96        S: AsRef<[T]> + ?Sized,
97    {
98        self.first_sequence = sequence.as_ref();
99        self.matching_blocks = None;
100        self.opcodes = None;
101    }
102
103    pub fn set_second_seq<S>(&mut self, sequence: &'a S)
104    where
105        S: AsRef<[T]> + ?Sized,
106    {
107        self.second_sequence = sequence.as_ref();
108        self.matching_blocks = None;
109        self.opcodes = None;
110        self.chain_second_seq();
111    }
112
113    fn chain_second_seq(&mut self) {
114        let second_sequence = self.second_sequence;
115        let mut second_sequence_elements = HashMap::new();
116        for (i, item) in second_sequence.iter().enumerate() {
117            let mut counter = second_sequence_elements
118                .entry(item)
119                .or_insert_with(Vec::new);
120            counter.push(i);
121        }
122        if let Some(junk_func) = self.is_junk {
123            second_sequence_elements = second_sequence_elements
124                .into_iter()
125                .filter(|&(element, _)| !junk_func(element))
126                .collect();
127        }
128        // Filter out popular elements
129        let len = second_sequence.len();
130        if len >= 200 {
131            let test_len = (len as f32 / 100.0).floor() as usize + 1;
132            second_sequence_elements = second_sequence_elements
133                .into_iter()
134                .filter(|&(_, ref indexes)| indexes.len() > test_len)
135                .collect();
136        }
137        self.second_sequence_elements = second_sequence_elements;
138    }
139
140    pub fn find_longest_match(
141        &self,
142        first_start: usize,
143        first_end: usize,
144        second_start: usize,
145        second_end: usize,
146    ) -> Match {
147        let first_sequence = &self.first_sequence;
148        let second_sequence = &self.second_sequence;
149        let second_sequence_elements = &self.second_sequence_elements;
150        let (mut best_i, mut best_j, mut best_size) = (first_start, second_start, 0);
151        let mut j2len: HashMap<usize, usize> = HashMap::new();
152        for (i, item) in first_sequence
153            .iter()
154            .enumerate()
155            .take(first_end)
156            .skip(first_start)
157        {
158            let mut new_j2len: HashMap<usize, usize> = HashMap::new();
159            if let Some(indexes) = second_sequence_elements.get(item) {
160                for j in indexes {
161                    let j = *j;
162                    if j < second_start {
163                        continue;
164                    };
165                    if j >= second_end {
166                        break;
167                    };
168                    let mut size = 0;
169                    if j > 0 {
170                        if let Some(k) = j2len.get(&(j - 1)) {
171                            size = *k;
172                        }
173                    }
174                    size += 1;
175                    new_j2len.insert(j, size);
176                    if size > best_size {
177                        best_i = i + 1 - size;
178                        best_j = j + 1 - size;
179                        best_size = size;
180                    }
181                }
182            }
183            j2len = new_j2len;
184        }
185        for _ in 0..2 {
186            while best_i > first_start
187                && best_j > second_start
188                && first_sequence.get(best_i - 1) == second_sequence.get(best_j - 1)
189            {
190                best_i -= 1;
191                best_j -= 1;
192                best_size += 1;
193            }
194            while best_i + best_size < first_end
195                && best_j + best_size < second_end
196                && first_sequence.get(best_i + best_size) == second_sequence.get(best_j + best_size)
197            {
198                best_size += 1;
199            }
200        }
201        Match::new(best_i, best_j, best_size)
202    }
203
204    pub fn get_matching_blocks(&mut self) -> Vec<Match> {
205        if self.matching_blocks.as_ref().is_some() {
206            return self.matching_blocks.as_ref().unwrap().clone();
207        }
208        let (first_length, second_length) = (self.first_sequence.len(), self.second_sequence.len());
209        let mut matches = Vec::new();
210        let mut queue = vec![(0, first_length, 0, second_length)];
211        while !queue.is_empty() {
212            let (first_start, first_end, second_start, second_end) = queue.pop().unwrap();
213            let m = self.find_longest_match(first_start, first_end, second_start, second_end);
214            match m.size {
215                0 => {}
216                _ => {
217                    if first_start < m.first_start && second_start < m.second_start {
218                        queue.push((first_start, m.first_start, second_start, m.second_start));
219                    }
220                    if m.first_start + m.size < first_end && m.second_start + m.size < second_end {
221                        queue.push((
222                            m.first_start + m.size,
223                            first_end,
224                            m.second_start + m.size,
225                            second_end,
226                        ));
227                    }
228                    matches.push(m);
229                }
230            }
231        }
232        matches.sort_by(|a, b| a.cmp(b));
233        let (mut first_start, mut second_start, mut size) = (0, 0, 0);
234        let mut non_adjacent = Vec::new();
235        for m in &matches {
236            if first_start + size == m.first_start && second_start + size == m.second_start {
237                size += m.size
238            } else {
239                if size != 0 {
240                    non_adjacent.push(Match::new(first_start, second_start, size));
241                }
242                first_start = m.first_start;
243                second_start = m.second_start;
244                size = m.size;
245            }
246        }
247        if size != 0 {
248            non_adjacent.push(Match::new(first_start, second_start, size));
249        }
250        non_adjacent.push(Match::new(first_length, second_length, 0));
251        self.matching_blocks = Some(non_adjacent);
252        self.matching_blocks.as_ref().unwrap().clone()
253    }
254
255    pub fn get_opcodes(&mut self) -> Vec<Opcode> {
256        if self.opcodes.as_ref().is_some() {
257            return self.opcodes.as_ref().unwrap().clone();
258        }
259        let mut opcodes = Vec::new();
260        let (mut i, mut j) = (0, 0);
261        for m in self.get_matching_blocks() {
262            let mut tag = String::new();
263            if i < m.first_start && j < m.second_start {
264                tag = String::from("replace");
265            } else if i < m.first_start {
266                tag = String::from("delete");
267            } else if j < m.second_start {
268                tag = String::from("insert");
269            }
270            if !tag.is_empty() {
271                opcodes.push(Opcode::new(tag, i, m.first_start, j, m.second_start));
272            }
273            i = m.first_start + m.size;
274            j = m.second_start + m.size;
275            if m.size != 0 {
276                opcodes.push(Opcode::new(
277                    String::from("equal"),
278                    m.first_start,
279                    i,
280                    m.second_start,
281                    j,
282                ));
283            }
284        }
285        self.opcodes = Some(opcodes);
286        self.opcodes.as_ref().unwrap().clone()
287    }
288
289    pub fn get_grouped_opcodes(&mut self, n: usize) -> Vec<Vec<Opcode>> {
290        let mut res = Vec::new();
291        let mut codes = self.get_opcodes();
292        if codes.is_empty() {
293            codes.push(Opcode::new("equal".to_string(), 0, 1, 0, 1));
294        }
295
296        if codes.first().unwrap().tag == "equal" {
297            let opcode = codes.first_mut().unwrap();
298            opcode.first_start = max(opcode.first_start, opcode.first_end.saturating_sub(n));
299            opcode.second_start = max(opcode.second_start, opcode.second_end.saturating_sub(n));
300        }
301        if codes.last().unwrap().tag == "equal" {
302            let opcode = codes.last_mut().unwrap();
303            opcode.first_end = min(opcode.first_start + n, opcode.first_end);
304            opcode.second_end = min(opcode.second_start + n, opcode.second_end);
305        }
306        let nn = n + n;
307        let mut group = Vec::new();
308        for code in &codes {
309            let (mut first_start, mut second_start) = (code.first_start, code.second_start);
310            if code.tag == "equal" && code.first_end - code.first_start > nn {
311                group.push(Opcode::new(
312                    code.tag.clone(),
313                    code.first_start,
314                    min(code.first_end, code.first_start + n),
315                    code.second_start,
316                    min(code.second_end, code.second_start + n),
317                ));
318                res.push(group.clone());
319                group.clear();
320                first_start = max(first_start, code.first_end.saturating_sub(n));
321                second_start = max(second_start, code.second_end.saturating_sub(n));
322            }
323            group.push(Opcode::new(
324                code.tag.clone(),
325                first_start,
326                code.first_end,
327                second_start,
328                code.second_end,
329            ));
330        }
331        if !(group.len() == 1 && group.first().unwrap().tag == "equal") || group.is_empty() {
332            res.push(group.clone());
333        }
334        res
335    }
336
337    pub fn ratio(&mut self) -> f32 {
338        let matches = self.get_matching_blocks()
339            .iter()
340            .fold(0, |res, &m| res + m.size);
341        calculate_ratio(
342            matches,
343            self.first_sequence.len() + self.second_sequence.len(),
344        )
345    }
346}