infer/matchers/
doc.rs

1use std::convert::TryInto;
2
3#[derive(Debug, Eq, PartialEq)]
4enum DocType {
5    // DOC,
6    DOCX,
7    // XLS,
8    XLSX,
9    // PPT,
10    PPTX,
11    OOXLM,
12}
13
14/// Returns whether a buffer is Microsoft Word Document (DOC) data.
15pub fn is_doc(buf: &[u8]) -> bool {
16    buf.len() > 7
17        && buf[0] == 0xD0
18        && buf[1] == 0xCF
19        && buf[2] == 0x11
20        && buf[3] == 0xE0
21        && buf[4] == 0xA1
22        && buf[5] == 0xB1
23        && buf[6] == 0x1A
24        && buf[7] == 0xE1
25}
26
27/// Returns whether a buffer is Microsoft Word Open XML Format Document (DOCX) data.
28pub fn is_docx(buf: &[u8]) -> bool {
29    match msooxml(buf) {
30        Some(typ) => typ == DocType::DOCX,
31        None => false,
32    }
33}
34
35/// Returns whether a buffer is Microsoft Excel 97-2003 Worksheet (XLS) data.
36pub fn is_xls(buf: &[u8]) -> bool {
37    buf.len() > 7
38        && buf[0] == 0xD0
39        && buf[1] == 0xCF
40        && buf[2] == 0x11
41        && buf[3] == 0xE0
42        && buf[4] == 0xA1
43        && buf[5] == 0xB1
44        && buf[6] == 0x1A
45        && buf[7] == 0xE1
46}
47
48/// Returns whether a buffer is Microsoft Excel Open XML Format Spreadsheet (XLSX) data.
49pub fn is_xlsx(buf: &[u8]) -> bool {
50    match msooxml(buf) {
51        Some(typ) => typ == DocType::XLSX,
52        None => false,
53    }
54}
55
56/// Returns whether a buffer is Microsoft PowerPoint 97-2003 Presentation (PPT) data.
57pub fn is_ppt(buf: &[u8]) -> bool {
58    buf.len() > 7
59        && buf[0] == 0xD0
60        && buf[1] == 0xCF
61        && buf[2] == 0x11
62        && buf[3] == 0xE0
63        && buf[4] == 0xA1
64        && buf[5] == 0xB1
65        && buf[6] == 0x1A
66        && buf[7] == 0xE1
67}
68
69/// Returns whether a buffer is Microsoft PowerPoint Open XML Presentation (PPTX) data.
70pub fn is_pptx(buf: &[u8]) -> bool {
71    match msooxml(buf) {
72        Some(typ) => typ == DocType::PPTX,
73        None => false,
74    }
75}
76
77fn msooxml(buf: &[u8]) -> Option<DocType> {
78    let signature = [b'P', b'K', 0x03, 0x04];
79
80    // start by checking for ZIP local file header signature
81    if !compare_bytes(buf, &signature, 0) {
82        return None;
83    }
84
85    let v = check_msooml(buf, 0x1E);
86    if v.is_some() {
87        return v;
88    }
89
90    if !compare_bytes(buf, b"[Content_Types].xml", 0x1E)
91        && !compare_bytes(buf, b"_rels/.rels", 0x1E)
92    {
93        return None;
94    }
95
96    // skip to the second local file header
97    // since some documents include a 520-byte extra field following the file
98    // header, we need to scan for the next header
99    let mut start_offset = (u32::from_le_bytes(buf[18..22].try_into().unwrap()) + 49) as usize;
100    let idx = search(buf, start_offset, 6000)?;
101
102    // now skip to the *third* local file header; again, we need to scan due to a
103    // 520-byte extra field following the file header
104    start_offset += idx + 4 + 26;
105    let idx = search(buf, start_offset, 6000)?;
106
107    // and check the subdirectory name to determine which type of OOXML
108    // file we have.  Correct the mimetype with the registered ones:
109    // http://technet.microsoft.com/en-us/library/cc179224.aspx
110    start_offset += idx + 4 + 26;
111    check_msooml(buf, start_offset)?;
112
113    // OpenOffice/Libreoffice orders ZIP entry differently, so check the 4th file
114    start_offset += 26;
115    let idx = search(buf, start_offset, 6000);
116    match idx {
117        Some(idx) => start_offset += idx + 4 + 26,
118        None => return Some(DocType::OOXLM),
119    };
120
121    let typo = check_msooml(buf, start_offset);
122    if typo.is_some() {
123        return typo;
124    }
125
126    Some(DocType::OOXLM)
127}
128
129fn compare_bytes(slice: &[u8], sub_slice: &[u8], start_offset: usize) -> bool {
130    let sl = sub_slice.len();
131
132    if start_offset + sl > slice.len() {
133        return false;
134    }
135
136    for (i, v) in slice.iter().skip(start_offset).take(sl).enumerate() {
137        let v2 = sub_slice[i];
138
139        if *v != v2 {
140            return false;
141        }
142    }
143
144    true
145}
146
147fn check_msooml(buf: &[u8], offset: usize) -> Option<DocType> {
148    if compare_bytes(buf, &[b'w', b'o', b'r', b'd', b'/'], offset) {
149        Some(DocType::DOCX)
150    } else if compare_bytes(buf, &[b'p', b'p', b't', b'/'], offset) {
151        Some(DocType::PPTX)
152    } else if compare_bytes(buf, &[b'x', b'l', b'/'], offset) {
153        Some(DocType::XLSX)
154    } else {
155        None
156    }
157}
158
159fn search(buf: &[u8], start: usize, range: usize) -> Option<usize> {
160    let length = buf.len();
161    let mut end = start + range;
162    let signature: &[_] = &[b'P', b'K', 0x03, 0x04];
163
164    if end > length {
165        end = length;
166    }
167
168    if start >= end {
169        return None;
170    }
171
172    buf[start..end]
173        .windows(signature.len())
174        .position(|window| window == signature)
175}