tiberius/tds/
collation.rs

1//! legacy implementation of collations (or codepages rather) for dealing with varchar's with legacy databases
2//! references [1] which has some mappings from the katmai (SQL Server 2008) source code and is a TDS driver
3//! directly from microsoft
4//! [2] is helpful to map CP1234 to the appropriate encoding
5//!
6//! [1] https://github.com/Microsoft/mssql-jdbc/blob/eb14f63077c47ef1fc1c690deb8cfab602baeb85/src/main/java/com/microsoft/sqlserver/jdbc/SQLCollation.java
7//! [2] https://github.com/lifthrasiir/rust-encoding/blob/496823171f15d9b9446b2ec3fb7765f22346256b/src/label.rs#L282
8
9use encoding_rs::Encoding;
10use std::fmt;
11
12use crate::error::Error;
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub struct Collation {
16    /// LCID ColFlags Version
17    info: u32,
18    /// Sortid
19    sort_id: u8,
20}
21
22impl Collation {
23    pub fn new(info: u32, sort_id: u8) -> Self {
24        Self { info, sort_id }
25    }
26
27    /// return the locale id part of the LCID (the specification here uses ambiguous terms)
28    pub fn lcid(&self) -> u16 {
29        (self.info & 0xffff) as u16
30    }
31
32    pub fn sort_id(&self) -> u8 {
33        self.sort_id
34    }
35
36    pub fn info(&self) -> u32 {
37        self.info
38    }
39
40    /// return an encoding for a given collation
41    pub fn encoding(&self) -> crate::Result<&'static Encoding> {
42        let res = if self.sort_id == 0 {
43            lcid_to_encoding(self.lcid())
44        } else {
45            sortid_to_encoding(self.sort_id)
46        };
47
48        res.ok_or_else(|| {
49            Error::Encoding(
50                format!(
51                    "encoding: unspported encoding (LCID: {:#02x}, sort ID: {})",
52                    self.lcid(),
53                    self.sort_id(),
54                )
55                .into(),
56            )
57        })
58    }
59}
60
61impl fmt::Display for Collation {
62    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
63        match self.encoding() {
64            Ok(encoding) => write!(f, "{}", encoding.name()),
65            _ => write!(f, "None"),
66        }
67    }
68}
69
70/// https://github.com/Microsoft/mssql-jdbc/blob/eb14f63077c47ef1fc1c690deb8cfab602baeb85/src/main/java/com/microsoft/sqlserver/jdbc/SQLCollation.java#L102-L310
71/// maps an LCID (it's locale part which is only 2 bytes) to a codepage
72///
73/// generate the code below from source code:
74/// 1. (regex)replace: (.*?)\((.*?),(.*?)\) with $2 => $3
75/// 2. replace: Encoding.CP(.*?) with encoding::all::WINDOWS_$1
76/// 3. replace: Encoding.UNICODE with encoding::all::UTF16_LE
77//
78/// the unimplemented!() one's are not supported by rust-encoding
79pub fn lcid_to_encoding(locale: u16) -> Option<&'static Encoding> {
80    match locale {
81        0x0401 => Some(encoding_rs::WINDOWS_1256),
82        0x0402 => Some(encoding_rs::WINDOWS_1251),
83        0x0403 => Some(encoding_rs::WINDOWS_1252),
84        // CP950
85        0x0404 | 0x0c04 | 0x1404 => Some(encoding_rs::BIG5),
86        0x0405 => Some(encoding_rs::WINDOWS_1250),
87        0x0406 => Some(encoding_rs::WINDOWS_1252),
88        0x0407 => Some(encoding_rs::WINDOWS_1252),
89        0x0408 => Some(encoding_rs::WINDOWS_1253),
90        0x0409 => Some(encoding_rs::WINDOWS_1252),
91        0x040a => Some(encoding_rs::WINDOWS_1252),
92        0x040b => Some(encoding_rs::WINDOWS_1252),
93        0x040c => Some(encoding_rs::WINDOWS_1252),
94        0x040d => Some(encoding_rs::WINDOWS_1255),
95        0x040e => Some(encoding_rs::WINDOWS_1250),
96        0x040f => Some(encoding_rs::WINDOWS_1252),
97        0x0410 => Some(encoding_rs::WINDOWS_1252),
98        // CP932
99        0x0411 => Some(encoding_rs::SHIFT_JIS),
100        0x0412 => Some(encoding_rs::EUC_KR),
101        0x0413 => Some(encoding_rs::WINDOWS_1252),
102        0x0414 => Some(encoding_rs::WINDOWS_1252),
103        0x0415 => Some(encoding_rs::WINDOWS_1250),
104        0x0416 => Some(encoding_rs::WINDOWS_1252),
105        0x0417 => Some(encoding_rs::WINDOWS_1252),
106        0x0418 => Some(encoding_rs::WINDOWS_1250),
107        0x0419 => Some(encoding_rs::WINDOWS_1251),
108        0x041a => Some(encoding_rs::WINDOWS_1250),
109        0x041b => Some(encoding_rs::WINDOWS_1250),
110        0x041c => Some(encoding_rs::WINDOWS_1250),
111        0x041d => Some(encoding_rs::WINDOWS_1252),
112        0x041e => Some(encoding_rs::WINDOWS_874),
113        0x041f => Some(encoding_rs::WINDOWS_1254),
114        0x0420 => Some(encoding_rs::WINDOWS_1256),
115        0x0421 => Some(encoding_rs::WINDOWS_1252),
116        0x0422 => Some(encoding_rs::WINDOWS_1251),
117        0x0423 => Some(encoding_rs::WINDOWS_1251),
118        0x0424 => Some(encoding_rs::WINDOWS_1250),
119        0x0425 => Some(encoding_rs::WINDOWS_1257),
120        0x0426 => Some(encoding_rs::WINDOWS_1257),
121        0x0427 => Some(encoding_rs::WINDOWS_1257),
122        0x0428 => Some(encoding_rs::WINDOWS_1251),
123        0x0429 => Some(encoding_rs::WINDOWS_1256),
124        0x042a => Some(encoding_rs::WINDOWS_1258),
125        0x042b => Some(encoding_rs::WINDOWS_1252),
126        0x042c => Some(encoding_rs::WINDOWS_1254),
127        0x042d => Some(encoding_rs::WINDOWS_1252),
128        0x042e => Some(encoding_rs::WINDOWS_1252),
129        0x042f => Some(encoding_rs::WINDOWS_1251),
130        0x0432 => Some(encoding_rs::WINDOWS_1252),
131        0x0434 => Some(encoding_rs::WINDOWS_1252),
132        0x0435 => Some(encoding_rs::WINDOWS_1252),
133        0x0436 => Some(encoding_rs::WINDOWS_1252),
134        0x0437 => Some(encoding_rs::WINDOWS_1252),
135        0x0438 => Some(encoding_rs::WINDOWS_1252),
136        0x0439 => Some(encoding_rs::UTF_16LE),
137        0x043a => Some(encoding_rs::UTF_16LE),
138        0x043b => Some(encoding_rs::WINDOWS_1252),
139        0x043e => Some(encoding_rs::WINDOWS_1252),
140        0x043f => Some(encoding_rs::WINDOWS_1251),
141        0x0440 => Some(encoding_rs::WINDOWS_1251),
142        0x0441 => Some(encoding_rs::WINDOWS_1252),
143        0x0442 => Some(encoding_rs::WINDOWS_1250),
144        0x0443 => Some(encoding_rs::WINDOWS_1254),
145        0x0444 => Some(encoding_rs::WINDOWS_1251),
146        0x0445 => Some(encoding_rs::UTF_16LE),
147        0x0446 => Some(encoding_rs::UTF_16LE),
148        0x0447 => Some(encoding_rs::UTF_16LE),
149        0x0448 => Some(encoding_rs::UTF_16LE),
150        0x0449 => Some(encoding_rs::UTF_16LE),
151        0x044a => Some(encoding_rs::UTF_16LE),
152        0x044b => Some(encoding_rs::UTF_16LE),
153        0x044c => Some(encoding_rs::UTF_16LE),
154        0x044d => Some(encoding_rs::UTF_16LE),
155        0x044e => Some(encoding_rs::UTF_16LE),
156        0x044f => Some(encoding_rs::UTF_16LE),
157        0x0450 => Some(encoding_rs::WINDOWS_1251),
158        0x0451 => Some(encoding_rs::UTF_16LE),
159        0x0452 => Some(encoding_rs::WINDOWS_1252),
160        0x0453 => Some(encoding_rs::UTF_16LE),
161        0x0454 => Some(encoding_rs::UTF_16LE),
162        0x0456 => Some(encoding_rs::WINDOWS_1252),
163        0x0457 => Some(encoding_rs::UTF_16LE),
164        0x045a => Some(encoding_rs::UTF_16LE),
165        0x045b => Some(encoding_rs::UTF_16LE),
166        0x045d => Some(encoding_rs::WINDOWS_1252),
167        0x045e => Some(encoding_rs::WINDOWS_1252),
168        0x0461 => Some(encoding_rs::UTF_16LE),
169        0x0462 => Some(encoding_rs::WINDOWS_1252),
170        0x0463 => Some(encoding_rs::UTF_16LE),
171        0x0464 => Some(encoding_rs::WINDOWS_1252),
172        0x0465 => Some(encoding_rs::UTF_16LE),
173        0x0468 => Some(encoding_rs::WINDOWS_1252),
174        0x046a => Some(encoding_rs::WINDOWS_1252),
175        0x046b => Some(encoding_rs::WINDOWS_1252),
176        0x046c => Some(encoding_rs::WINDOWS_1252),
177        0x046d => Some(encoding_rs::WINDOWS_1251),
178        0x046e => Some(encoding_rs::WINDOWS_1252),
179        0x046f => Some(encoding_rs::WINDOWS_1252),
180        0x0470 => Some(encoding_rs::WINDOWS_1252),
181        0x0478 => Some(encoding_rs::WINDOWS_1252),
182        0x047a => Some(encoding_rs::WINDOWS_1252),
183        0x047c => Some(encoding_rs::WINDOWS_1252),
184        0x047e => Some(encoding_rs::WINDOWS_1252),
185        0x0480 => Some(encoding_rs::WINDOWS_1256),
186        0x0481 => Some(encoding_rs::UTF_16LE),
187        0x0482 => Some(encoding_rs::WINDOWS_1252),
188        0x0483 => Some(encoding_rs::WINDOWS_1252),
189        0x0484 => Some(encoding_rs::WINDOWS_1252),
190        0x0485 => Some(encoding_rs::WINDOWS_1251),
191        0x0486 => Some(encoding_rs::WINDOWS_1252),
192        0x0487 => Some(encoding_rs::WINDOWS_1252),
193        0x0488 => Some(encoding_rs::WINDOWS_1252),
194        0x048c => Some(encoding_rs::WINDOWS_1256),
195        0x0801 => Some(encoding_rs::WINDOWS_1256),
196        // CP936
197        0x0804 | 0x1004 => Some(encoding_rs::GB18030),
198        0x0807 => Some(encoding_rs::WINDOWS_1252),
199        0x0809 => Some(encoding_rs::WINDOWS_1252),
200        0x080a => Some(encoding_rs::WINDOWS_1252),
201        0x080c => Some(encoding_rs::WINDOWS_1252),
202        0x0810 => Some(encoding_rs::WINDOWS_1252),
203        0x0813 => Some(encoding_rs::WINDOWS_1252),
204        0x0814 => Some(encoding_rs::WINDOWS_1252),
205        0x0816 => Some(encoding_rs::WINDOWS_1252),
206        0x081a => Some(encoding_rs::WINDOWS_1250),
207        0x081d => Some(encoding_rs::WINDOWS_1252),
208        0x0827 => Some(encoding_rs::WINDOWS_1257),
209        0x082c => Some(encoding_rs::WINDOWS_1251),
210        0x082e => Some(encoding_rs::WINDOWS_1252),
211        0x083b => Some(encoding_rs::WINDOWS_1252),
212        0x083c => Some(encoding_rs::WINDOWS_1252),
213        0x083e => Some(encoding_rs::WINDOWS_1252),
214        0x0843 => Some(encoding_rs::WINDOWS_1251),
215        0x0845 => Some(encoding_rs::UTF_16LE),
216        0x0850 => Some(encoding_rs::WINDOWS_1251),
217        0x085d => Some(encoding_rs::WINDOWS_1252),
218        0x085f => Some(encoding_rs::WINDOWS_1252),
219        0x086b => Some(encoding_rs::WINDOWS_1252),
220        0x0c01 => Some(encoding_rs::WINDOWS_1256),
221        0x0c07 => Some(encoding_rs::WINDOWS_1252),
222        0x0c09 => Some(encoding_rs::WINDOWS_1252),
223        0x0c0a => Some(encoding_rs::WINDOWS_1252),
224        0x0c0c => Some(encoding_rs::WINDOWS_1252),
225        0x0c1a => Some(encoding_rs::WINDOWS_1251),
226        0x0c3b => Some(encoding_rs::WINDOWS_1252),
227        0x0c6b => Some(encoding_rs::WINDOWS_1252),
228        0x1001 => Some(encoding_rs::WINDOWS_1256),
229        0x1007 => Some(encoding_rs::WINDOWS_1252),
230        0x1009 => Some(encoding_rs::WINDOWS_1252),
231        0x100a => Some(encoding_rs::WINDOWS_1252),
232        0x100c => Some(encoding_rs::WINDOWS_1252),
233        0x101a => Some(encoding_rs::WINDOWS_1250),
234        0x103b => Some(encoding_rs::WINDOWS_1252),
235        0x1401 => Some(encoding_rs::WINDOWS_1256),
236        0x1407 => Some(encoding_rs::WINDOWS_1252),
237        0x1409 => Some(encoding_rs::WINDOWS_1252),
238        0x140a => Some(encoding_rs::WINDOWS_1252),
239        0x140c => Some(encoding_rs::WINDOWS_1252),
240        0x141a => Some(encoding_rs::WINDOWS_1250),
241        0x143b => Some(encoding_rs::WINDOWS_1252),
242        0x1801 => Some(encoding_rs::WINDOWS_1256),
243        0x1809 => Some(encoding_rs::WINDOWS_1252),
244        0x180a => Some(encoding_rs::WINDOWS_1252),
245        0x180c => Some(encoding_rs::WINDOWS_1252),
246        0x181a => Some(encoding_rs::WINDOWS_1250),
247        0x183b => Some(encoding_rs::WINDOWS_1252),
248        0x1c01 => Some(encoding_rs::WINDOWS_1256),
249        0x1c09 => Some(encoding_rs::WINDOWS_1252),
250        0x1c0a => Some(encoding_rs::WINDOWS_1252),
251        0x1c1a => Some(encoding_rs::WINDOWS_1251),
252        0x1c3b => Some(encoding_rs::WINDOWS_1252),
253        0x2001 => Some(encoding_rs::WINDOWS_1256),
254        0x2009 => Some(encoding_rs::WINDOWS_1252),
255        0x200a => Some(encoding_rs::WINDOWS_1252),
256        0x201a => Some(encoding_rs::WINDOWS_1251),
257        0x203b => Some(encoding_rs::WINDOWS_1252),
258        0x2401 => Some(encoding_rs::WINDOWS_1256),
259        0x2409 => Some(encoding_rs::WINDOWS_1252),
260        0x240a => Some(encoding_rs::WINDOWS_1252),
261        0x243b => Some(encoding_rs::WINDOWS_1252),
262        0x2801 => Some(encoding_rs::WINDOWS_1256),
263        0x2809 => Some(encoding_rs::WINDOWS_1252),
264        0x280a => Some(encoding_rs::WINDOWS_1252),
265        0x2c01 => Some(encoding_rs::WINDOWS_1256),
266        0x2c09 => Some(encoding_rs::WINDOWS_1252),
267        0x2c0a => Some(encoding_rs::WINDOWS_1252),
268        0x3001 => Some(encoding_rs::WINDOWS_1256),
269        0x3009 => Some(encoding_rs::WINDOWS_1252),
270        0x300a => Some(encoding_rs::WINDOWS_1252),
271        0x3401 => Some(encoding_rs::WINDOWS_1256),
272        0x3409 => Some(encoding_rs::WINDOWS_1252),
273        0x340a => Some(encoding_rs::WINDOWS_1252),
274        0x3801 => Some(encoding_rs::WINDOWS_1256),
275        0x380a => Some(encoding_rs::WINDOWS_1252),
276        0x3c01 => Some(encoding_rs::WINDOWS_1256),
277        0x3c0a => Some(encoding_rs::WINDOWS_1252),
278        0x4001 => Some(encoding_rs::WINDOWS_1256),
279        0x4009 => Some(encoding_rs::WINDOWS_1252),
280        0x400a => Some(encoding_rs::WINDOWS_1252),
281        0x4409 => Some(encoding_rs::WINDOWS_1252),
282        0x440a => Some(encoding_rs::WINDOWS_1252),
283        0x4809 => Some(encoding_rs::WINDOWS_1252),
284        0x480a => Some(encoding_rs::WINDOWS_1252),
285        0x4c0a => Some(encoding_rs::WINDOWS_1252),
286        0x500a => Some(encoding_rs::WINDOWS_1252),
287        0x540a => Some(encoding_rs::WINDOWS_1252),
288        _ => None,
289    }
290}
291
292/// [1] https://github.com/Microsoft/mssql-jdbc/blob/eb14f63077c47ef1fc1c690deb8cfab602baeb85/src/main/java/com/microsoft/sqlserver/jdbc/SQLCollation.java#L362-L482
293/// [2] https://msdn.microsoft.com/de-de/library/ms144250(v=sql.105).aspx
294///
295/// [2] does only contain 3/4 of the content [1] contains, so the source code is again the better source of information
296///
297/// generate the code below from source code:
298/// 1. (regex)replace .*\((.*?),.*?,(.*?)\) with $1 => $2
299/// 2. see above/as above
300pub fn sortid_to_encoding(sort_id: u8) -> Option<&'static Encoding> {
301    match sort_id {
302        // 30 | 31 | 32 | 33 | 34 | 35 => Some(encoding_rs::WINDOWS_437),
303        // 40 | 41 | 42 | 43 | 44 | 45 | 49 => Some(encoding_rs::WINDOWS_850),
304        50 => Some(encoding_rs::WINDOWS_1252),
305        51 => Some(encoding_rs::WINDOWS_1252),
306        52 => Some(encoding_rs::WINDOWS_1252),
307        53 => Some(encoding_rs::WINDOWS_1252),
308        54 => Some(encoding_rs::WINDOWS_1252),
309        // 55 | 56 | 57 | 58 | 59 | 60 | 61 => Some(encoding_rs::WINDOWS_850),
310        71 => Some(encoding_rs::WINDOWS_1252),
311        72 => Some(encoding_rs::WINDOWS_1252),
312        73 => Some(encoding_rs::WINDOWS_1252),
313        74 => Some(encoding_rs::WINDOWS_1252),
314        75 => Some(encoding_rs::WINDOWS_1252),
315        80 => Some(encoding_rs::WINDOWS_1250),
316        81 => Some(encoding_rs::WINDOWS_1250),
317        82 => Some(encoding_rs::WINDOWS_1250),
318        83 => Some(encoding_rs::WINDOWS_1250),
319        84 => Some(encoding_rs::WINDOWS_1250),
320        85 => Some(encoding_rs::WINDOWS_1250),
321        86 => Some(encoding_rs::WINDOWS_1250),
322        87 => Some(encoding_rs::WINDOWS_1250),
323        88 => Some(encoding_rs::WINDOWS_1250),
324        89 => Some(encoding_rs::WINDOWS_1250),
325        90 => Some(encoding_rs::WINDOWS_1250),
326        91 => Some(encoding_rs::WINDOWS_1250),
327        92 => Some(encoding_rs::WINDOWS_1250),
328        93 => Some(encoding_rs::WINDOWS_1250),
329        94 => Some(encoding_rs::WINDOWS_1250),
330        95 => Some(encoding_rs::WINDOWS_1250),
331        96 => Some(encoding_rs::WINDOWS_1250),
332        97 => Some(encoding_rs::WINDOWS_1250),
333        98 => Some(encoding_rs::WINDOWS_1250),
334        104 => Some(encoding_rs::WINDOWS_1251),
335        105 => Some(encoding_rs::WINDOWS_1251),
336        106 => Some(encoding_rs::WINDOWS_1251),
337        107 => Some(encoding_rs::WINDOWS_1251),
338        108 => Some(encoding_rs::WINDOWS_1251),
339        112 => Some(encoding_rs::WINDOWS_1253),
340        113 => Some(encoding_rs::WINDOWS_1253),
341        114 => Some(encoding_rs::WINDOWS_1253),
342        120 => Some(encoding_rs::WINDOWS_1253),
343        121 => Some(encoding_rs::WINDOWS_1253),
344        122 => Some(encoding_rs::WINDOWS_1253),
345        124 => Some(encoding_rs::WINDOWS_1253),
346        128 => Some(encoding_rs::WINDOWS_1254),
347        129 => Some(encoding_rs::WINDOWS_1254),
348        130 => Some(encoding_rs::WINDOWS_1254),
349        136 => Some(encoding_rs::WINDOWS_1255),
350        137 => Some(encoding_rs::WINDOWS_1255),
351        138 => Some(encoding_rs::WINDOWS_1255),
352        144 => Some(encoding_rs::WINDOWS_1256),
353        145 => Some(encoding_rs::WINDOWS_1256),
354        146 => Some(encoding_rs::WINDOWS_1256),
355        152 => Some(encoding_rs::WINDOWS_1257),
356        153 => Some(encoding_rs::WINDOWS_1257),
357        154 => Some(encoding_rs::WINDOWS_1257),
358        155 => Some(encoding_rs::WINDOWS_1257),
359        156 => Some(encoding_rs::WINDOWS_1257),
360        157 => Some(encoding_rs::WINDOWS_1257),
361        158 => Some(encoding_rs::WINDOWS_1257),
362        159 => Some(encoding_rs::WINDOWS_1257),
363        160 => Some(encoding_rs::WINDOWS_1257),
364        183 => Some(encoding_rs::WINDOWS_1252),
365        184 => Some(encoding_rs::WINDOWS_1252),
366        185 => Some(encoding_rs::WINDOWS_1252),
367        186 => Some(encoding_rs::WINDOWS_1252),
368        // CP 932
369        192 | 193 | 200 => Some(encoding_rs::SHIFT_JIS),
370        194 => Some(encoding_rs::EUC_KR),
371        195 => Some(encoding_rs::EUC_KR),
372        // CP950
373        196 | 197 | 202 => Some(encoding_rs::BIG5),
374        // CP936 (GB18030 is an extension of it with more chars), should be backwards-compatible)
375        198 | 199 | 203 => Some(encoding_rs::GB18030),
376        201 => Some(encoding_rs::BIG5),
377        204 => Some(encoding_rs::WINDOWS_874),
378        205 => Some(encoding_rs::WINDOWS_874),
379        206 => Some(encoding_rs::WINDOWS_874),
380        210 => Some(encoding_rs::WINDOWS_1252),
381        211 => Some(encoding_rs::WINDOWS_1252),
382        212 => Some(encoding_rs::WINDOWS_1252),
383        213 => Some(encoding_rs::WINDOWS_1252),
384        214 => Some(encoding_rs::WINDOWS_1252),
385        215 => Some(encoding_rs::WINDOWS_1252),
386        216 => Some(encoding_rs::WINDOWS_1252),
387        217 => Some(encoding_rs::WINDOWS_1252),
388        _ => None,
389    }
390}
391
392/* TODO
393#[cfg(test)]
394mod tests {
395    use futures_state_stream::StateStream;
396    use tokio::executor::current_thread;
397    use crate::tests::new_connection;
398
399    #[test]
400    fn select_nvarchar_collation_test() {
401        let c1 = new_connection();
402        let query = c1.simple_query(
403            "select cast(cast(N'cześć' as nvarchar(5)) collate Polish_CI_AI as varchar(5))",
404        );
405        let mut i = 0;
406        {
407            let future = query.for_each(|x| {
408                let val: &str = x.get(0);
409                assert_eq!(val, "cześć");
410                i += 1;
411                Ok(())
412            });
413            current_thread::block_on_all(future).unwrap();
414        }
415        assert_eq!(i, 1);
416    }
417}
418*/