basic_toml/
tokens.rs

1use std::borrow::Cow;
2use std::char;
3use std::str;
4
5/// A span, designating a range of bytes where a token is located.
6#[derive(Eq, PartialEq, Debug, Clone, Copy)]
7pub struct Span {
8    /// The start of the range.
9    pub start: usize,
10    /// The end of the range (exclusive).
11    pub end: usize,
12}
13
14impl From<Span> for (usize, usize) {
15    fn from(Span { start, end }: Span) -> (usize, usize) {
16        (start, end)
17    }
18}
19
20#[derive(Eq, PartialEq, Debug)]
21pub enum Token<'a> {
22    Whitespace(&'a str),
23    Newline,
24    Comment(&'a str),
25
26    Equals,
27    Period,
28    Comma,
29    Colon,
30    Plus,
31    LeftBrace,
32    RightBrace,
33    LeftBracket,
34    RightBracket,
35
36    Keylike(&'a str),
37    String {
38        src: &'a str,
39        val: Cow<'a, str>,
40        multiline: bool,
41    },
42}
43
44#[derive(Eq, PartialEq, Debug)]
45pub enum Error {
46    InvalidCharInString(usize, char),
47    InvalidEscape(usize, char),
48    InvalidHexEscape(usize, char),
49    InvalidEscapeValue(usize, u32),
50    NewlineInString(usize),
51    Unexpected(usize, char),
52    UnterminatedString(usize),
53    NewlineInTableKey(usize),
54    MultilineStringKey(usize),
55    Wanted {
56        at: usize,
57        expected: &'static str,
58        found: &'static str,
59    },
60}
61
62#[derive(Clone)]
63pub struct Tokenizer<'a> {
64    input: &'a str,
65    chars: CrlfFold<'a>,
66}
67
68#[derive(Clone)]
69struct CrlfFold<'a> {
70    chars: str::CharIndices<'a>,
71}
72
73#[derive(Debug)]
74enum MaybeString {
75    NotEscaped(usize),
76    Owned(String),
77}
78
79impl<'a> Tokenizer<'a> {
80    pub fn new(input: &'a str) -> Tokenizer<'a> {
81        let mut t = Tokenizer {
82            input,
83            chars: CrlfFold {
84                chars: input.char_indices(),
85            },
86        };
87        // Eat utf-8 BOM
88        t.eatc('\u{feff}');
89        t
90    }
91
92    pub fn next(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
93        let (start, token) = match self.one() {
94            Some((start, '\n')) => (start, Token::Newline),
95            Some((start, ' ' | '\t')) => (start, self.whitespace_token(start)),
96            Some((start, '#')) => (start, self.comment_token(start)),
97            Some((start, '=')) => (start, Token::Equals),
98            Some((start, '.')) => (start, Token::Period),
99            Some((start, ',')) => (start, Token::Comma),
100            Some((start, ':')) => (start, Token::Colon),
101            Some((start, '+')) => (start, Token::Plus),
102            Some((start, '{')) => (start, Token::LeftBrace),
103            Some((start, '}')) => (start, Token::RightBrace),
104            Some((start, '[')) => (start, Token::LeftBracket),
105            Some((start, ']')) => (start, Token::RightBracket),
106            Some((start, '\'')) => {
107                return self
108                    .literal_string(start)
109                    .map(|t| Some((self.step_span(start), t)))
110            }
111            Some((start, '"')) => {
112                return self
113                    .basic_string(start)
114                    .map(|t| Some((self.step_span(start), t)))
115            }
116            Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
117
118            Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
119            None => return Ok(None),
120        };
121
122        let span = self.step_span(start);
123        Ok(Some((span, token)))
124    }
125
126    pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
127        self.clone().next()
128    }
129
130    pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
131        self.eat_spanned(expected).map(|s| s.is_some())
132    }
133
134    /// Eat a value, returning it's span if it was consumed.
135    pub fn eat_spanned(&mut self, expected: Token<'a>) -> Result<Option<Span>, Error> {
136        let span = match self.peek()? {
137            Some((span, ref found)) if expected == *found => span,
138            Some(_) | None => return Ok(None),
139        };
140
141        drop(self.next());
142        Ok(Some(span))
143    }
144
145    pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
146        // ignore span
147        let _ = self.expect_spanned(expected)?;
148        Ok(())
149    }
150
151    /// Expect the given token returning its span.
152    pub fn expect_spanned(&mut self, expected: Token<'a>) -> Result<Span, Error> {
153        let current = self.current();
154        match self.next()? {
155            Some((span, found)) => {
156                if expected == found {
157                    Ok(span)
158                } else {
159                    Err(Error::Wanted {
160                        at: current,
161                        expected: expected.describe(),
162                        found: found.describe(),
163                    })
164                }
165            }
166            None => Err(Error::Wanted {
167                at: self.input.len(),
168                expected: expected.describe(),
169                found: "eof",
170            }),
171        }
172    }
173
174    pub fn table_key(&mut self) -> Result<(Span, Cow<'a, str>), Error> {
175        let current = self.current();
176        match self.next()? {
177            Some((span, Token::Keylike(k))) => Ok((span, k.into())),
178            Some((
179                span,
180                Token::String {
181                    src,
182                    val,
183                    multiline,
184                },
185            )) => {
186                let offset = self.substr_offset(src);
187                if multiline {
188                    return Err(Error::MultilineStringKey(offset));
189                }
190                match src.find('\n') {
191                    None => Ok((span, val)),
192                    Some(i) => Err(Error::NewlineInTableKey(offset + i)),
193                }
194            }
195            Some((_, other)) => Err(Error::Wanted {
196                at: current,
197                expected: "a table key",
198                found: other.describe(),
199            }),
200            None => Err(Error::Wanted {
201                at: self.input.len(),
202                expected: "a table key",
203                found: "eof",
204            }),
205        }
206    }
207
208    pub fn eat_whitespace(&mut self) {
209        while self.eatc(' ') || self.eatc('\t') {
210            // ...
211        }
212    }
213
214    pub fn eat_comment(&mut self) -> Result<bool, Error> {
215        if !self.eatc('#') {
216            return Ok(false);
217        }
218        drop(self.comment_token(0));
219        self.eat_newline_or_eof().map(|()| true)
220    }
221
222    pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> {
223        let current = self.current();
224        match self.next()? {
225            None | Some((_, Token::Newline)) => Ok(()),
226            Some((_, other)) => Err(Error::Wanted {
227                at: current,
228                expected: "newline",
229                found: other.describe(),
230            }),
231        }
232    }
233
234    pub fn skip_to_newline(&mut self) {
235        loop {
236            match self.one() {
237                Some((_, '\n')) | None => break,
238                _ => {}
239            }
240        }
241    }
242
243    fn eatc(&mut self, ch: char) -> bool {
244        match self.chars.clone().next() {
245            Some((_, ch2)) if ch == ch2 => {
246                self.one();
247                true
248            }
249            _ => false,
250        }
251    }
252
253    pub fn current(&mut self) -> usize {
254        match self.chars.clone().next() {
255            Some(i) => i.0,
256            None => self.input.len(),
257        }
258    }
259
260    fn whitespace_token(&mut self, start: usize) -> Token<'a> {
261        while self.eatc(' ') || self.eatc('\t') {
262            // ...
263        }
264        Token::Whitespace(&self.input[start..self.current()])
265    }
266
267    fn comment_token(&mut self, start: usize) -> Token<'a> {
268        while let Some((_, ch)) = self.chars.clone().next() {
269            if ch != '\t' && (ch < '\u{20}' || ch > '\u{10ffff}') {
270                break;
271            }
272            self.one();
273        }
274        Token::Comment(&self.input[start..self.current()])
275    }
276
277    fn read_string(
278        &mut self,
279        delim: char,
280        start: usize,
281        new_ch: &mut dyn FnMut(
282            &mut Tokenizer,
283            &mut MaybeString,
284            bool,
285            usize,
286            char,
287        ) -> Result<(), Error>,
288    ) -> Result<Token<'a>, Error> {
289        let mut multiline = false;
290        if self.eatc(delim) {
291            if self.eatc(delim) {
292                multiline = true;
293            } else {
294                return Ok(Token::String {
295                    src: &self.input[start..start + 2],
296                    val: Cow::Borrowed(""),
297                    multiline: false,
298                });
299            }
300        }
301        let mut val = MaybeString::NotEscaped(self.current());
302        let mut n = 0;
303        loop {
304            n += 1;
305            match self.one() {
306                Some((i, '\n')) => {
307                    if multiline {
308                        if self.input.as_bytes()[i] == b'\r' {
309                            val.make_owned(&self.input[..i]);
310                        }
311                        if n == 1 {
312                            val = MaybeString::NotEscaped(self.current());
313                        } else {
314                            val.push('\n');
315                        }
316                    } else {
317                        return Err(Error::NewlineInString(i));
318                    }
319                }
320                Some((mut i, ch)) if ch == delim => {
321                    if multiline {
322                        if !self.eatc(delim) {
323                            val.push(delim);
324                            continue;
325                        }
326                        if !self.eatc(delim) {
327                            val.push(delim);
328                            val.push(delim);
329                            continue;
330                        }
331                        if self.eatc(delim) {
332                            val.push(delim);
333                            i += 1;
334                        }
335                        if self.eatc(delim) {
336                            val.push(delim);
337                            i += 1;
338                        }
339                    }
340                    return Ok(Token::String {
341                        src: &self.input[start..self.current()],
342                        val: val.into_cow(&self.input[..i]),
343                        multiline,
344                    });
345                }
346                Some((i, c)) => new_ch(self, &mut val, multiline, i, c)?,
347                None => return Err(Error::UnterminatedString(start)),
348            }
349        }
350    }
351
352    fn literal_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
353        self.read_string('\'', start, &mut |_me, val, _multi, i, ch| {
354            if ch == '\u{09}' || ('\u{20}' <= ch && ch <= '\u{10ffff}' && ch != '\u{7f}') {
355                val.push(ch);
356                Ok(())
357            } else {
358                Err(Error::InvalidCharInString(i, ch))
359            }
360        })
361    }
362
363    fn basic_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
364        self.read_string('"', start, &mut |me, val, multi, i, ch| match ch {
365            '\\' => {
366                val.make_owned(&me.input[..i]);
367                match me.chars.next() {
368                    Some((_, '"')) => val.push('"'),
369                    Some((_, '\\')) => val.push('\\'),
370                    Some((_, 'b')) => val.push('\u{8}'),
371                    Some((_, 'f')) => val.push('\u{c}'),
372                    Some((_, 'n')) => val.push('\n'),
373                    Some((_, 'r')) => val.push('\r'),
374                    Some((_, 't')) => val.push('\t'),
375                    Some((i, c @ ('u' | 'U'))) => {
376                        let len = if c == 'u' { 4 } else { 8 };
377                        val.push(me.hex(start, i, len)?);
378                    }
379                    Some((i, c @ (' ' | '\t' | '\n'))) if multi => {
380                        if c != '\n' {
381                            while let Some((_, ch)) = me.chars.clone().next() {
382                                match ch {
383                                    ' ' | '\t' => {
384                                        me.chars.next();
385                                        continue;
386                                    }
387                                    '\n' => {
388                                        me.chars.next();
389                                        break;
390                                    }
391                                    _ => return Err(Error::InvalidEscape(i, c)),
392                                }
393                            }
394                        }
395                        while let Some((_, ch)) = me.chars.clone().next() {
396                            match ch {
397                                ' ' | '\t' | '\n' => {
398                                    me.chars.next();
399                                }
400                                _ => break,
401                            }
402                        }
403                    }
404                    Some((i, c)) => return Err(Error::InvalidEscape(i, c)),
405                    None => return Err(Error::UnterminatedString(start)),
406                }
407                Ok(())
408            }
409            ch if ch == '\u{09}' || ('\u{20}' <= ch && ch <= '\u{10ffff}' && ch != '\u{7f}') => {
410                val.push(ch);
411                Ok(())
412            }
413            _ => Err(Error::InvalidCharInString(i, ch)),
414        })
415    }
416
417    fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> {
418        let mut buf = String::with_capacity(len);
419        for _ in 0..len {
420            match self.one() {
421                Some((_, ch)) if ch as u32 <= 0x7F && ch.is_ascii_hexdigit() => buf.push(ch),
422                Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)),
423                None => return Err(Error::UnterminatedString(start)),
424            }
425        }
426        let val = u32::from_str_radix(&buf, 16).unwrap();
427        match char::from_u32(val) {
428            Some(ch) => Ok(ch),
429            None => Err(Error::InvalidEscapeValue(i, val)),
430        }
431    }
432
433    fn keylike(&mut self, start: usize) -> Token<'a> {
434        while let Some((_, ch)) = self.peek_one() {
435            if !is_keylike(ch) {
436                break;
437            }
438            self.one();
439        }
440        Token::Keylike(&self.input[start..self.current()])
441    }
442
443    pub fn substr_offset(&self, s: &'a str) -> usize {
444        assert!(s.len() <= self.input.len());
445        let a = self.input.as_ptr() as usize;
446        let b = s.as_ptr() as usize;
447        assert!(a <= b);
448        b - a
449    }
450
451    /// Calculate the span of a single character.
452    fn step_span(&mut self, start: usize) -> Span {
453        let end = match self.peek_one() {
454            Some(t) => t.0,
455            None => self.input.len(),
456        };
457        Span { start, end }
458    }
459
460    /// Peek one char without consuming it.
461    fn peek_one(&mut self) -> Option<(usize, char)> {
462        self.chars.clone().next()
463    }
464
465    /// Take one char.
466    pub fn one(&mut self) -> Option<(usize, char)> {
467        self.chars.next()
468    }
469}
470
471impl<'a> Iterator for CrlfFold<'a> {
472    type Item = (usize, char);
473
474    fn next(&mut self) -> Option<(usize, char)> {
475        self.chars.next().map(|(i, c)| {
476            if c == '\r' {
477                let mut attempt = self.chars.clone();
478                if let Some((_, '\n')) = attempt.next() {
479                    self.chars = attempt;
480                    return (i, '\n');
481                }
482            }
483            (i, c)
484        })
485    }
486}
487
488impl MaybeString {
489    fn push(&mut self, ch: char) {
490        match *self {
491            MaybeString::NotEscaped(..) => {}
492            MaybeString::Owned(ref mut s) => s.push(ch),
493        }
494    }
495
496    fn make_owned(&mut self, input: &str) {
497        match *self {
498            MaybeString::NotEscaped(start) => {
499                *self = MaybeString::Owned(input[start..].to_owned());
500            }
501            MaybeString::Owned(..) => {}
502        }
503    }
504
505    fn into_cow(self, input: &str) -> Cow<str> {
506        match self {
507            MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]),
508            MaybeString::Owned(s) => Cow::Owned(s),
509        }
510    }
511}
512
513fn is_keylike(ch: char) -> bool {
514    ('A' <= ch && ch <= 'Z')
515        || ('a' <= ch && ch <= 'z')
516        || ('0' <= ch && ch <= '9')
517        || ch == '-'
518        || ch == '_'
519}
520
521impl<'a> Token<'a> {
522    pub fn describe(&self) -> &'static str {
523        match *self {
524            Token::Keylike(_) => "an identifier",
525            Token::Equals => "an equals",
526            Token::Period => "a period",
527            Token::Comment(_) => "a comment",
528            Token::Newline => "a newline",
529            Token::Whitespace(_) => "whitespace",
530            Token::Comma => "a comma",
531            Token::RightBrace => "a right brace",
532            Token::LeftBrace => "a left brace",
533            Token::RightBracket => "a right bracket",
534            Token::LeftBracket => "a left bracket",
535            Token::String { multiline, .. } => {
536                if multiline {
537                    "a multiline string"
538                } else {
539                    "a string"
540                }
541            }
542            Token::Colon => "a colon",
543            Token::Plus => "a plus",
544        }
545    }
546}