extern crate alloc;
use std::error::Error;
use std::{char, fmt};
use mz_ore::lex::LexBuf;
use mz_ore::str::{MaxLenString, StrExt};
use serde::{Deserialize, Serialize};
use crate::keywords::Keyword;
pub const MAX_IDENTIFIER_LENGTH: usize = 255;
pub type IdentString = MaxLenString<MAX_IDENTIFIER_LENGTH>;
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct LexerError {
pub message: String,
pub pos: usize,
}
impl fmt::Display for LexerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str(&self.message)
}
}
impl Error for LexerError {}
impl LexerError {
pub(crate) fn new<S>(pos: usize, message: S) -> LexerError
where
S: Into<String>,
{
LexerError {
pos,
message: message.into(),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
Keyword(Keyword),
Ident(IdentString),
String(String),
HexString(String),
Number(String),
Parameter(usize),
Op(String),
Star,
Eq,
LParen,
RParen,
LBracket,
RBracket,
Dot,
Comma,
Colon,
DoubleColon,
Semicolon,
Arrow,
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Token::Keyword(kw) => f.write_str(kw.as_str()),
Token::Ident(id) => write!(f, "identifier {}", id.quoted()),
Token::String(s) => write!(f, "string literal {}", s.quoted()),
Token::HexString(s) => write!(f, "hex string literal {}", s.quoted()),
Token::Number(n) => write!(f, "number \"{}\"", n),
Token::Parameter(n) => write!(f, "parameter \"${}\"", n),
Token::Op(op) => write!(f, "operator {}", op.quoted()),
Token::Star => f.write_str("star"),
Token::Eq => f.write_str("equals sign"),
Token::LParen => f.write_str("left parenthesis"),
Token::RParen => f.write_str("right parenthesis"),
Token::LBracket => f.write_str("left square bracket"),
Token::RBracket => f.write_str("right square bracket"),
Token::Dot => f.write_str("dot"),
Token::Comma => f.write_str("comma"),
Token::Colon => f.write_str("colon"),
Token::DoubleColon => f.write_str("double colon"),
Token::Semicolon => f.write_str("semicolon"),
Token::Arrow => f.write_str("arrow"),
}
}
}
pub struct PosToken {
pub kind: Token,
pub offset: usize,
}
macro_rules! bail {
($pos:expr, $($fmt:expr),*) => {
return Err(LexerError::new($pos, format!($($fmt),*)))
}
}
pub fn lex(query: &str) -> Result<Vec<PosToken>, LexerError> {
let buf = &mut LexBuf::new(query);
let mut tokens = vec![];
while let Some(ch) = buf.next() {
let pos = buf.pos() - ch.len_utf8();
let token = match ch {
_ if ch.is_ascii_whitespace() => continue,
'-' if buf.consume('-') => {
lex_line_comment(buf);
continue;
}
'/' if buf.consume('*') => {
lex_multiline_comment(buf)?;
continue;
}
'\'' => Token::String(lex_string(buf)?),
'x' | 'X' if buf.consume('\'') => Token::HexString(lex_string(buf)?),
'e' | 'E' if buf.consume('\'') => lex_extended_string(buf)?,
'A'..='Z' | 'a'..='z' | '_' | '\u{80}'..=char::MAX => lex_ident(buf)?,
'"' => lex_quoted_ident(buf)?,
'0'..='9' => lex_number(buf)?,
'.' if matches!(buf.peek(), Some('0'..='9')) => lex_number(buf)?,
'$' if matches!(buf.peek(), Some('0'..='9')) => lex_parameter(buf)?,
'$' => lex_dollar_string(buf)?,
'(' => Token::LParen,
')' => Token::RParen,
',' => Token::Comma,
'.' => Token::Dot,
':' if buf.consume(':') => Token::DoubleColon,
':' => Token::Colon,
';' => Token::Semicolon,
'[' => Token::LBracket,
']' => Token::RBracket,
#[rustfmt::skip]
'+'|'-'|'*'|'/'|'<'|'>'|'='|'~'|'!'|'@'|'#'|'%'|'^'|'&'|'|'|'`'|'?' => lex_op(buf),
_ => bail!(pos, "unexpected character in input: {}", ch),
};
tokens.push(PosToken {
kind: token,
offset: pos,
})
}
#[cfg(debug_assertions)]
for token in &tokens {
assert!(query.is_char_boundary(token.offset));
}
Ok(tokens)
}
fn lex_line_comment(buf: &mut LexBuf) {
buf.take_while(|ch| ch != '\n');
}
fn lex_multiline_comment(buf: &mut LexBuf) -> Result<(), LexerError> {
let pos = buf.pos() - 2;
let mut nesting = 0;
while let Some(ch) = buf.next() {
match ch {
'*' if buf.consume('/') => {
if nesting == 0 {
return Ok(());
} else {
nesting -= 1;
}
}
'/' if buf.consume('*') => nesting += 1,
_ => (),
}
}
bail!(pos, "unterminated multiline comment")
}
fn lex_ident(buf: &mut LexBuf) -> Result<Token, LexerError> {
buf.prev();
let pos: usize = buf.pos();
let word = buf.take_while(
|ch| matches!(ch, 'A'..='Z' | 'a'..='z' | '0'..='9' | '$' | '_' | '\u{80}'..=char::MAX),
);
match word.parse() {
Ok(kw) => Ok(Token::Keyword(kw)),
Err(_) => {
let Ok(small) = IdentString::new(word.to_lowercase()) else {
bail!(
pos,
"identifier length exceeds {MAX_IDENTIFIER_LENGTH} bytes"
)
};
Ok(Token::Ident(small))
}
}
}
fn lex_quoted_ident(buf: &mut LexBuf) -> Result<Token, LexerError> {
let mut s = String::new();
let pos = buf.pos() - 1;
loop {
match buf.next() {
Some('"') if buf.consume('"') => s.push('"'),
Some('"') => break,
Some('\0') => bail!(pos, "null character in quoted identifier"),
Some(c) => s.push(c),
None => bail!(pos, "unterminated quoted identifier"),
}
}
let Ok(small) = IdentString::new(s) else {
bail!(
pos,
"identifier length exceeds {MAX_IDENTIFIER_LENGTH} bytes"
)
};
Ok(Token::Ident(small))
}
fn lex_string(buf: &mut LexBuf) -> Result<String, LexerError> {
let mut s = String::new();
loop {
let pos = buf.pos() - 1;
loop {
match buf.next() {
Some('\'') if buf.consume('\'') => s.push('\''),
Some('\'') => break,
Some(c) => s.push(c),
None => bail!(pos, "unterminated quoted string"),
}
}
if !lex_to_adjacent_string(buf) {
return Ok(s);
}
}
}
fn lex_extended_string(buf: &mut LexBuf) -> Result<Token, LexerError> {
fn lex_unicode_escape(buf: &mut LexBuf, n: usize) -> Result<char, LexerError> {
let pos = buf.pos() - 2;
buf.next_n(n)
.and_then(|s| u32::from_str_radix(s, 16).ok())
.and_then(|codepoint| char::try_from(codepoint).ok())
.ok_or_else(|| LexerError::new(pos, "invalid unicode escape"))
}
fn lex_octal_escape(buf: &mut LexBuf) -> LexerError {
let pos = buf.pos() - 2;
buf.take_while(|ch| matches!(ch, '0'..='7'));
LexerError::new(pos, "octal escapes are not supported")
}
fn lex_hexadecimal_escape(buf: &mut LexBuf) -> LexerError {
let pos = buf.pos() - 2;
buf.take_while(|ch| matches!(ch, '0'..='9' | 'A'..='F' | 'a'..='f'));
LexerError::new(pos, "hexadecimal escapes are not supported")
}
let mut s = String::new();
loop {
let pos = buf.pos() - 1;
loop {
match buf.next() {
Some('\'') if buf.consume('\'') => s.push('\''),
Some('\'') => break,
Some('\\') => match buf.next() {
Some('b') => s.push('\x08'),
Some('f') => s.push('\x0c'),
Some('n') => s.push('\n'),
Some('r') => s.push('\r'),
Some('t') => s.push('\t'),
Some('u') => s.push(lex_unicode_escape(buf, 4)?),
Some('U') => s.push(lex_unicode_escape(buf, 8)?),
Some('0'..='7') => return Err(lex_octal_escape(buf)),
Some('x') => return Err(lex_hexadecimal_escape(buf)),
Some(c) => s.push(c),
None => bail!(pos, "unterminated quoted string"),
},
Some(c) => s.push(c),
None => bail!(pos, "unterminated quoted string"),
}
}
if !lex_to_adjacent_string(buf) {
return Ok(Token::String(s));
}
}
}
fn lex_to_adjacent_string(buf: &mut LexBuf) -> bool {
let whitespace = buf.take_while(|ch| ch.is_ascii_whitespace());
whitespace.contains(&['\n', '\r'][..]) && buf.consume('\'')
}
fn lex_dollar_string(buf: &mut LexBuf) -> Result<Token, LexerError> {
let pos = buf.pos() - 1;
let tag = format!("${}$", buf.take_while(|ch| ch != '$'));
let _ = buf.next();
if let Some(s) = buf.take_to_delimiter(&tag) {
Ok(Token::String(s.into()))
} else {
Err(LexerError::new(pos, "unterminated dollar-quoted string"))
}
}
fn lex_parameter(buf: &mut LexBuf) -> Result<Token, LexerError> {
let pos = buf.pos() - 1;
let n = buf
.take_while(|ch| matches!(ch, '0'..='9'))
.parse()
.map_err(|_| LexerError::new(pos, "invalid parameter number"))?;
Ok(Token::Parameter(n))
}
fn lex_number(buf: &mut LexBuf) -> Result<Token, LexerError> {
buf.prev();
let mut s = buf.take_while(|ch| matches!(ch, '0'..='9')).to_owned();
if buf.consume('.') {
s.push('.');
s.push_str(buf.take_while(|ch| matches!(ch, '0'..='9')));
}
if buf.consume('e') || buf.consume('E') {
s.push('E');
let require_exp = if buf.consume('-') {
s.push('-');
true
} else {
buf.consume('+')
};
let exp = buf.take_while(|ch| matches!(ch, '0'..='9'));
if require_exp && exp.is_empty() {
return Err(LexerError::new(buf.pos() - 1, "missing required exponent"));
} else if exp.is_empty() {
buf.prev();
s.pop();
} else {
s.push_str(exp);
}
}
Ok(Token::Number(s))
}
fn lex_op(buf: &mut LexBuf) -> Token {
buf.prev();
if buf.consume_str("=>") {
return Token::Arrow;
}
let mut s = String::new();
while let Some(ch) = buf.next() {
match ch {
'-' if buf.peek() == Some('-') => {
buf.prev();
break;
}
'/' if buf.peek() == Some('*') => {
buf.prev();
break;
}
#[rustfmt::skip]
'+'|'-'|'*'|'/'|'<'|'>'|'='|'~'|'!'|'@'|'#'|'%'|'^'|'&'|'|'|'`'|'?' => s.push(ch),
_ => {
buf.prev();
break;
}
}
}
if s.len() > 1
&& s.ends_with(&['-', '+'][..])
&& !s.contains(&['~', '!', '@', '#', '%', '^', '&', '|', '`', '?'][..])
{
while s.len() > 1 && s.ends_with(&['-', '+'][..]) {
buf.prev();
s.pop();
}
}
match s.as_str() {
"*" => Token::Star,
"=" => Token::Eq,
"!=" => Token::Op("<>".into()),
_ => Token::Op(s),
}
}