use std::iter; use nom::{ IResult, Parser, branch::alt, multi::many0, bytes::complete::{tag, take_while1}, character::complete::multispace1, }; use crate::sexp::{SExp, SExp::*, SLeaf::*, util::*}; use crate::r#type::{Type::*}; #[derive(Debug,PartialEq,Clone)] pub enum Token { ParOpen, ParClose, Num(i32), Sym(String), Whitespace(String), } use std::fmt; use std::fmt::Display; impl Display for Token { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", match self { ParOpen => "(".to_string(), ParClose => ")".to_string(), Num(x) => x.to_string(), Sym(s) => s.to_string(), Whitespace(s) => s.to_string(), }) } } use Token::*; use crate::parse::util::*; fn parse_token(s: &str) -> IResult<&str, Token> { alt(( tag("(").map(|_| ParOpen), tag(")").map(|_| ParClose), multispace1.map(whitespace), nom::character::complete::i32.map(Num), take_while1(|c| !(" \n\t()".contains(c))).map(sym), )).parse(s) } fn tokenize(s: &str) -> Result, String> { match many0(parse_token).parse(s) { Ok(("", res)) => Ok(res), Ok((rest, _)) => Err(format!("all data should be tokenizable, '{rest}' was not")), Err(e) => Err(e.to_string()), } } fn tokens_to_ast(tokens: Vec) -> Result { let tokens = iter::once(ParOpen) .chain(tokens.into_iter()) .chain(iter::once(ParClose)); let res = tokens_to_ast_inner(tokens.collect()); match res { Ok((rest, result)) if rest.len() == 0 => Ok(result), Ok((rest, _)) => Err(format!("couldn't parse all input: '{:?}' remained", rest)), Err(e) => Err(e), } } use std::collections::VecDeque; fn tokens_to_ast_inner( mut input: VecDeque ) -> Result<(Vec, SExp), String> { let mut current_token = input.pop_front(); match current_token { Some(ParOpen) => Ok(()), Some(t) => Err(format!( "expected opening parenthesis, found '{t}'" )), None => Err("unexpected input: expected opening parenthesis".to_string()), }?; current_token = input.pop_front(); if let Some(Whitespace(_)) = current_token { current_token = input.pop_front(); } let mut elements = vec![]; loop { elements.push(match current_token { Some(Num(x)) => Ok(Atom(Int(x))), Some(Sym(s)) if s == "+" => Ok(Atom(Add)), Some(Sym(s)) if s == "*" => Ok(Atom(Mul)), Some(Sym(s)) if s == "/" => Ok(Atom(Div)), Some(Sym(s)) if s == "-" => Ok(Atom(Sub)), Some(Sym(s)) if s == ">" => Ok(Atom(Gt)), Some(Sym(s)) if s == "<" => Ok(Atom(Lt)), Some(Sym(s)) if s == ">=" => Ok(Atom(Ge)), Some(Sym(s)) if s == "<=" => Ok(Atom(Le)), Some(Sym(s)) if s == "=" => Ok(Atom(Eq)), Some(Sym(s)) if s == "!=" => Ok(Atom(Neq)), Some(Sym(s)) if s == "and" => Ok(Atom(And)), Some(Sym(s)) if s == "or" => Ok(Atom(Or)), Some(Sym(s)) if s == "not" => Ok(Atom(Not)), Some(Sym(s)) if s == "xor" => Ok(Atom(Xor)), Some(Sym(s)) if s == "true" => Ok(Atom(True)), Some(Sym(s)) if s == "false" => Ok(Atom(False)), Some(Sym(s)) if s == "quote" => Ok(Atom(Quote)), Some(Sym(s)) if s == "vector" => Ok(Atom(Vector)), Some(Sym(s)) if s == "print" => Ok(Atom(Print)), Some(Sym(s)) if s == "let" => Ok(Atom(Let)), Some(Sym(s)) if s == "fn" => Ok(Atom(Fun)), Some(Sym(s)) if s == "case" => Ok(Atom(Case)), Some(Sym(s)) if s == "->" => Ok(Atom(Arr)), Some(Sym(s)) if s == "int" => Ok(Atom(Ty(Integer))), Some(Sym(s)) if s == "bool" => Ok(Atom(Ty(Boolean))), Some(Sym(s)) if s.starts_with("..") => Ok(Atom(RestPat(s.strip_prefix("..").unwrap().to_string()))), Some(Sym(s)) => Ok(Atom(Var(s))), Some(ParClose) => break, Some(ParOpen) => { let subexp = tokens_to_ast_inner( iter::once(ParOpen) .chain(input.clone()) .collect() )?; input = subexp.0.into(); Ok(subexp.1) }, Some(Whitespace(_)) => Err("unexpected whitespace".to_string()), None => Err("unexpected end of input".to_string()), }?); current_token = input.pop_front(); match current_token { Some(Whitespace(_)) => current_token = input.pop_front(), Some(ParClose) => break, Some(_) => return Err("expected whitespace or closing parenthesis".to_string()), None => return Err("unexpected end of input".to_string()) }; if let Some(ParClose) = current_token { break; } } let mut res = Atom(Nil); for el in elements.into_iter().rev() { res = scons(el, res); } Ok((input.into(), res)) } pub fn parse_to_ast(s: &str) -> Result { let tokens = match tokenize(s) { Ok(t) => Ok(t), Err(e) => Err(format!("error in tokenization: {}", e)), }?; tokens_to_ast(tokens.into()) } #[cfg(test)] mod private_parsing_tests { use super::{*, parse_token}; #[test] fn test_parse_token() { assert_eq!(parse_token("()"), Ok((")", ParOpen))); assert_eq!(parse_token(")"), Ok(("", ParClose))); assert_eq!(parse_token(" \t\n"), Ok(("", whitespace(" \t\n")))); assert_eq!(parse_token("1 23"), Ok((" 23", Num(1)))); assert_eq!(parse_token("23"), Ok(("", Num(23)))); assert_eq!(parse_token("Nil a"), Ok((" a", sym("Nil")))); assert_eq!(parse_token("a"), Ok(("", sym("a")))); assert!(parse_token("").is_err()) } #[test] fn test_tokenize() { assert_eq!( tokenize("(+ 1 2 (\t\n a)").unwrap(), vec![ ParOpen, sym("+"), whitespace(" "), Num(1), whitespace(" "), Num(2), whitespace(" "), ParOpen, whitespace("\t\n "), sym("a"), ParClose ] ); } #[test] fn test_tokens_to_ast() { // Syms are parsed to vars, // and everything is wrapped in an extra layer of scons // so that multi-expression programs can be returned // as one single SExp. assert_eq!( tokens_to_ast(vec![sym("a")]), Ok(scons(var("a"), Nil)) ); // Lists are pased to scons linked lists assert_eq!( tokens_to_ast(vec![ ParOpen, sym("a"), whitespace(" "), sym("b"), whitespace(" "), sym("c"), whitespace(" "), ParClose, ]), Ok(scons(scons(var("a"), scons(var("b"), scons(var("c"), Nil))), Nil)) ); // Nesting should work. assert_eq!( tokens_to_ast(vec![ ParOpen, sym("a"), whitespace(" "), ParOpen, sym("b"), whitespace(" "), sym("c"), whitespace(" "), ParClose, whitespace(" "), sym("d"), ParClose ]), Ok( scons(scons(var("a"), scons( scons(var("b"), scons(var("c"), Nil)) , scons(var("d"), Nil))), Nil) ) ); // Multiple expressions should be parseable assert_eq!( tokens_to_ast(vec![ ParOpen, sym("a"), ParClose, whitespace(" "), sym("b"), whitespace(" "), ParOpen, sym("c"), ParClose, ]), Ok(scons( scons(var("a"), Nil), scons(var("b"), scons(scons(var("c"), Nil), Nil)))) ); // Operators are parsed correctly assert_eq!( tokens_to_ast(vec![ ParOpen, sym("+"), whitespace(" "), sym("-"), whitespace(" "), sym("*"), whitespace(" "), sym("/"), ParClose ]), Ok(scons( scons(Add, scons(Sub, scons(Mul, scons(Div, Nil)))), Nil)) ); // Integers are parsed correctly assert_eq!( tokens_to_ast(vec![Num(5)]), Ok(scons(Int(5), Nil)) ); // Nil can be parsed assert_eq!( tokens_to_ast(vec![ParOpen,ParClose]), Ok(scons(Nil, Nil)) ); // Quote can be parsed assert_eq!( tokens_to_ast(vec![ sym("quote"), whitespace(" "), sym("a"), whitespace(" "), sym("b"), whitespace(" "), ]), Ok(scons(Quote, scons(var("a"), scons(var("b"), Nil)))) ); } #[test] fn test_tokens_to_ast_failing() { assert!( tokens_to_ast(vec![ParClose]).is_err(), "Invalid parentheses should fail" ); assert!( tokens_to_ast(vec![ParOpen]).is_err(), "Invalid parentheses should fail" ); assert!( tokens_to_ast(vec![ParClose, ParOpen]).is_err(), "Invalid parentheses should fail" ); assert!( tokens_to_ast(vec![ParOpen, ParOpen, ParClose]).is_err(), "Invalid parentheses should fail" ); assert!( tokens_to_ast(vec![Num(1), sym("a")]).is_err(), "Having a symbol starting with a number should fail." ); assert!( tokens_to_ast(vec![sym("quote"), sym("a")]).is_err(), "There should be whitespace between quote and other symbols" ); } #[test] fn test_parse_to_ast() { assert_eq!( parse_to_ast("(* 2 (+ 3 4))"), Ok(scons(scons( Mul, scons( 2, scons(scons(Add, scons(3, scons(4, Nil))), Nil) ) ), Nil)) ); } }