feat(tazjin/rlox): Implement single-character scanning

... still not that interesting, but at this point slightly divergent
from the book:

The book embraces mutability for interpreter state, initially for
tracking whether an error condition has occured.

I avoid this by instead defining an error type and collecting the
error values, to be handled later on.

Notes: So far nothing special, but this is just the beginning of the
book. I like the style it is written in and it has pointed to some
interesting resources, such as a 1965 paper titled "The Next 700
Languages".

Change-Id: I030b38438fec9eb55372bf547af225138908230a
Reviewed-on: https://cl.tvl.fyi/c/depot/+/2144
Reviewed-by: tazjin <mail@tazj.in>
Tested-by: BuildkiteCI
This commit is contained in:
Vincent Ambo 2020-11-23 02:00:02 +01:00 committed by tazjin
parent 9d2b001c4c
commit 3d1b116f7f
3 changed files with 139 additions and 0 deletions

View file

@ -0,0 +1,14 @@
#[derive(Debug)]
pub enum ErrorKind {
UnexpectedChar(char),
}
#[derive(Debug)]
pub struct Error {
pub line: usize,
pub kind: ErrorKind,
}
pub fn report(loc: &str, err: &Error) {
eprintln!("[line {}] Error {}: {:?}", err.line, loc, err.kind);
}

View file

@ -4,7 +4,9 @@ use std::io;
use std::io::Write;
use std::process;
mod errors;
mod interpreter;
mod scanner;
fn main() {
let mut args = env::args();

View file

@ -0,0 +1,123 @@
use crate::errors::{Error, ErrorKind};
#[derive(Debug)]
pub enum TokenKind {
// Single-character tokens.
LeftParen,
RightParen,
LeftBrace,
RightBrace,
Comma,
Dot,
Minus,
Plus,
Semicolon,
Slash,
Star,
// One or two character tokens.
Bang,
BangEqual,
Equal,
EqualEqual,
Greater,
GreaterEqual,
Less,
LessEqual,
// Literals.
Identifier,
String,
Number,
// Keywords.
And,
Class,
Else,
False,
Fun,
For,
If,
Nil,
Or,
Print,
Return,
Super,
This,
True,
Var,
While,
// Special things
Eof,
}
#[derive(Debug)]
pub struct Token<'a> {
kind: TokenKind,
lexeme: &'a str,
// literal: Object, // TODO(tazjin): Uhh?
line: usize,
}
struct Scanner<'a> {
source: &'a str,
tokens: Vec<Token<'a>>,
errors: Vec<Error>,
start: usize, // offset of first character in current lexeme
current: usize, // current offset into source
line: usize, // current line in source
}
impl<'a> Scanner<'a> {
fn is_at_end(&self) -> bool {
return self.current >= self.source.len();
}
fn advance(&mut self) -> char {
self.current += 1;
// TODO(tazjin): Due to utf8-safety, this is a bit annoying.
// Since string iteration is not the point here I'm just
// leaving this as is for now.
self.source.chars().nth(self.current - 1).unwrap()
}
fn add_token(&mut self, kind: TokenKind) {
let lexeme = &self.source[self.start..self.current];
self.tokens.push(Token {
kind,
lexeme,
line: self.line,
})
}
fn scan_token(&mut self) {
match self.advance() {
'(' => self.add_token(TokenKind::LeftParen),
')' => self.add_token(TokenKind::RightParen),
'{' => self.add_token(TokenKind::LeftBrace),
'}' => self.add_token(TokenKind::RightBrace),
',' => self.add_token(TokenKind::Comma),
'.' => self.add_token(TokenKind::Dot),
'-' => self.add_token(TokenKind::Minus),
'+' => self.add_token(TokenKind::Plus),
';' => self.add_token(TokenKind::Semicolon),
'*' => self.add_token(TokenKind::Star),
unexpected => self.errors.push(Error {
line: self.line,
kind: ErrorKind::UnexpectedChar(unexpected),
}),
};
}
fn scan_tokens(mut self) -> Vec<Token<'a>> {
while !self.is_at_end() {
self.start = self.current;
self.scan_token();
}
return self.tokens;
}
}