Radiant Log #007
A Lexer in R'
Below is the full source code for a lexical scanner or lexer for the R’ language, written in R’. This is the first module I’ve ported from the original C source code to R’. The scanner is able to tokenize its own source code, marking an important milestone in the bootstrapping process.
//! Lexical scanner for the Radiance programming language.
//!
//! This module implements a hand-written scanner that tokenizes Radiance
//! source code into a stream of tokens for consumption by the parser.
use core::debug::assert;
/// Token kinds representing all lexical elements in Radiance.
///
/// This enum covers operators, keywords, literals, and structural
/// elements used by the parser to build the AST.
enum TokenKind: Debug {
/// Special end of file token generated when the input is exhausted.
Eof,
/// Special invalid token.
Invalid,
LParen, // (
RParen, // )
LBrace, // {
RBrace, // }
LBracket, // [
RBracket, // ]
Comma, // ,
Dot, // .
DotDot, // ..
Minus, // -
Plus, // +
Colon, // :
ColonColon, // ::
Semicolon, // ;
Slash, // /
Star, // *
Percent, // %
Amp, // &
Pipe, // |
Caret, // ^
Tilde, // ~
Underscore, // _
At, // @
Question, // ?
Bang, // !
BangEqual, // !=
Equal, // =
EqualEqual, // ==
Gt, // >
GtEqual, // >=
Lt, // <
LtEqual, // <=
LtLt, // <<
GtGt, // >>
Arrow, // ->
FatArrow, // =>
// Boolean operators.
Not, And, Or,
/// Eg. `input:`
Label,
/// Eg. `fnord`
Ident,
// Literals.
String, // "fnord"
Char, // 'f'
Number, // 42
True, // true
False, // false
Nil, // nil
Undefined, // undefined
// Control flow tokens.
If, Else, Return, Break,
Continue, While, For, In,
Loop, Switch, Case, Try, Catch,
Throw, Throws, Panic,
// Variable binding tokens.
Let, Mut, Const, Align,
// Module-related tokens.
Mod, Use, Super,
// Type or function attributes.
Pub, Default, Extern, Static, Test,
// Type-related tokens.
I8, I16, I32, U8, U16, U32,
Void, Fn, Bool, Enum, Struct, As
}
/// A reserved keyword.
struct Keyword {
/// Keyword string.
name: *[u8],
/// Corresponding token.
tok: TokenKind,
}
/// Keyword lookup table.
const KEYWORDS: [Keyword; 47] = [
{ "fn", TokenKind::Fn },
{ "default", TokenKind::Default },
{ "pub", TokenKind::Pub },
{ "test", TokenKind::Test },
{ "return", TokenKind::Return },
{ "while", TokenKind::While },
{ "mut", TokenKind::Mut },
{ "let", TokenKind::Let },
{ "if", TokenKind::If },
{ "else", TokenKind::Else },
{ "i8", TokenKind::I8 },
{ "i16", TokenKind::I16 },
{ "i32", TokenKind::I32 },
{ "u8", TokenKind::U8 },
{ "u16", TokenKind::U16 },
{ "u32", TokenKind::U32 },
{ "bool", TokenKind::Bool },
{ "void", TokenKind::Void },
{ "true", TokenKind::True },
{ "false", TokenKind::False },
{ "nil", TokenKind::Nil },
{ "undefined", TokenKind::Undefined },
{ "loop", TokenKind::Loop },
{ "for", TokenKind::For },
{ "in", TokenKind::In },
{ "const", TokenKind::Const },
{ "break", TokenKind::Break },
{ "continue", TokenKind::Continue },
{ "enum", TokenKind::Enum },
{ "struct", TokenKind::Struct },
{ "and", TokenKind::And },
{ "or", TokenKind::Or },
{ "not", TokenKind::Not },
{ "switch", TokenKind::Switch },
{ "use", TokenKind::Use },
{ "super", TokenKind::Super },
{ "case", TokenKind::Case },
{ "try", TokenKind::Try },
{ "catch", TokenKind::Catch },
{ "extern", TokenKind::Extern },
{ "static", TokenKind::Static },
{ "mod", TokenKind::Mod },
{ "as", TokenKind::As },
{ "align", TokenKind::Align },
{ "throw", TokenKind::Throw },
{ "throws", TokenKind::Throws },
{ "panic", TokenKind::Panic },
];
/// Lexical scanner state for tokenizing Radiance source code.
///
/// Maintains position information and source buffer reference.
pub struct Scanner {
/// File path.
file: *[u8],
/// Source buffer.
source: *[u8],
/// Offset of current token into buffer.
token: u32,
/// Offset of current character being scanned.
cursor: u32,
}
/// Individual token with kind, source text, and position.
///
/// Represents a single lexical element extracted from source,
/// including its original text and byte offset for error reporting.
pub struct Token {
/// Token kind.
kind: TokenKind,
/// Token source string.
source: *[u8],
/// Byte offset of `source` in input buffer.
offset: u32,
}
/// Source code location with file and line/column information.
///
/// Used for error reporting and debugging.
pub struct Location {
/// File path.
file: *[u8],
/// line number.
line: u16,
/// Column number.
col: u16,
}
/// Create a new scanner object.
pub fn scanner(file: *[u8], source: *[u8]) -> Scanner {
assert(source.len > 0);
return Scanner { file, source, 0, 0 };
}
/// Check if we've reached the end of input.
fn isEof(s: *Scanner) -> bool {
return s.cursor >= s.source.len;
}
/// Get the current character, if any.
pub fn current(s: *Scanner) -> ?u8 {
if isEof(s) {
return nil;
}
return s.source[s.cursor];
}
/// Peek at the next character without advancing the scanner.
fn peek(s: *Scanner) -> ?u8 {
if s.cursor + 1 >= s.source.len {
return nil;
}
return s.source[s.cursor + 1];
}
/// Advance scanner and return the character that was consumed.
fn advance(s: *mut Scanner) -> u8 {
s.cursor = s.cursor + 1;
return s.source[s.cursor - 1];
}
/// Consume the expected character if it matches the current position.
fn consume(s: *mut Scanner, expected: u8) -> bool {
if let c = current(s); c == expected {
advance(s);
return true;
}
return false;
}
/// Create a token from the current scanner state.
fn tok(s: *Scanner, kind: TokenKind) -> Token {
return Token {
kind: kind,
source: &s.source[s.token..s.cursor],
offset: s.token,
};
}
/// Create an invalid token with the given message.
fn invalid(offset: u32, message: *[u8]) -> Token {
return Token {
kind: TokenKind::Invalid,
source: message,
offset: offset,
};
}
/// Skip whitespace characters and line comments.
fn skipWhitespace(s: *mut Scanner) {
while let ch = current(s) {
switch ch {
case ' ', '\n', '\r', '\t' => advance(s),
case '/' => {
if let c = peek(s); c == '/' {
while let ch = current(s); ch != '\n' {
advance(s);
}
} else {
return;
}
}
default => return,
}
}
}
/// Check if character is an ASCII digit (0-9).
fn isDigit(c: u8) -> bool {
return c >= '0' and c <= '9';
}
/// Check if character is a hexadecimal digit (0-9, a-f, A-F).
fn isHexDigit(c: u8) -> bool {
return (c >= '0' and c <= '9')
or (c >= 'a' and c <= 'f')
or (c >= 'A' and c <= 'F');
}
/// Check if character is a binary digit (0 or 1).
fn isBinDigit(c: u8) -> bool {
return c == '0' or c == '1';
}
/// Check if character is alphabetic.
fn isAlpha(c: u8) -> bool {
return (c >= 'a' and c <= 'z')
or (c >= 'A' and c <= 'Z');
}
/// Check if character is printable ASCII.
fn isPrint(c: u8) -> bool {
return c >= ' ' and c <= '~';
}
/// Scan numeric literal (decimal, hex, or binary).
fn scanNumber(s: *mut Scanner) -> Token {
// Check for hex literal (0x or 0X prefix)
if s.source[s.cursor - 1] == '0' {
if let ch = current(s); ch == 'x' or ch == 'X' {
advance(s);
// Must have at least one hex digit after 0x
if let ch = current(s); not isHexDigit(ch) {
return invalid(s.token, "invalid hex literal");
}
while let ch = current(s); isHexDigit(ch) {
advance(s);
}
return tok(s, TokenKind::Number);
}
// Check for binary literal (0b or 0B prefix)
if let ch = current(s); ch == 'b' or ch == 'B' {
advance(s);
// Must have at least one binary digit after 0b
if let ch = current(s); not isBinDigit(ch) {
return invalid(s.token, "invalid binary literal");
}
while let ch = current(s); isBinDigit(ch) {
advance(s);
}
return tok(s, TokenKind::Number);
}
}
// Regular decimal number
while let ch = current(s); isDigit(ch) {
advance(s);
}
// Look for decimal part.
if let ch = current(s); ch == '.' {
if let p = peek(s); isDigit(p) {
advance(s); // Consume the "."
while let ch = current(s); isDigit(ch) {
advance(s);
}
}
}
return tok(s, TokenKind::Number);
}
fn scanDelimited(s: *mut Scanner, delim: u8, kind: TokenKind) -> ?Token {
while let ch = current(s); ch != delim {
if not isPrint(ch) {
return invalid(s.token, "invalid character");
}
consume(s, '\\'); // Consume escapes
advance(s);
}
if not consume(s, delim) {
return nil;
}
return tok(s, kind);
}
/// Scan string literal enclosed in double quotes.
fn scanString(s: *mut Scanner) -> Token {
if let tok = scanDelimited(s, '"', TokenKind::String) {
return tok;
}
return invalid(s.token, "unterminated string");
}
/// Scan character literal enclosed in single quotes.
fn scanChar(s: *mut Scanner) -> Token {
if let tok = scanDelimited(s, '\'', TokenKind::Char) {
return tok;
}
return invalid(s.token, "unterminated character");
}
/// Scan a keyword or an identifier.
fn keywordOrIdent(src: *[u8]) -> TokenKind {
for kw in KEYWORDS {
if kw.name == src {
return kw.tok;
}
}
return TokenKind::Ident;
}
/// Scan an identifier, keyword, or label.
fn scanIdentifier(s: *mut Scanner) -> Token {
while let ch = current(s); isAlpha(ch) or isDigit(ch) or ch == '_' or ch == '#' {
advance(s);
}
return tok(s, keywordOrIdent(&s.source[s.token..s.cursor]));
}
/// Scan the next token.
pub fn next(s: *mut Scanner) -> Token {
skipWhitespace(s); // Skip any whitespace between tokens.
s.token = s.cursor; // Token starts at current position.
if isEof(s) {
return tok(s, TokenKind::Eof);
}
let c: u8 = advance(s);
if isDigit(c) {
return scanNumber(s);
}
if isAlpha(c) {
return scanIdentifier(s);
}
switch c {
case '\'' => return scanChar(s),
case '"' => return scanString(s),
case '(' => return tok(s, TokenKind::LParen),
case ')' => return tok(s, TokenKind::RParen),
case '{' => return tok(s, TokenKind::LBrace),
case '}' => return tok(s, TokenKind::RBrace),
case '[' => return tok(s, TokenKind::LBracket),
case ']' => return tok(s, TokenKind::RBracket),
case ';' => return tok(s, TokenKind::Semicolon),
case ',' => return tok(s, TokenKind::Comma),
case '.' => {
if consume(s, '.') {
return tok(s, TokenKind::DotDot);
}
return tok(s, TokenKind::Dot);
}
case ':' => {
if consume(s, ':') {
return tok(s, TokenKind::ColonColon);
}
return tok(s, TokenKind::Colon);
}
case '-' => {
if consume(s, '>') {
return tok(s, TokenKind::Arrow);
}
// If followed by a digit, scan as negative number
if let ch = current(s); isDigit(ch) {
return scanNumber(s);
}
return tok(s, TokenKind::Minus);
}
case '+' => return tok(s, TokenKind::Plus),
case '/' => return tok(s, TokenKind::Slash),
case '*' => return tok(s, TokenKind::Star),
case '%' => return tok(s, TokenKind::Percent),
case '&' => return tok(s, TokenKind::Amp),
case '?' => return tok(s, TokenKind::Question),
case '|' => return tok(s, TokenKind::Pipe),
case '^' => return tok(s, TokenKind::Caret),
case '~' => return tok(s, TokenKind::Tilde),
case '!' => {
if consume(s, '=') {
return tok(s, TokenKind::BangEqual);
}
return tok(s, TokenKind::Bang);
}
case '=' => {
if consume(s, '>') {
return tok(s, TokenKind::FatArrow);
}
if consume(s, '=') {
return tok(s, TokenKind::EqualEqual);
}
return tok(s, TokenKind::Equal);
}
case '<' => {
if consume(s, '<') {
return tok(s, TokenKind::LtLt);
}
if consume(s, '=') {
return tok(s, TokenKind::LtEqual);
}
return tok(s, TokenKind::Lt);
}
case '>' => {
if consume(s, '>') {
return tok(s, TokenKind::GtGt);
}
if consume(s, '=') {
return tok(s, TokenKind::GtEqual);
}
return tok(s, TokenKind::Gt);
}
case '@' => return tok(s, TokenKind::At),
case '_' => {
if let ch = current(s); isAlpha(ch) or isDigit(ch) {
// This is part of an identifier like `_foo` or `_123`
return scanIdentifier(s);
}
return tok(s, TokenKind::Underscore);
}
}
return invalid(s.token, "unexpected character");
}
As you’ll notice, R’ borrows heavily from Rust and Zig, with a couple syntactic forms taken from Swift and Haskell.