Radiant Log #007

Below is the full source code for a lexical scanner or lexer for the R’ language, written in R’. This is the first module I’ve ported from the original C source code to R’. The scanner is able to tokenize its own source code, marking an important milestone in the bootstrapping process.
//! Lexical scanner for the Radiance programming language.
//!
//! This module implements a hand-written scanner that tokenizes Radiance
//! source code into a stream of tokens for consumption by the parser.

use core::debug::assert;

/// Token kinds representing all lexical elements in Radiance.
///
/// This union covers operators, keywords, literals, and structural
/// elements used by the parser to build the AST.
union TokenKind: Debug {
    /// Special end of file token generated when the input is exhausted.
    Eof,
    /// Special invalid token.
    Invalid,

    LParen,     // (
    RParen,     // )
    LBrace,     // {
    RBrace,     // }
    LBracket,   // [
    RBracket,   // ]
    Comma,      // ,
    Dot,        // .
    DotDot,     // ..
    Minus,      // -
    Plus,       // +
    Colon,      // :
    ColonColon, // ::
    Semicolon,  // ;
    Slash,      // /
    Star,       // *
    Percent,    // %
    Amp,        // &
    Pipe,       // |
    Caret,      // ^
    Tilde,      // ~
    Underscore, // _
    At,         // @
    Question,   // ?
    Bang,       // !
    BangEqual,  // !=
    Equal,      // =
    EqualEqual, // ==
    Gt,         // >
    GtEqual,    // >=
    Lt,         // <
    LtEqual,    // <=
    LtLt,       // <<
    GtGt,       // >>
    Arrow,      // ->
    FatArrow,   // =>

    // Boolean operators.
    Not, And, Or,

    /// Eg. `input:`
    Label,
    /// Eg. `fnord`
    Ident,

    // Literals.
    String,     // "fnord"
    Char,       // 'f'
    Number,     // 42
    True,       // true
    False,      // false
    Nil,        // nil
    Undefined,  // undefined

    // Control flow tokens.
    If, Else, Return, Break,
    Continue, While, For, In,
    Loop, Match, Case, Try, Catch,
    Throw, Throws, Panic,

    // Variable binding tokens.
    Let, Mut, Const, Align,

    // Module-related tokens.
    Mod, Use, Super,

    // Type or function attributes.
    Pub, Default, Extern, Static, Test,

    // Type-related tokens.
    I8, I16, I32, U8, U16, U32,
    Void, Fn, Bool, Union, Record, As
}

/// A reserved keyword.
record Keyword {
    /// Keyword string.
    name: *[u8],
    /// Corresponding token.
    tok: TokenKind,
}

/// Keyword lookup table.
const KEYWORDS: [Keyword; 47] = [
    { "fn", TokenKind::Fn },
    { "default", TokenKind::Default },
    { "pub", TokenKind::Pub },
    { "test", TokenKind::Test },
    { "return", TokenKind::Return },
    { "while", TokenKind::While },
    { "mut", TokenKind::Mut },
    { "let", TokenKind::Let },
    { "if", TokenKind::If },
    { "else", TokenKind::Else },
    { "i8", TokenKind::I8 },
    { "i16", TokenKind::I16 },
    { "i32", TokenKind::I32 },
    { "u8", TokenKind::U8 },
    { "u16", TokenKind::U16 },
    { "u32", TokenKind::U32 },
    { "bool", TokenKind::Bool },
    { "void", TokenKind::Void },
    { "true", TokenKind::True },
    { "false", TokenKind::False },
    { "nil", TokenKind::Nil },
    { "undefined", TokenKind::Undefined },
    { "loop", TokenKind::Loop },
    { "for", TokenKind::For },
    { "in", TokenKind::In },
    { "const", TokenKind::Const },
    { "break", TokenKind::Break },
    { "continue", TokenKind::Continue },
    { "union", TokenKind::Union },
    { "record", TokenKind::Record },
    { "and", TokenKind::And },
    { "or", TokenKind::Or },
    { "not", TokenKind::Not },
    { "match", TokenKind::Match },
    { "use", TokenKind::Use },
    { "super", TokenKind::Super },
    { "case", TokenKind::Case },
    { "try", TokenKind::Try },
    { "catch", TokenKind::Catch },
    { "extern", TokenKind::Extern },
    { "static", TokenKind::Static },
    { "mod", TokenKind::Mod },
    { "as", TokenKind::As },
    { "align", TokenKind::Align },
    { "throw", TokenKind::Throw },
    { "throws", TokenKind::Throws },
    { "panic", TokenKind::Panic },
];

/// Lexical scanner state for tokenizing Radiance source code.
///
/// Maintains position information and source buffer reference.
pub record Scanner {
    /// File path.
    file: *[u8],
    /// Source buffer.
    source: *[u8],
    /// Offset of current token into buffer.
    token: u32,
    /// Offset of current character being scanned.
    cursor: u32,
}

/// Individual token with kind, source text, and position.
///
/// Represents a single lexical element extracted from source,
/// including its original text and byte offset for error reporting.
pub record Token {
    /// Token kind.
    kind: TokenKind,
    /// Token source string.
    source: *[u8],
    /// Byte offset of `source` in input buffer.
    offset: u32,
}

/// Source code location with file and line/column information.
///
/// Used for error reporting and debugging.
pub record Location {
    /// File path.
    file: *[u8],
    /// line number.
    line: u16,
    /// Column number.
    col: u16,
}

/// Create a new scanner object.
pub fn scanner(file: *[u8], source: *[u8]) -> Scanner {
    assert(source.len > 0);
    return Scanner { file, source, 0, 0 };
}

/// Check if we've reached the end of input.
fn isEof(s: *Scanner) -> bool {
    return s.cursor >= s.source.len;
}

/// Get the current character, if any.
pub fn current(s: *Scanner) -> ?u8 {
    if isEof(s) {
        return nil;
    }
    return s.source[s.cursor];
}

/// Peek at the next character without advancing the scanner.
fn peek(s: *Scanner) -> ?u8 {
    if s.cursor + 1 >= s.source.len {
        return nil;
    }
    return s.source[s.cursor + 1];
}

/// Advance scanner and return the character that was consumed.
fn advance(s: *mut Scanner) -> u8 {
    s.cursor = s.cursor + 1;
    return s.source[s.cursor - 1];
}

/// Consume the expected character if it matches the current position.
fn consume(s: *mut Scanner, expected: u8) -> bool {
    if let c = current(s); c == expected {
        advance(s);
        return true;
    }
    return false;
}

/// Create a token from the current scanner state.
fn tok(s: *Scanner, kind: TokenKind) -> Token {
    return Token {
        kind: kind,
        source: &s.source[s.token..s.cursor],
        offset: s.token,
    };
}

/// Create an invalid token with the given message.
fn invalid(offset: u32, message: *[u8]) -> Token {
    return Token {
        kind: TokenKind::Invalid,
        source: message,
        offset: offset,
    };
}

/// Skip whitespace characters and line comments.
fn skipWhitespace(s: *mut Scanner) {
    while let ch = current(s) {
        match ch {
            case ' ', '\n', '\r', '\t' => advance(s),
            case '/' => {
                if let c = peek(s); c == '/' {
                    while let ch = current(s); ch != '\n' {
                        advance(s);
                    }
                } else {
                    return;
                }
            }
            default => return,
        }
    }
}

/// Check if character is an ASCII digit (0-9).
fn isDigit(c: u8) -> bool {
    return c >= '0' and c <= '9';
}

/// Check if character is a hexadecimal digit (0-9, a-f, A-F).
fn isHexDigit(c: u8) -> bool {
    return (c >= '0' and c <= '9')
        or (c >= 'a' and c <= 'f')
        or (c >= 'A' and c <= 'F');
}

/// Check if character is a binary digit (0 or 1).
fn isBinDigit(c: u8) -> bool {
    return c == '0' or c == '1';
}

/// Check if character is alphabetic.
fn isAlpha(c: u8) -> bool {
    return (c >= 'a' and c <= 'z')
        or (c >= 'A' and c <= 'Z');
}

/// Check if character is printable ASCII.
fn isPrint(c: u8) -> bool {
    return c >= ' ' and c <= '~';
}

/// Scan numeric literal (decimal, hex, or binary).
fn scanNumber(s: *mut Scanner) -> Token {
    // Check for hex literal (0x or 0X prefix)
    if s.source[s.cursor - 1] == '0' {
        if let ch = current(s); ch == 'x' or ch == 'X' {
            advance(s);
            // Must have at least one hex digit after 0x
            if let ch = current(s); not isHexDigit(ch) {
                return invalid(s.token, "invalid hex literal");
            }
            while let ch = current(s); isHexDigit(ch) {
                advance(s);
            }
            return tok(s, TokenKind::Number);
        }
        // Check for binary literal (0b or 0B prefix)
        if let ch = current(s); ch == 'b' or ch == 'B' {
            advance(s);
            // Must have at least one binary digit after 0b
            if let ch = current(s); not isBinDigit(ch) {
                return invalid(s.token, "invalid binary literal");
            }
            while let ch = current(s); isBinDigit(ch) {
                advance(s);
            }
            return tok(s, TokenKind::Number);
        }
    }

    // Regular decimal number
    while let ch = current(s); isDigit(ch) {
        advance(s);
    }

    // Look for decimal part.
    if let ch = current(s); ch == '.' {
        if let p = peek(s); isDigit(p) {
            advance(s); // Consume the "."
            while let ch = current(s); isDigit(ch) {
                advance(s);
            }
        }
    }
    return tok(s, TokenKind::Number);
}

fn scanDelimited(s: *mut Scanner, delim: u8, kind: TokenKind) -> ?Token {
    while let ch = current(s); ch != delim {
        if not isPrint(ch) {
            return invalid(s.token, "invalid character");
        }
        consume(s, '\\'); // Consume escapes
        advance(s);
    }
    if not consume(s, delim) {
        return nil;
    }
    return tok(s, kind);
}

/// Scan string literal enclosed in double quotes.
fn scanString(s: *mut Scanner) -> Token {
    if let tok = scanDelimited(s, '"', TokenKind::String) {
        return tok;
    }
    return invalid(s.token, "unterminated string");
}

/// Scan character literal enclosed in single quotes.
fn scanChar(s: *mut Scanner) -> Token {
    if let tok = scanDelimited(s, '\'', TokenKind::Char) {
        return tok;
    }
    return invalid(s.token, "unterminated character");
}

/// Scan a keyword or an identifier.
fn keywordOrIdent(src: *[u8]) -> TokenKind {
    for kw in KEYWORDS {
        if kw.name == src {
            return kw.tok;
        }
    }
    return TokenKind::Ident;
}

/// Scan an identifier, keyword, or label.
fn scanIdentifier(s: *mut Scanner) -> Token {
    while let ch = current(s); isAlpha(ch) or isDigit(ch) or ch == '_' or ch == '#' {
        advance(s);
    }
    return tok(s, keywordOrIdent(&s.source[s.token..s.cursor]));
}

/// Scan the next token.
pub fn next(s: *mut Scanner) -> Token {
    skipWhitespace(s);  // Skip any whitespace between tokens.
    s.token = s.cursor; // Token starts at current position.

    if isEof(s) {
        return tok(s, TokenKind::Eof);
    }
    let c: u8 = advance(s);

    if isDigit(c) {
        return scanNumber(s);
    }
    if isAlpha(c) {
        return scanIdentifier(s);
    }
    match c {
        case '\'' => return scanChar(s),
        case '"'  => return scanString(s),
        case '('  => return tok(s, TokenKind::LParen),
        case ')'  => return tok(s, TokenKind::RParen),
        case '{'  => return tok(s, TokenKind::LBrace),
        case '}'  => return tok(s, TokenKind::RBrace),
        case '['  => return tok(s, TokenKind::LBracket),
        case ']'  => return tok(s, TokenKind::RBracket),
        case ';'  => return tok(s, TokenKind::Semicolon),
        case ','  => return tok(s, TokenKind::Comma),
        case '.'  => {
            if consume(s, '.') {
                return tok(s, TokenKind::DotDot);
            }
            return tok(s, TokenKind::Dot);
        }
        case ':'  => {
            if consume(s, ':') {
                return tok(s, TokenKind::ColonColon);
            }
            return tok(s, TokenKind::Colon);
        }
        case '-'  => {
            if consume(s, '>') {
                return tok(s, TokenKind::Arrow);
            }
            // If followed by a digit, scan as negative number
            if let ch = current(s); isDigit(ch) {
                return scanNumber(s);
            }
            return tok(s, TokenKind::Minus);
        }
        case '+' => return tok(s, TokenKind::Plus),
        case '/' => return tok(s, TokenKind::Slash),
        case '*' => return tok(s, TokenKind::Star),
        case '%' => return tok(s, TokenKind::Percent),
        case '&' => return tok(s, TokenKind::Amp),
        case '?' => return tok(s, TokenKind::Question),
        case '|' => return tok(s, TokenKind::Pipe),
        case '^' => return tok(s, TokenKind::Caret),
        case '~' => return tok(s, TokenKind::Tilde),
        case '!' => {
            if consume(s, '=') {
                return tok(s, TokenKind::BangEqual);
            }
            return tok(s, TokenKind::Bang);
        }
        case '=' => {
            if consume(s, '>') {
                return tok(s, TokenKind::FatArrow);
            }
            if consume(s, '=') {
                return tok(s, TokenKind::EqualEqual);
            }
            return tok(s, TokenKind::Equal);
        }
        case '<' => {
            if consume(s, '<') {
                return tok(s, TokenKind::LtLt);
            }
            if consume(s, '=') {
                return tok(s, TokenKind::LtEqual);
            }
            return tok(s, TokenKind::Lt);
        }
        case '>' => {
            if consume(s, '>') {
                return tok(s, TokenKind::GtGt);
            }
            if consume(s, '=') {
                return tok(s, TokenKind::GtEqual);
            }
            return tok(s, TokenKind::Gt);
        }
        case '@' => return tok(s, TokenKind::At),
        case '_' => {
            if let ch = current(s); isAlpha(ch) or isDigit(ch) {
                // This is part of an identifier like `_foo` or `_123`
                return scanIdentifier(s);
            }
            return tok(s, TokenKind::Underscore);
        }
    }
    return invalid(s.token, "unexpected character");
}
As you’ll notice, R’ borrows heavily from Rust and Zig, with a couple syntactic forms taken from Swift and Haskell.