#include "lexer.h" #include "vector.h" #include #include typedef struct { uint32_t start; uint32_t current; Source *src; Location loc; Location start_loc; TokenVec tokens; LexingErrorVec errors; } Lexer; static inline __attribute__((always_inline)) Token token(Source *src, TokenType type, const char *lexeme, uint32_t len, uint64_t lit, Location loc) { IF_DEBUG(src->ref_count++); return (Token){ .src = src, .lit = lit, .span.loc = loc, .span.len = len, .type = type, .lexeme = lexeme, }; } static inline __attribute__((always_inline)) LexingError lexing_error(Source *src, LexingErrorType type, Location loc, uint32_t len) { IF_DEBUG(src->ref_count++); return (LexingError){ .src = src, .type = type, .span.loc = loc, .span.len = len, }; } void token_drop(Token t) { IF_DEBUG(t.src->ref_count--); } void lexing_error_drop(LexingError e) { IF_DEBUG(e.src->ref_count--); } void lexing_result_drop(LexingResult res) { vec_drop(res.tokens); vec_drop(res.errors); } static Lexer lexer_init(Source *src) { TokenVec tokens = vec_init(); vec_grow(&tokens, 256); return (Lexer){ .start = 0, .current = 0, .src = src, .loc = location(1, 0, 0), .start_loc = location(1, 1, 0), .tokens = tokens, .errors = vec_init(), }; } static void lexer_add_token(Lexer *lex, TokenType type, uint32_t len, uint64_t lit) { vec_push(&lex->tokens, token(lex->src, type, &lex->src->str[lex->start], len, lit, lex->start_loc)); } static void lexer_add_error(Lexer *lex, LexingErrorType type, uint32_t len) { vec_push(&lex->errors, lexing_error(lex->src, type, lex->start_loc, len)); } static char lexer_advance(Lexer *lex) { char c = lex->src->str[lex->current++]; lex->loc.offset = lex->current; lex->loc.column++; if (c == '\n') { lex->loc.line++; lex->loc.column = 0; } return c; } static bool lexer_match(Lexer *lex, char exp) { if (lex->current >= lex->src->len) return false; if (lex->src->str[lex->current] != exp) return false; lexer_advance(lex); return true; } static bool lexer_match_not(Lexer *lex, char unexp) { if (lex->current >= lex->src->len) return false; if (lex->src->str[lex->current] == unexp) return false; lexer_advance(lex); return true; } static char lexer_peek(Lexer *lex) { return lex->src->str[lex->current]; } inline static bool is_digit(char c) { return c >= '0' && c <= '9'; } inline static uint64_t to_digit(char c) { return c - '0'; } inline static bool is_ident_start(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } inline static bool is_ident(char c) { return is_ident_start(c) || is_digit(c) || c == '_'; } static void lexer_scan_number(Lexer *lex) { // Get first digit (the one we already passed) uint64_t lit = to_digit(lex->src->str[lex->start]); uint32_t len = 1; char c = lexer_peek(lex); bool overflow = false; while (is_digit(c)) { uint64_t nlit = lit * 10 + to_digit(c); if (nlit < lit) { // overflow overflow = true; } lit = nlit; lexer_advance(lex); c = lexer_peek(lex); len++; } if (overflow) { lexer_add_error(lex, LexingErrorNumberLiteralOverflow, len); } lexer_add_token(lex, Number, len, lit); } static uint32_t u32max(uint32_t a, uint32_t b) { return a > b ? a : b; } static inline __attribute__((always_inline)) void lexer_scan_ident(Lexer *lex) { uint32_t len = 1; while (is_ident(lexer_peek(lex))) { lexer_advance(lex); len++; } const char *s = &lex->src->str[lex->start]; #define handle(x, str) else if (strncmp(str, s, u32max(sizeof(str) - 1, len)) == 0) lexer_add_token(lex, x, len, 0) if (false) ; handle(Messages, "messages"); handle(Struct, "struct"); handle(Version, "version"); handle(Const, "const"); handle(Type, "type"); else lexer_add_token(lex, Ident, len, 0); #undef handle } static void lexer_scan(Lexer *lex) { char c = lexer_advance(lex); switch (c) { case '(': lexer_add_token(lex, LeftParen, 1, 0); break; case ')': lexer_add_token(lex, RightParen, 1, 0); break; case '{': lexer_add_token(lex, LeftBrace, 1, 0); break; case '}': lexer_add_token(lex, RightBrace, 1, 0); break; case '[': lexer_add_token(lex, LeftBracket, 1, 0); break; case ']': lexer_add_token(lex, RightBracket, 1, 0); break; case ',': lexer_add_token(lex, Comma, 1, 0); break; case ';': lexer_add_token(lex, Semicolon, 1, 0); break; case '&': lexer_add_token(lex, Ampersand, 1, 0); break; case '^': lexer_add_token(lex, Caret, 1, 0); break; case ':': lexer_add_token(lex, Colon, 1, 0); break; case '=': lexer_add_token(lex, Equal, 1, 0); break; case '#': lexer_add_token(lex, Hash, 1, 0); case '/': if (lexer_match(lex, '/')) { while (lexer_match_not(lex, '\n')) ; } break; case ' ': case '\t': case '\n': case '\r': break; default: if (is_digit(c)) { lexer_scan_number(lex); } else if (is_ident_start(c)) { lexer_scan_ident(lex); } else { // Try to merge with the last error if possible if (lex->errors.len > 0) { LexingError *last_err = &lex->errors.data[lex->errors.len - 1]; if (last_err->span.loc.line == lex->loc.line && last_err->type == LexingErrorUnexpectedCharacter && last_err->span.loc.column + last_err->span.len == lex->start_loc.column) { last_err->span.len++; break; } } lexer_add_error(lex, LexingErrorUnexpectedCharacter, 1); } } } static void lexer_lex(Lexer *lex) { while (lex->current < lex->src->len) { lex->start = lex->current; lex->start_loc = lex->loc; lexer_scan(lex); } lex->start = lex->current; lex->start_loc = lex->loc; lexer_add_token(lex, Eof, 0, 0); } static LexingResult lexer_finish(Lexer lex) { return (LexingResult){ .errors = lex.errors, .tokens = lex.tokens, }; } LexingResult lex(Source *src) { Lexer lex = lexer_init(src); lexer_lex(&lex); return lexer_finish(lex); } void lexing_error_report(LexingError *le) { ReportSpan span = {.span = le->span, .sev = ReportSeverityError}; #define report(fmt, ...) source_report(le->src, le->span.loc, ReportSeverityError, &span, 1, NULL, fmt, __VA_ARGS__); switch (le->type) { case LexingErrorUnexpectedCharacter: report("Unexpected character%s '%.*s'", le->span.len > 1 ? "s" : "", le->span.len, &le->src->str[le->span.loc.offset]); break; case LexingErrorNumberLiteralOverflow: report("number literal '%.*s' overflows max value of %lu", le->span.len, &le->src->str[le->span.loc.offset], UINT64_MAX); break; default: break; } #undef report } char *token_type_string(TokenType t) { TokenType types[TOKEN_TYPE_COUNT]; size_t count = 0; #define handle(type) \ if (t & type) \ types[count++] = type handle(LeftParen); handle(RightParen); handle(LeftBrace); handle(RightBrace); handle(LeftBracket); handle(RightBracket); handle(Comma); handle(Semicolon); handle(Ampersand); handle(Caret); handle(Colon); handle(Equal); handle(Ident); handle(Number); handle(Messages); handle(Struct); handle(Version); handle(Const); handle(Type); handle(Eof); #undef handle CharVec str = vec_init(); for (size_t i = 0; i < count; i++) { if (i == 0) { } else if (i == count - 1) { vec_push_array(&str, " or ", 4); } else { vec_push_array(&str, ", ", 2); } switch (types[i]) { case LeftParen: vec_push_array(&str, "'('", 3); break; case RightParen: vec_push_array(&str, "')'", 3); break; case LeftBrace: vec_push_array(&str, "'{'", 3); break; case RightBrace: vec_push_array(&str, "'}'", 3); break; case LeftBracket: vec_push_array(&str, "'['", 3); break; case RightBracket: vec_push_array(&str, "']'", 3); break; case Comma: vec_push_array(&str, "','", 3); break; case Semicolon: vec_push_array(&str, "';'", 3); break; case Ampersand: vec_push_array(&str, "'&'", 3); break; case Caret: vec_push_array(&str, "'^'", 3); break; case Colon: vec_push_array(&str, "':'", 3); break; case Equal: vec_push_array(&str, "'='", 3); break; case Hash: vec_push_array(&str, "'#'", 3); break; case Ident: vec_push_array(&str, "identifier", 10); break; case Number: vec_push_array(&str, "number literal", 15); break; case Messages: vec_push_array(&str, "keyword messages", 16); break; case Struct: vec_push_array(&str, "keyword struct", 14); break; case Version: vec_push_array(&str, "keyword version", 15); break; case Const: vec_push_array(&str, "keyword const", 13); break; case Type: vec_push_array(&str, "keyword type", 12); break; case Eof: vec_push_array(&str, "end of file", 11); break; } } vec_push(&str, '\0'); return str.data; }