From 07b0a3b2583e7b17ef477c1e57a6c35a60711847 Mon Sep 17 00:00:00 2001 From: "Chloe M." Date: Sat, 23 May 2026 10:53:41 +0000 Subject: lexer: Add scanning of identifiers Signed-off-by: Chloe M. --- core/lexer.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/cescal/state.h | 2 ++ include/cescal/token.h | 1 + 3 files changed, 87 insertions(+) diff --git a/core/lexer.c b/core/lexer.c index a11a630..51e4d40 100644 --- a/core/lexer.c +++ b/core/lexer.c @@ -5,8 +5,35 @@ #include #include +#include +#include #include "cescal/lexer.h" #include "cescal/log.h" +#include "cescal/ptrbox.h" + +static inline void +lexer_putback(struct cescal_state *state, char c) +{ + if (state == NULL) { + return; + } + + state->lex_putback = c; +} + +static inline char +lexer_putback_pop(struct cescal_state *state) +{ + char retc; + + if (state == NULL) { + return '\0'; + } + + retc = state->lex_putback; + state->lex_putback = '\0'; + return retc; +} /* * Returns true if the given character is a whitespace @@ -44,6 +71,12 @@ lexer_consume_single(struct cescal_state *state, bool skip_ws) return '\0'; } + if ((c = lexer_putback_pop(state)) != '\0') { + if (lexer_is_ws(c) && !skip_ws) { + return c; + } + } + while ((c = readbuf_read(&state->rb, state->in_fd)) != '\0') { if (lexer_is_ws(c)) { continue; @@ -55,6 +88,53 @@ lexer_consume_single(struct cescal_state *state, bool skip_ws) return '\0'; } +static int +lexer_scan_ident(struct cescal_state *state, char lc, struct token *res) +{ + char *buf, c; + size_t bufsz, bufcap; + + if (state == NULL || res == NULL) { + errno = EINVAL; + return -1; + } + + bufsz = 0; + bufcap = 8; + if ((buf = malloc(bufcap)) == NULL) { + return -1; + } + + if (lc != '_' && !isalpha(lc)) { + cc_error("bad identifier\n"); + } + + buf[bufsz++] = lc; + for (;;) { + c = lexer_consume_single(state, false); + if (c != '_' && !isalnum(c)) { + lexer_putback(state, c); + buf[bufsz] = '\0'; + break; + } + + buf[bufsz++] = c; + if (bufsz >= bufcap) { + bufcap += 8; + buf = realloc(buf, bufcap); + } + + if (buf == NULL) { + return -1; + } + } + + res->s = ptrbox_strdup(&state->ptrbox, buf); + res->type = TT_IDENT; + free(buf); + return 0; +} + int lexer_nom(struct cescal_state *state, struct token *res) { @@ -82,6 +162,10 @@ lexer_nom(struct cescal_state *state, struct token *res) res->type = TT_COMMA; res->c = c; return 0; + default: + if (lexer_scan_ident(state, c, res) == 0) { + return 0; + } } cc_error("got unknown token '%c'\n", c); diff --git a/include/cescal/state.h b/include/cescal/state.h index 566b5a2..3c1ad86 100644 --- a/include/cescal/state.h +++ b/include/cescal/state.h @@ -18,12 +18,14 @@ * @rb: Read buffer * @tokbuf: Token buffer * @ptrbox: Global pointer box + * @lex_putback: Lexer putback buffer */ struct cescal_state { int in_fd; struct readbuf rb; struct tokbuf tokbuf; struct ptrbox ptrbox; + char lex_putback; }; /* diff --git a/include/cescal/token.h b/include/cescal/token.h index 0be8ff0..990df77 100644 --- a/include/cescal/token.h +++ b/include/cescal/token.h @@ -32,6 +32,7 @@ struct token { tt_t type; union { char c; + char *s; }; }; -- cgit v1.2.3