From 74e2e8c772d0f88da6684918f782b37156f10fb3 Mon Sep 17 00:00:00 2001 From: Ian Moffett Date: Sat, 23 May 2026 02:21:09 -0400 Subject: core: Add lexer + parser groundwork Signed-off-by: Ian Moffett --- core/cescal.c | 5 +++ core/lexer.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++ core/parser.c | 61 +++++++++++++++++++++++++++++++++++ include/cescal/lexer.h | 24 ++++++++++++++ include/cescal/parser.h | 20 ++++++++++++ include/cescal/token.h | 38 ++++++++++++++++++++++ 6 files changed, 232 insertions(+) create mode 100644 core/lexer.c create mode 100644 core/parser.c create mode 100644 include/cescal/lexer.h create mode 100644 include/cescal/parser.h create mode 100644 include/cescal/token.h diff --git a/core/cescal.c b/core/cescal.c index 6c828e1..0121389 100644 --- a/core/cescal.c +++ b/core/cescal.c @@ -6,6 +6,7 @@ #include #include #include "cescal/state.h" +#include "cescal/parser.h" #include "cescal/log.h" static void @@ -28,6 +29,10 @@ compile(const char *pathname) return -1; } + if (parser_parse(&st) < 0) { + return -1; + } + state_close(&st); return 0; } diff --git a/core/lexer.c b/core/lexer.c new file mode 100644 index 0000000..d6e5e66 --- /dev/null +++ b/core/lexer.c @@ -0,0 +1,84 @@ +#include +#include +#include "cescal/lexer.h" +#include "cescal/log.h" + +/* + * Returns true if the given character is a whitespace + * + * @c: Character to check + */ +static inline bool +lexer_is_ws(char c) +{ + switch (c) { + case '\t': + case '\n': + case ' ': + case '\f': + case '\r': + return true; + } + + return false; +} + +/* + * Consume a single character from the input source file and + * optionally skip whitespace + * + * @state: Compiler state + * @skip_ws: If true skip whitespace + */ +static char +lexer_consume_single(struct cescal_state *state, bool skip_ws) +{ + char c; + + if (state == NULL) { + return '\0'; + } + + while ((c = readbuf_read(&state->rb, state->in_fd)) != '\0') { + if (lexer_is_ws(c)) { + continue; + } + + return c; + } + + return '\0'; +} + +int +lexer_nom(struct cescal_state *state, struct token *res) +{ + char c; + + if (state == NULL || res == NULL) { + errno = EINVAL; + return -1; + } + + if ((c = lexer_consume_single(state, true)) == '\0') { + return -1; + } + + switch (c) { + case '(': + res->type = TT_LPAREN; + res->c = c; + return 0; + case ')': + res->type = TT_RPAREN; + res->c = c; + return 0; + case ',': + res->type = TT_COMMA; + res->c = c; + return 0; + } + + cc_error("got unknown token '%c'\n", c); + return -1; +} diff --git a/core/parser.c b/core/parser.c new file mode 100644 index 0000000..a584a56 --- /dev/null +++ b/core/parser.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2026, Chloe M. + * Provided under the BSD-3 clause + */ + +#include +#include +#include "cescal/log.h" +#include "cescal/parser.h" +#include "cescal/state.h" +#include "cescal/lexer.h" + +/* Symbolic token */ +#define symtok(tok) \ + "[" tok "]" + +/* Quoted token */ +#define qtok(tok) \ + "'" tok "'" + +/* Convert token to string */ +#define tokstr1(tt) \ + toktab[(tt)] + +/* Convert token to string */ +#define tokstr(tok) \ + toktab[(tok)->type] + +/* + * Converts numeric tokens into human readable strings + */ +static const char *toktab[] = { + [TT_NONE] = symtok("none"), + [TT_IDENT] = symtok("ident"), + [TT_INTLIT] = symtok("number"), + [TT_LPAREN] = qtok("("), + [TT_RPAREN] = qtok(")"), + [TT_COMMA] = qtok(","), + [TT_RETURN] = qtok("return"), + [TT_PUB] = qtok("pub"), + [TT_PROC] = qtok("proc"), + [TT_BEGIN] = qtok("begin"), + [TT_END] = qtok("end") +}; + +int +parser_parse(struct cescal_state *state) +{ + struct token tok; + + if (state == NULL) { + errno = EINVAL; + return -1; + } + + while (lexer_nom(state, &tok) == 0) { + cc_trace("got token %s\n", tokstr(&tok)); + } + + return 0; +} diff --git a/include/cescal/lexer.h b/include/cescal/lexer.h new file mode 100644 index 0000000..7d4a9b4 --- /dev/null +++ b/include/cescal/lexer.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2026, Chloe M. + * Provided under the BSD-3 clause + */ + +#ifndef CESCAL_LEXER_H +#define CESCAL_LEXER_H 1 + +#include +#include +#include "cescal/token.h" +#include "cescal/state.h" + +/* + * Consume a single token from the input source file + * + * @state: Compiler state + * @res: Token result is written here + * + * Returns zero on success + */ +int lexer_nom(struct cescal_state *state, struct token *res); + +#endif /* !CESCAL_LEXER_H */ diff --git a/include/cescal/parser.h b/include/cescal/parser.h new file mode 100644 index 0000000..c97006b --- /dev/null +++ b/include/cescal/parser.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2026, Chloe M. + * Provided under the BSD-3 clause + */ + +#ifndef CESCAL_PARSER_H +#define CESCAL_PARSER_H 1 + +#include "cescal/state.h" + +/* + * Begin parsing the input source file + * + * @state: Compiler state + * + * Returns zero on success + */ +int parser_parse(struct cescal_state *state); + +#endif /* !CESCAL_PARSER_H */ diff --git a/include/cescal/token.h b/include/cescal/token.h new file mode 100644 index 0000000..0be8ff0 --- /dev/null +++ b/include/cescal/token.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2026, Chloe M. + * Provided under the BSD-3 clause + */ + +#ifndef CESCAL_TOKEN_H +#define CESCAL_TOKEN_H 1 + +/* + * Represents valid source file token types + */ +typedef enum { + TT_NONE, /* [none] */ + TT_IDENT, /* [identifier] */ + TT_INTLIT, /* [0-9]+ */ + TT_LPAREN, /* '(' */ + TT_RPAREN, /* '( */ + TT_COMMA, /* ',' */ + TT_RETURN, /* 'return' */ + TT_PUB, /* 'pub' */ + TT_PROC, /* 'proc' */ + TT_BEGIN, /* 'begin' */ + TT_END, /* 'end' */ +} tt_t; + +/* + * Represents a source file token + * + * @type: Token type + */ +struct token { + tt_t type; + union { + char c; + }; +}; + +#endif /* !CESCAL_TOKEN_H */ -- cgit v1.2.3