From 74e2e8c772d0f88da6684918f782b37156f10fb3 Mon Sep 17 00:00:00 2001
From: Ian Moffett <ian@mirocom.org>
Date: Sat, 23 May 2026 02:21:09 -0400
Subject: core: Add lexer + parser groundwork

Signed-off-by: Ian Moffett <ian@mirocom.org>
---
 core/cescal.c           |  5 +++
 core/lexer.c            | 84 +++++++++++++++++++++++++++++++++++++++++++++++++
 core/parser.c           | 61 +++++++++++++++++++++++++++++++++++
 include/cescal/lexer.h  | 24 ++++++++++++++
 include/cescal/parser.h | 20 ++++++++++++
 include/cescal/token.h  | 38 ++++++++++++++++++++++
 6 files changed, 232 insertions(+)
 create mode 100644 core/lexer.c
 create mode 100644 core/parser.c
 create mode 100644 include/cescal/lexer.h
 create mode 100644 include/cescal/parser.h
 create mode 100644 include/cescal/token.h

diff --git a/core/cescal.c b/core/cescal.c
index 6c828e1..0121389 100644
--- a/core/cescal.c
+++ b/core/cescal.c
@@ -6,6 +6,7 @@
 #include <stdio.h>
 #include <unistd.h>
 #include "cescal/state.h"
+#include "cescal/parser.h"
 #include "cescal/log.h"
 
 static void
@@ -28,6 +29,10 @@ compile(const char *pathname)
         return -1;
     }
 
+    if (parser_parse(&st) < 0) {
+        return -1;
+    }
+
     state_close(&st);
     return 0;
 }
diff --git a/core/lexer.c b/core/lexer.c
new file mode 100644
index 0000000..d6e5e66
--- /dev/null
+++ b/core/lexer.c
@@ -0,0 +1,84 @@
+#include <errno.h>
+#include <stdbool.h>
+#include "cescal/lexer.h"
+#include "cescal/log.h"
+
+/*
+ * Returns true if the given character is a whitespace
+ *
+ * @c: Character to check
+ */
+static inline bool
+lexer_is_ws(char c)
+{
+    switch (c) {
+    case '\t':
+    case '\n':
+    case ' ':
+    case '\f':
+    case '\r':
+        return true;
+    }
+
+    return false;
+}
+
+/*
+ * Consume a single character from the input source file and
+ * optionally skip whitespace
+ *
+ * @state: Compiler state
+ * @skip_ws: If true skip whitespace
+ */
+static char
+lexer_consume_single(struct cescal_state *state, bool skip_ws)
+{
+    char c;
+
+    if (state == NULL) {
+        return '\0';
+    }
+
+    while ((c = readbuf_read(&state->rb, state->in_fd)) != '\0') {
+        if (lexer_is_ws(c)) {
+            continue;
+        }
+
+        return c;
+    }
+
+    return '\0';
+}
+
+int
+lexer_nom(struct cescal_state *state, struct token *res)
+{
+    char c;
+
+    if (state == NULL || res == NULL) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    if ((c = lexer_consume_single(state, true)) == '\0') {
+        return -1;
+    }
+
+    switch (c) {
+    case '(':
+        res->type = TT_LPAREN;
+        res->c = c;
+        return 0;
+    case ')':
+        res->type = TT_RPAREN;
+        res->c = c;
+        return 0;
+    case ',':
+        res->type = TT_COMMA;
+        res->c = c;
+        return 0;
+    }
+
+    cc_error("got unknown token '%c'\n", c);
+    return -1;
+}
diff --git a/core/parser.c b/core/parser.c
new file mode 100644
index 0000000..a584a56
--- /dev/null
+++ b/core/parser.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2026, Chloe M.
+ * Provided under the BSD-3 clause
+ */
+
+#include <stdint.h>
+#include <errno.h>
+#include "cescal/log.h"
+#include "cescal/parser.h"
+#include "cescal/state.h"
+#include "cescal/lexer.h"
+
+/* Symbolic token */
+#define symtok(tok) \
+    "[" tok "]"
+
+/* Quoted token */
+#define qtok(tok) \
+    "'" tok "'"
+
+/* Convert token to string */
+#define tokstr1(tt) \
+    toktab[(tt)]
+
+/* Convert token to string */
+#define tokstr(tok) \
+    toktab[(tok)->type]
+
+/*
+ * Converts numeric tokens into human readable strings
+ */
+static const char *toktab[] = {
+    [TT_NONE]   = symtok("none"),
+    [TT_IDENT]  = symtok("ident"),
+    [TT_INTLIT] = symtok("number"),
+    [TT_LPAREN] = qtok("("),
+    [TT_RPAREN] = qtok(")"),
+    [TT_COMMA]  = qtok(","),
+    [TT_RETURN] = qtok("return"),
+    [TT_PUB]    = qtok("pub"),
+    [TT_PROC]   = qtok("proc"),
+    [TT_BEGIN]  = qtok("begin"),
+    [TT_END]    = qtok("end")
+};
+
+int
+parser_parse(struct cescal_state *state)
+{
+    struct token tok;
+
+    if (state == NULL) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    while (lexer_nom(state, &tok) == 0) {
+        cc_trace("got token %s\n", tokstr(&tok));
+    }
+
+    return 0;
+}
diff --git a/include/cescal/lexer.h b/include/cescal/lexer.h
new file mode 100644
index 0000000..7d4a9b4
--- /dev/null
+++ b/include/cescal/lexer.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2026, Chloe M.
+ * Provided under the BSD-3 clause
+ */
+
+#ifndef CESCAL_LEXER_H
+#define CESCAL_LEXER_H 1
+
+#include <stdint.h>
+#include <stddef.h>
+#include "cescal/token.h"
+#include "cescal/state.h"
+
+/*
+ * Consume a single token from the input source file
+ *
+ * @state: Compiler state
+ * @res:   Token result is written here
+ *
+ * Returns zero on success
+ */
+int lexer_nom(struct cescal_state *state, struct token *res);
+
+#endif  /* !CESCAL_LEXER_H */
diff --git a/include/cescal/parser.h b/include/cescal/parser.h
new file mode 100644
index 0000000..c97006b
--- /dev/null
+++ b/include/cescal/parser.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2026, Chloe M.
+ * Provided under the BSD-3 clause
+ */
+
+#ifndef CESCAL_PARSER_H
+#define CESCAL_PARSER_H 1
+
+#include "cescal/state.h"
+
+/*
+ * Begin parsing the input source file
+ *
+ * @state:  Compiler state
+ *
+ * Returns zero on success
+ */
+int parser_parse(struct cescal_state *state);
+
+#endif  /* !CESCAL_PARSER_H */
diff --git a/include/cescal/token.h b/include/cescal/token.h
new file mode 100644
index 0000000..0be8ff0
--- /dev/null
+++ b/include/cescal/token.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2026, Chloe M.
+ * Provided under the BSD-3 clause
+ */
+
+#ifndef CESCAL_TOKEN_H
+#define CESCAL_TOKEN_H 1
+
+/*
+ * Represents valid source file token types
+ */
+typedef enum {
+    TT_NONE,        /* [none] */
+    TT_IDENT,       /* [identifier] */
+    TT_INTLIT,      /* [0-9]+ */
+    TT_LPAREN,      /* '(' */
+    TT_RPAREN,      /* '( */
+    TT_COMMA,       /* ',' */
+    TT_RETURN,      /* 'return' */
+    TT_PUB,         /* 'pub' */
+    TT_PROC,        /* 'proc' */
+    TT_BEGIN,       /* 'begin' */
+    TT_END,         /* 'end' */
+} tt_t;
+
+/*
+ * Represents a source file token
+ *
+ * @type: Token type
+ */
+struct token {
+    tt_t type;
+    union {
+        char c;
+    };
+};
+
+#endif  /* !CESCAL_TOKEN_H */
-- 
cgit v1.2.3