Starting with the lexer, i'm starting to comprehend better what pratt parsing is and how to apply it. For now just declaring basic structure and functions

I think i'm done, lexer works fine and errors work fine too, amazing actually, should be moving on to the parser so that i can construct the expression tree
Damn, it works, the lexer actually works, that's amazing, need to test the bad cases but at least i'm sure it can detect and process correct math expressions
2026-03-13 07:58:38 -06:00 · 2026-03-10 07:27:35 -06:00 · 2026-03-10 07:08:12 -06:00 · 2026-03-09 11:58:55 -06:00 · 2026-03-09 09:23:06 -06:00 · 2026-03-09 09:06:06 -06:00
7 changed files with 577 additions and 3 deletions
--- a/include/lexer.h
+++ b/include/lexer.h
@@ -0,0 +1,78 @@
+#ifndef LEXER_H
+#define LEXER_H
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+// For identifing
+typedef enum {
+    NODE_INTEGER,
+    NODE_BINARY_OP,
+} ASTNodeType;
+
+// For classify operators
+typedef enum {
+    OP_ADD,
+    OP_SUB,
+    OP_MUL,
+    OP_DIV
+} Operator;
+
+typedef enum {
+    ARRAY_OK = 0,
+    ARRAY_NULL,
+    ARRAY_EMPTY,
+    ARRAY_OUT_OF_BOUNDS,
+    ARRAY_NULL_ARG,
+    ARRAY_ALLOC,
+} ASTNodeArrayErr;
+
+typedef enum {
+    LEXER_OK = 0,
+    LEXER_INT_OVERFLOW,
+    LEXER_FAILED_NUMBER_CONVERSION,
+    LEXER_NOT_RECOGNIZED_SYMBOL,
+    LEXER_EMPTY_INPUT,
+    LEXER_NULL_ARG,
+    LEXER_WRONG_SYNTAX,
+    LEXER_BUF_OVERFLOW,
+} LexerErr;
+
+// Can be thought as tokens, they will be used by the parser.
+typedef struct ASTNode {
+    ASTNodeType type;
+    union {
+        int64_t integer;
+        struct {
+            struct ASTNode *left;
+            struct ASTNode *right;
+            Operator op;
+        } binary;
+    } data;
+} ASTNode;
+
+// I prefer ot have a dynamic array for storing the "tokens"
+typedef struct {
+    size_t len;
+    size_t cap;
+    ASTNode *data;
+} ASTNodeArray;
+
+// Basic array functionality
+ASTNodeArray ASTNodeArray_init(size_t size);
+void ASTNodeArray_free(ASTNodeArray *arr);
+ASTNodeArrayErr ASTNodeArray_push(ASTNodeArray *arr, ASTNode node);
+ASTNodeArrayErr ASTNodeArray_get(const ASTNodeArray *arr, size_t index, ASTNode *out);
+// Out in pop can be NULL so it doesn't return anything
+ASTNodeArrayErr ASTNodeArray_pop(ASTNodeArray *arr, size_t index, ASTNode *out);
+size_t ASTNodeArray_len(ASTNodeArray *arr);
+
+// Lexer funtions as well as few functionality
+LexerErr tokenize(const char* input, ASTNodeArray *out);
+LexerErr tokenize_number(const char* input, size_t *offset, ASTNode *out);
+LexerErr string_to_integer(const char buf[], int64_t *number);
+bool isoperator(int c);
+Operator char_to_operator(int c);
+
+#endif // !LEXER_H
--- a/include/parser.h
+++ b/include/parser.h
@@ -0,0 +1,14 @@
+#include "lexer.h"
+
+typedef struct {
+    ASTNode *head;
+} AST;
+
+typedef enum {
+    PARSER_NUD, // Null Denotation
+    PARSER_LED, // Left Denotation
+} ParserState;
+
+size_t node_lbp(Operator op);
+size_t node_rbp(Operator op);
+AST parse(ASTNodeArray arr);
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -0,0 +1,250 @@
+#include "lexer.h"
+#include <ctype.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <limits.h>
+
+#define NODE_ARRAY_DEFAULT_SIZE 64
+// Helps  state machine for the lexer :)
+typedef enum {
+    WAIT_FOR_NUMBER,
+    WAIT_FOR_OPERATOR,
+} LexerState;
+
+ASTNodeArray ASTNodeArray_init(size_t size) {
+    ASTNodeArray new;
+    new.len = 0;      // if 0 then use default
+    new.cap = size == 0 ? NODE_ARRAY_DEFAULT_SIZE : size;
+    new.data = malloc(new.cap * sizeof(ASTNode));
+    return new;
+}
+
+void ASTNodeArray_free(ASTNodeArray *arr) {
+    free(arr->data);
+    arr->cap = 0;
+    arr->len = 0;
+}
+
+ASTNodeArrayErr ASTNodeArray_get(const ASTNodeArray *arr, size_t index, ASTNode *out) {
+    if (arr == NULL) {
+        return ARRAY_NULL;
+    }
+
+    if (out == NULL) {
+        return ARRAY_NULL_ARG;
+    }
+
+    if (arr->len == 0) {
+        return ARRAY_EMPTY;
+    }
+
+    if (index >= arr->len) {
+        return ARRAY_OUT_OF_BOUNDS;
+    }
+
+    *out = arr->data[index];
+
+    return ARRAY_OK;
+}
+
+ASTNodeArrayErr ASTNodeArray_push(ASTNodeArray *arr, ASTNode node) {
+    if (arr == NULL) {
+        return ARRAY_NULL;
+    }
+
+    if (arr->len >= arr->cap) {
+        size_t new_cap = arr->cap * 2;
+        ASTNode *tmp = realloc(arr->data, new_cap * sizeof(ASTNode));
+        if (tmp == NULL) {
+            return ARRAY_ALLOC;
+        }
+        arr->data = tmp;
+        arr->cap = new_cap;
+    }
+
+    arr->data[arr->len] = node;
+    arr->len = arr->len + 1;
+
+    return ARRAY_OK;
+}
+
+ASTNodeArrayErr ASTNodeArray_pop(ASTNodeArray *arr, size_t index, ASTNode *out) {
+    if (arr == NULL) {
+        return ARRAY_NULL;
+    }
+
+    if (arr->len == 0) {
+        return ARRAY_EMPTY;
+    }
+
+    if (index >= arr->len) {
+        return ARRAY_OUT_OF_BOUNDS;
+    }
+
+    if (arr->cap / 4 > arr->len) {
+        size_t new_cap = arr->cap / 2;
+        ASTNode *tmp = realloc(arr->data, new_cap * sizeof(ASTNode));
+        if (tmp == NULL) {
+            return ARRAY_ALLOC;
+        }
+        arr->data = tmp;
+        arr->cap = new_cap;
+    }
+    
+    if (out != NULL) {
+        ASTNode node_to_delete = arr->data[index];
+        *out = node_to_delete;
+    }
+
+    for (size_t i = index; i < arr->len - 1; i++) {
+        arr->data[index] = arr->data[index + 1];
+    }
+
+    return ARRAY_OK;
+}
+
+size_t ASTNodeArray_len(ASTNodeArray *arr) {
+    if (arr == NULL) {
+        return 0;
+    }
+    return arr->len;
+}
+
+LexerErr tokenize(const char *input, ASTNodeArray *out) {
+    size_t offset = 0;
+    LexerState state = WAIT_FOR_NUMBER;
+    ASTNodeArray arr = ASTNodeArray_init(0); // 0 defaults to 64
+
+    while (input[offset] != '\n' && input[offset] != '\0') {
+        int current = input[offset];
+
+        if (isdigit(current)) {
+            if (state != WAIT_FOR_NUMBER) {
+                ASTNodeArray_free(&arr);
+                return LEXER_WRONG_SYNTAX;
+            }
+            ASTNode new_node;
+            LexerErr result = tokenize_number(input, &offset, &new_node);
+
+            if (result != LEXER_OK) {
+                ASTNodeArray_free(&arr);
+                return result;
+            }
+
+            ASTNodeArray_push(&arr, new_node);
+            state = WAIT_FOR_OPERATOR;
+        } else if (isoperator(current)) {
+            if (state != WAIT_FOR_OPERATOR) {
+                return LEXER_WRONG_SYNTAX;
+            }
+            ASTNode new_node = {
+                .type = NODE_BINARY_OP,
+                .data.binary.op = char_to_operator(current),
+                .data.binary.right = NULL,
+                .data.binary.left = NULL,
+            };
+
+            ASTNodeArray_push(&arr, new_node);
+            state = WAIT_FOR_NUMBER;
+        } else if (isspace(current)) {
+            // Nothing...
+        } else {
+            ASTNodeArray_free(&arr);
+            return LEXER_NOT_RECOGNIZED_SYMBOL;
+        }
+
+        offset++;
+    }
+
+    if (arr.len < 1) {
+        return LEXER_EMPTY_INPUT;
+    }
+
+    *out = arr;
+    return LEXER_OK;
+}
+
+// CURRENTLY, it only supports ints, not clear how floating
+// point is implemented but i'll figure it out
+LexerErr tokenize_number(const char *input, size_t *offset, ASTNode *out) {
+    char buf[128] = { '\0' };
+    size_t buf_pos = 0;
+    bool is_integer = true; // Will later be used to differentiate fractions
+
+    size_t current = *offset;
+    while (isdigit(input[current])) {
+        buf[buf_pos] = input[current];
+        
+        if (buf_pos >= sizeof(buf)) {
+            return LEXER_BUF_OVERFLOW;
+        }
+        current++;
+        buf_pos++;
+    }
+
+    ASTNode new_node;
+    if (is_integer) {
+        new_node.type = NODE_INTEGER;
+        LexerErr status = string_to_integer(buf, &new_node.data.integer);
+        if (status == LEXER_OK) {
+            *out = new_node;
+        }
+        *offset = current;
+        return status;
+    }
+
+    return LEXER_FAILED_NUMBER_CONVERSION;
+}
+
+LexerErr string_to_integer(const char *buf, int64_t *number) {
+    int c = 0;
+    int64_t count = 0;
+    while (buf[c] != '\0') {
+        
+        int digit = buf[c] - '0';
+
+        if (count > (INT64_MAX - digit) / 10) {
+            return LEXER_INT_OVERFLOW;
+        }
+        count = count * 10;
+        count += digit;
+        
+        c++;
+    }
+
+    *number = count;
+    return LEXER_OK;
+}
+
+bool isoperator(int c) {
+    switch (c) {
+        case '+':
+        case '-':
+        case '/':
+        case '*':
+            return true;
+        default:
+            return false;
+    }
+}
+
+Operator char_to_operator(int c) {
+    switch (c) {
+        case '+':
+            return OP_ADD;
+            break;
+        case '-':
+            return OP_SUB;
+            break;
+        case '*':
+            return OP_MUL;
+            break;
+        case '/':
+            return OP_DIV;
+            break;
+        default: // I mean shouldn't be used, we assume
+            return -1;
+    }
+}
--- a/src/parser.c
+++ b/src/parser.c
@@ -0,0 +1,27 @@
+#include "parser.h"
+#include "lexer.h"
+#include <stdlib.h>
+
+size_t node_lbp(Operator op) {
+    switch (op) {
+        case OP_ADD:
+        case OP_SUB:
+            return 10;
+            break;
+        case OP_DIV:
+        case OP_MUL:
+            return 20;
+    }
+}
+
+size_t node_rbp(Operator op) {
+    switch (op) {
+        case OP_ADD:
+        case OP_SUB:
+            return 10;
+            break;
+        case OP_DIV:
+        case OP_MUL:
+            return 20;
+    }
+}
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,10 +1,17 @@
 find_package(cmocka REQUIRED)

-add_executable(test_parser test_parser.c)
+add_executable(test_nodeArray test_ASTNodeArray.c)
+add_executable(test_lexer test_lexer.c)

-target_link_libraries(test_parser
+target_link_libraries(test_nodeArray
    calculator_lib
    cmocka::cmocka
 )

-add_test(NAME parser_tests COMMAND test_parser)
+target_link_libraries(test_lexer
+    calculator_lib
+    cmocka::cmocka
+)
+
+add_test(NAME nodeArray_tests COMMAND test_nodeArray)
+add_test(NAME lexer_tests COMMAND test_lexer)
--- a/test/test_ASTNodeArray.c
+++ b/test/test_ASTNodeArray.c
@@ -0,0 +1,86 @@
+#include "lexer.h"
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <setjmp.h>
+#include <cmocka.h>
+
+static void test_array_push(void **state) {
+    (void) state;
+
+    // We use 2 to force resize and checking anything wrong with malloc
+    ASTNodeArray arr = ASTNodeArray_init(2);
+    ASTNode node1 = {
+        .type = NODE_INTEGER,
+        .data = { .integer = 90 }
+    };
+
+    ASTNode node2 = {
+        .type = NODE_INTEGER,
+        .data = { .integer = 80 }
+    };
+
+    ASTNode node3 = {
+        .type = NODE_INTEGER,
+        .data = { .integer = 70 }
+    };
+
+    assert_int_equal(ASTNodeArray_push(&arr, node1), ARRAY_OK);
+    assert_int_equal(ASTNodeArray_len(&arr), 1);
+
+    assert_int_equal(ASTNodeArray_push(&arr, node2), ARRAY_OK);
+    assert_int_equal(ASTNodeArray_len(&arr), 2);
+
+    assert_int_equal(ASTNodeArray_push(&arr, node3), ARRAY_OK);
+    assert_int_equal(ASTNodeArray_len(&arr), 3);
+
+    ASTNodeArray_free(&arr);
+}
+
+static void test_array_pop(void **state) {
+    (void) state;
+
+    // Set to force desize
+    ASTNodeArray arr = ASTNodeArray_init(16);
+    ASTNode node1 = {
+        .type = NODE_INTEGER,
+        .data = { .integer = 90 }
+    };
+
+    ASTNode node2 = {
+        .type = NODE_INTEGER,
+        .data = { .integer = 80 }
+    };
+
+    ASTNode node3 = {
+        .type = NODE_INTEGER,
+        .data = { .integer = 70 }
+    };
+
+    assert_int_equal(ASTNodeArray_push(&arr, node1), ARRAY_OK);
+    assert_int_equal(ASTNodeArray_len(&arr), 1);
+
+    assert_int_equal(ASTNodeArray_push(&arr, node2), ARRAY_OK);
+    assert_int_equal(ASTNodeArray_len(&arr), 2);
+
+    assert_int_equal(ASTNodeArray_push(&arr, node3), ARRAY_OK);
+    assert_int_equal(ASTNodeArray_len(&arr), 3);
+
+    ASTNode node4;
+    assert_int_equal(ASTNodeArray_pop(&arr, 1, &node4), ARRAY_OK);
+    assert_int_equal(node4.type, NODE_INTEGER);
+    assert_int_equal(node4.data.integer, 80);
+
+    ASTNodeArray_free(&arr);
+}
+
+
+int main(void) {
+    const struct CMUnitTest tests[] = {
+        cmocka_unit_test(test_array_push),
+        cmocka_unit_test(test_array_pop),
+    };
+
+    return cmocka_run_group_tests(tests, NULL, NULL);
+}
--- a/test/test_lexer.c
+++ b/test/test_lexer.c
@@ -0,0 +1,112 @@
+#include "lexer.h"
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <setjmp.h>
+#include <cmocka.h>
+
+static void test_tokenize_normal_expresion(void **state) {
+    (void) state;
+
+    char expr[256] = "2 + 3 / 66 * 789";
+    ASTNodeArray tokens;
+    ASTNode node;
+    
+    assert_int_equal(tokenize(expr, &tokens), LEXER_OK);
+    assert_int_equal(tokens.len, 7);
+
+    ASTNodeArray_get(&tokens, 0, &node);
+    assert_int_equal(node.type, NODE_INTEGER);
+    assert_int_equal(node.data.integer, 2);
+
+    ASTNodeArray_get(&tokens, 1, &node);
+    assert_int_equal(node.type, NODE_BINARY_OP);
+    assert_int_equal(node.data.binary.op, OP_ADD);
+
+    ASTNodeArray_get(&tokens, 2, &node);
+    assert_int_equal(node.type, NODE_INTEGER);
+    assert_int_equal(node.data.integer, 3);
+
+    ASTNodeArray_get(&tokens, 3, &node);
+    assert_int_equal(node.type, NODE_BINARY_OP);
+    assert_int_equal(node.data.binary.op, OP_DIV);
+
+    ASTNodeArray_get(&tokens, 4, &node);
+    assert_int_equal(node.type, NODE_INTEGER);
+    assert_int_equal(node.data.integer, 66);
+
+    ASTNodeArray_get(&tokens, 5, &node);
+    assert_int_equal(node.type, NODE_BINARY_OP);
+    assert_int_equal(node.data.binary.op, OP_MUL);
+
+    ASTNodeArray_get(&tokens, 6, &node);
+    assert_int_equal(node.type, NODE_INTEGER);
+    assert_int_equal(node.data.integer, 789);
+}
+
+static void test_tokenize_unrecognized_symbol(void **state) {
+    (void) state;
+
+    char expr[256] = " 2 j 3 / 66 } 789";
+    ASTNodeArray tokens = {
+        .len = 0,
+        .cap = 0,
+    };
+
+    assert_int_equal(tokenize(expr, &tokens), LEXER_NOT_RECOGNIZED_SYMBOL);
+    assert_int_equal(tokens.len, 0);
+    assert_int_equal(tokens.cap, 0);
+}
+
+static void test_tokenize_wrong_sintax(void **state) {
+    (void) state;
+
+    char expr[256] = "2 3 / 66 789";
+    ASTNodeArray tokens = {
+        .len = 0,
+        .cap = 0,
+    };
+
+    assert_int_equal(tokenize(expr, &tokens), LEXER_WRONG_SYNTAX);
+    assert_int_equal(tokens.len, 0);
+    assert_int_equal(tokens.cap, 0);
+}
+
+static void test_string_to_number_normal(void **state) {
+    (void) state;
+
+    char num[16] = "2333t55";
+    size_t offset = 0;
+    ASTNode result;
+
+    assert_int_equal(tokenize_number(num, &offset, &result), LEXER_OK);
+
+    assert_int_equal(offset, 4); // equal to t position in string
+    assert_int_equal(result.type, NODE_INTEGER);
+    assert_int_equal(result.data.integer, 2333);
+}
+
+static void test_string_to_number_overflow(void **state) {
+    (void) state;
+
+    // Number is INT64_MAX but with a extra 8 at the end
+    char num[32] = "92233720368547758078yy7";
+    size_t offset = 0;
+    ASTNode result;
+    assert_int_equal(tokenize_number(num, &offset, &result), LEXER_INT_OVERFLOW);
+    // Technically it can trigger a buf overflow error but obvioulsy 
+    // it will trigger int overflow error first
+}
+
+int main(void) {
+    const struct CMUnitTest tests[] = {
+        cmocka_unit_test(test_string_to_number_normal),
+        cmocka_unit_test(test_string_to_number_overflow),
+        cmocka_unit_test(test_tokenize_normal_expresion),
+        cmocka_unit_test(test_tokenize_unrecognized_symbol),
+        cmocka_unit_test(test_tokenize_wrong_sintax),
+    };
+
+    return cmocka_run_group_tests(tests, NULL, NULL);
+}
Author	SHA1	Message	Date
LaEntropiaa	6377515558	Starting with the lexer, i'm starting to comprehend better what pratt parsing is and how to apply it. For now just declaring basic structure and functions	2026-03-13 07:58:38 -06:00
LaEntropiaa	903fdbd6ff	I think i'm done, lexer works fine and errors work fine too, amazing actually, should be moving on to the parser so that i can construct the expression tree	2026-03-10 07:27:35 -06:00
LaEntropiaa	73451fcca9	Damn, it works, the lexer actually works, that's amazing, need to test the bad cases but at least i'm sure it can detect and process correct math expressions	2026-03-10 07:08:12 -06:00
LaEntropiaa	0de6cf5024	Modified the structure of the lexer, now is more easy to add types of numbers like fractions, like i could enev consider roots, irrationals, complex or imaginary, that would be dope. For now only support for integer, we need to get this shit running	2026-03-09 11:58:55 -06:00
LaEntropiaa	afae8fbe3a	Made the arrangements for the mentioned changes in the last commit, for now just integers but IT WILL be capable of handling doubles as fractions	2026-03-09 09:23:06 -06:00
LaEntropiaa	771069455d	First version for string_to_number, just one test, is working fine, i'm considering swithching to handling only integers for in the future to manage in special struct that manages doubles as fractions, obviously this will mean changing nodes for general numbers to integers/fractions and shit	2026-03-09 09:06:06 -06:00
LaEntropiaa	194f1dd80f	Second test, just pop, almost identical to first put important for asserting pop works because it is very needed	2026-03-05 10:20:44 -06:00
LaEntropiaa	79f7e327ff	First test added, changed signature for using pointers beacause i forgot you can't actually change a parameter because c copies everything, stupid from me to forget that	2026-03-05 08:27:36 -06:00
LaEntropiaa	3126be5782	Added functionality for the basic array functionality, i'm going to make tests even thoug they are tedius as fuck, i'm way more interested in making tests for the lexer itself	2026-03-04 19:30:56 -06:00
LaEntropiaa	adaf5c012f	Redesigned everithing so that everything is cleaner and not making everything all messy to fix later, still, ther may be redesigns and shit but should be fine	2026-03-04 18:54:46 -06:00