Compare commits

...

10 Commits

Author SHA1 Message Date
6377515558 Starting with the lexer, i'm starting to comprehend better what pratt parsing is and how to apply it. For now just declaring basic structure and functions 2026-03-13 07:58:38 -06:00
903fdbd6ff I think i'm done, lexer works fine and errors work fine too, amazing actually, should be moving on to the parser so that i can construct the expression tree 2026-03-10 07:27:35 -06:00
73451fcca9 Damn, it works, the lexer actually works, that's amazing, need to test the bad cases but at least i'm sure it can detect and process correct math expressions 2026-03-10 07:08:12 -06:00
0de6cf5024 Modified the structure of the lexer, now is more easy to add types of numbers like fractions, like i could enev consider roots, irrationals, complex or imaginary, that would be dope. For now only support for integer, we need to get this shit running 2026-03-09 11:58:55 -06:00
afae8fbe3a Made the arrangements for the mentioned changes in the last commit, for now just integers but IT WILL be capable of handling doubles as fractions 2026-03-09 09:23:06 -06:00
771069455d First version for string_to_number, just one test, is working fine, i'm considering swithching to handling only integers for in the future to manage in special struct that manages doubles as fractions, obviously this will mean changing nodes for general numbers to integers/fractions and shit 2026-03-09 09:06:06 -06:00
194f1dd80f Second test, just pop, almost identical to first put important for asserting pop works because it is very needed 2026-03-05 10:20:44 -06:00
79f7e327ff First test added, changed signature for using pointers beacause i forgot you can't actually change a parameter because c copies everything, stupid from me to forget that 2026-03-05 08:27:36 -06:00
3126be5782 Added functionality for the basic array functionality, i'm going to make tests even thoug they are tedius as fuck, i'm way more interested in making tests for the lexer itself 2026-03-04 19:30:56 -06:00
adaf5c012f Redesigned everithing so that everything is cleaner and not making everything all messy to fix later, still, ther may be redesigns and shit but should be fine 2026-03-04 18:54:46 -06:00
7 changed files with 577 additions and 3 deletions

View File

@@ -0,0 +1,78 @@
#ifndef LEXER_H
#define LEXER_H
#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>
// For identifing
typedef enum {
NODE_INTEGER,
NODE_BINARY_OP,
} ASTNodeType;
// For classify operators
typedef enum {
OP_ADD,
OP_SUB,
OP_MUL,
OP_DIV
} Operator;
typedef enum {
ARRAY_OK = 0,
ARRAY_NULL,
ARRAY_EMPTY,
ARRAY_OUT_OF_BOUNDS,
ARRAY_NULL_ARG,
ARRAY_ALLOC,
} ASTNodeArrayErr;
typedef enum {
LEXER_OK = 0,
LEXER_INT_OVERFLOW,
LEXER_FAILED_NUMBER_CONVERSION,
LEXER_NOT_RECOGNIZED_SYMBOL,
LEXER_EMPTY_INPUT,
LEXER_NULL_ARG,
LEXER_WRONG_SYNTAX,
LEXER_BUF_OVERFLOW,
} LexerErr;
// Can be thought as tokens, they will be used by the parser.
typedef struct ASTNode {
ASTNodeType type;
union {
int64_t integer;
struct {
struct ASTNode *left;
struct ASTNode *right;
Operator op;
} binary;
} data;
} ASTNode;
// I prefer ot have a dynamic array for storing the "tokens"
typedef struct {
size_t len;
size_t cap;
ASTNode *data;
} ASTNodeArray;
// Basic array functionality
ASTNodeArray ASTNodeArray_init(size_t size);
void ASTNodeArray_free(ASTNodeArray *arr);
ASTNodeArrayErr ASTNodeArray_push(ASTNodeArray *arr, ASTNode node);
ASTNodeArrayErr ASTNodeArray_get(const ASTNodeArray *arr, size_t index, ASTNode *out);
// Out in pop can be NULL so it doesn't return anything
ASTNodeArrayErr ASTNodeArray_pop(ASTNodeArray *arr, size_t index, ASTNode *out);
size_t ASTNodeArray_len(ASTNodeArray *arr);
// Lexer funtions as well as few functionality
LexerErr tokenize(const char* input, ASTNodeArray *out);
LexerErr tokenize_number(const char* input, size_t *offset, ASTNode *out);
LexerErr string_to_integer(const char buf[], int64_t *number);
bool isoperator(int c);
Operator char_to_operator(int c);
#endif // !LEXER_H

View File

@@ -0,0 +1,14 @@
#include "lexer.h"
typedef struct {
ASTNode *head;
} AST;
typedef enum {
PARSER_NUD, // Null Denotation
PARSER_LED, // Left Denotation
} ParserState;
size_t node_lbp(Operator op);
size_t node_rbp(Operator op);
AST parse(ASTNodeArray arr);

View File

@@ -0,0 +1,250 @@
#include "lexer.h"
#include <ctype.h>
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <strings.h>
#include <limits.h>
#define NODE_ARRAY_DEFAULT_SIZE 64
// Helps state machine for the lexer :)
typedef enum {
WAIT_FOR_NUMBER,
WAIT_FOR_OPERATOR,
} LexerState;
ASTNodeArray ASTNodeArray_init(size_t size) {
ASTNodeArray new;
new.len = 0; // if 0 then use default
new.cap = size == 0 ? NODE_ARRAY_DEFAULT_SIZE : size;
new.data = malloc(new.cap * sizeof(ASTNode));
return new;
}
void ASTNodeArray_free(ASTNodeArray *arr) {
free(arr->data);
arr->cap = 0;
arr->len = 0;
}
ASTNodeArrayErr ASTNodeArray_get(const ASTNodeArray *arr, size_t index, ASTNode *out) {
if (arr == NULL) {
return ARRAY_NULL;
}
if (out == NULL) {
return ARRAY_NULL_ARG;
}
if (arr->len == 0) {
return ARRAY_EMPTY;
}
if (index >= arr->len) {
return ARRAY_OUT_OF_BOUNDS;
}
*out = arr->data[index];
return ARRAY_OK;
}
ASTNodeArrayErr ASTNodeArray_push(ASTNodeArray *arr, ASTNode node) {
if (arr == NULL) {
return ARRAY_NULL;
}
if (arr->len >= arr->cap) {
size_t new_cap = arr->cap * 2;
ASTNode *tmp = realloc(arr->data, new_cap * sizeof(ASTNode));
if (tmp == NULL) {
return ARRAY_ALLOC;
}
arr->data = tmp;
arr->cap = new_cap;
}
arr->data[arr->len] = node;
arr->len = arr->len + 1;
return ARRAY_OK;
}
ASTNodeArrayErr ASTNodeArray_pop(ASTNodeArray *arr, size_t index, ASTNode *out) {
if (arr == NULL) {
return ARRAY_NULL;
}
if (arr->len == 0) {
return ARRAY_EMPTY;
}
if (index >= arr->len) {
return ARRAY_OUT_OF_BOUNDS;
}
if (arr->cap / 4 > arr->len) {
size_t new_cap = arr->cap / 2;
ASTNode *tmp = realloc(arr->data, new_cap * sizeof(ASTNode));
if (tmp == NULL) {
return ARRAY_ALLOC;
}
arr->data = tmp;
arr->cap = new_cap;
}
if (out != NULL) {
ASTNode node_to_delete = arr->data[index];
*out = node_to_delete;
}
for (size_t i = index; i < arr->len - 1; i++) {
arr->data[index] = arr->data[index + 1];
}
return ARRAY_OK;
}
size_t ASTNodeArray_len(ASTNodeArray *arr) {
if (arr == NULL) {
return 0;
}
return arr->len;
}
LexerErr tokenize(const char *input, ASTNodeArray *out) {
size_t offset = 0;
LexerState state = WAIT_FOR_NUMBER;
ASTNodeArray arr = ASTNodeArray_init(0); // 0 defaults to 64
while (input[offset] != '\n' && input[offset] != '\0') {
int current = input[offset];
if (isdigit(current)) {
if (state != WAIT_FOR_NUMBER) {
ASTNodeArray_free(&arr);
return LEXER_WRONG_SYNTAX;
}
ASTNode new_node;
LexerErr result = tokenize_number(input, &offset, &new_node);
if (result != LEXER_OK) {
ASTNodeArray_free(&arr);
return result;
}
ASTNodeArray_push(&arr, new_node);
state = WAIT_FOR_OPERATOR;
} else if (isoperator(current)) {
if (state != WAIT_FOR_OPERATOR) {
return LEXER_WRONG_SYNTAX;
}
ASTNode new_node = {
.type = NODE_BINARY_OP,
.data.binary.op = char_to_operator(current),
.data.binary.right = NULL,
.data.binary.left = NULL,
};
ASTNodeArray_push(&arr, new_node);
state = WAIT_FOR_NUMBER;
} else if (isspace(current)) {
// Nothing...
} else {
ASTNodeArray_free(&arr);
return LEXER_NOT_RECOGNIZED_SYMBOL;
}
offset++;
}
if (arr.len < 1) {
return LEXER_EMPTY_INPUT;
}
*out = arr;
return LEXER_OK;
}
// CURRENTLY, it only supports ints, not clear how floating
// point is implemented but i'll figure it out
LexerErr tokenize_number(const char *input, size_t *offset, ASTNode *out) {
char buf[128] = { '\0' };
size_t buf_pos = 0;
bool is_integer = true; // Will later be used to differentiate fractions
size_t current = *offset;
while (isdigit(input[current])) {
buf[buf_pos] = input[current];
if (buf_pos >= sizeof(buf)) {
return LEXER_BUF_OVERFLOW;
}
current++;
buf_pos++;
}
ASTNode new_node;
if (is_integer) {
new_node.type = NODE_INTEGER;
LexerErr status = string_to_integer(buf, &new_node.data.integer);
if (status == LEXER_OK) {
*out = new_node;
}
*offset = current;
return status;
}
return LEXER_FAILED_NUMBER_CONVERSION;
}
LexerErr string_to_integer(const char *buf, int64_t *number) {
int c = 0;
int64_t count = 0;
while (buf[c] != '\0') {
int digit = buf[c] - '0';
if (count > (INT64_MAX - digit) / 10) {
return LEXER_INT_OVERFLOW;
}
count = count * 10;
count += digit;
c++;
}
*number = count;
return LEXER_OK;
}
bool isoperator(int c) {
switch (c) {
case '+':
case '-':
case '/':
case '*':
return true;
default:
return false;
}
}
Operator char_to_operator(int c) {
switch (c) {
case '+':
return OP_ADD;
break;
case '-':
return OP_SUB;
break;
case '*':
return OP_MUL;
break;
case '/':
return OP_DIV;
break;
default: // I mean shouldn't be used, we assume
return -1;
}
}

View File

@@ -0,0 +1,27 @@
#include "parser.h"
#include "lexer.h"
#include <stdlib.h>
size_t node_lbp(Operator op) {
switch (op) {
case OP_ADD:
case OP_SUB:
return 10;
break;
case OP_DIV:
case OP_MUL:
return 20;
}
}
size_t node_rbp(Operator op) {
switch (op) {
case OP_ADD:
case OP_SUB:
return 10;
break;
case OP_DIV:
case OP_MUL:
return 20;
}
}

View File

@@ -1,10 +1,17 @@
find_package(cmocka REQUIRED)
add_executable(test_parser test_parser.c)
add_executable(test_nodeArray test_ASTNodeArray.c)
add_executable(test_lexer test_lexer.c)
target_link_libraries(test_parser
target_link_libraries(test_nodeArray
calculator_lib
cmocka::cmocka
)
add_test(NAME parser_tests COMMAND test_parser)
target_link_libraries(test_lexer
calculator_lib
cmocka::cmocka
)
add_test(NAME nodeArray_tests COMMAND test_nodeArray)
add_test(NAME lexer_tests COMMAND test_lexer)

86
test/test_ASTNodeArray.c Normal file
View File

@@ -0,0 +1,86 @@
#include "lexer.h"
#include <stdarg.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <setjmp.h>
#include <cmocka.h>
static void test_array_push(void **state) {
(void) state;
// We use 2 to force resize and checking anything wrong with malloc
ASTNodeArray arr = ASTNodeArray_init(2);
ASTNode node1 = {
.type = NODE_INTEGER,
.data = { .integer = 90 }
};
ASTNode node2 = {
.type = NODE_INTEGER,
.data = { .integer = 80 }
};
ASTNode node3 = {
.type = NODE_INTEGER,
.data = { .integer = 70 }
};
assert_int_equal(ASTNodeArray_push(&arr, node1), ARRAY_OK);
assert_int_equal(ASTNodeArray_len(&arr), 1);
assert_int_equal(ASTNodeArray_push(&arr, node2), ARRAY_OK);
assert_int_equal(ASTNodeArray_len(&arr), 2);
assert_int_equal(ASTNodeArray_push(&arr, node3), ARRAY_OK);
assert_int_equal(ASTNodeArray_len(&arr), 3);
ASTNodeArray_free(&arr);
}
static void test_array_pop(void **state) {
(void) state;
// Set to force desize
ASTNodeArray arr = ASTNodeArray_init(16);
ASTNode node1 = {
.type = NODE_INTEGER,
.data = { .integer = 90 }
};
ASTNode node2 = {
.type = NODE_INTEGER,
.data = { .integer = 80 }
};
ASTNode node3 = {
.type = NODE_INTEGER,
.data = { .integer = 70 }
};
assert_int_equal(ASTNodeArray_push(&arr, node1), ARRAY_OK);
assert_int_equal(ASTNodeArray_len(&arr), 1);
assert_int_equal(ASTNodeArray_push(&arr, node2), ARRAY_OK);
assert_int_equal(ASTNodeArray_len(&arr), 2);
assert_int_equal(ASTNodeArray_push(&arr, node3), ARRAY_OK);
assert_int_equal(ASTNodeArray_len(&arr), 3);
ASTNode node4;
assert_int_equal(ASTNodeArray_pop(&arr, 1, &node4), ARRAY_OK);
assert_int_equal(node4.type, NODE_INTEGER);
assert_int_equal(node4.data.integer, 80);
ASTNodeArray_free(&arr);
}
int main(void) {
const struct CMUnitTest tests[] = {
cmocka_unit_test(test_array_push),
cmocka_unit_test(test_array_pop),
};
return cmocka_run_group_tests(tests, NULL, NULL);
}

112
test/test_lexer.c Normal file
View File

@@ -0,0 +1,112 @@
#include "lexer.h"
#include <stdarg.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <setjmp.h>
#include <cmocka.h>
static void test_tokenize_normal_expresion(void **state) {
(void) state;
char expr[256] = "2 + 3 / 66 * 789";
ASTNodeArray tokens;
ASTNode node;
assert_int_equal(tokenize(expr, &tokens), LEXER_OK);
assert_int_equal(tokens.len, 7);
ASTNodeArray_get(&tokens, 0, &node);
assert_int_equal(node.type, NODE_INTEGER);
assert_int_equal(node.data.integer, 2);
ASTNodeArray_get(&tokens, 1, &node);
assert_int_equal(node.type, NODE_BINARY_OP);
assert_int_equal(node.data.binary.op, OP_ADD);
ASTNodeArray_get(&tokens, 2, &node);
assert_int_equal(node.type, NODE_INTEGER);
assert_int_equal(node.data.integer, 3);
ASTNodeArray_get(&tokens, 3, &node);
assert_int_equal(node.type, NODE_BINARY_OP);
assert_int_equal(node.data.binary.op, OP_DIV);
ASTNodeArray_get(&tokens, 4, &node);
assert_int_equal(node.type, NODE_INTEGER);
assert_int_equal(node.data.integer, 66);
ASTNodeArray_get(&tokens, 5, &node);
assert_int_equal(node.type, NODE_BINARY_OP);
assert_int_equal(node.data.binary.op, OP_MUL);
ASTNodeArray_get(&tokens, 6, &node);
assert_int_equal(node.type, NODE_INTEGER);
assert_int_equal(node.data.integer, 789);
}
static void test_tokenize_unrecognized_symbol(void **state) {
(void) state;
char expr[256] = " 2 j 3 / 66 } 789";
ASTNodeArray tokens = {
.len = 0,
.cap = 0,
};
assert_int_equal(tokenize(expr, &tokens), LEXER_NOT_RECOGNIZED_SYMBOL);
assert_int_equal(tokens.len, 0);
assert_int_equal(tokens.cap, 0);
}
static void test_tokenize_wrong_sintax(void **state) {
(void) state;
char expr[256] = "2 3 / 66 789";
ASTNodeArray tokens = {
.len = 0,
.cap = 0,
};
assert_int_equal(tokenize(expr, &tokens), LEXER_WRONG_SYNTAX);
assert_int_equal(tokens.len, 0);
assert_int_equal(tokens.cap, 0);
}
static void test_string_to_number_normal(void **state) {
(void) state;
char num[16] = "2333t55";
size_t offset = 0;
ASTNode result;
assert_int_equal(tokenize_number(num, &offset, &result), LEXER_OK);
assert_int_equal(offset, 4); // equal to t position in string
assert_int_equal(result.type, NODE_INTEGER);
assert_int_equal(result.data.integer, 2333);
}
static void test_string_to_number_overflow(void **state) {
(void) state;
// Number is INT64_MAX but with a extra 8 at the end
char num[32] = "92233720368547758078yy7";
size_t offset = 0;
ASTNode result;
assert_int_equal(tokenize_number(num, &offset, &result), LEXER_INT_OVERFLOW);
// Technically it can trigger a buf overflow error but obvioulsy
// it will trigger int overflow error first
}
int main(void) {
const struct CMUnitTest tests[] = {
cmocka_unit_test(test_string_to_number_normal),
cmocka_unit_test(test_string_to_number_overflow),
cmocka_unit_test(test_tokenize_normal_expresion),
cmocka_unit_test(test_tokenize_unrecognized_symbol),
cmocka_unit_test(test_tokenize_wrong_sintax),
};
return cmocka_run_group_tests(tests, NULL, NULL);
}