2026-03-04 18:54:46 -06:00
|
|
|
#include "lexer.h"
|
2026-03-09 09:06:06 -06:00
|
|
|
#include <ctype.h>
|
|
|
|
|
#include <math.h>
|
|
|
|
|
#include <stdint.h>
|
2026-03-04 19:30:56 -06:00
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <strings.h>
|
2026-03-09 09:06:06 -06:00
|
|
|
#include <limits.h>
|
2026-03-04 18:54:46 -06:00
|
|
|
|
2026-03-04 19:30:56 -06:00
|
|
|
#define NODE_ARRAY_DEFAULT_SIZE 64
|
2026-03-04 18:54:46 -06:00
|
|
|
// Helps state machine for the lexer :)
|
|
|
|
|
typedef enum {
|
|
|
|
|
WAIT_FOR_NUMBER,
|
|
|
|
|
WAIT_FOR_OPERATOR,
|
|
|
|
|
} LexerState;
|
|
|
|
|
|
2026-03-04 19:30:56 -06:00
|
|
|
ASTNodeArray ASTNodeArray_init(size_t size) {
|
|
|
|
|
ASTNodeArray new;
|
|
|
|
|
new.len = 0; // if 0 then use default
|
|
|
|
|
new.cap = size == 0 ? NODE_ARRAY_DEFAULT_SIZE : size;
|
|
|
|
|
new.data = malloc(new.cap * sizeof(ASTNode));
|
|
|
|
|
return new;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ASTNodeArray_free(ASTNodeArray *arr) {
|
|
|
|
|
free(arr->data);
|
|
|
|
|
arr->cap = 0;
|
|
|
|
|
arr->len = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-05 08:27:36 -06:00
|
|
|
ASTNodeArrayErr ASTNodeArray_get(const ASTNodeArray *arr, size_t index, ASTNode *out) {
|
|
|
|
|
if (arr == NULL) {
|
|
|
|
|
return ARRAY_NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-04 19:30:56 -06:00
|
|
|
if (out == NULL) {
|
|
|
|
|
return ARRAY_NULL_ARG;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-05 08:27:36 -06:00
|
|
|
if (arr->len == 0) {
|
2026-03-04 19:30:56 -06:00
|
|
|
return ARRAY_EMPTY;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-05 08:27:36 -06:00
|
|
|
if (index >= arr->len) {
|
2026-03-04 19:30:56 -06:00
|
|
|
return ARRAY_OUT_OF_BOUNDS;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-05 08:27:36 -06:00
|
|
|
*out = arr->data[index];
|
2026-03-04 19:30:56 -06:00
|
|
|
|
|
|
|
|
return ARRAY_OK;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-05 08:27:36 -06:00
|
|
|
ASTNodeArrayErr ASTNodeArray_push(ASTNodeArray *arr, ASTNode node) {
|
|
|
|
|
if (arr == NULL) {
|
|
|
|
|
return ARRAY_NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (arr->len >= arr->cap) {
|
|
|
|
|
size_t new_cap = arr->cap * 2;
|
|
|
|
|
ASTNode *tmp = realloc(arr->data, new_cap * sizeof(ASTNode));
|
2026-03-04 19:30:56 -06:00
|
|
|
if (tmp == NULL) {
|
|
|
|
|
return ARRAY_ALLOC;
|
|
|
|
|
}
|
2026-03-05 08:27:36 -06:00
|
|
|
arr->data = tmp;
|
|
|
|
|
arr->cap = new_cap;
|
2026-03-04 19:30:56 -06:00
|
|
|
}
|
|
|
|
|
|
2026-03-05 08:27:36 -06:00
|
|
|
arr->data[arr->len] = node;
|
|
|
|
|
arr->len = arr->len + 1;
|
2026-03-04 19:30:56 -06:00
|
|
|
|
|
|
|
|
return ARRAY_OK;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-05 08:27:36 -06:00
|
|
|
ASTNodeArrayErr ASTNodeArray_pop(ASTNodeArray *arr, size_t index, ASTNode *out) {
|
|
|
|
|
if (arr == NULL) {
|
|
|
|
|
return ARRAY_NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (arr->len == 0) {
|
2026-03-04 19:30:56 -06:00
|
|
|
return ARRAY_EMPTY;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-05 08:27:36 -06:00
|
|
|
if (index >= arr->len) {
|
2026-03-04 19:30:56 -06:00
|
|
|
return ARRAY_OUT_OF_BOUNDS;
|
|
|
|
|
}
|
2026-03-09 09:06:06 -06:00
|
|
|
|
|
|
|
|
if (arr->cap / 4 > arr->len) {
|
|
|
|
|
size_t new_cap = arr->cap / 2;
|
|
|
|
|
ASTNode *tmp = realloc(arr->data, new_cap * sizeof(ASTNode));
|
|
|
|
|
if (tmp == NULL) {
|
|
|
|
|
return ARRAY_ALLOC;
|
|
|
|
|
}
|
|
|
|
|
arr->data = tmp;
|
|
|
|
|
arr->cap = new_cap;
|
|
|
|
|
}
|
2026-03-04 19:30:56 -06:00
|
|
|
|
|
|
|
|
if (out != NULL) {
|
2026-03-05 08:27:36 -06:00
|
|
|
ASTNode node_to_delete = arr->data[index];
|
2026-03-04 19:30:56 -06:00
|
|
|
*out = node_to_delete;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-05 08:27:36 -06:00
|
|
|
for (size_t i = index; i < arr->len - 1; i++) {
|
|
|
|
|
arr->data[index] = arr->data[index + 1];
|
2026-03-04 19:30:56 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ARRAY_OK;
|
|
|
|
|
}
|
2026-03-05 08:27:36 -06:00
|
|
|
|
|
|
|
|
size_t ASTNodeArray_len(ASTNodeArray *arr) {
|
|
|
|
|
if (arr == NULL) {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
return arr->len;
|
|
|
|
|
}
|
2026-03-09 09:06:06 -06:00
|
|
|
|
2026-03-10 07:08:12 -06:00
|
|
|
LexerErr tokenize(const char *input, ASTNodeArray *out) {
|
|
|
|
|
size_t offset = 0;
|
|
|
|
|
LexerState state = WAIT_FOR_NUMBER;
|
|
|
|
|
ASTNodeArray arr = ASTNodeArray_init(0); // 0 defaults to 64
|
|
|
|
|
|
|
|
|
|
while (input[offset] != '\n' && input[offset] != '\0') {
|
|
|
|
|
int current = input[offset];
|
|
|
|
|
|
2026-03-10 07:27:35 -06:00
|
|
|
if (isdigit(current)) {
|
|
|
|
|
if (state != WAIT_FOR_NUMBER) {
|
|
|
|
|
ASTNodeArray_free(&arr);
|
|
|
|
|
return LEXER_WRONG_SYNTAX;
|
|
|
|
|
}
|
2026-03-10 07:08:12 -06:00
|
|
|
ASTNode new_node;
|
|
|
|
|
LexerErr result = tokenize_number(input, &offset, &new_node);
|
|
|
|
|
|
|
|
|
|
if (result != LEXER_OK) {
|
|
|
|
|
ASTNodeArray_free(&arr);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ASTNodeArray_push(&arr, new_node);
|
|
|
|
|
state = WAIT_FOR_OPERATOR;
|
2026-03-10 07:27:35 -06:00
|
|
|
} else if (isoperator(current)) {
|
|
|
|
|
if (state != WAIT_FOR_OPERATOR) {
|
|
|
|
|
return LEXER_WRONG_SYNTAX;
|
|
|
|
|
}
|
2026-03-10 07:08:12 -06:00
|
|
|
ASTNode new_node = {
|
|
|
|
|
.type = NODE_BINARY_OP,
|
2026-03-13 07:58:38 -06:00
|
|
|
.data.binary.op = char_to_operator(current),
|
2026-03-10 07:08:12 -06:00
|
|
|
.data.binary.right = NULL,
|
|
|
|
|
.data.binary.left = NULL,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
ASTNodeArray_push(&arr, new_node);
|
|
|
|
|
state = WAIT_FOR_NUMBER;
|
|
|
|
|
} else if (isspace(current)) {
|
|
|
|
|
// Nothing...
|
|
|
|
|
} else {
|
|
|
|
|
ASTNodeArray_free(&arr);
|
|
|
|
|
return LEXER_NOT_RECOGNIZED_SYMBOL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
offset++;
|
|
|
|
|
}
|
2026-03-09 11:58:55 -06:00
|
|
|
|
2026-03-13 07:58:38 -06:00
|
|
|
if (arr.len < 1) {
|
|
|
|
|
return LEXER_EMPTY_INPUT;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-10 07:08:12 -06:00
|
|
|
*out = arr;
|
|
|
|
|
return LEXER_OK;
|
|
|
|
|
}
|
2026-03-09 11:58:55 -06:00
|
|
|
|
2026-03-09 09:06:06 -06:00
|
|
|
// CURRENTLY, it only supports ints, not clear how floating
|
|
|
|
|
// point is implemented but i'll figure it out
|
2026-03-09 11:58:55 -06:00
|
|
|
LexerErr tokenize_number(const char *input, size_t *offset, ASTNode *out) {
|
2026-03-09 09:06:06 -06:00
|
|
|
char buf[128] = { '\0' };
|
|
|
|
|
size_t buf_pos = 0;
|
2026-03-09 11:58:55 -06:00
|
|
|
bool is_integer = true; // Will later be used to differentiate fractions
|
2026-03-09 09:06:06 -06:00
|
|
|
|
|
|
|
|
size_t current = *offset;
|
|
|
|
|
while (isdigit(input[current])) {
|
|
|
|
|
buf[buf_pos] = input[current];
|
|
|
|
|
|
|
|
|
|
if (buf_pos >= sizeof(buf)) {
|
|
|
|
|
return LEXER_BUF_OVERFLOW;
|
|
|
|
|
}
|
|
|
|
|
current++;
|
|
|
|
|
buf_pos++;
|
|
|
|
|
}
|
2026-03-09 11:58:55 -06:00
|
|
|
|
|
|
|
|
ASTNode new_node;
|
|
|
|
|
if (is_integer) {
|
|
|
|
|
new_node.type = NODE_INTEGER;
|
|
|
|
|
LexerErr status = string_to_integer(buf, &new_node.data.integer);
|
|
|
|
|
if (status == LEXER_OK) {
|
|
|
|
|
*out = new_node;
|
|
|
|
|
}
|
|
|
|
|
*offset = current;
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return LEXER_FAILED_NUMBER_CONVERSION;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
LexerErr string_to_integer(const char *buf, int64_t *number) {
|
2026-03-09 09:06:06 -06:00
|
|
|
int c = 0;
|
2026-03-09 09:23:06 -06:00
|
|
|
int64_t count = 0;
|
2026-03-09 09:06:06 -06:00
|
|
|
while (buf[c] != '\0') {
|
|
|
|
|
|
|
|
|
|
int digit = buf[c] - '0';
|
|
|
|
|
|
2026-03-09 11:58:55 -06:00
|
|
|
if (count > (INT64_MAX - digit) / 10) {
|
|
|
|
|
return LEXER_INT_OVERFLOW;
|
2026-03-09 09:06:06 -06:00
|
|
|
}
|
|
|
|
|
count = count * 10;
|
|
|
|
|
count += digit;
|
|
|
|
|
|
|
|
|
|
c++;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-09 09:23:06 -06:00
|
|
|
*number = count;
|
2026-03-09 09:06:06 -06:00
|
|
|
return LEXER_OK;
|
|
|
|
|
}
|
2026-03-10 07:08:12 -06:00
|
|
|
|
|
|
|
|
bool isoperator(int c) {
|
|
|
|
|
switch (c) {
|
|
|
|
|
case '+':
|
|
|
|
|
case '-':
|
|
|
|
|
case '/':
|
|
|
|
|
case '*':
|
|
|
|
|
return true;
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-03-13 07:58:38 -06:00
|
|
|
|
|
|
|
|
Operator char_to_operator(int c) {
|
|
|
|
|
switch (c) {
|
|
|
|
|
case '+':
|
|
|
|
|
return OP_ADD;
|
|
|
|
|
break;
|
|
|
|
|
case '-':
|
|
|
|
|
return OP_SUB;
|
|
|
|
|
break;
|
|
|
|
|
case '*':
|
|
|
|
|
return OP_MUL;
|
|
|
|
|
break;
|
|
|
|
|
case '/':
|
|
|
|
|
return OP_DIV;
|
|
|
|
|
break;
|
|
|
|
|
default: // I mean shouldn't be used, we assume
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
}
|