refactor-lexer #11

Merged
laentropia merged 6 commits from refactor-lexer into main 2026-05-12 20:08:39 -06:00
5 changed files with 223 additions and 69 deletions

View File

@@ -10,6 +10,8 @@
typedef enum { typedef enum {
NODE_INTEGER, NODE_INTEGER,
NODE_BINARY_OP, NODE_BINARY_OP,
NODE_UNARY_OP,
NODE_PARENTHESIS,
} ASTNodeType; } ASTNodeType;
// For classify operators // For classify operators
@@ -17,7 +19,11 @@ typedef enum {
OP_ADD, OP_ADD,
OP_SUB, OP_SUB,
OP_MUL, OP_MUL,
OP_DIV OP_DIV,
OP_POW,
OP_FACTORIAL,
OP_START_PAR,
OP_END_PAR,
} Operator; } Operator;
typedef enum { typedef enum {
@@ -26,8 +32,6 @@ typedef enum {
LEXER_FAILED_NUMBER_CONVERSION, LEXER_FAILED_NUMBER_CONVERSION,
LEXER_NOT_RECOGNIZED_SYMBOL, LEXER_NOT_RECOGNIZED_SYMBOL,
LEXER_EMPTY_INPUT, LEXER_EMPTY_INPUT,
LEXER_NULL_ARG,
LEXER_WRONG_SYNTAX,
LEXER_BUF_OVERFLOW, LEXER_BUF_OVERFLOW,
} LexerErr; } LexerErr;
@@ -41,6 +45,14 @@ typedef struct ASTNode {
struct ASTNode *right; struct ASTNode *right;
Operator op; Operator op;
} binary; } binary;
struct {
struct ASTNode *val;
Operator op;
} unary;
struct {
struct ASTNode *val;
Operator op;
} parenthesis;
} data; } data;
} ASTNode; } ASTNode;
@@ -66,12 +78,12 @@ typedef struct {
LexerErr err; LexerErr err;
int64_t number; int64_t number;
}; };
} I64Result; } LexerI64Result;
// Lexer funtions as well as few functionality // Lexer funtions as well as few functionality
TokenizeResult tokenize(const char* input); TokenizeResult tokenize(const char* input);
ASTNodeResult tokenize_number(const char* input, size_t *offset); ASTNodeResult tokenize_number(const char* input, size_t *offset);
I64Result string_to_integer(const char buf[]); LexerI64Result string_to_integer(const char buf[]);
bool isoperator(int c); bool isoperator(int c);
Operator char_to_operator(int c); Operator char_to_operator(int c);
char operator_to_char(Operator op); char operator_to_char(Operator op);

View File

@@ -32,8 +32,10 @@ typedef struct {
ASTNode *nud(ArraySlice *slice); ASTNode *nud(ArraySlice *slice);
ASTNode *led(ArraySlice *slice, size_t right_precedence); ASTNode *led(ArraySlice *slice, size_t right_precedence);
uint8_t node_lbp(ASTNode node); uint8_t prefix_rbp(ASTNode node);
uint8_t node_rbp(ASTNode node); uint8_t postfix_lbp(ASTNode node);
uint8_t infix_lbp(ASTNode node);
uint8_t infix_rbp(ASTNode node);
ParseResult parse(TokenizeResult tokens); ParseResult parse(TokenizeResult tokens);
ASTNode *parse_expr(ArraySlice *slice, Arena *arena, uint8_t min_bp); ASTNode *parse_expr(ArraySlice *slice, Arena *arena, uint8_t min_bp);

View File

@@ -3,6 +3,7 @@
#include "lexer.h" #include "lexer.h"
#include "parser.h" #include "parser.h"
#include <stdint.h> #include <stdint.h>
#include <math.h>
int64_t evaluate_tree(ASTNode *tree) { int64_t evaluate_tree(ASTNode *tree) {
@@ -20,7 +21,8 @@ int64_t evaluate_tree(ASTNode *tree) {
return evaluate_tree(left) * evaluate_tree(right); return evaluate_tree(left) * evaluate_tree(right);
case OP_DIV: case OP_DIV:
return evaluate_tree(left) / evaluate_tree(right); return evaluate_tree(left) / evaluate_tree(right);
case OP_POW:
return pow(evaluate_tree(left), evaluate_tree(right));
} }
} }

View File

@@ -3,6 +3,7 @@
#include <ctype.h> #include <ctype.h>
#include <stdbool.h> #include <stdbool.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <strings.h> #include <strings.h>
#include <limits.h> #include <limits.h>
@@ -14,45 +15,37 @@ typedef enum {
TokenizeResult tokenize(const char *input) { TokenizeResult tokenize(const char *input) {
size_t offset = 0;
LexerState state = WAIT_FOR_NUMBER;
ArrayList *arr = arraylist_init(64, sizeof(ASTNode)); ArrayList *arr = arraylist_init(64, sizeof(ASTNode));
size_t offset = 0;
while (input[offset] != '\n' && input[offset] != '\0') { while (
int current = input[offset]; input[offset] != '\n' ||
input[offset] != EOF ||
input[offset] != '\0') {
if (isdigit(current)) { if (isdigit(input[offset])) {
if (state != WAIT_FOR_NUMBER) {
arraylist_destroy(&arr);
return (TokenizeResult) {.is_valid = false, .err = LEXER_WRONG_SYNTAX};
}
ASTNodeResult result = tokenize_number(input, &offset); ASTNodeResult result = tokenize_number(input, &offset);
if (!result.is_valid) { if (!result.is_valid) {
arraylist_destroy(&arr);
return (TokenizeResult) {.is_valid = false, .err = result.err}; return (TokenizeResult) {.is_valid = false, .err = result.err};
} }
arraylist_push_back(arr, &result.node); arraylist_push_back(arr, &result.node);
state = WAIT_FOR_OPERATOR; } else if (isoperator(input[offset])) {
} else if (isoperator(current)) { ASTNode op_node = {
if (state != WAIT_FOR_OPERATOR) {
return (TokenizeResult) {.is_valid = false, .err =LEXER_WRONG_SYNTAX};
}
ASTNode new_node = {
.type = NODE_BINARY_OP, .type = NODE_BINARY_OP,
.data.binary.op = char_to_operator(current), .data.binary.op = char_to_operator(input[offset]),
.data.binary.right = NULL,
.data.binary.left = NULL, .data.binary.left = NULL,
.data.binary.right = NULL,
}; };
arraylist_push_back(arr, &new_node); arraylist_push_back(arr, &op_node);
state = WAIT_FOR_NUMBER; } else if (isspace(input[offset])) {
} else if (isspace(current)) {
// Nothing... // Nothing...
} else { } else {
arraylist_destroy(&arr); return (TokenizeResult) {
return (TokenizeResult) {.is_valid = false, .err = LEXER_NOT_RECOGNIZED_SYMBOL}; .is_valid = false,
.err = LEXER_NOT_RECOGNIZED_SYMBOL};
} }
offset++; offset++;
@@ -68,17 +61,21 @@ TokenizeResult tokenize(const char *input) {
// CURRENTLY, it only supports ints, not clear how floating // CURRENTLY, it only supports ints, not clear how floating
// point is implemented but i'll figure it out // point is implemented but i'll figure it out
ASTNodeResult tokenize_number(const char *input, size_t *offset) { ASTNodeResult tokenize_number(const char *input, size_t *offset) {
char buf[128] = { '\0' }; char buf[64] = { '\0' };
size_t buf_pos = 0; size_t buf_pos = 0;
bool is_integer = true; // Will later be used to differentiate fractions bool is_integer = true; // Will later be used to differentiate fractions
// read number
size_t current = *offset; size_t current = *offset;
while (isdigit(input[current])) { while (isdigit(input[current])) {
buf[buf_pos] = input[current]; buf[buf_pos] = input[current];
if (buf_pos >= sizeof(buf)) { if (buf_pos >= sizeof(buf)) {
return (ASTNodeResult) {.is_valid = false, .err = LEXER_BUF_OVERFLOW}; return (ASTNodeResult) {
.is_valid = false,
.err = LEXER_BUF_OVERFLOW};
} }
current++; current++;
buf_pos++; buf_pos++;
} }
@@ -86,35 +83,46 @@ ASTNodeResult tokenize_number(const char *input, size_t *offset) {
ASTNode new_node; ASTNode new_node;
if (is_integer) { if (is_integer) {
new_node.type = NODE_INTEGER; new_node.type = NODE_INTEGER;
I64Result status = string_to_integer(buf); LexerI64Result status = string_to_integer(buf);
if (!status.is_valid) { if (!status.is_valid) {
return (ASTNodeResult) {.is_valid = false, .err = status.err}; return (ASTNodeResult) {.is_valid = false, .err = status.err};
} }
new_node.data.integer = status.number; new_node.data.integer = status.number;
*offset = current; *offset = current;
return (ASTNodeResult) {.is_valid = true, .node = new_node}; return (ASTNodeResult) {.is_valid = true, .node = new_node};
} }
return (ASTNodeResult) {.is_valid = false, .err = LEXER_FAILED_NUMBER_CONVERSION}; return (ASTNodeResult) {
.is_valid = false,
.err = LEXER_FAILED_NUMBER_CONVERSION};
} }
I64Result string_to_integer(const char *buf) { LexerI64Result string_to_integer(const char *buf) {
int c = 0; int c = 0;
int64_t count = 0; int64_t count = 0;
while (buf[c] != '\0') { while (buf[c] != '\0') {
// Extracts number from char
int digit = buf[c] - '0'; int digit = buf[c] - '0';
if (count > (INT64_MAX - digit) / 10) { if (count > (INT64_MAX - digit) / 10) {
return (I64Result) {.is_valid = false, .err = LEXER_INT_OVERFLOW}; return (LexerI64Result) {
.is_valid = false,
.err = LEXER_INT_OVERFLOW};
} }
count = count * 10; count = count * 10;
count += digit; count += digit;
c++; c++;
} }
return (I64Result) {.is_valid = true, .number = count}; return (LexerI64Result) {.is_valid = true, .number = count};
} }
bool isoperator(int c) { bool isoperator(int c) {
@@ -123,6 +131,10 @@ bool isoperator(int c) {
case '-': case '-':
case '/': case '/':
case '*': case '*':
case '^':
case '!':
case '(':
case ')':
return true; return true;
default: default:
return false; return false;
@@ -143,6 +155,18 @@ Operator char_to_operator(int c) {
case '/': case '/':
return OP_DIV; return OP_DIV;
break; break;
case '^':
return OP_POW;
break;
case '!':
return OP_FACTORIAL;
break;
case '(':
return OP_START_PAR;
break;
case ')':
return OP_END_PAR;
break;
default: // I mean shouldn't be used, we assume default: // I mean shouldn't be used, we assume
return -1; return -1;
} }
@@ -158,5 +182,15 @@ char operator_to_char(Operator op) {
return '*'; return '*';
case OP_DIV: case OP_DIV:
return '/'; return '/';
case OP_POW:
return '^';
case OP_FACTORIAL:
return '!';
case OP_START_PAR:
return '(';
case OP_END_PAR:
return ')';
default:
return EOF;
} }
} }

View File

@@ -6,7 +6,35 @@
#include <stdbool.h> #include <stdbool.h>
#include <stdint.h> #include <stdint.h>
uint8_t node_lbp(ASTNode node) { uint8_t prefix_rbp(ASTNode node) {
if (node.type == NODE_INTEGER) {
return 0;
}
switch (node.data.unary.op) {
case OP_SUB:
case OP_ADD:
return 30;
default:
return -1;
}
}
uint8_t postfix_lbp(ASTNode node) {
if (node.type == NODE_INTEGER) {
return 0;
}
switch (node.data.unary.op) {
case OP_FACTORIAL:
return 40;
default:
// needs to be dealt with with resulttypes
return 255;
}
}
uint8_t infix_lbp(ASTNode node) {
if (node.type == NODE_INTEGER) { if (node.type == NODE_INTEGER) {
return 0; return 0;
} }
@@ -19,12 +47,14 @@ uint8_t node_lbp(ASTNode node) {
case OP_DIV: case OP_DIV:
case OP_MUL: case OP_MUL:
return 20; return 20;
case OP_POW:
return 51;
default: default:
return 0; return 0;
} }
} }
uint8_t node_rbp(ASTNode node) { uint8_t infix_rbp(ASTNode node) {
if (node.type == NODE_INTEGER) { if (node.type == NODE_INTEGER) {
return 0; return 0;
} }
@@ -37,6 +67,8 @@ uint8_t node_rbp(ASTNode node) {
case OP_DIV: case OP_DIV:
case OP_MUL: case OP_MUL:
return 21; return 21;
case OP_POW:
return 50;
default: default:
return 0; return 0;
} }
@@ -53,12 +85,14 @@ ParseResult parse(TokenizeResult tokens) {
} }
ASTNode *parse_expr(ArraySlice *slice, Arena *arena, uint8_t min_bp) { ASTNode *parse_expr(ArraySlice *slice, Arena *arena, uint8_t min_bp) {
// First: Consume a first number
arena_ensure_capacity( arena_ensure_capacity(
arena, arena,
sizeof(ASTNode), sizeof(ASTNode),
alignof(ASTNode) alignof(ASTNode)
); ); // shouldn't fail but if it does then what a shame
// Get pointer in the arena
ASTNode *left_side = arena_unwrap_pointer( ASTNode *left_side = arena_unwrap_pointer(
arena_alloc( arena_alloc(
arena, arena,
@@ -69,20 +103,87 @@ ASTNode *parse_expr(ArraySlice *slice, Arena *arena, uint8_t min_bp) {
arrayslice_next(slice, left_side); arrayslice_next(slice, left_side);
if (left_side->type == NODE_PARENTHESIS &&
left_side->data.parenthesis.op == OP_START_PAR) {
left_side = parse_expr(slice, arena, 0);
// HERE CHEKC LATER if slice.next != ')'
ASTNode *end_par;
arrayslice_next(slice, &end_par);
if (end_par->type != NODE_PARENTHESIS ||
end_par->data.parenthesis.op != OP_END_PAR) {
// todo
}
return left_side;
}
// if is unary then take prefix bp and continue
// to the right, no need to allocate left side
// because we just did and right side
// WILL return a valid allocated pointer.
if (left_side->type == NODE_UNARY_OP) {
uint8_t rbp = prefix_rbp(*left_side);
ASTNode *righ_side = parse_expr(slice, arena, rbp);
left_side->data.unary.val = righ_side;
}
while (true) { while (true) {
// Second: Get next one and checn bp
if (!arrayslice_is_valid(slice)) { if (!arrayslice_is_valid(slice)) {
break; break;
} }
ASTNode operator; // Here check if not OP error
arrayslice_peek(slice, &operator);
uint8_t rbp = node_rbp(operator);
uint8_t lbp = node_lbp(operator);
ASTNode operator;
// Here should chekc if is operator not some bs
// Third, get operator and binding powers
arrayslice_peek(slice, &operator);
// temporary for bad error handling
if (postfix_lbp(operator) != 255) {
if (postfix_lbp(operator) < min_bp) {
break;
}
// allocate operator
arrayslice_next(slice, NULL);
arena_ensure_capacity(
arena,
sizeof(ASTNode),
alignof(ASTNode));
ASTNode *new_node = arena_unwrap_pointer(
arena_alloc(
arena,
sizeof(ASTNode),
alignof(ASTNode)
)
);
*new_node = operator;
new_node->data.unary.val = left_side;
left_side = new_node;
continue;
}
// check if it has infix or not, if not then error
uint8_t rbp = infix_rbp(operator);
uint8_t lbp = infix_lbp(operator);
if (rbp != 255 && lbp != 255) {
// If lbp is LESS then stop recursion,
// we found the next smaller binding power
// or the one with more precedence
if (lbp < min_bp) { if (lbp < min_bp) {
break; break;
} }
// If NOT, then we continue wtching ahead
// for the next one but taking our current
// concern that is rbp of the current operator
arrayslice_next(slice, NULL); arrayslice_next(slice, NULL);
ASTNode *right_side = parse_expr(slice, arena, rbp); ASTNode *right_side = parse_expr(slice, arena, rbp);
@@ -103,11 +204,14 @@ ASTNode *parse_expr(ArraySlice *slice, Arena *arena, uint8_t min_bp) {
new_node->data.binary.right = right_side; new_node->data.binary.right = right_side;
left_side = new_node; left_side = new_node;
continue;
}
break;
} }
// Final: return left side
return left_side; return left_side;
} }