update lexer to just return the number and then do the other logic somewhere else

This commit is contained in:
2025-05-30 02:12:51 +01:00
parent 626445a906
commit 68341db0b0
13 changed files with 165 additions and 350 deletions

View File

@@ -1,12 +1,10 @@
%option reentrant
%option yylineno
%{
#include "token.h"
#include "lexer.h"
#define GET_STATE LexerState *state = (LexerState *)yyget_extra(yyscanner);
#define ADD_TO_COLUMN COLUMN_NO += yyleng;
#define LINE_NO yylineno+1
#define TOKENS state->tokens
#define COLUMN_NO state->current_column
int yywrap(void *) {
@@ -17,326 +15,87 @@ int yywrap(void *) {
%%
\"((\\([\"\\\/bfnrt]|u[0-9a-fA-F]{4}))|[^\\\"\n])*\" {
GET_STATE
Token * token = create_token(
TOKEN_STRING,
LINE_NO,
COLUMN_NO,
yytext
);
append(TOKENS, token);
ADD_TO_COLUMN
return TOKEN_STRING;
}
\'((\\([\'\\\/bfnrt]|u[0-9a-fA-F]{4}))|[^\\\'\n])*\' {
GET_STATE
append(TOKENS, create_token(
TOKEN_STRING,
LINE_NO,
COLUMN_NO,
yytext
));
ADD_TO_COLUMN
return TOKEN_STRING;
}
((([0-9]+(\.[0-9]+)?)|(\.[0-9]+))(e((\-|\+)?([0-9]+(\.[0-9]+)?)))?) {
GET_STATE
append(TOKENS, create_token(
TOKEN_NUMBER,
LINE_NO,
COLUMN_NO,
yytext
));
ADD_TO_COLUMN
return TOKEN_NUMBER;
}
([0-9]+\/[0-9]+) {
GET_STATE
append(TOKENS, create_token(
TOKEN_FRACTION,
LINE_NO,
COLUMN_NO,
yytext
));
ADD_TO_COLUMN
return TOKEN_FRACTION;
}
"not"[ \t]+"in" { GET_STATE; append(TOKENS, create_token(TOKEN_NOT_IN,
LINE_NO,
COLUMN_NO,
yytext
));
append(TOKENS, create_token(
TOKEN_NOT_IN,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"&&" { GET_STATE; append(TOKENS, create_token(TOKEN_AND,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"||" { GET_STATE; append(TOKENS, create_token(TOKEN_OR,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"<=" { GET_STATE; append(TOKENS, create_token(TOKEN_LE,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
">=" { GET_STATE; append(TOKENS, create_token(TOKEN_GE,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"!=" { GET_STATE; append(TOKENS, create_token(TOKEN_NE,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"==" { GET_STATE; append(TOKENS, create_token(TOKEN_EQ,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"=" { GET_STATE; append(TOKENS, create_token(TOKEN_ASSIGN,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"//" { GET_STATE; append(TOKENS, create_token(TOKEN_FLOORDIV,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"<" { GET_STATE; append(TOKENS, create_token(TOKEN_LT,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
">" { GET_STATE; append(TOKENS, create_token(TOKEN_GT,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"+" { GET_STATE; append(TOKENS, create_token(TOKEN_PLUS,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"-" { GET_STATE; append(TOKENS, create_token(TOKEN_MINUS,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"%" { GET_STATE; append(TOKENS, create_token(TOKEN_MODULO,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"*" { GET_STATE; append(TOKENS, create_token(TOKEN_STAR,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"/" { GET_STATE; append(TOKENS, create_token(TOKEN_SLASH,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"^" { GET_STATE; append(TOKENS, create_token(TOKEN_CARET,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"not"[ \t]+"in" { return TOKEN_NOT_IN; }
"&&" { return TOKEN_AND; }
"||" { return TOKEN_OR; }
"<=" { return TOKEN_LE; }
">=" { return TOKEN_GE; }
"!=" { return TOKEN_NE; }
"==" { return TOKEN_EQ; }
"=" { return TOKEN_ASSIGN; }
"//" { return TOKEN_FLOORDIV; }
"<" { return TOKEN_LT; }
">" { return TOKEN_GT; }
"+" { return TOKEN_PLUS; }
"-" { return TOKEN_MINUS; }
"%" { return TOKEN_MODULO; }
"*" { return TOKEN_STAR; }
"/" { return TOKEN_SLASH; }
"^" { return TOKEN_CARET; }
"if" { GET_STATE; append(TOKENS, create_token(TOKEN_IF,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"else" { GET_STATE; append(TOKENS, create_token(TOKEN_ELSE,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"while" { GET_STATE; append(TOKENS, create_token(TOKEN_WHILE,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"forever" { GET_STATE; append(TOKENS, create_token(TOKEN_FOREVER,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"for" { GET_STATE; append(TOKENS, create_token(TOKEN_FOR,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"break" { GET_STATE; append(TOKENS, create_token(TOKEN_BREAK,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"continue" { GET_STATE; append(TOKENS, create_token(TOKEN_CONTINUE,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"return" { GET_STATE; append(TOKENS, create_token(TOKEN_RETURN,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"let" { GET_STATE; append(TOKENS, create_token(TOKEN_LET,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"import" { GET_STATE; append(TOKENS, create_token(TOKEN_IMPORT,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"from" { GET_STATE; append(TOKENS, create_token(TOKEN_FROM,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"do" { GET_STATE; append(TOKENS, create_token(TOKEN_DO,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"true" { GET_STATE; append(TOKENS, create_token(TOKEN_TRUE,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"false" { GET_STATE; append(TOKENS, create_token(TOKEN_FALSE,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"null" { GET_STATE; append(TOKENS, create_token(TOKEN_NULL,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"delete" { GET_STATE; append(TOKENS, create_token(TOKEN_DELETE,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"not" { GET_STATE; append(TOKENS, create_token(TOKEN_NOT,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"in" { GET_STATE; append(TOKENS, create_token(TOKEN_IN,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"try" { GET_STATE; append(TOKENS, create_token(TOKEN_TRY,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"catch" { GET_STATE; append(TOKENS, create_token(TOKEN_CATCH,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"if" { return TOKEN_IF; }
"else" { return TOKEN_ELSE; }
"while" { return TOKEN_WHILE; }
"forever" { return TOKEN_FOREVER; }
"for" { return TOKEN_FOR; }
"break" { return TOKEN_BREAK; }
"continue" { return TOKEN_CONTINUE; }
"return" { return TOKEN_RETURN; }
"let" { return TOKEN_LET; }
"import" { return TOKEN_IMPORT; }
"from" { return TOKEN_FROM; }
"do" { return TOKEN_DO; }
"true" { return TOKEN_TRUE; }
"false" { return TOKEN_FALSE; }
"null" { return TOKEN_NULL; }
"delete" { return TOKEN_DELETE; }
"not" { return TOKEN_NOT; }
"in" { return TOKEN_IN; }
"try" { return TOKEN_TRY; }
"catch" { return TOKEN_CATCH; }
"(" { GET_STATE; append(TOKENS, create_token(TOKEN_LPAREN,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
")" { GET_STATE; append(TOKENS, create_token(TOKEN_RPAREN,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"[" { GET_STATE; append(TOKENS, create_token(TOKEN_LBRACKET,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"]" { GET_STATE; append(TOKENS, create_token(TOKEN_RBRACKET,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"{" { GET_STATE; append(TOKENS, create_token(TOKEN_LBRACE,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"}" { GET_STATE; append(TOKENS, create_token(TOKEN_RBRACE,
LINE_NO,
COLUMN_NO,
yytext
)); ADD_TO_COLUMN; }
"(" { return TOKEN_LPAREN; }
")" { return TOKEN_RPAREN; }
"[" { return TOKEN_LBRACKET; }
"]" { return TOKEN_RBRACKET; }
"{" { return TOKEN_LBRACE; }
"}" { return TOKEN_RBRACE; }
[a-zA-Z_][a-zA-Z0-9_]* {
GET_STATE
append(TOKENS, create_token(TOKEN_IDENTIFIER,
LINE_NO,
COLUMN_NO,
yytext
));
ADD_TO_COLUMN
}
[a-zA-Z_][a-zA-Z0-9_]* { return TOKEN_IDENTIFIER; }
"." {GET_STATE;append(TOKENS, create_token(TOKEN_DOT,
LINE_NO,
COLUMN_NO,
yytext
));ADD_TO_COLUMN}
"," {GET_STATE;append(TOKENS, create_token(TOKEN_COMMA,
LINE_NO,
COLUMN_NO,
yytext
));ADD_TO_COLUMN}
":" {GET_STATE;append(TOKENS, create_token(TOKEN_COLON,
LINE_NO,
COLUMN_NO,
yytext
));ADD_TO_COLUMN}
"." { return TOKEN_DOT; }
"," {return TOKEN_COMMA; }
":" {return TOKEN_COLON; }
\n {
GET_STATE
append(TOKENS, create_token(TOKEN_NEW_LINE,
LINE_NO,
COLUMN_NO,
yytext
));
COLUMN_NO = 1;
}
\n { return TOKEN_NEW_LINE; }
[ \t]+ {
GET_STATE
if (COLUMN_NO == 1){
append(TOKENS, create_token(TOKEN_INDENT,
LINE_NO,
COLUMN_NO,
yytext
));
if (COLUMN_NO == 0){
return TOKEN_INDENT;
}
ADD_TO_COLUMN // Advance column for whitespace
COLUMN_NO += yyleng;
}
. {
GET_STATE
fprintf(stderr, "%s: line %d column %d: unexpected character '%s'\n", state->path, LINE_NO, COLUMN_NO, yytext);
fprintf(stderr, "%s:%u:%u: unexpected character '%s'\n", state->path, yylineno+1, COLUMN_NO+1, yytext);
exit(1);
}
%%

View File

@@ -11,7 +11,21 @@ void lexer(LexerState state) {
void* buffer = yy_scan_string(state.content, scanner);
yy_switch_to_buffer(buffer, scanner);
yylex(scanner);
int token;
while ((token = yylex(scanner)) != 0) {
Token * token_struct = create_token(
token,
yyget_lineno(scanner),
state.current_column,
yyget_text(scanner)
);
append(state.tokens, token_struct);
if (token == TOKEN_NEW_LINE) {
state.current_column = 0;
} else {
state.current_column += yyget_leng(scanner);
}
}
yy_delete_buffer(buffer, scanner);
yylex_destroy(scanner);

View File

@@ -1,14 +1,17 @@
#include "token.h"
#include <stdio.h>
#include <stdlib.h>
#include "../string/string.h"
Token *create_token(TokenType type, int line, int column, char *value) {
Token * token = malloc(sizeof(Token));
printf("%s\n", value);
token->type = type;
token->line=line;
token->column=column;
token->value=cloneString(value);
return token;
}
void free_token(void * ptr) {
Token* token = ptr;
free(token->value);
}

View File

@@ -2,7 +2,7 @@
#define TOKEN_H
typedef enum {
TOKEN_STRING,
TOKEN_STRING = 256,
TOKEN_NUMBER,
TOKEN_FRACTION,
TOKEN_IDENTIFIER,
@@ -72,4 +72,5 @@ typedef struct {
} Token;
Token *create_token(TokenType type, int line, int column, char *value);
void free_token(void * ptr);
#endif

View File

@@ -82,11 +82,14 @@ void print_list(LinkedList *list, void (*print_func)(void *)) {
}
}
void free_list(LinkedList *list) {
void free_list(LinkedList *list, void (*free_data)(void *)) {
Node *current = list->head;
while (current) {
Node *next = current->next;
free(current->data);
if (free_data) // Safe to pass NULL if you don't need it
free_data(current->data);
free(current);
current = next;
}

View File

@@ -37,6 +37,6 @@ size_t list_length(LinkedList *list);
void print_list(LinkedList *list, void (*print_func)(void *));
// Free all memory used by the list
void free_list(LinkedList *list);
void free_list(LinkedList *list, void (*free_data)(void *));
#endif // LINKEDLIST_H

View File

@@ -6,6 +6,7 @@
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
char* read_file_as_text(const char* filename) {
FILE *file = fopen(filename, "r");
@@ -46,7 +47,7 @@ int main() {
LexerState state = {
path,
content,
1,
0,
tokens
};
lexer(state);
@@ -55,11 +56,9 @@ int main() {
parser(parsed, tokens, false);
Node *current = parsed->head;
while (current) {
printf("%s\n", (char*)((TaggedValue*)current->data)->data);
current = current->next;
}
free_list(tokens, free_token);
free_list(parsed,free_tagged_value);
return 0;
}

View File

@@ -1,15 +1,24 @@
#include "parser.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include "../lexer/token.h"
#include "../list/list.h"
#include "string/string.h"
TaggedValue parse_token(LinkedList * tokens, size_t *index) {
TaggedValue * parse_token(LinkedList * tokens, size_t *index) {
Token * token = get_element_at(tokens, *index);
switch (token->type) {
case TOKEN_STRING:
(*index)++;
return parse_string(*token);
case TOKEN_NEW_LINE:
(*index)++;
return NULL;
default:
perror("unreachable");
exit(0);
fprintf(stderr, "Panic: %s\n", "unreachable"); \
exit(EXIT_FAILURE); \
}
}
@@ -17,7 +26,19 @@ void parser(LinkedList * parsed, LinkedList * tokens, bool inline_flag) {
size_t index = 0;
size_t length = list_length(tokens);
while (index < length) {
TaggedValue parsed_code = parse_token(tokens, &index);
append(parsed,&parsed_code);
TaggedValue * parsed_code = parse_token(tokens, &index);
if (parsed_code)
append(parsed,parsed_code);
}
}
void free_tagged_value(void *ptr) {
TaggedValue *tagged = ptr;
switch (tagged->type) {
case AST_STRING:
free(tagged->data);
break;
// Add cases if needed
}
free(tagged); // Always free the TaggedValue itself
}

View File

@@ -1,9 +1,27 @@
#include "../lexer/token.h"
#include "string/string.h"
#ifndef PARSER_H
#define PARSER_H
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
void parser(LinkedList * parsed, LinkedList * tokens, bool inline_flag);
TaggedValue parse_token(LinkedList * tokens, size_t *index);
typedef struct LinkedList LinkedList;
typedef enum {
AST_STRING,
} ValueType;
typedef struct {
ValueType type;
void *data;
} TaggedValue;
void parser(LinkedList *parsed, LinkedList *tokens, bool inline_flag);
TaggedValue *parse_token(LinkedList *tokens, size_t *index);
void free_tagged_value(void *ptr);
#endif // PARSER_H

View File

@@ -7,7 +7,7 @@
#include <stdlib.h>
#include <string.h>
char *swap_quotes(char *input) {
char *swap_quotes(char *input, char quote) {
size_t len = strlen(input);
char *result = malloc(len + 1);
if (!result)
@@ -15,8 +15,8 @@ char *swap_quotes(char *input) {
for (size_t i = 0; i < len; ++i) {
if (input[i] == '"')
result[i] = '\'';
else if (input[i] == '\'')
result[i] = quote;
else if (input[i] == quote)
result[i] = '"';
else
result[i] = input[i];
@@ -33,8 +33,8 @@ char *unquote(char *str) {
char *swapped = NULL;
char *unescaped = NULL;
if (quote == '\'') {
swapped = swap_quotes(str);
if (quote != '"') {
swapped = swap_quotes(str, quote);
if (!swapped)
return NULL;
str = swapped;
@@ -62,8 +62,8 @@ char *unquote(char *str) {
free(swapped);
// If input was single-quoted, swap quotes back in the output
if (quote == '\'') {
char *final = swap_quotes(unescaped);
if (quote != '"') {
char *final = swap_quotes(unescaped, quote);
free(unescaped);
return final;
}
@@ -71,9 +71,10 @@ char *unquote(char *str) {
return unescaped;
}
TaggedValue parse_string(Token token) {
return (TaggedValue){
AST_STRING,
unquote(token.value),
};
TaggedValue * parse_string(Token token) {
TaggedValue * taggedValue = malloc(sizeof(TaggedValue));
taggedValue->type = AST_STRING;
taggedValue->data = unquote(token.value);
return taggedValue;
}

View File

@@ -1,8 +1,15 @@
#include "../../lexer/token.h"
#include "../taggedValue.h"
#ifndef STRING_UTILS_H
#define STRING_UTILS_H
char *swap_quotes(char *input);
#include "../../lexer/token.h"
#include "../parser.h"
// Declare functions related to string processing in parser
char *swap_quotes(char *input, char quote);
char *unquote(char *str);
TaggedValue parse_string(Token token);
TaggedValue *parse_string(Token token);
#endif // STRING_UTILS_H

View File

@@ -1,11 +0,0 @@
#include "../list/list.h"
typedef enum {
AST_STRING,
} ValueType;
typedef struct {
ValueType type;
void *data;
} TaggedValue;

View File

@@ -1 +1 @@
"hello world"
'hello world'