From 20621944e65de0ea84b987f43c5da5d68ff2d860 Mon Sep 17 00:00:00 2001 From: William Bell Date: Sat, 14 Jun 2025 01:16:28 +0100 Subject: [PATCH] change string literals to be length terminated instead of null terminated, so null characters can be embeded --- src/dynamic_array/darray.h | 2 +- src/lexer/lex.l | 5 + src/lexer/lexer.c | 33 +++- src/lexer/token.c | 11 -- src/lexer/token.h | 2 +- src/main.c | 11 +- src/parser/parser.c | 2 +- src/parser/string/string.c | 214 +++++++++++++++++++++-- src/parser/string/string.h | 9 +- src/translator/delcaration/delcaration.c | 0 src/translator/string/string.c | 9 +- src/translator/translator.c | 21 +-- src/translator/translator.h | 2 - test.ar | 11 +- test.json | 1 + 15 files changed, 262 insertions(+), 71 deletions(-) create mode 100644 src/translator/delcaration/delcaration.c create mode 100644 test.json diff --git a/src/dynamic_array/darray.h b/src/dynamic_array/darray.h index a5554aa..f04aa65 100644 --- a/src/dynamic_array/darray.h +++ b/src/dynamic_array/darray.h @@ -4,7 +4,7 @@ #include #include // for size_t -#define CHUNK_SIZE 16 +#define CHUNK_SIZE 1024 typedef struct { void *data; diff --git a/src/lexer/lex.l b/src/lexer/lex.l index 5ca1d61..3800f3a 100644 --- a/src/lexer/lex.l +++ b/src/lexer/lex.l @@ -15,6 +15,11 @@ int yywrap(void * unused_param) { %% +"\0" { + fprintf(stderr, "Error: Null character encountered at line %d\n", yylineno); + exit(1); +} + "." { return TOKEN_DOT; } "!" { return TOKEN_EXCLAMATION; } "," { return TOKEN_COMMA; } diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index e1603c6..4e9e3fc 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -1,7 +1,27 @@ #include "lexer.h" #include "lex.yy.h" +#include "../string/string.h" void lexer(LexerState state) { + size_t line = 1; + size_t column = 1; + int ch; + while ((ch = fgetc(state.file)) != EOF) { + if (ch == 0 || (ch < 0x20 && ch != '\n' && ch != '\r' && ch != '\t')) { + fprintf(stderr, "%s:%zu:%zu error: disallowed character\n", state.path, + line, column); + exit(1); + } + + if (ch == '\n') { + line++; + column = 1; + } else { + column++; + } + } + rewind(state.file); + yyscan_t scanner; yylex_init(&scanner); @@ -12,11 +32,14 @@ void lexer(LexerState state) { int token; while ((token = yylex(scanner)) != 0) { - Token *token_struct = - create_token(token, state.current_line+1, state.current_column + 1, - yyget_text(scanner)); - darray_push(state.tokens, token_struct); - free(token_struct); + Token token_struct = (Token){ + token, + state.current_line+1, + state.current_column+1, + yyget_leng(scanner), + cloneString(yyget_text(scanner)) + }; + darray_push(state.tokens, &token_struct); if (token == TOKEN_NEW_LINE) { state.current_line += 1; state.current_column = 0; diff --git a/src/lexer/token.c b/src/lexer/token.c index 68742c1..31b72a1 100644 --- a/src/lexer/token.c +++ b/src/lexer/token.c @@ -1,16 +1,5 @@ #include "token.h" -#include "../string/string.h" #include -#include "../memory.h" - -Token *create_token(TokenType type, int line, int column, char *value) { - Token *token = checked_malloc(sizeof(Token)); - token->type = type; - token->line = line; - token->column = column; - token->value = cloneString(value); - return token; -} void free_token(void *ptr) { Token *token = ptr; diff --git a/src/lexer/token.h b/src/lexer/token.h index fc4ea8a..8658ece 100644 --- a/src/lexer/token.h +++ b/src/lexer/token.h @@ -78,9 +78,9 @@ typedef struct { TokenType type; size_t line; size_t column; + size_t length; char *value; } Token; -Token *create_token(TokenType type, int line, int column, char *value); void free_token(void *ptr); #endif \ No newline at end of file diff --git a/src/main.c b/src/main.c index 499c941..5056342 100644 --- a/src/main.c +++ b/src/main.c @@ -5,12 +5,14 @@ #include "parser/parser.h" #include "translator/translator.h" +#include #include #include #include #include int main(int argc, char *argv[]) { + setlocale(LC_ALL, ""); if (argc <= 1) return -1; ar_memory_init(); @@ -19,7 +21,7 @@ int main(int argc, char *argv[]) { darray_init(&tokens, sizeof(Token)); - FILE * file = fopen(path, "r"); + FILE *file = fopen(path, "r"); if (!file) { return -1; @@ -43,13 +45,14 @@ int main(int argc, char *argv[]) { darray_free(&ast, free_parsed); file = fopen("out.car", "wb"); - + fwrite(&translated.registerCount, sizeof(size_t), 1, file); fwrite(&translated.constants.size, sizeof(size_t), 1, file); fwrite(&translated.bytecode.size, sizeof(size_t), 1, file); fwrite(translated.constants.data, 1, translated.constants.size, file); - fwrite(translated.bytecode.data, translated.bytecode.element_size, translated.bytecode.size, file); - + fwrite(translated.bytecode.data, translated.bytecode.element_size, + translated.bytecode.size, file); + fclose(file); free_translator(&translated); diff --git a/src/parser/parser.c b/src/parser/parser.c index c3cd8fb..c709788 100644 --- a/src/parser/parser.c +++ b/src/parser/parser.c @@ -82,7 +82,7 @@ ParsedValue *parse_token_full(char *file, DArray *tokens, size_t *index, break; case TOKEN_STRING: (*index)++; - output = parse_string(*token); + output = parse_string(file,token); break; case TOKEN_NEW_LINE: (*index)++; diff --git a/src/parser/string/string.c b/src/parser/string/string.c index 594e137..3119480 100644 --- a/src/parser/string/string.c +++ b/src/parser/string/string.c @@ -1,12 +1,197 @@ #include "string.h" #include "../../lexer/token.h" +#include "../../memory.h" #include #include #include #include #include -#include "../../memory.h" +#include +#include +#include +#include + +// Helper: Convert 4 hex digits from input to a uint16_t value +static int parse_hex4(const char *in, uint16_t *out) { + uint16_t val = 0; + for (int i = 0; i < 4; i++) { + char c = in[i]; + val <<= 4; + if (c >= '0' && c <= '9') + val |= (c - '0'); + else if (c >= 'a' && c <= 'f') + val |= (c - 'a' + 10); + else if (c >= 'A' && c <= 'F') + val |= (c - 'A' + 10); + else + return 0; // invalid hex digit + } + *out = val; + return 1; +} + +// Helper: Encode a Unicode codepoint as UTF-8, write to *out_ptr, return bytes written +static int utf8_encode(uint32_t codepoint, char **out_ptr) { + char *p = *out_ptr; + if (codepoint <= 0x7F) { + *p++ = (char)codepoint; + *out_ptr = p; + return 1; + } + else if (codepoint <= 0x7FF) { + *p++ = (char)(0xC0 | (codepoint >> 6)); + *p++ = (char)(0x80 | (codepoint & 0x3F)); + *out_ptr = p; + return 2; + } + else if (codepoint <= 0xFFFF) { + *p++ = (char)(0xE0 | (codepoint >> 12)); + *p++ = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + *p++ = (char)(0x80 | (codepoint & 0x3F)); + *out_ptr = p; + return 3; + } + else if (codepoint <= 0x10FFFF) { + *p++ = (char)(0xF0 | (codepoint >> 18)); + *p++ = (char)(0x80 | ((codepoint >> 12) & 0x3F)); + *p++ = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + *p++ = (char)(0x80 | (codepoint & 0x3F)); + *out_ptr = p; + return 4; + } + return 0; // invalid codepoint +} + +/** + * unquote_json_string: + * Parses and unescapes a JSON string literal including quotes, + * returning a malloc'ed buffer with the decoded string and its length (including embedded nulls). + * + * Parameters: + * input: const char* JSON string literal (must start and end with quotes) + * out_len: pointer to size_t to receive decoded length + * + * Returns: + * malloc'ed buffer with decoded string (not necessarily null-terminated) + * NULL on error (invalid input) + * + * Caller must free() returned buffer. + */ +char *unquote_json_string(const char *input, size_t *out_len) { + if (!input || input[0] != '"') return NULL; + + // Find the closing quote + const char *p = input + 1; + const char *end = NULL; + while (*p) { + if (*p == '"') { + end = p; + break; + } + // Skip escaped quotes and escapes + if (*p == '\\') { + p++; + if (*p == '\0') return NULL; // invalid escape at end + } + p++; + } + if (!end) return NULL; // no closing quote + + size_t input_len = end - (input + 1); // length inside quotes + const char *src = input + 1; + // Allocate max output size = input_len, decoded string cannot be longer than input_len + char *outbuf = (char *)malloc(input_len + 1); + if (!outbuf) return NULL; + + char *dst = outbuf; + const char *src_end = src + input_len; + + while (src < src_end) { + if (*src != '\\') { + *dst++ = *src++; + } else { + // Escape sequence + src++; + if (src >= src_end) { + free(outbuf); + return NULL; // invalid escape at end + } + switch (*src) { + case '"': *dst++ = '"'; src++; break; + case '\\': *dst++ = '\\'; src++; break; + case '/': *dst++ = '/'; src++; break; + case 'b': *dst++ = '\b'; src++; break; + case 'f': *dst++ = '\f'; src++; break; + case 'n': *dst++ = '\n'; src++; break; + case 'r': *dst++ = '\r'; src++; break; + case 't': *dst++ = '\t'; src++; break; + + case 'u': { + // Unicode escape \uXXXX + if (src + 5 > src_end) { + free(outbuf); + return NULL; // not enough chars for \uXXXX + } + uint16_t code_unit1 = 0; + if (!parse_hex4(src + 1, &code_unit1)) { + free(outbuf); + return NULL; // invalid hex digits + } + src += 5; // consume uXXXX + + // Check for surrogate pair + if (code_unit1 >= 0xD800 && code_unit1 <= 0xDBFF) { + // high surrogate, expect another \uXXXX + if (src + 6 <= src_end && src[0] == '\\' && src[1] == 'u') { + uint16_t code_unit2 = 0; + if (!parse_hex4(src + 2, &code_unit2)) { + free(outbuf); + return NULL; + } + if (code_unit2 >= 0xDC00 && code_unit2 <= 0xDFFF) { + // valid low surrogate, combine to codepoint + uint32_t codepoint = 0x10000 + (((code_unit1 - 0xD800) << 10) | (code_unit2 - 0xDC00)); + utf8_encode(codepoint, &dst); + src += 6; // consume \uXXXX low surrogate + break; + } else { + free(outbuf); + return NULL; // invalid low surrogate + } + } else { + free(outbuf); + return NULL; // expected low surrogate missing + } + } else if (code_unit1 >= 0xDC00 && code_unit1 <= 0xDFFF) { + free(outbuf); + return NULL; // unexpected low surrogate without high surrogate + } else { + // normal BMP codepoint + utf8_encode(code_unit1, &dst); + } + break; + } + + default: + free(outbuf); + return NULL; // invalid escape char + } + } + } + // decoded length: + size_t decoded_len = dst - outbuf; + + // Optionally null terminate (not required) + *dst = '\0'; + + if (out_len) + *out_len = decoded_len; + + return outbuf; +} + + char *swap_quotes(char *input, char quote) { size_t len = strlen(input); @@ -26,7 +211,8 @@ char *swap_quotes(char *input, char quote) { return result; } -char *unquote(char *str) { + +char *unquote(char *str, size_t *decoded_len) { if (*str == '\0') return NULL; @@ -41,29 +227,16 @@ char *unquote(char *str) { str = swapped; } - cJSON *json = cJSON_Parse(str); - if (!json || !cJSON_IsString(json)) { - cJSON_Delete(json); + unescaped = unquote_json_string(str, decoded_len); + if (!unescaped) { if (swapped) free(swapped); return NULL; } - // Copy unescaped string before freeing JSON object - const char *decoded = cJSON_GetStringValue(json); - if (!decoded) { - cJSON_Delete(json); - if (swapped) - free(swapped); - return NULL; - } - - unescaped = strdup(decoded); - cJSON_Delete(json); if (swapped) free(swapped); - // If input was single-quoted, swap quotes back in the output if (quote != '"') { char *final = swap_quotes(unescaped, quote); free(unescaped); @@ -73,9 +246,12 @@ char *unquote(char *str) { return unescaped; } -ParsedValue *parse_string(Token token) { +ParsedValue *parse_string(char*file,Token* token) { ParsedValue *parsedValue = checked_malloc(sizeof(ParsedValue)); parsedValue->type = AST_STRING; - parsedValue->data = unquote(token.value); + ParsedString *parsedString = checked_malloc(sizeof(ParsedString)); + parsedValue->data = parsedString; + parsedString->length = 0; + parsedString->string = unquote(token->value, &parsedString->length); return parsedValue; } \ No newline at end of file diff --git a/src/parser/string/string.h b/src/parser/string/string.h index 6a366fe..470688b 100644 --- a/src/parser/string/string.h +++ b/src/parser/string/string.h @@ -6,10 +6,15 @@ // Declare functions related to string processing in parser +typedef struct { + size_t length; + char *string; +} ParsedString; + char *swap_quotes(char *input, char quote); -char *unquote(char *str); +char *unquote(char *str, size_t *decoded_len); -ParsedValue *parse_string(Token token); +ParsedValue *parse_string(char*file,Token* token); #endif // STRING_UTILS_H \ No newline at end of file diff --git a/src/translator/delcaration/delcaration.c b/src/translator/delcaration/delcaration.c new file mode 100644 index 0000000..e69de29 diff --git a/src/translator/string/string.c b/src/translator/string/string.c index 44fe5e8..9c2bd01 100644 --- a/src/translator/string/string.c +++ b/src/translator/string/string.c @@ -1,13 +1,18 @@ #include "../translator.h" +#include "../../parser/string/string.h" #include +#include #include void translate_parsed_string(Translated *translated, ParsedValue *parsedValue) { - size_t string_pos = arena_push_string(&translated->constants, (char*)parsedValue->data); + ParsedString *parsedString = (ParsedString*)parsedValue->data; + size_t string_pos = arena_push(&translated->constants, parsedString->string, parsedString->length); set_registers(translated, 1); push_instruction_code(translated, OP_LOAD_CONST); push_instruction_code(translated, 0); push_instruction_code(translated, OP_TYPE_STRING); - push_instruction_code(translated,strlen(parsedValue->data)+1); + push_instruction_code(translated,parsedString->length); push_instruction_code(translated, string_pos); + fwrite(parsedString->string, 1, parsedString->length, stdout); + putchar('\n'); } \ No newline at end of file diff --git a/src/translator/translator.c b/src/translator/translator.c index f76b945..90904e3 100644 --- a/src/translator/translator.c +++ b/src/translator/translator.c @@ -13,15 +13,15 @@ void arena_init(ConstantArena *arena) { } void arena_resize(ConstantArena *arena, size_t new_size) { - new_size = ((new_size / CHUNK_SIZE) + 1)*CHUNK_SIZE; - if (new_size == arena->capacity) + size_t new_capacity = ((new_size / CHUNK_SIZE) + 1)*CHUNK_SIZE; + if (new_capacity == arena->capacity) return; - arena->data = realloc(arena->data, new_size); + arena->data = realloc(arena->data, new_capacity); if (!arena->data) { - fprintf(stderr, "error: failed to resize arena from %zu to %zu\n", new_size, arena->capacity); + fprintf(stderr, "error: failed to resize arena from %zu to %zu\n", new_capacity, arena->capacity); exit(EXIT_FAILURE); } - arena->capacity = new_size; + arena->capacity = new_capacity; } @@ -35,16 +35,6 @@ void * arena_get(ConstantArena *arena, size_t offset) { return arena->data + offset; } - -size_t arena_push_string(ConstantArena *arena, const char *string) { - size_t length = strlen(string)+1; - arena_resize(arena, arena->size+length); - size_t offset = arena->size; - strcpy(arena->data + arena->size, string); - arena->size += length; - return offset; -} - size_t arena_push(ConstantArena *arena, const void *data, size_t length) { arena_resize(arena, arena->size+length); size_t offset = arena->size; @@ -55,6 +45,7 @@ size_t arena_push(ConstantArena *arena, const void *data, size_t length) { Translated init_translator() { Translated translated; + translated.registerCount = 0; darray_init(&translated.bytecode, sizeof(size_t)); arena_init(&translated.constants); return translated; diff --git a/src/translator/translator.h b/src/translator/translator.h index fd22523..5ddd767 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -24,8 +24,6 @@ typedef struct { void * arena_get(ConstantArena *arena, size_t offset); -size_t arena_push_string(ConstantArena *arena, const char *string); - size_t arena_push(ConstantArena *arena, const void *data, size_t length); size_t push_instruction_code(Translated * translator, size_t code); diff --git a/test.ar b/test.ar index 5c23a21..5651530 100644 --- a/test.ar +++ b/test.ar @@ -13,14 +13,9 @@ "hello world" "hello world" "hello world" -"hello world" -"hello world" -"hello world" -"hello world" -"hello world" -"hello world" -"hello world" - +"hello\u0000world" +"🇬🇧" +"hello\u0000world" let a, b = 1, diff --git a/test.json b/test.json new file mode 100644 index 0000000..ed6780d --- /dev/null +++ b/test.json @@ -0,0 +1 @@ +"\u0000" \ No newline at end of file