change string literals to be length terminated instead of null terminated, so null characters can be embeded
This commit is contained in:
@@ -4,7 +4,7 @@
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h> // for size_t
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
#define CHUNK_SIZE 1024
|
||||
|
||||
typedef struct {
|
||||
void *data;
|
||||
|
||||
@@ -15,6 +15,11 @@ int yywrap(void * unused_param) {
|
||||
|
||||
%%
|
||||
|
||||
"\0" {
|
||||
fprintf(stderr, "Error: Null character encountered at line %d\n", yylineno);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
"." { return TOKEN_DOT; }
|
||||
"!" { return TOKEN_EXCLAMATION; }
|
||||
"," { return TOKEN_COMMA; }
|
||||
|
||||
@@ -1,7 +1,27 @@
|
||||
#include "lexer.h"
|
||||
#include "lex.yy.h"
|
||||
#include "../string/string.h"
|
||||
|
||||
void lexer(LexerState state) {
|
||||
size_t line = 1;
|
||||
size_t column = 1;
|
||||
int ch;
|
||||
while ((ch = fgetc(state.file)) != EOF) {
|
||||
if (ch == 0 || (ch < 0x20 && ch != '\n' && ch != '\r' && ch != '\t')) {
|
||||
fprintf(stderr, "%s:%zu:%zu error: disallowed character\n", state.path,
|
||||
line, column);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (ch == '\n') {
|
||||
line++;
|
||||
column = 1;
|
||||
} else {
|
||||
column++;
|
||||
}
|
||||
}
|
||||
rewind(state.file);
|
||||
|
||||
yyscan_t scanner;
|
||||
|
||||
yylex_init(&scanner);
|
||||
@@ -12,11 +32,14 @@ void lexer(LexerState state) {
|
||||
|
||||
int token;
|
||||
while ((token = yylex(scanner)) != 0) {
|
||||
Token *token_struct =
|
||||
create_token(token, state.current_line+1, state.current_column + 1,
|
||||
yyget_text(scanner));
|
||||
darray_push(state.tokens, token_struct);
|
||||
free(token_struct);
|
||||
Token token_struct = (Token){
|
||||
token,
|
||||
state.current_line+1,
|
||||
state.current_column+1,
|
||||
yyget_leng(scanner),
|
||||
cloneString(yyget_text(scanner))
|
||||
};
|
||||
darray_push(state.tokens, &token_struct);
|
||||
if (token == TOKEN_NEW_LINE) {
|
||||
state.current_line += 1;
|
||||
state.current_column = 0;
|
||||
|
||||
@@ -1,16 +1,5 @@
|
||||
#include "token.h"
|
||||
#include "../string/string.h"
|
||||
#include <stdlib.h>
|
||||
#include "../memory.h"
|
||||
|
||||
Token *create_token(TokenType type, int line, int column, char *value) {
|
||||
Token *token = checked_malloc(sizeof(Token));
|
||||
token->type = type;
|
||||
token->line = line;
|
||||
token->column = column;
|
||||
token->value = cloneString(value);
|
||||
return token;
|
||||
}
|
||||
|
||||
void free_token(void *ptr) {
|
||||
Token *token = ptr;
|
||||
|
||||
@@ -78,9 +78,9 @@ typedef struct {
|
||||
TokenType type;
|
||||
size_t line;
|
||||
size_t column;
|
||||
size_t length;
|
||||
char *value;
|
||||
} Token;
|
||||
|
||||
Token *create_token(TokenType type, int line, int column, char *value);
|
||||
void free_token(void *ptr);
|
||||
#endif
|
||||
11
src/main.c
11
src/main.c
@@ -5,12 +5,14 @@
|
||||
#include "parser/parser.h"
|
||||
#include "translator/translator.h"
|
||||
|
||||
#include <locale.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
setlocale(LC_ALL, "");
|
||||
if (argc <= 1)
|
||||
return -1;
|
||||
ar_memory_init();
|
||||
@@ -19,7 +21,7 @@ int main(int argc, char *argv[]) {
|
||||
|
||||
darray_init(&tokens, sizeof(Token));
|
||||
|
||||
FILE * file = fopen(path, "r");
|
||||
FILE *file = fopen(path, "r");
|
||||
|
||||
if (!file) {
|
||||
return -1;
|
||||
@@ -43,13 +45,14 @@ int main(int argc, char *argv[]) {
|
||||
darray_free(&ast, free_parsed);
|
||||
|
||||
file = fopen("out.car", "wb");
|
||||
|
||||
|
||||
fwrite(&translated.registerCount, sizeof(size_t), 1, file);
|
||||
fwrite(&translated.constants.size, sizeof(size_t), 1, file);
|
||||
fwrite(&translated.bytecode.size, sizeof(size_t), 1, file);
|
||||
fwrite(translated.constants.data, 1, translated.constants.size, file);
|
||||
fwrite(translated.bytecode.data, translated.bytecode.element_size, translated.bytecode.size, file);
|
||||
|
||||
fwrite(translated.bytecode.data, translated.bytecode.element_size,
|
||||
translated.bytecode.size, file);
|
||||
|
||||
fclose(file);
|
||||
|
||||
free_translator(&translated);
|
||||
|
||||
@@ -82,7 +82,7 @@ ParsedValue *parse_token_full(char *file, DArray *tokens, size_t *index,
|
||||
break;
|
||||
case TOKEN_STRING:
|
||||
(*index)++;
|
||||
output = parse_string(*token);
|
||||
output = parse_string(file,token);
|
||||
break;
|
||||
case TOKEN_NEW_LINE:
|
||||
(*index)++;
|
||||
|
||||
@@ -1,12 +1,197 @@
|
||||
#include "string.h"
|
||||
#include "../../lexer/token.h"
|
||||
|
||||
#include "../../memory.h"
|
||||
#include <cjson/cJSON.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "../../memory.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
// Helper: Convert 4 hex digits from input to a uint16_t value
|
||||
static int parse_hex4(const char *in, uint16_t *out) {
|
||||
uint16_t val = 0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
char c = in[i];
|
||||
val <<= 4;
|
||||
if (c >= '0' && c <= '9')
|
||||
val |= (c - '0');
|
||||
else if (c >= 'a' && c <= 'f')
|
||||
val |= (c - 'a' + 10);
|
||||
else if (c >= 'A' && c <= 'F')
|
||||
val |= (c - 'A' + 10);
|
||||
else
|
||||
return 0; // invalid hex digit
|
||||
}
|
||||
*out = val;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Helper: Encode a Unicode codepoint as UTF-8, write to *out_ptr, return bytes written
|
||||
static int utf8_encode(uint32_t codepoint, char **out_ptr) {
|
||||
char *p = *out_ptr;
|
||||
if (codepoint <= 0x7F) {
|
||||
*p++ = (char)codepoint;
|
||||
*out_ptr = p;
|
||||
return 1;
|
||||
}
|
||||
else if (codepoint <= 0x7FF) {
|
||||
*p++ = (char)(0xC0 | (codepoint >> 6));
|
||||
*p++ = (char)(0x80 | (codepoint & 0x3F));
|
||||
*out_ptr = p;
|
||||
return 2;
|
||||
}
|
||||
else if (codepoint <= 0xFFFF) {
|
||||
*p++ = (char)(0xE0 | (codepoint >> 12));
|
||||
*p++ = (char)(0x80 | ((codepoint >> 6) & 0x3F));
|
||||
*p++ = (char)(0x80 | (codepoint & 0x3F));
|
||||
*out_ptr = p;
|
||||
return 3;
|
||||
}
|
||||
else if (codepoint <= 0x10FFFF) {
|
||||
*p++ = (char)(0xF0 | (codepoint >> 18));
|
||||
*p++ = (char)(0x80 | ((codepoint >> 12) & 0x3F));
|
||||
*p++ = (char)(0x80 | ((codepoint >> 6) & 0x3F));
|
||||
*p++ = (char)(0x80 | (codepoint & 0x3F));
|
||||
*out_ptr = p;
|
||||
return 4;
|
||||
}
|
||||
return 0; // invalid codepoint
|
||||
}
|
||||
|
||||
/**
|
||||
* unquote_json_string:
|
||||
* Parses and unescapes a JSON string literal including quotes,
|
||||
* returning a malloc'ed buffer with the decoded string and its length (including embedded nulls).
|
||||
*
|
||||
* Parameters:
|
||||
* input: const char* JSON string literal (must start and end with quotes)
|
||||
* out_len: pointer to size_t to receive decoded length
|
||||
*
|
||||
* Returns:
|
||||
* malloc'ed buffer with decoded string (not necessarily null-terminated)
|
||||
* NULL on error (invalid input)
|
||||
*
|
||||
* Caller must free() returned buffer.
|
||||
*/
|
||||
char *unquote_json_string(const char *input, size_t *out_len) {
|
||||
if (!input || input[0] != '"') return NULL;
|
||||
|
||||
// Find the closing quote
|
||||
const char *p = input + 1;
|
||||
const char *end = NULL;
|
||||
while (*p) {
|
||||
if (*p == '"') {
|
||||
end = p;
|
||||
break;
|
||||
}
|
||||
// Skip escaped quotes and escapes
|
||||
if (*p == '\\') {
|
||||
p++;
|
||||
if (*p == '\0') return NULL; // invalid escape at end
|
||||
}
|
||||
p++;
|
||||
}
|
||||
if (!end) return NULL; // no closing quote
|
||||
|
||||
size_t input_len = end - (input + 1); // length inside quotes
|
||||
const char *src = input + 1;
|
||||
// Allocate max output size = input_len, decoded string cannot be longer than input_len
|
||||
char *outbuf = (char *)malloc(input_len + 1);
|
||||
if (!outbuf) return NULL;
|
||||
|
||||
char *dst = outbuf;
|
||||
const char *src_end = src + input_len;
|
||||
|
||||
while (src < src_end) {
|
||||
if (*src != '\\') {
|
||||
*dst++ = *src++;
|
||||
} else {
|
||||
// Escape sequence
|
||||
src++;
|
||||
if (src >= src_end) {
|
||||
free(outbuf);
|
||||
return NULL; // invalid escape at end
|
||||
}
|
||||
switch (*src) {
|
||||
case '"': *dst++ = '"'; src++; break;
|
||||
case '\\': *dst++ = '\\'; src++; break;
|
||||
case '/': *dst++ = '/'; src++; break;
|
||||
case 'b': *dst++ = '\b'; src++; break;
|
||||
case 'f': *dst++ = '\f'; src++; break;
|
||||
case 'n': *dst++ = '\n'; src++; break;
|
||||
case 'r': *dst++ = '\r'; src++; break;
|
||||
case 't': *dst++ = '\t'; src++; break;
|
||||
|
||||
case 'u': {
|
||||
// Unicode escape \uXXXX
|
||||
if (src + 5 > src_end) {
|
||||
free(outbuf);
|
||||
return NULL; // not enough chars for \uXXXX
|
||||
}
|
||||
uint16_t code_unit1 = 0;
|
||||
if (!parse_hex4(src + 1, &code_unit1)) {
|
||||
free(outbuf);
|
||||
return NULL; // invalid hex digits
|
||||
}
|
||||
src += 5; // consume uXXXX
|
||||
|
||||
// Check for surrogate pair
|
||||
if (code_unit1 >= 0xD800 && code_unit1 <= 0xDBFF) {
|
||||
// high surrogate, expect another \uXXXX
|
||||
if (src + 6 <= src_end && src[0] == '\\' && src[1] == 'u') {
|
||||
uint16_t code_unit2 = 0;
|
||||
if (!parse_hex4(src + 2, &code_unit2)) {
|
||||
free(outbuf);
|
||||
return NULL;
|
||||
}
|
||||
if (code_unit2 >= 0xDC00 && code_unit2 <= 0xDFFF) {
|
||||
// valid low surrogate, combine to codepoint
|
||||
uint32_t codepoint = 0x10000 + (((code_unit1 - 0xD800) << 10) | (code_unit2 - 0xDC00));
|
||||
utf8_encode(codepoint, &dst);
|
||||
src += 6; // consume \uXXXX low surrogate
|
||||
break;
|
||||
} else {
|
||||
free(outbuf);
|
||||
return NULL; // invalid low surrogate
|
||||
}
|
||||
} else {
|
||||
free(outbuf);
|
||||
return NULL; // expected low surrogate missing
|
||||
}
|
||||
} else if (code_unit1 >= 0xDC00 && code_unit1 <= 0xDFFF) {
|
||||
free(outbuf);
|
||||
return NULL; // unexpected low surrogate without high surrogate
|
||||
} else {
|
||||
// normal BMP codepoint
|
||||
utf8_encode(code_unit1, &dst);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
free(outbuf);
|
||||
return NULL; // invalid escape char
|
||||
}
|
||||
}
|
||||
}
|
||||
// decoded length:
|
||||
size_t decoded_len = dst - outbuf;
|
||||
|
||||
// Optionally null terminate (not required)
|
||||
*dst = '\0';
|
||||
|
||||
if (out_len)
|
||||
*out_len = decoded_len;
|
||||
|
||||
return outbuf;
|
||||
}
|
||||
|
||||
|
||||
|
||||
char *swap_quotes(char *input, char quote) {
|
||||
size_t len = strlen(input);
|
||||
@@ -26,7 +211,8 @@ char *swap_quotes(char *input, char quote) {
|
||||
return result;
|
||||
}
|
||||
|
||||
char *unquote(char *str) {
|
||||
|
||||
char *unquote(char *str, size_t *decoded_len) {
|
||||
if (*str == '\0')
|
||||
return NULL;
|
||||
|
||||
@@ -41,29 +227,16 @@ char *unquote(char *str) {
|
||||
str = swapped;
|
||||
}
|
||||
|
||||
cJSON *json = cJSON_Parse(str);
|
||||
if (!json || !cJSON_IsString(json)) {
|
||||
cJSON_Delete(json);
|
||||
unescaped = unquote_json_string(str, decoded_len);
|
||||
if (!unescaped) {
|
||||
if (swapped)
|
||||
free(swapped);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Copy unescaped string before freeing JSON object
|
||||
const char *decoded = cJSON_GetStringValue(json);
|
||||
if (!decoded) {
|
||||
cJSON_Delete(json);
|
||||
if (swapped)
|
||||
free(swapped);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
unescaped = strdup(decoded);
|
||||
cJSON_Delete(json);
|
||||
if (swapped)
|
||||
free(swapped);
|
||||
|
||||
// If input was single-quoted, swap quotes back in the output
|
||||
if (quote != '"') {
|
||||
char *final = swap_quotes(unescaped, quote);
|
||||
free(unescaped);
|
||||
@@ -73,9 +246,12 @@ char *unquote(char *str) {
|
||||
return unescaped;
|
||||
}
|
||||
|
||||
ParsedValue *parse_string(Token token) {
|
||||
ParsedValue *parse_string(char*file,Token* token) {
|
||||
ParsedValue *parsedValue = checked_malloc(sizeof(ParsedValue));
|
||||
parsedValue->type = AST_STRING;
|
||||
parsedValue->data = unquote(token.value);
|
||||
ParsedString *parsedString = checked_malloc(sizeof(ParsedString));
|
||||
parsedValue->data = parsedString;
|
||||
parsedString->length = 0;
|
||||
parsedString->string = unquote(token->value, &parsedString->length);
|
||||
return parsedValue;
|
||||
}
|
||||
@@ -6,10 +6,15 @@
|
||||
|
||||
// Declare functions related to string processing in parser
|
||||
|
||||
typedef struct {
|
||||
size_t length;
|
||||
char *string;
|
||||
} ParsedString;
|
||||
|
||||
char *swap_quotes(char *input, char quote);
|
||||
|
||||
char *unquote(char *str);
|
||||
char *unquote(char *str, size_t *decoded_len);
|
||||
|
||||
ParsedValue *parse_string(Token token);
|
||||
ParsedValue *parse_string(char*file,Token* token);
|
||||
|
||||
#endif // STRING_UTILS_H
|
||||
0
src/translator/delcaration/delcaration.c
Normal file
0
src/translator/delcaration/delcaration.c
Normal file
@@ -1,13 +1,18 @@
|
||||
#include "../translator.h"
|
||||
#include "../../parser/string/string.h"
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
void translate_parsed_string(Translated *translated, ParsedValue *parsedValue) {
|
||||
size_t string_pos = arena_push_string(&translated->constants, (char*)parsedValue->data);
|
||||
ParsedString *parsedString = (ParsedString*)parsedValue->data;
|
||||
size_t string_pos = arena_push(&translated->constants, parsedString->string, parsedString->length);
|
||||
set_registers(translated, 1);
|
||||
push_instruction_code(translated, OP_LOAD_CONST);
|
||||
push_instruction_code(translated, 0);
|
||||
push_instruction_code(translated, OP_TYPE_STRING);
|
||||
push_instruction_code(translated,strlen(parsedValue->data)+1);
|
||||
push_instruction_code(translated,parsedString->length);
|
||||
push_instruction_code(translated, string_pos);
|
||||
fwrite(parsedString->string, 1, parsedString->length, stdout);
|
||||
putchar('\n');
|
||||
}
|
||||
@@ -13,15 +13,15 @@ void arena_init(ConstantArena *arena) {
|
||||
}
|
||||
|
||||
void arena_resize(ConstantArena *arena, size_t new_size) {
|
||||
new_size = ((new_size / CHUNK_SIZE) + 1)*CHUNK_SIZE;
|
||||
if (new_size == arena->capacity)
|
||||
size_t new_capacity = ((new_size / CHUNK_SIZE) + 1)*CHUNK_SIZE;
|
||||
if (new_capacity == arena->capacity)
|
||||
return;
|
||||
arena->data = realloc(arena->data, new_size);
|
||||
arena->data = realloc(arena->data, new_capacity);
|
||||
if (!arena->data) {
|
||||
fprintf(stderr, "error: failed to resize arena from %zu to %zu\n", new_size, arena->capacity);
|
||||
fprintf(stderr, "error: failed to resize arena from %zu to %zu\n", new_capacity, arena->capacity);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
arena->capacity = new_size;
|
||||
arena->capacity = new_capacity;
|
||||
}
|
||||
|
||||
|
||||
@@ -35,16 +35,6 @@ void * arena_get(ConstantArena *arena, size_t offset) {
|
||||
return arena->data + offset;
|
||||
}
|
||||
|
||||
|
||||
size_t arena_push_string(ConstantArena *arena, const char *string) {
|
||||
size_t length = strlen(string)+1;
|
||||
arena_resize(arena, arena->size+length);
|
||||
size_t offset = arena->size;
|
||||
strcpy(arena->data + arena->size, string);
|
||||
arena->size += length;
|
||||
return offset;
|
||||
}
|
||||
|
||||
size_t arena_push(ConstantArena *arena, const void *data, size_t length) {
|
||||
arena_resize(arena, arena->size+length);
|
||||
size_t offset = arena->size;
|
||||
@@ -55,6 +45,7 @@ size_t arena_push(ConstantArena *arena, const void *data, size_t length) {
|
||||
|
||||
Translated init_translator() {
|
||||
Translated translated;
|
||||
translated.registerCount = 0;
|
||||
darray_init(&translated.bytecode, sizeof(size_t));
|
||||
arena_init(&translated.constants);
|
||||
return translated;
|
||||
|
||||
@@ -24,8 +24,6 @@ typedef struct {
|
||||
|
||||
void * arena_get(ConstantArena *arena, size_t offset);
|
||||
|
||||
size_t arena_push_string(ConstantArena *arena, const char *string);
|
||||
|
||||
size_t arena_push(ConstantArena *arena, const void *data, size_t length);
|
||||
|
||||
size_t push_instruction_code(Translated * translator, size_t code);
|
||||
|
||||
Reference in New Issue
Block a user