Skip to content

Commit f398974

Browse files
committed
c2cat: prevent lexing errors in raw_mode to allow full module parse
* fix compilation with `DumpTokens` feature * extend c2cat to support multiple files * use a context to avoid global variables * rename `Feat_invalid` as `Invalid` for invalid character tokens * produce `Kind.Invalid` tokens for invalid characters in `raw_mode` * use style names instead of hard-coded colors
1 parent 59c838a commit f398974

File tree

4 files changed

+185
-80
lines changed

4 files changed

+185
-80
lines changed

compiler/compiler.c2

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,9 @@ fn void Compiler.build(Compiler* c,
374374
console.log_time("parsing", t1_end - t1_start);
375375
if (!c.diags.isOk()) return;
376376
#if DumpTokens
377-
return;
377+
u32 dump_tokens = 1;
378+
if (dump_tokens)
379+
return;
378380
#endif
379381

380382
if (opts.print_ast_early) {

parser/c2_tokenizer.c2

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ public type Tokenizer struct {
268268
Feature[constants.MaxFeatureDepth+1] feature_stack;
269269
u32 feature_count;
270270
const string_list.List* features;
271-
bool raw_mode; // also emit comments
271+
bool raw_mode; // also emit comments and invalid characters
272272
bool stop_at_eol; // restrict lexing to single line for preprocessor
273273

274274
char[256] error_msg;
@@ -327,9 +327,24 @@ fn void Tokenizer.lex_internal(Tokenizer* t, Token* result) {
327327
const char *endp = nil;
328328
if ((*t.cur & 0x80) && decode_utf8(t.cur, &endp) >= 0) {
329329
// FIXME: should accept BOM \uFEFF (EF BB BF) at start of file?
330+
if (t.raw_mode) {
331+
usize len = cast<usize>(endp - t.cur);
332+
result.kind = Kind.Invalid;
333+
string.memcpy(result.invalid, t.cur, len);
334+
result.invalid[len] = '\0';
335+
t.cur += len;
336+
return;
337+
}
330338
t.error(result, "Unicode (UTF-8) is only allowed inside string literals or comments");
331339
return;
332340
}
341+
if (t.raw_mode) {
342+
result.kind = Kind.Invalid;
343+
result.invalid[0] = *t.cur;
344+
result.invalid[1] = '\0';
345+
t.cur += 1;
346+
return;
347+
}
333348
if (*t.cur >= ' ' && *t.cur < 0x7F)
334349
t.error(result, "invalid char '%c'", *t.cur);
335350
else
@@ -435,7 +450,7 @@ fn void Tokenizer.lex_internal(Tokenizer* t, Token* result) {
435450
result.kind = Kind.MinusEqual;
436451
return;
437452
}
438-
if (*t.cur == '>') {
453+
if (*t.cur == '>' && !t.raw_mode) {
439454
t.cur--;
440455
t.error(result, "use the dot operators instead of '->'");
441456
return;
@@ -602,7 +617,7 @@ fn void Tokenizer.lex_internal(Tokenizer* t, Token* result) {
602617
return;
603618
case CR:
604619
t.cur++;
605-
if (*t.cur != '\n') {
620+
if (*t.cur != '\n' && !t.raw_mode) {
606621
t.error(result, "unexpected character 0x%02X after CR", *t.cur & 0xFF);
607622
return;
608623
}
@@ -703,7 +718,7 @@ fn void Tokenizer.lex_identifier(Tokenizer* t, Token* result) {
703718
while (Identifier_char[cast<u8>(*end)]) end++;
704719

705720
usize len = cast<usize>(end - start);
706-
if (len > constants.MaxIdentifierLen) {
721+
if (len > constants.MaxIdentifierLen && !t.raw_mode) {
707722
t.error(result, "identifier too long (max %d chars)", constants.MaxIdentifierLen);
708723
return;
709724
}
@@ -1155,7 +1170,7 @@ fn bool Tokenizer.lex_block_comment(Tokenizer* t, Token* result) {
11551170
t.error(result, "un-terminated block comment");
11561171
return true;
11571172
case '/':
1158-
if (t.cur[1] == '*') {
1173+
if (t.cur[1] == '*' && !t.raw_mode) {
11591174
t.error(result, "'/*' within block comment");
11601175
return true;
11611176
}
@@ -1199,16 +1214,23 @@ fn bool Tokenizer.lex_feature_cmd(Tokenizer* t, Token* result) {
11991214
t.cur = skip_blanks(t.cur + 1);
12001215

12011216
Kind kind;
1202-
for (kind = Kind.Feat_if; kind < Kind.Feat_invalid; kind++) {
1217+
for (kind = Kind.Feat_if; kind < Kind.Invalid; kind++) {
12031218
const char *word = kind.str() + 1;
12041219
if (compare_word(t.cur, word)) {
12051220
t.cur += string.strlen(word);
12061221
break;
12071222
}
12081223
}
12091224
result.kind = kind;
1210-
if (t.raw_mode)
1225+
1226+
if (t.raw_mode) {
1227+
if (kind == Kind.Invalid) {
1228+
result.invalid[0] = '#';
1229+
result.invalid[1] = '\0';
1230+
t.cur = start + 1;
1231+
}
12111232
return true;
1233+
}
12121234

12131235
t.cur = skip_blanks(t.cur);
12141236
switch (kind) {

parser/token.c2

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ public type Kind enum u8 {
143143
Feat_endif,
144144
Feat_error,
145145
Feat_warning,
146-
Feat_invalid,
146+
Invalid,
147147
LineComment,
148148
BlockComment,
149149
// Special Tokens
@@ -277,7 +277,7 @@ const char*[] token_names = {
277277
[Kind.Feat_endif] = "#endif",
278278
[Kind.Feat_error] = "#error",
279279
[Kind.Feat_warning] = "#warning",
280-
[Kind.Feat_invalid] = "#",
280+
[Kind.Invalid] = "invalid",
281281
[Kind.LineComment] = "l-comment",
282282
[Kind.BlockComment] = "b-comment",
283283
[Kind.Eof] = "eof",
@@ -313,6 +313,7 @@ public type Token struct {
313313
u64 int_value; // IntegerLiteral
314314
f64 float_value; // FloatLiteral
315315
u8 char_value; // CharLiteral
316+
char[8] invalid; // Invalid
316317
}
317318
}
318319
static_assert(16, sizeof(Token));

0 commit comments

Comments
 (0)