c2cat: prevent lexing errors in raw_mode to allow full module parse

chqrlie · chqrlie · commit f39897463626 · 2025-03-10T00:11:36.000+01:00
* fix compilation with `DumpTokens` feature
* extend c2cat to support multiple files
* use a context to avoid global variables
* rename `Feat_invalid` as `Invalid` for invalid character tokens
* produce `Kind.Invalid` tokens for invalid characters in `raw_mode`
* use style names instead of hard-coded colors
diff --git a/compiler/compiler.c2 b/compiler/compiler.c2
@@ -374,7 +374,9 @@ fn void Compiler.build(Compiler* c,
     console.log_time("parsing", t1_end - t1_start);
     if (!c.diags.isOk()) return;
 #if DumpTokens
-    return;
+    u32 dump_tokens = 1;
+    if (dump_tokens)
+        return;
 #endif
 
     if (opts.print_ast_early) {
diff --git a/parser/c2_tokenizer.c2 b/parser/c2_tokenizer.c2
@@ -268,7 +268,7 @@ public type Tokenizer struct {
     Feature[constants.MaxFeatureDepth+1] feature_stack;
     u32 feature_count;
     const string_list.List* features;
-    bool raw_mode;  // also emit comments
+    bool raw_mode;  // also emit comments and invalid characters
     bool stop_at_eol; // restrict lexing to single line for preprocessor
 
     char[256] error_msg;
@@ -327,9 +327,24 @@ fn void Tokenizer.lex_internal(Tokenizer* t, Token* result) {
             const char *endp = nil;
             if ((*t.cur & 0x80) && decode_utf8(t.cur, &endp) >= 0) {
                 // FIXME: should accept BOM \uFEFF (EF BB BF) at start of file?
+                if (t.raw_mode) {
+                    usize len = cast<usize>(endp - t.cur);
+                    result.kind = Kind.Invalid;
+                    string.memcpy(result.invalid, t.cur, len);
+                    result.invalid[len] = '\0';
+                    t.cur += len;
+                    return;
+                }
                 t.error(result, "Unicode (UTF-8) is only allowed inside string literals or comments");
                 return;
             }
+            if (t.raw_mode) {
+                result.kind = Kind.Invalid;
+                result.invalid[0] = *t.cur;
+                result.invalid[1] = '\0';
+                t.cur += 1;
+                return;
+            }
             if (*t.cur >= ' ' && *t.cur < 0x7F)
                 t.error(result, "invalid char '%c'", *t.cur);
             else
@@ -435,7 +450,7 @@ fn void Tokenizer.lex_internal(Tokenizer* t, Token* result) {
                 result.kind = Kind.MinusEqual;
                 return;
             }
-            if (*t.cur == '>') {
+            if (*t.cur == '>' && !t.raw_mode) {
                 t.cur--;
                 t.error(result, "use the dot operators instead of '->'");
                 return;
@@ -602,7 +617,7 @@ fn void Tokenizer.lex_internal(Tokenizer* t, Token* result) {
             return;
         case CR:
             t.cur++;
-            if (*t.cur != '\n') {
+            if (*t.cur != '\n' && !t.raw_mode) {
                 t.error(result, "unexpected character 0x%02X after CR", *t.cur & 0xFF);
                 return;
             }
@@ -703,7 +718,7 @@ fn void Tokenizer.lex_identifier(Tokenizer* t, Token* result) {
     while (Identifier_char[cast<u8>(*end)]) end++;
 
     usize len = cast<usize>(end - start);
-    if (len > constants.MaxIdentifierLen) {
+    if (len > constants.MaxIdentifierLen && !t.raw_mode) {
         t.error(result, "identifier too long (max %d chars)", constants.MaxIdentifierLen);
         return;
     }
@@ -1155,7 +1170,7 @@ fn bool Tokenizer.lex_block_comment(Tokenizer* t, Token* result) {
             t.error(result, "un-terminated block comment");
             return true;
         case '/':
-            if (t.cur[1] == '*') {
+            if (t.cur[1] == '*' && !t.raw_mode) {
                 t.error(result, "'/*' within block comment");
                 return true;
             }
@@ -1199,16 +1214,23 @@ fn bool Tokenizer.lex_feature_cmd(Tokenizer* t, Token* result) {
     t.cur = skip_blanks(t.cur + 1);
 
     Kind kind;
-    for (kind = Kind.Feat_if; kind < Kind.Feat_invalid; kind++) {
+    for (kind = Kind.Feat_if; kind < Kind.Invalid; kind++) {
         const char *word = kind.str() + 1;
         if (compare_word(t.cur, word)) {
             t.cur += string.strlen(word);
             break;
         }
     }
     result.kind = kind;
-    if (t.raw_mode)
+
+    if (t.raw_mode) {
+        if (kind == Kind.Invalid) {
+            result.invalid[0] = '#';
+            result.invalid[1] = '\0';
+            t.cur = start + 1;
+        }
         return true;
+    }
 
     t.cur = skip_blanks(t.cur);
     switch (kind) {
diff --git a/parser/token.c2 b/parser/token.c2
@@ -143,7 +143,7 @@ public type Kind enum u8 {
     Feat_endif,
     Feat_error,
     Feat_warning,
-    Feat_invalid,
+    Invalid,
     LineComment,
     BlockComment,
     // Special Tokens
@@ -277,7 +277,7 @@ const char*[] token_names = {
     [Kind.Feat_endif]       = "#endif",
     [Kind.Feat_error]       = "#error",
     [Kind.Feat_warning]     = "#warning",
-    [Kind.Feat_invalid]     = "#",
+    [Kind.Invalid]          = "invalid",
     [Kind.LineComment]      = "l-comment",
     [Kind.BlockComment]     = "b-comment",
     [Kind.Eof]              = "eof",
@@ -313,6 +313,7 @@ public type Token struct {
         u64 int_value;     // IntegerLiteral
         f64 float_value;   // FloatLiteral
         u8 char_value;     // CharLiteral
+        char[8] invalid;   // Invalid
     }
 }
 static_assert(16, sizeof(Token));
diff --git a/tools/c2cat.c2 b/tools/c2cat.c2