@@ -268,7 +268,7 @@ public type Tokenizer struct {
268
268
Feature[constants.MaxFeatureDepth+1] feature_stack;
269
269
u32 feature_count;
270
270
const string_list.List* features;
271
- bool raw_mode; // also emit comments
271
+ bool raw_mode; // also emit comments and invalid characters
272
272
bool stop_at_eol; // restrict lexing to single line for preprocessor
273
273
274
274
char[256] error_msg;
@@ -327,9 +327,24 @@ fn void Tokenizer.lex_internal(Tokenizer* t, Token* result) {
327
327
const char *endp = nil;
328
328
if ((*t.cur & 0x80) && decode_utf8(t.cur, &endp) >= 0) {
329
329
// FIXME: should accept BOM \uFEFF (EF BB BF) at start of file?
330
+ if (t.raw_mode) {
331
+ usize len = cast<usize>(endp - t.cur);
332
+ result.kind = Kind.Invalid;
333
+ string.memcpy(result.invalid, t.cur, len);
334
+ result.invalid[len] = '\0';
335
+ t.cur += len;
336
+ return;
337
+ }
330
338
t.error(result, "Unicode (UTF-8) is only allowed inside string literals or comments");
331
339
return;
332
340
}
341
+ if (t.raw_mode) {
342
+ result.kind = Kind.Invalid;
343
+ result.invalid[0] = *t.cur;
344
+ result.invalid[1] = '\0';
345
+ t.cur += 1;
346
+ return;
347
+ }
333
348
if (*t.cur >= ' ' && *t.cur < 0x7F)
334
349
t.error(result, "invalid char '%c'", *t.cur);
335
350
else
@@ -435,7 +450,7 @@ fn void Tokenizer.lex_internal(Tokenizer* t, Token* result) {
435
450
result.kind = Kind.MinusEqual;
436
451
return;
437
452
}
438
- if (*t.cur == '>') {
453
+ if (*t.cur == '>' && !t.raw_mode ) {
439
454
t.cur--;
440
455
t.error(result, "use the dot operators instead of '->'");
441
456
return;
@@ -602,7 +617,7 @@ fn void Tokenizer.lex_internal(Tokenizer* t, Token* result) {
602
617
return;
603
618
case CR:
604
619
t.cur++;
605
- if (*t.cur != '\n') {
620
+ if (*t.cur != '\n' && !t.raw_mode ) {
606
621
t.error(result, "unexpected character 0x%02X after CR", *t.cur & 0xFF);
607
622
return;
608
623
}
@@ -703,7 +718,7 @@ fn void Tokenizer.lex_identifier(Tokenizer* t, Token* result) {
703
718
while (Identifier_char[cast<u8>(*end)]) end++;
704
719
705
720
usize len = cast<usize>(end - start);
706
- if (len > constants.MaxIdentifierLen) {
721
+ if (len > constants.MaxIdentifierLen && !t.raw_mode ) {
707
722
t.error(result, "identifier too long (max %d chars)", constants.MaxIdentifierLen);
708
723
return;
709
724
}
@@ -1155,7 +1170,7 @@ fn bool Tokenizer.lex_block_comment(Tokenizer* t, Token* result) {
1155
1170
t.error(result, "un-terminated block comment");
1156
1171
return true;
1157
1172
case '/':
1158
- if (t.cur[1] == '*') {
1173
+ if (t.cur[1] == '*' && !t.raw_mode ) {
1159
1174
t.error(result, "'/*' within block comment");
1160
1175
return true;
1161
1176
}
@@ -1199,16 +1214,23 @@ fn bool Tokenizer.lex_feature_cmd(Tokenizer* t, Token* result) {
1199
1214
t.cur = skip_blanks(t.cur + 1);
1200
1215
1201
1216
Kind kind;
1202
- for (kind = Kind.Feat_if; kind < Kind.Feat_invalid ; kind++) {
1217
+ for (kind = Kind.Feat_if; kind < Kind.Invalid ; kind++) {
1203
1218
const char *word = kind.str() + 1;
1204
1219
if (compare_word(t.cur, word)) {
1205
1220
t.cur += string.strlen(word);
1206
1221
break;
1207
1222
}
1208
1223
}
1209
1224
result.kind = kind;
1210
- if (t.raw_mode)
1225
+
1226
+ if (t.raw_mode) {
1227
+ if (kind == Kind.Invalid) {
1228
+ result.invalid[0] = '#';
1229
+ result.invalid[1] = '\0';
1230
+ t.cur = start + 1;
1231
+ }
1211
1232
return true;
1233
+ }
1212
1234
1213
1235
t.cur = skip_blanks(t.cur);
1214
1236
switch (kind) {
0 commit comments