@@ -667,18 +667,24 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
667
667
{ " \\ p{N}" , unicode_cpt_flags::NUMBER },
668
668
{ " \\ p{L}" , unicode_cpt_flags::LETTER },
669
669
{ " \\ p{P}" , unicode_cpt_flags::PUNCTUATION },
670
+ { " \\ p{M}" , unicode_cpt_flags::ACCENT_MARK },
671
+ { " \\ p{S}" , unicode_cpt_flags::SYMBOL },
670
672
};
671
673
672
674
static const std::map<int , int > k_ucat_cpt = {
673
675
{ unicode_cpt_flags::NUMBER, 0xD1 },
674
676
{ unicode_cpt_flags::LETTER, 0xD2 },
675
677
{ unicode_cpt_flags::PUNCTUATION, 0xD3 },
678
+ { unicode_cpt_flags::ACCENT_MARK, 0xD4 },
679
+ { unicode_cpt_flags::SYMBOL, 0xD5 },
676
680
};
677
681
678
682
static const std::map<int , std::string> k_ucat_map = {
679
683
{ unicode_cpt_flags::NUMBER, " \x30 -\x39 " }, // 0-9
680
684
{ unicode_cpt_flags::LETTER, " \x41 -\x5A\x61 -\x7A " }, // A-Za-z
681
685
{ unicode_cpt_flags::PUNCTUATION, " \x21 -\x23\x25 -\x2A\x2C -\x2F\x3A -\x3B\x3F -\x40\\\x5B -\\\x5D\x5F\\\x7B\\\x7D " }, // !-#%-*,-/:-;?-@\[-\]_\{\}
686
+ { unicode_cpt_flags::ACCENT_MARK, " " }, // no sub-128 codepoints
687
+ { unicode_cpt_flags::SYMBOL, " \\\x24\\\x2B\x3C -\x3E\x5E\x60\\\x7C " }, // $+<=>^`|
682
688
};
683
689
684
690
// compute collapsed codepoints only if needed by at least one regex
0 commit comments