Skip to content

Commit 04b70c2

Browse files
authored
Simplifying JSON read code (#1449)
* Simplifying JSON code Added escape_unicode.hpp * Fix variant reset logic * variant handling of tags * Removing now unused key_stats * Update string_literal.hpp * explicit constexpr on lambda to perhaps fix GCC error * Remove now unused read_json_visitor * make_static for GCC 12
1 parent bbc11fa commit 04b70c2

File tree

7 files changed

+322
-384
lines changed

7 files changed

+322
-384
lines changed

include/glaze/core/common.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ struct glz::meta<glz::error_code>
600600
"invalid_distribution_elements",
601601
"hostname_failure",
602602
"includer_error"};
603-
static constexpr auto value = std::array{none, //
603+
static constexpr std::array value{none, //
604604
version_mismatch, //
605605
invalid_header, //
606606
invalid_query, //

include/glaze/core/opts.hpp

-5
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,6 @@ namespace glz
8080

8181
bool_t bools_as_numbers = false; // Read and write booleans with 1's and 0's
8282

83-
bool_t escaped_unicode_key_conversion =
84-
false; // JSON does not require escaped unicode keys to match with unescaped UTF-8
85-
// This enables automatic escaped unicode unescaping and matching for keys in glz::object, but it comes at a
86-
// performance cost.
87-
8883
bool_t quoted_num = false; // treat numbers as quoted or array-like types as having quoted numbers
8984
bool_t number = false; // read numbers as strings and write these string as numbers
9085
bool_t raw = false; // write out string like values without quotes

include/glaze/json.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#pragma once
55

6+
#include "glaze/json/escape_unicode.hpp"
67
#include "glaze/json/invoke.hpp"
78
#include "glaze/json/json_concepts.hpp"
89
#include "glaze/json/json_ptr.hpp"

include/glaze/json/escape_unicode.hpp

+222
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
// Glaze Library
2+
// For the license information refer to glaze.hpp
3+
4+
#pragma once
5+
6+
#include <string>
7+
#include <cstdint>
8+
9+
#include "glaze/util/string_literal.hpp"
10+
11+
// JSON does not require escaped unicode keys to match with unescaped UTF-8
12+
// In order to match with escaped unicode you can register your fields with
13+
// the escaped unicode value.
14+
// glz::escape_unicode<"😀"> will generate a compile time escaped unicode version
15+
// of your key.
16+
17+
namespace glz::detail
18+
{
19+
// Helper function to append a Unicode escape sequence to the output string.
20+
inline constexpr void append_unicode_escape(std::string& output, uint16_t code_unit) {
21+
output += '\\';
22+
output += 'u';
23+
for (int shift = 12; shift >= 0; shift -= 4) {
24+
uint8_t digit = (code_unit >> shift) & 0xF;
25+
output += (digit < 10) ? ('0' + digit) : ('A' + (digit - 10));
26+
}
27+
}
28+
29+
// Function to calculate the length of the escaped JSON string.
30+
inline constexpr size_t escaped_length(const std::string_view input) {
31+
size_t length = 0;
32+
size_t i = 0;
33+
size_t len = input.size();
34+
35+
while (i < len) {
36+
unsigned char c = static_cast<unsigned char>(input[i++]);
37+
38+
if (c <= 0x7F) {
39+
// ASCII character
40+
switch (c) {
41+
case '\"': case '\\': case '\b': case '\f': case '\n': case '\r': case '\t':
42+
length += 2; // Escaped as two characters
43+
break;
44+
default:
45+
if (c <= 0x1F) {
46+
length += 6; // Control character, escaped as \u00XX
47+
} else {
48+
length += 1; // Regular character
49+
}
50+
break;
51+
}
52+
} else {
53+
// Multibyte UTF-8 character
54+
uint32_t codepoint = 0;
55+
int bytes = 0;
56+
57+
if ((c & 0xE0) == 0xC0) {
58+
// 2-byte sequence
59+
codepoint = c & 0x1F;
60+
bytes = 1;
61+
} else if ((c & 0xF0) == 0xE0) {
62+
// 3-byte sequence
63+
codepoint = c & 0x0F;
64+
bytes = 2;
65+
} else if ((c & 0xF8) == 0xF0) {
66+
// 4-byte sequence
67+
codepoint = c & 0x07;
68+
bytes = 3;
69+
} else {
70+
// Invalid UTF-8 start byte
71+
codepoint = 0xFFFD;
72+
bytes = 0;
73+
}
74+
75+
bool invalid_sequence = false;
76+
77+
for (int j = 0; j < bytes; ++j) {
78+
if (i == len) {
79+
invalid_sequence = true;
80+
break;
81+
}
82+
unsigned char c2 = static_cast<unsigned char>(input[i]);
83+
if ((c2 & 0xC0) != 0x80) {
84+
invalid_sequence = true;
85+
break;
86+
}
87+
codepoint = (codepoint << 6) | (c2 & 0x3F);
88+
++i;
89+
}
90+
91+
if (invalid_sequence) {
92+
// Invalid UTF-8 sequence, replace with U+FFFD
93+
codepoint = 0xFFFD;
94+
}
95+
96+
if (codepoint <= 0xFFFF) {
97+
length += 6; // Escaped as \uXXXX
98+
} else {
99+
length += 12; // Surrogate pair, escaped as \uXXXX\uXXXX
100+
}
101+
}
102+
}
103+
104+
return length;
105+
}
106+
107+
// Main function to escape the JSON string.
108+
inline constexpr std::string escape_json_string(const std::string_view input, const size_t output_length) {
109+
110+
std::string output;
111+
output.reserve(output_length);
112+
113+
size_t i = 0;
114+
size_t len = input.size();
115+
116+
while (i < len) {
117+
unsigned char c = static_cast<unsigned char>(input[i++]);
118+
119+
if (c <= 0x7F) {
120+
// ASCII character
121+
switch (c) {
122+
case '\"': output += "\\\""; break;
123+
case '\\': output += "\\\\"; break;
124+
case '\b': output += "\\b"; break;
125+
case '\f': output += "\\f"; break;
126+
case '\n': output += "\\n"; break;
127+
case '\r': output += "\\r"; break;
128+
case '\t': output += "\\t"; break;
129+
default:
130+
if (c <= 0x1F) {
131+
// Control character, escape using \u00XX
132+
output += "\\u00";
133+
uint8_t high_nibble = (c >> 4) & 0xF;
134+
uint8_t low_nibble = c & 0xF;
135+
output += (high_nibble < 10) ? ('0' + high_nibble) : ('A' + high_nibble - 10);
136+
output += (low_nibble < 10) ? ('0' + low_nibble) : ('A' + low_nibble - 10);
137+
} else {
138+
output += c;
139+
}
140+
break;
141+
}
142+
} else {
143+
// Multibyte UTF-8 character
144+
uint32_t codepoint = 0;
145+
int bytes = 0;
146+
147+
if ((c & 0xE0) == 0xC0) {
148+
// 2-byte sequence
149+
codepoint = c & 0x1F;
150+
bytes = 1;
151+
} else if ((c & 0xF0) == 0xE0) {
152+
// 3-byte sequence
153+
codepoint = c & 0x0F;
154+
bytes = 2;
155+
} else if ((c & 0xF8) == 0xF0) {
156+
// 4-byte sequence
157+
codepoint = c & 0x07;
158+
bytes = 3;
159+
} else {
160+
// Invalid UTF-8 start byte, replace with U+FFFD
161+
codepoint = 0xFFFD;
162+
bytes = 0;
163+
}
164+
165+
bool invalid_sequence = false;
166+
167+
for (int j = 0; j < bytes; ++j) {
168+
if (i == len) {
169+
invalid_sequence = true;
170+
break;
171+
}
172+
unsigned char c2 = static_cast<unsigned char>(input[i]);
173+
if ((c2 & 0xC0) != 0x80) {
174+
invalid_sequence = true;
175+
break;
176+
}
177+
codepoint = (codepoint << 6) | (c2 & 0x3F);
178+
++i;
179+
}
180+
181+
if (invalid_sequence) {
182+
// Invalid UTF-8 sequence, replace with U+FFFD
183+
codepoint = 0xFFFD;
184+
}
185+
186+
if (codepoint <= 0xFFFF) {
187+
// BMP character
188+
append_unicode_escape(output, static_cast<uint16_t>(codepoint));
189+
} else {
190+
// Supplementary character (needs surrogate pair)
191+
codepoint -= 0x10000;
192+
uint16_t high_surrogate = 0xD800 + (codepoint >> 10);
193+
uint16_t low_surrogate = 0xDC00 + (codepoint & 0x3FF);
194+
append_unicode_escape(output, high_surrogate);
195+
append_unicode_escape(output, low_surrogate);
196+
}
197+
}
198+
}
199+
200+
return output;
201+
}
202+
}
203+
204+
namespace glz
205+
{
206+
template <string_literal Str>
207+
inline constexpr auto escape_unicode = []() constexpr -> std::string_view {
208+
constexpr auto escaped = []() constexpr {
209+
constexpr auto output_length = detail::escaped_length(Str.sv());
210+
std::array<char, output_length + 1> result{}; // + 1 for null character
211+
const auto escaped = detail::escape_json_string(Str.sv(), output_length);
212+
for (size_t i = 0; i < output_length; ++i) {
213+
result[i] = escaped[i];
214+
}
215+
return result;
216+
}();
217+
218+
// make_static here required for GCC 12, in the future just make escaped static
219+
auto& arr = detail::make_static<escaped>::value;
220+
return {arr.data(), arr.size() - 1};
221+
}();
222+
}

0 commit comments

Comments
 (0)