diff --git a/README.md b/README.md index c2936fce..b513ef4a 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,19 @@ HOCON spec for reference: https://lightbend.github.io/config/ * `key={a: 1}\n{b: 2}` * `key={a=1, b=2}` - `url()/file()/classpath()` includes are not supported +- Quotes next to triple-quotes needs to be escaped, otherwise they are discarded. + Meaning `"""a""""` is parsed as `a` but not `a"`, to crrectly express `a"`, it must be one of below: + * Escape the last `"`: `"""a\""""`; + * Or add `~` around the string value: `"""~a"~"""` (see below). +- Multiline strings allow indentation (spaces, not tabs). + If `~\n` (or `~\r\n`) are the only characters following the opening triple-quote, then it's a multiline string with indentation: + * The first line `~\n` is ignored; + * The indentation spaces of the following lines are trimed; + * Indentation is allowed but not required for empty lines; + * Indentation level is determined by the least number of leading spaces among the non-empty lines; + * Backslashes are treated as escape characters, i.e. should be escaped with another backslash; + * There is no need to escape quotes in multiline strings, but it's allowed; + * The closing triple-quote can be either `"""` or `~"""` (`~` allows the string to end with `"` without escaping). ## Schema diff --git a/etc/unescape.conf b/etc/unescape.conf index a7ef540b..78e7e3a1 100644 --- a/etc/unescape.conf +++ b/etc/unescape.conf @@ -5,6 +5,6 @@ sql_laitin1_with_escape_1 = "SELECT * FROM \"t/1\"" sql_laitin1_with_escape_2 = "SELECT * FROM \\\"t/1\\\"" sql_unicode_with_escape_1 = "SELECT * FROM \"t/1\" WHERE clientid = \"-测试专用-\"" sql_unicode_with_escape_2 = "SELECT * FROM \\\"t/1\\\" WHERE clientid = \"-测试专用-\"" -sql_unicode_with_escape_3 = "SELECT * FROM \\\"t/1\\\" WHERE clientid = \"-测试\\\n\r\t专用-\"" +sql_unicode_with_escape_3 = "SELECT * FROM \\\"t/1\\\" WHERE clientid = \"-测试\\\r\n\t专用-\"" z = 1 z1 = "1" diff --git a/src/hocon_pp.erl b/src/hocon_pp.erl index dfb9ec69..19b5f63a 100644 --- a/src/hocon_pp.erl +++ b/src/hocon_pp.erl @@ -21,6 +21,7 @@ -include("hocon_private.hrl"). -define(INDENT, " "). +-define(TRIPLE_QUOTE, <<"\"\"\"">>). %% @doc Pretty print HOCON value. %% Options are: @@ -99,10 +100,10 @@ gen(Bin, Opts) when is_binary(Bin) -> gen(S, Opts) when is_list(S) -> case io_lib:printable_latin1_list(S) of true -> - maybe_quote_latin1_str(S); + gen_str(S, latin1); false -> case io_lib:printable_unicode_list(S) of - true -> <<"\"", (format_escape_sequences(S))/binary, "\"">>; + true -> gen_str(S, unicode); false -> gen_list(S, Opts) end end; @@ -124,26 +125,100 @@ gen(Value, Opts) -> options => Opts }). +gen_str(S, Codec) -> + case is_triple_quote_str(S) of + true -> + gen_triple_quote_str(S); + false -> + gen_single_quote_str(S, Codec) + end. + +%% If a string requires escaping, it is a triple quote string +%% with one exception: if the string itself contains triple-quote +is_triple_quote_str(Chars) -> + case has_triple_quotes(Chars) of + true -> + false; + false -> + lists:any(fun(C) -> esc(C) =/= C end, Chars) + end. + +%% Return 'true' if there are three consecutive quotes in a string. +has_triple_quotes(Chars) -> + nomatch =/= string:find(Chars, "\"\"\""). + +%% If a string has '\n' in it, it's a multiline. +%% If it has leading or trailing quotes, +%% it's a multiline -- so that there is no need to escape the quotes. +is_multiline([]) -> + false; +is_multiline(Chars) -> + lists:member($\n, Chars) orelse is_leading_quote(Chars) orelse is_trailling_quote(Chars). + +is_leading_quote([$" | _]) -> true; +is_leading_quote(_) -> false. + +is_trailling_quote(Chars) -> + is_leading_quote(lists:reverse(Chars)). + +gen_single_quote_str(S, latin1) -> + maybe_quote_latin1_str(S); +gen_single_quote_str(S, unicode) -> + <<"\"", (format_escape_sequences(S))/binary, "\"">>. + +gen_triple_quote_str(Str) -> + [ + ?TRIPLE_QUOTE, + maybe_indent(esc_backslashes(Str)), + ?TRIPLE_QUOTE + ]. + +maybe_indent(Chars) -> + case is_multiline(Chars) of + true -> + ["~", indent_multiline_str(Chars), "~"]; + false -> + Chars + end. + +indent_multiline_str(Chars) -> + Lines = hocon_scanner:split_lines(Chars), + indent_str_value_lines(Lines). + +%% mark each line for indentation with 'indent' +%% except for empty lines in the middle of the string +indent_str_value_lines([[]]) -> + %% last line being empty + [?NL]; +indent_str_value_lines([LastLine]) -> + %% last line is not empty + [{indent, bin(LastLine)}]; +indent_str_value_lines([[] | Lines]) -> + %% do not indent empty line + [<<"\n">> | indent_str_value_lines(Lines)]; +indent_str_value_lines([Line | Lines]) -> + [{indent, bin(Line)} | indent_str_value_lines(Lines)]. + gen_list(L, Opts) -> case is_oneliner(L) of true -> %% one line ["[", infix([gen(I, Opts) || I <- L], ", "), "]"]; false -> - do_gen_list(L, Opts) + gen_multiline_list(L, Opts) end. -do_gen_list([_ | _] = L, Opts) -> +gen_multiline_list([_ | _] = L, Opts) -> [ - ["[", ?NL], - do_gen_list_loop(L, Opts#{no_obj_nl => true}), + ["["], + gen_multiline_list_loop(L, Opts#{no_obj_nl => true}), ["]", ?NL] ]. -do_gen_list_loop([I], Opts) -> +gen_multiline_list_loop([I], Opts) -> [{indent, gen(I, Opts)}]; -do_gen_list_loop([H | T], Opts) -> - [{indent, [gen(H, Opts), ","]} | do_gen_list_loop(T, Opts)]. +gen_multiline_list_loop([H | T], Opts) -> + [{indent, [gen(H, Opts), ","]} | gen_multiline_list_loop(T, Opts)]. is_oneliner(L) when is_list(L) -> lists:all(fun(X) -> is_number(X) orelse is_binary(X) orelse is_atom(X) end, L); @@ -153,7 +228,7 @@ is_oneliner(M) when is_map(M) -> gen_map(M, Opts) -> case is_oneliner(M) of true -> ["{", infix(gen_map_fields(M, Opts, ""), ", "), "}"]; - false -> [["{", ?NL], {indent, gen_map_fields(M, Opts, ?NL)}, "}"] + false -> ["{", {indent, gen_map_fields(M, Opts, ?NL)}, [?NL, "}"]] end. gen_map_fields(M, Opts, NL) -> @@ -224,7 +299,7 @@ fmt(L) when is_list(L) -> bin(lists:map(fun fmt/1, L)); fmt({indent, Block}) -> FormattedBlock = fmt(Block), - bin([[?INDENT, Line, ?NL] || Line <- split(FormattedBlock)]). + bin([[?NL, ?INDENT, Line] || Line <- split(FormattedBlock)]). split(Bin) -> [Line || Line <- binary:split(Bin, ?NL, [global]), Line =/= <<>>]. @@ -256,3 +331,9 @@ esc($\") -> "\\\""; % \ esc($\\) -> "\\\\"; esc(Char) -> Char. + +esc_backslashes(Str) -> + lists:map(fun esc_backslash/1, Str). + +esc_backslash($\\) -> "\\\\"; +esc_backslash(Char) -> Char. diff --git a/src/hocon_scanner.xrl b/src/hocon_scanner.xrl index 8f157521..476f54f4 100644 --- a/src/hocon_scanner.xrl +++ b/src/hocon_scanner.xrl @@ -81,7 +81,7 @@ Rules. {Integer} : {token, {integer, TokenLine, list_to_integer(TokenChars)}}. {Float} : {token, {float, TokenLine, to_float(TokenChars)}}. {String} : {token, {string, TokenLine, unquote(TokenChars, force_escape)}}. -{MultilineString} : {token, {string, TokenLine, unquote(TokenChars, allow_unescaped)}}. +{MultilineString} : {token, {string, TokenLine, unindent(unquote(TokenChars, allow_unescaped))}}. {Bytesize} : {token, {string, TokenLine, TokenChars}}. {Percent} : {token, {string, TokenLine, TokenChars}}. {Duration} : {token, {string, TokenLine, TokenChars}}. @@ -92,6 +92,8 @@ Rules. Erlang code. +-export([split_lines/1]). + maybe_include("include", TokenLine) -> {include, TokenLine}; maybe_include(TokenChars, TokenLine) -> {unqstr, TokenLine, TokenChars}. @@ -111,6 +113,68 @@ strip_surrounded_quotes([$" | Rem]) -> strip_surrounded_quotes(Str) -> Str. +unindent([$~, $\r, $\n | Chars]) -> + do_unindent(Chars); +unindent([$~, $\n | Chars]) -> + do_unindent(Chars); +unindent(Chars) -> + Chars. + +do_unindent(Chars) -> + Lines = split_lines(Chars), + Indent = min_indent(Lines), + NewLines = lists:map(fun(Line) -> trim_indents(Line, Indent) end, Lines), + lists:flatten(lists:join($\n, NewLines)). + +split_lines(Chars) -> + split_lines(Chars, "", []). + +%% Split multiline strings like +%% """~ +%% line1 +%% line2 +%% ~""" +%% into ["line1\n", "line2\n"] +split_lines([], LastLineR, Lines) -> + %% if the last line ends with '-' drop it + LastLine = case LastLineR of + [$~ | Rest] -> + lists:reverse(Rest); + _ -> + lists:reverse(LastLineR) + end, + lists:reverse([LastLine | Lines]); +split_lines([$\n | Chars], Line, Lines) -> + split_lines(Chars, [], [lists:reverse(Line) | Lines]); +split_lines([Char | Chars], Line, Lines) -> + split_lines(Chars, [Char | Line], Lines). + +min_indent(Lines) -> + Indents0 = lists:map(fun indent_level/1, Lines), + case lists:filter(fun erlang:is_integer/1, Indents0) of + [] -> + 0; + Indents -> + lists:min(Indents) + end. + +indent_level("") -> + ignore; +indent_level(Line) -> + indent_level(Line, 0). + +indent_level([$\s | Chars], Count) -> + indent_level(Chars, Count + 1); +indent_level(_, Count) -> + Count. + +trim_indents([], _Indent) -> + []; +trim_indents(Chars, 0) -> + Chars; +trim_indents([$\s | Chars], Indent) when Indent > 0 -> + trim_indents(Chars, Indent - 1). + % the first clause is commented out on purpose % meaning below two escape sequence (in a hocon file) % key="\\"" diff --git a/test/hocon_pp_tests.erl b/test/hocon_pp_tests.erl index 18b24661..47c554c1 100644 --- a/test/hocon_pp_tests.erl +++ b/test/hocon_pp_tests.erl @@ -65,16 +65,6 @@ do(File) -> ?assertEqual(Conf, Conf3), file:delete(TmpFile). -pp_escape_to_file_test() -> - File = "etc/unescape.conf", - {ok, Conf} = hocon:load(File), - PP = hocon_pp:do(Conf, #{}), - TmpFile = File ++ ".pp", - file:write_file(TmpFile, [PP]), - ?assertEqual(file:read_file(File), file:read_file(TmpFile)), - file:delete(TmpFile), - ok. - pp_quote_test() -> Fun = fun(Map, ExpectBin) -> Bin = iolist_to_binary(hocon_pp:do(Map, #{})), @@ -100,14 +90,15 @@ pp_quote_test() -> Fun(#{<<"$d_dfdk2f">> => <<"12">>}, <<"\"$d_dfdk2f\" = \"12\"\n">>), %% backslash - Fun(#{<<"test_backslash">> => <<"\\emqx">>}, <<"test_backslash = \"\\\\emqx\"\n">>), - Fun(#{<<"test_backslash">> => <<"emqx\\emqx">>}, <<"test_backslash = \"emqx\\\\emqx\"\n">>), - Fun(#{<<"test_backslash">> => <<"emqx\\">>}, <<"test_backslash = \"emqx\\\\\"\n">>), + Fun(#{<<"a">> => <<"\\emqx">>}, <<"a = \"\"\"\\\\emqx\"\"\"\n">>), + Fun(#{<<"b">> => <<"emqx\\emqx">>}, <<"b = \"\"\"emqx\\\\emqx\"\"\"\n">>), + Fun(#{<<"c">> => <<"emqx\\">>}, <<"c = \"\"\"emqx\\\\\"\"\"\n">>), %% quote - Fun(#{<<"test_quote">> => <<"\"emqx">>}, <<"test_quote = \"\\\"emqx\"\n">>), - Fun(#{<<"test_quote">> => <<"emqx\"emqx">>}, <<"test_quote = \"emqx\\\"emqx\"\n">>), - Fun(#{<<"test_quote">> => <<"emqx\"">>}, <<"test_quote = \"emqx\\\"\"\n">>), + Fun(#{<<"A">> => <<"\"emqx">>}, <<"A = \"\"\"~\n \"emqx~\"\"\"\n">>), + Fun(#{<<"B">> => <<"emqx\"emqx">>}, <<"B = \"\"\"emqx\"emqx\"\"\"\n">>), + Fun(#{<<"C">> => <<"emqx\"">>}, <<"C = \"\"\"~\n emqx\"~\"\"\"\n">>), + Fun(#{<<"D">> => <<"emqx\"\"\"">>}, <<"D = \"emqx\\\"\\\"\\\"\"\n">>), %% '${}[]:=,+#`^?!@*& ' should quote lists:foreach( @@ -115,14 +106,32 @@ pp_quote_test() -> Header = list_to_binary([Char | "emqx"]), Tail = list_to_binary("emqx" ++ [Char]), Middle = <>, - Fun(#{<<"test_key">> => Header}, <<"test_key = \"", Header/binary, "\"\n">>), - Fun(#{<<"test_key">> => Tail}, <<"test_key = \"", Tail/binary, "\"\n">>), - Fun(#{<<"test_key">> => Middle}, <<"test_key = \"", Middle/binary, "\"\n">>) + Fun(#{<<"D">> => Header}, <<"D = \"", Header/binary, "\"\n">>), + Fun(#{<<"E">> => Tail}, <<"E = \"", Tail/binary, "\"\n">>), + Fun(#{<<"F">> => Middle}, <<"F = \"", Middle/binary, "\"\n">>) end, "'${}[]:=,+#`^?!@*& " ), ok. +multi_line_str_indent_test() -> + Struct = #{<<"a">> => #{<<"b">> => #{<<"c">> => "line1\n\nline2\n\nline3\n"}}}, + Expected = << + "a {\n" + " b {\n" + " c = \"\"\"~\n" + " line1\n" + "\n" + " line2\n" + "\n" + " line3\n" + " ~\"\"\"\n" + " }\n" + "}\n" + >>, + ?assertEqual(Expected, iolist_to_binary(hocon_pp:do(Struct, #{}))), + ok. + load_file_pp_test() -> TmpF = "/tmp/load_file_pp_test", F = fun(Raw, Format) -> diff --git a/test/hocon_tests.erl b/test/hocon_tests.erl index d5d97e2e..bb1cf154 100644 --- a/test/hocon_tests.erl +++ b/test/hocon_tests.erl @@ -268,8 +268,33 @@ escape_test_() -> ) ]. -multiline_string_test_() -> - []. +triple_quote_string_test_() -> + Parse = fun(Str) -> maps:get(<<"a">>, binary(<<"a = \"\"\"", Str/binary, "\"\"\"">>)) end, + [ + ?_assertEqual(<<"1">>, Parse(<<"1">>)), + ?_assertEqual(<<"1">>, Parse(<<"~\n1~">>)), + ?_assertEqual(<<"1\n">>, Parse(<<"~\n1\n~">>)), + ?_assertEqual(<<"1\r\n">>, Parse(<<"~\r\n1\r\n">>)), + ?_assertEqual(<<"1\n\n2">>, Parse(<<"~\n1\n\n2">>)), + ?_assertEqual(<<"1\n\n2">>, Parse(<<"~\n 1\n\n 2">>)), + ?_assertEqual(<<"1\n\n2">>, Parse(<<"~\n 1\n \n 2">>)), + ?_assertEqual(<<" 1\n\n2">>, Parse(<<"~\n 1\n \n 2">>)), + ?_assertEqual(<<" 1\n\n2\n">>, Parse(<<"~\n 1\n \n 2\n">>)), + ?_assertEqual(<<" 1\n\n2\n">>, Parse(<<"~\n 1\n \n 2\n ">>)), + ?_assertEqual(<<" 1\n\n2\n">>, Parse(<<"~\n 1\n \n 2\n ~">>)), + ?_assertEqual(<<" 1\n\n2\n ">>, Parse(<<"~\n 1\n \n 2\n ~">>)), + ?_assertEqual(<<"1\"\"\n2">>, Parse(<<"~\n 1\"\"\n 2">>)), + %% must escape quotes if it's next to """ + ?_assertEqual(<<"1\"">>, Parse(<<"1\\\"">>)), + %% must escape quotes if it's next to """ + ?_assertEqual(<<"\"1">>, Parse(<<"\\\"1">>)), + %% no need to escape quotes unless it's next to """ + ?_assertEqual(<<"1\"2">>, Parse(<<"1\"2">>)), + %% empty string with closing quote in the next line + ?_assertEqual(<<"">>, Parse(<<"~\n">>)), + %% empty string with indented closing quote in the next line + ?_assertEqual(<<"">>, Parse(<<"~\n ~">>)) + ]. obj_inside_array_test_() -> [