Skip to content

Commit a332ca6

Browse files
authored
Support escaping in mro string literals.
Use json syntax for encoding strings. Allow decoding from golang escaping format, which is more permissive.
1 parent 847dc18 commit a332ca6

File tree

7 files changed

+311
-9
lines changed

7 files changed

+311
-9
lines changed

Diff for: martian/syntax/formatter.go

+92-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"path/filepath"
1515
"sort"
1616
"strings"
17+
"unicode/utf8"
1718
)
1819

1920
const (
@@ -109,6 +110,91 @@ func (self *printer) String() string {
109110
return self.buf.String()
110111
}
111112

113+
// QuoteString writes a string, quoted and escaped as json.
114+
//
115+
// The reason we don't just use json.Marshal here is because the default
116+
// encoder html-escapes strings, and disabling that by using json.Encoder
117+
// puts carriage returns at the end of the string, which is also bad for
118+
// this use case. Plus this way we can bypass a lot of reflection junk.
119+
//
120+
// This method is mostly copy/pasted from unexported go standard library
121+
// json encoder implementation (see
122+
// https://github.com/golang/go/blob/release-branch.go1.11/src/encoding/json/encode.go#L884)
123+
func quoteString(w stringWriter, s string) {
124+
w.WriteByte('"')
125+
const hex = "0123456789abcdef"
126+
start := 0
127+
for i := 0; i < len(s); {
128+
// Single-byte code points.
129+
if b := s[i]; b < utf8.RuneSelf {
130+
if b >= ' ' && b != '"' && b != '\\' {
131+
i++
132+
continue
133+
}
134+
if start < i {
135+
w.WriteString(s[start:i])
136+
}
137+
switch b {
138+
case '\\', '"':
139+
w.WriteByte('\\')
140+
w.WriteByte(b)
141+
case '\n':
142+
w.WriteByte('\\')
143+
w.WriteByte('n')
144+
case '\r':
145+
w.WriteByte('\\')
146+
w.WriteByte('r')
147+
case '\t':
148+
w.WriteByte('\\')
149+
w.WriteByte('t')
150+
default:
151+
// This encodes bytes < 0x20 except for \t, \n and \r.
152+
w.WriteString(`\u00`)
153+
w.WriteByte(hex[b>>4])
154+
w.WriteByte(hex[b&0xF])
155+
}
156+
i++
157+
start = i
158+
continue
159+
}
160+
// Multi-byte code points.
161+
c, size := utf8.DecodeRuneInString(s[i:])
162+
if c == utf8.RuneError && size == 1 {
163+
// Transform invalid code points into unicode
164+
// "replacement character".
165+
if start < i {
166+
w.WriteString(s[start:i])
167+
}
168+
w.WriteString(`\ufffd`)
169+
i += size
170+
start = i
171+
continue
172+
}
173+
// U+2028 is LINE SEPARATOR.
174+
// U+2029 is PARAGRAPH SEPARATOR.
175+
// They are both technically valid characters in JSON strings,
176+
// but don't work in JSONP, which has to be evaluated as JavaScript,
177+
// and can lead to security holes there. It is valid JSON to
178+
// escape them, so we do so unconditionally.
179+
// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
180+
if c == '\u2028' || c == '\u2029' {
181+
if start < i {
182+
w.WriteString(s[start:i])
183+
}
184+
w.WriteString(`\u202`)
185+
w.WriteByte(hex[c&0xF])
186+
i += size
187+
start = i
188+
continue
189+
}
190+
i += size
191+
}
192+
if start < len(s) {
193+
w.WriteString(s[start:])
194+
}
195+
w.WriteByte('"')
196+
}
197+
112198
//
113199
// Expression
114200
//
@@ -120,7 +206,12 @@ func (self *ValExp) format(w stringWriter, prefix string) {
120206
} else if self.Kind == KindFloat {
121207
fmt.Fprintf(w, "%g", self.Value)
122208
} else if self.Kind == KindString {
123-
fmt.Fprintf(w, "\"%s\"", self.Value)
209+
switch s := self.Value.(type) {
210+
case string:
211+
quoteString(w, s)
212+
default:
213+
fmt.Fprintf(w, "%q", self.Value)
214+
}
124215
} else if self.Kind == KindMap {
125216
self.formatMap(w, prefix)
126217
} else if self.Kind == KindArray {

Diff for: martian/syntax/formatter_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,9 @@ func TestFormatValueExpression(t *testing.T) {
9292
Equal(t, buff.String(), "\"blah\"", "Double quote a string.")
9393
buff.Reset()
9494

95-
ve.Value = "\"blah\""
95+
ve.Value = `"blah"`
9696
ve.format(&buff, "")
97-
Equal(t, buff.String(), "\"\"blah\"\"", "Double quote a double-quoted string.")
97+
Equal(t, buff.String(), `"\"blah\""`, "Double quote a double-quoted string.")
9898
buff.Reset()
9999

100100
//
@@ -218,7 +218,7 @@ pipeline AWESOME(
218218
call ADD_KEY1(
219219
key = self.key1,
220220
value = self.value1,
221-
failfile = "fail1",
221+
failfile = "fail \n\"1\"",
222222
start = null,
223223
) using (
224224
local = true,

Diff for: martian/syntax/parsenum.go

+17
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,20 @@ func parseFloat(s []byte) float64 {
6060
}
6161
return f
6262
}
63+
64+
func unhex(c byte) byte {
65+
switch {
66+
case '0' <= c && c <= '9':
67+
return c - '0'
68+
case 'a' <= c && c <= 'f':
69+
return c - 'a' + 10
70+
case 'A' <= c && c <= 'F':
71+
return c - 'A' + 10
72+
default:
73+
panic(string(append([]byte("Invalid character "), c)))
74+
}
75+
}
76+
77+
func parseHexByte(c0, c1 byte) byte {
78+
return (unhex(c0) << 4) + unhex(c1)
79+
}

Diff for: martian/syntax/string_intern.go

+105-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44

55
package syntax
66

7-
import "bytes"
7+
import (
8+
"bytes"
9+
"unicode/utf8"
10+
)
811

912
type stringIntern struct {
1013
internSet map[string]string
@@ -65,12 +68,110 @@ func (store *stringIntern) Get(value []byte) string {
6568
}
6669
}
6770

68-
var quoteBytes = []byte(`"`)
71+
func runeError() []byte {
72+
b := make([]byte, 3)
73+
utf8.EncodeRune(b, utf8.RuneError)
74+
return b
75+
}
76+
77+
func unquoteBytes(value []byte) []byte {
78+
n := len(value)
79+
if n < 2 || value[0] != '"' || value[n-1] != '"' {
80+
// Should be prevented by the tokenizer.
81+
panic("string was not quoted: " + string(value))
82+
}
83+
value = value[1 : n-1]
84+
if !bytes.ContainsAny(value, `\"`) {
85+
// Trivial value, avoid allocation.
86+
return value
87+
}
88+
89+
buf := make([]byte, 0, len(value)+2*utf8.UTFMax)
90+
for len(value) > 0 {
91+
switch c := value[0]; {
92+
case c >= utf8.RuneSelf:
93+
// Multibyte character.
94+
_, size := utf8.DecodeRune(value)
95+
buf = append(buf, value[:size]...)
96+
value = value[size:]
97+
case c != '\\':
98+
buf = append(buf, value[0])
99+
value = value[1:]
100+
default:
101+
// Escape
102+
c2 := value[1]
103+
value = value[2:]
104+
switch c2 {
105+
// easy cases
106+
case 'a':
107+
buf = append(buf, '\a')
108+
case 'b':
109+
buf = append(buf, '\b')
110+
case 'f':
111+
buf = append(buf, '\f')
112+
case 'n':
113+
buf = append(buf, '\n')
114+
case 'r':
115+
buf = append(buf, '\r')
116+
case 't':
117+
buf = append(buf, '\t')
118+
case 'v':
119+
buf = append(buf, '\v')
120+
// Harder cases
121+
case 'x':
122+
// one-byte hex-encoded unicode.
123+
buf = append(buf, parseHexByte(value[0], value[1]))
124+
value = value[2:]
125+
case 'u':
126+
// two-byte hex-encoded unicode.
127+
if len(value) < 4 {
128+
buf = append(buf, runeError()...)
129+
value = value[len(value):]
130+
} else {
131+
var enc [2]byte
132+
n := utf8.EncodeRune(enc[:],
133+
rune(parseHexByte(value[2], value[3]))+
134+
(rune(parseHexByte(value[0], value[1]))<<8))
135+
buf = append(buf, enc[:n]...)
136+
value = value[4:]
137+
}
138+
case 'U':
139+
// four-byte hex-encoded unicode.
140+
if len(value) < 8 {
141+
buf = append(buf, runeError()...)
142+
value = value[len(value):]
143+
} else {
144+
var enc [4]byte
145+
n := utf8.EncodeRune(enc[:],
146+
rune(parseHexByte(value[6], value[7]))+
147+
(rune(parseHexByte(value[4], value[5]))<<8)+
148+
(rune(parseHexByte(value[2], value[3]))<<16)+
149+
(rune(parseHexByte(value[0], value[1]))<<24))
150+
buf = append(buf, enc[:n]...)
151+
value = value[8:]
152+
}
153+
case '0', '1', '2', '3', '4', '5', '6', '7':
154+
// one-byte octal unicode
155+
if value[1] < '0' || value[1] > '7' || value[0] < '0' || value[0] > '7' {
156+
buf = append(buf, runeError()...)
157+
value = value[len(value):]
158+
} else {
159+
buf = append(buf, ((c2-'0')<<6)+((value[0]-'0')<<3)+(value[1]-'0'))
160+
value = value[2:]
161+
}
162+
default:
163+
// \, ", etc.
164+
buf = append(buf, c2)
165+
}
166+
}
167+
}
168+
return buf
169+
}
69170

70171
func (store *stringIntern) unquote(value []byte) string {
71-
return store.Get(bytes.Replace(value, quoteBytes, nil, -1))
172+
return store.Get(unquoteBytes(value))
72173
}
73174

74175
func unquote(qs []byte) string {
75-
return string(bytes.Replace(qs, quoteBytes, nil, -1))
176+
return string(unquoteBytes(qs))
76177
}

Diff for: martian/syntax/string_intern_test.go

+79
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33
package syntax
44

55
import (
6+
"bytes"
7+
"encoding/json"
68
"testing"
9+
"testing/quick"
710
)
811

912
func TestStringIntern(t *testing.T) {
@@ -27,3 +30,79 @@ func TestStringIntern(t *testing.T) {
2730
t.Errorf("Bytes key lookup AllocsPerRun = %f, want 0", n)
2831
}
2932
}
33+
34+
func TestUnquote(t *testing.T) {
35+
check := func(t *testing.T, input, expect string) {
36+
t.Helper()
37+
if s := unquote([]byte(input)); s != expect {
38+
t.Errorf("Expected: %q, got %q",
39+
expect, s)
40+
}
41+
}
42+
check(t,
43+
`"\"hey\" is\\\n\tfor \U0001f40es"`,
44+
"\"hey\" is\\\n\tfor \U0001f40es")
45+
check(t,
46+
`"\xf2Y\xbb\x8a,\xd0(\xf0\xff=\x8c\xbd"`,
47+
"\xf2Y\xbb\x8a,\xd0(\xf0\xff=\x8c\xbd")
48+
check(t, `"multibyte \"ဤ\" character"`, "multibyte \"\xe1\x80\xa4\" character")
49+
check(t, `"Octal is \167eird"`, "Octal is weird")
50+
check(t, `"Hex is \x6eormal"`, "Hex is normal")
51+
check(t, `"Hex is \x6Eormal"`, "Hex is normal")
52+
check(t, `"Hex is \u0146ormal"`, "Hex is \u0146ormal")
53+
check(t, `"We căn use anỿ valid utf-8 ☺"`, "We căn use anỿ valid utf-8 ☺")
54+
check(t, `"Case sensitivity is \U0001f4A9"`, "Case sensitivity is \U0001f4A9")
55+
check(t, `"Control\a\b\f\n\r\t\v characters"`, "Control\a\b\f\n\r\t\v characters")
56+
check(t, `"Invalid \u123"`, "Invalid \ufffd")
57+
}
58+
59+
// Fuzz test for unquote.
60+
func TestUnquoteFuzz(t *testing.T) {
61+
t.Parallel()
62+
if err := quick.CheckEqual(func(s string) string {
63+
return s
64+
}, func(s string) string {
65+
var buf bytes.Buffer
66+
enc := json.NewEncoder(&buf)
67+
enc.SetEscapeHTML(false)
68+
enc.Encode(s)
69+
return unquote(buf.Bytes()[:buf.Len()-1])
70+
}, nil); err != nil {
71+
t.Error(err)
72+
}
73+
}
74+
75+
// Fuzzer test for format/decode round trip.
76+
func TestUnquoteFormat(t *testing.T) {
77+
t.Parallel()
78+
enc := func(s string) []byte {
79+
var buf bytes.Buffer
80+
quoteString(&buf, s)
81+
return buf.Bytes()
82+
}
83+
roundTrip := func(s string) []byte {
84+
return enc(unquote(enc(s)))
85+
}
86+
if err := quick.CheckEqual(enc, roundTrip, nil); err != nil {
87+
t.Error(err)
88+
}
89+
jsonEnc := func(s string) []byte {
90+
var buf bytes.Buffer
91+
enc := json.NewEncoder(&buf)
92+
enc.SetEscapeHTML(false)
93+
enc.Encode(s)
94+
return buf.Bytes()[:buf.Len()-1]
95+
}
96+
if err := quick.CheckEqual(enc, jsonEnc, nil); err != nil {
97+
t.Error(err)
98+
}
99+
check := func(t *testing.T, s string) {
100+
t.Helper()
101+
if e, a := jsonEnc(s), enc(s); !bytes.Equal(e, a) {
102+
t.Errorf("Expected %q -> %q, got %q", s, e, a)
103+
}
104+
}
105+
check(t, "\"hey\" is\\\n\tfor \U0001f40es")
106+
check(t, "Control\a\b\f\n\r\t\v \u2029 characters")
107+
check(t, "Invalid character \x88\xee")
108+
}

Diff for: martian/syntax/tokenizer.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ var rules = [...]rule{
3232
{regexp.MustCompile(`^;`), SEMICOLON},
3333
{regexp.MustCompile(`^,`), COMMA},
3434
{regexp.MustCompile(`^\.`), DOT},
35-
{regexp.MustCompile(`^"[^\"]*"`), LITSTRING}, // double-quoted strings. escapes not supported
35+
// double-quoted strings with escaping.
36+
{regexp.MustCompile(`^"(?:[^\\"]|\\[abfnrtv\\"]|\\[0-7]{3}|\\x[0-9a-fA-f]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"`), LITSTRING},
3637
{regexp.MustCompile(`^filetype\b`), FILETYPE},
3738
{regexp.MustCompile(`^stage\b`), STAGE},
3839
{regexp.MustCompile(`^pipeline\b`), PIPELINE},

0 commit comments

Comments
 (0)