Skip to content

Commit

Permalink
Implement allowUnsafeSymbols encoding option
Browse files Browse the repository at this point in the history
Although `he.escape` allows for escaping ‘unsafe’ markup characters, there is currently no way to escape the inverse set – non-ASCII characters only. This missing functionality is useful in contexts where markup is allowed, but non-ASCII characters might be otherwise mangled (as in the `iframe` `srcdoc` attribute, for example).

Introduce a new option to `he.encode` which disables the escaping of unsafe markup characters. Preserve API compatability by disabling this behavior by default.

Closes #16 and #23.
  • Loading branch information
jugglinmike authored and mathiasbynens committed Aug 24, 2014
1 parent f48a096 commit 3054a1d
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 13 deletions.
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ he.encode('foo © bar ≠ baz 𝌆 qux', {

#### `encodeEverything`

The default value for the `encodeEverything` option is `false`. This means that `encode()` will not use any character references for printable ASCII symbols that don’t need escaping. Set it to `true` to encode every symbol in the input string.
The default value for the `encodeEverything` option is `false`. This means that `encode()` will not use any character references for printable ASCII symbols that don’t need escaping. Set it to `true` to encode every symbol in the input string. When set to `true`, this option takes precedence over `allowUnsafeSymbols` (i.e. setting the latter to `true` in such a case has no effect).

```js
// Using the global default setting (defaults to `false`):
Expand Down Expand Up @@ -149,6 +149,15 @@ he.encode('\x01', {
// → Parse error
```

#### `allowUnsafeSymbols`

The default value for the `allowUnsafeSymbols` option is `false`. This means that characters that are unsafe for use in HTML content (`&`, `<`, `>`, `"`, `'`, and `` ` ``) will be encoded. When set to `true`, only non-ASCII characters will be encoded. If the `encodeEverything` option is set to `true`, this option will be ignored.

```js
he.encode('foo © and & ampersand');
// → 'foo &#xA9; and & ampersand'
```

#### Overriding default `encode` options globally

The global default setting can be overridden by modifying the `he.encode.options` object. This saves you from passing in an `options` object for every call to `encode` if you want to use the non-default setting.
Expand Down
7 changes: 6 additions & 1 deletion bin/he
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
log([
'\nUsage:\n',
'\the [--escape] string',
'\the [--encode] [--use-named-refs] [--everything] string',
'\the [--encode] [--use-named-refs] [--everything] [--allow-unsafe] string',
'\the [--decode] [--attribute] [--strict] string',
'\the [-v | --version]',
'\the [-h | --help]',
Expand Down Expand Up @@ -59,6 +59,11 @@
options.encodeEverything = true;
return;
}
if (string == '--allow-unsafe') {
action = 'encode';
options.allowUnsafeSymbols = true;
return;
}
if (string == '--decode') {
action = 'decode';
return;
Expand Down
14 changes: 9 additions & 5 deletions he.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/he.1
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
.br
.Op Fl -encode Ar string
.br
.Op Fl -encode Fl -use-named-refs Fl -everything Ar string
.Op Fl -encode Fl -use-named-refs Fl -everything Fl -allow-unsafe Ar string
.br
.Op Fl -decode Ar string
.br
Expand All @@ -33,6 +33,8 @@ Take a string of text and encode any symbols that aren't printable ASCII symbols
Enable the use of named character references (like `&copy;`) in the output. If compatibility with older browsers is a concern, don't use this option.
.It Sy "--encode --everything"
Encode every symbol in the input string, even safe printable ASCII symbols.
.It Sy "--encode --allow-unsafe"
Encode non-ASCII characters only. This leaves unsafe HTML/XML symbols like `&`, `<`, `>`, `"`, and `'` intact.
.It Sy "--decode"
Takes a string of HTML and decode any named and numerical character references in it using the algorithm described in the HTML spec.
.It Sy "--decode --attribute"
Expand Down
14 changes: 9 additions & 5 deletions src/he.js
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@
}
var encodeEverything = options.encodeEverything;
var useNamedReferences = options.useNamedReferences;
var allowUnsafeSymbols = options.allowUnsafeSymbols;
if (encodeEverything) {
// Encode ASCII symbols.
string = string.replace(regexAsciiWhitelist, function(symbol) {
Expand Down Expand Up @@ -185,9 +186,11 @@
} else if (useNamedReferences) {
// Apply named character references.
// Encode `<>"'&` using named character references.
string = string.replace(regexEscape, function(string) {
return '&' + encodeMap[string] + ';'; // no need to check `has()` here
});
if (!allowUnsafeSymbols) {
string = string.replace(regexEscape, function(string) {
return '&' + encodeMap[string] + ';'; // no need to check `has()` here
});
}
// Shorten escapes that represent two symbols, of which at least one is
// `<>"'&`.
string = string
Expand All @@ -198,7 +201,7 @@
// Note: there is no need to check `has(encodeMap, string)` here.
return '&' + encodeMap[string] + ';';
});
} else {
} else if (!allowUnsafeSymbols) {
// Encode `<>"'&` using hexadecimal escapes, now that they’re not handled
// using named character references.
string = string.replace(regexEscape, hexEscape);
Expand All @@ -220,7 +223,8 @@
encode.options = {
'encodeEverything': false,
'strict': false,
'useNamedReferences': false
'useNamedReferences': false,
'allowUnsafeSymbols': false
};

var decode = function(html, options) {
Expand Down
22 changes: 22 additions & 0 deletions tests/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -6598,6 +6598,28 @@
'\0\x89',
'Does not encode invalid code points whose character references would refer to another code point, even when `encodeEverything: true` is used'
);
equal(
he.encode('foo\xA9<bar\uD834\uDF06>baz\u2603"qux', { 'allowUnsafeSymbols': true }),
'foo&#xA9;<bar&#x1D306;>baz&#x2603;"qux',
'Markup characters pass through when `allowUnsafeSymbols: true`'
);
equal(
he.encode('a<b', { 'allowUnsafeSymbols': true, 'encodeEverything': true }),
'&#x61;&#x3C;&#x62;',
'`encodeEverything` takes precedence over `allowUnsafeSymbols`'
);
equal(
he.encode('a<\u223E>', { 'allowUnsafeSymbols': true, 'useNamedReferences': true }),
'a<&ac;>',
'`useNamedReferences` only affects non-ASCII symbols when `allowUnsafeSymbols: true`'
)
raises(
function() {
he.encode('\0\x01\x02\x03\x04\x05\x06\x07\b\x0B\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\uFDD0\uFDD1\uFDD2\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDDF\uFDE0\uFDE1\uFDE2\uFDE3\uFDE4\uFDE5\uFDE6\uFDE7\uFDE8\uFDE9\uFDEA\uFDEB\uFDEC\uFDED\uFDEE\uFDEF\uFFFE\uFFFF\uD83F\uDFFE\uD83F\uDFFF\uD87F\uDFFE\uD87F\uDFFF\uD8BF\uDFFE\uD8BF\uDFFF\uD8FF\uDFFE\uD8FF\uDFFF\uD93F\uDFFE\uD93F\uDFFF\uD97F\uDFFE\uD97F\uDFFF\uD9BF\uDFFE\uD9BF\uDFFF\uD9FF\uDFFE\uD9FF\uDFFF\uDA3F\uDFFE\uDA3F\uDFFF\uDA7F\uDFFE\uDA7F\uDFFF\uDABF\uDFFE\uDABF\uDFFF\uDAFF\uDFFE\uDAFF\uDFFF\uDB3F\uDFFE\uDB3F\uDFFF\uDB7F\uDFFE\uDB7F\uDFFF\uDBBF\uDFFE\uDBBF\uDFFF\uDBFF\uDFFE\uDBFF\uDFFF', { 'allowUnsafeSymbols': true, 'strict': true });
},
Error,
'Parse error: forbidden code point when `allowUnsafeSymbols: true` and `strict: true`'
);
});
test('escape', function() {
equal(
Expand Down
22 changes: 22 additions & 0 deletions tests/tests.src.js
Original file line number Diff line number Diff line change
Expand Up @@ -6598,6 +6598,28 @@
'\0\x89',
'Does not encode invalid code points whose character references would refer to another code point, even when `encodeEverything: true` is used'
);
equal(
he.encode('foo\xA9<bar\uD834\uDF06>baz\u2603"qux', { 'allowUnsafeSymbols': true }),
'foo&#xA9;<bar&#x1D306;>baz&#x2603;"qux',
'Markup characters pass through when `allowUnsafeSymbols: true`'
);
equal(
he.encode('a<b', { 'allowUnsafeSymbols': true, 'encodeEverything': true }),
'&#x61;&#x3C;&#x62;',
'`encodeEverything` takes precedence over `allowUnsafeSymbols`'
);
equal(
he.encode('a<\u223E>', { 'allowUnsafeSymbols': true, 'useNamedReferences': true }),
'a<&ac;>',
'`useNamedReferences` only affects non-ASCII symbols when `allowUnsafeSymbols: true`'
)
raises(
function() {
he.encode(<%= stringInvalidCodePoints %>, { 'allowUnsafeSymbols': true, 'strict': true });
},
Error,
'Parse error: forbidden code point when `allowUnsafeSymbols: true` and `strict: true`'
);
});
test('escape', function() {
equal(
Expand Down

0 comments on commit 3054a1d

Please sign in to comment.