From 37f0e29c95ef1508f6a1a258f536f71df8152d24 Mon Sep 17 00:00:00 2001 From: Mike Pennisi Date: Thu, 7 Aug 2014 08:54:23 -0400 Subject: [PATCH] Implement `allowUnsafeSymbols` encoding option Although `he.escape` allows for escaping "unsafe" markup characters, there is currently no way to escape the inverse set--non-ASCII characters only. This missing functionality is useful in contexts where markup is allowed, but non-ASCII characters might be otherwise mangled (as in the iframe `srcdoc` attribute, for example). Introduce a new option to `he.encode` which disables the escaping of unsafe markup characters. Preserve API compatability by disabling this behavior by default. --- README.md | 11 ++++++++++- bin/he | 7 ++++++- he.js | 14 +++++++++----- src/he.js | 14 +++++++++----- tests/tests.js | 23 +++++++++++++++++++++++ tests/tests.src.js | 23 +++++++++++++++++++++++ 6 files changed, 80 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 38a843a..c704db1 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ he.encode('foo © bar ≠ baz 𝌆 qux', { #### `encodeEverything` -The default value for the `encodeEverything` option is `false`. This means that `encode()` will not use any character references for printable ASCII symbols that don’t need escaping. Set it to `true` to encode every symbol in the input string. +The default value for the `encodeEverything` option is `false`. This means that `encode()` will not use any character references for printable ASCII symbols that don’t need escaping. Set it to `true` to encode every symbol in the input string. When set to `true`, this option will take precedence over `allowUnsafeSymbols` (setting the latter to `true` will have no effect). ```js // Using the global default setting (defaults to `false`): @@ -149,6 +149,15 @@ he.encode('\x01', { // → Parse error ``` +#### `allowUnsafeSymbols` + +The default value for the `allowUnsafeSymbols` option is `false`. This means that characters that are unsafe for use in HTML content (`&`, `<`, `>`, `"`, `'`, and `` ` ``) will be encoded. When set to `true`, only non-ASCII characters will be encoded. If the `encodeEverything` option is set to `true`, this option will be ignored. + +```js +he.encode('foo © and & ampersand'); +// → 'foo © and & ampersand' +``` + #### Overriding default `encode` options globally The global default setting can be overridden by modifying the `he.encode.options` object. This saves you from passing in an `options` object for every call to `encode` if you want to use the non-default setting. diff --git a/bin/he b/bin/he index 5e4e779..234710c 100755 --- a/bin/he +++ b/bin/he @@ -23,7 +23,7 @@ log([ '\nUsage:\n', '\the [--escape] string', - '\the [--encode] [--use-named-refs] [--everything] string', + '\the [--encode] [--use-named-refs] [--everything] [--allow-unsafe] string', '\the [--decode] [--attribute] [--strict] string', '\the [-v | --version]', '\the [-h | --help]', @@ -59,6 +59,11 @@ options.encodeEverything = true; return; } + if (string == '--allow-unsafe') { + action = 'encode'; + options.allowUnsafeSymbols = true; + return; + } if (string == '--decode') { action = 'decode'; return; diff --git a/he.js b/he.js index 30be85d..6d90ba5 100644 --- a/he.js +++ b/he.js @@ -141,6 +141,7 @@ } var encodeEverything = options.encodeEverything; var useNamedReferences = options.useNamedReferences; + var allowUnsafeSymbols = options.allowUnsafeSymbols; if (encodeEverything) { // Encode ASCII symbols. string = string.replace(regexAsciiWhitelist, function(symbol) { @@ -170,9 +171,11 @@ } else if (useNamedReferences) { // Apply named character references. // Encode `<>"'&` using named character references. - string = string.replace(regexEscape, function(string) { - return '&' + encodeMap[string] + ';'; // no need to check `has()` here - }); + if (!allowUnsafeSymbols) { + string = string.replace(regexEscape, function(string) { + return '&' + encodeMap[string] + ';'; // no need to check `has()` here + }); + } // Shorten escapes that represent two symbols, of which at least one is // `<>"'&`. string = string @@ -183,7 +186,7 @@ // Note: there is no need to check `has(encodeMap, string)` here. return '&' + encodeMap[string] + ';'; }); - } else { + } else if (!allowUnsafeSymbols) { // Encode `<>"'&` using hexadecimal escapes, now that they’re not handled // using named character references. string = string.replace(regexEscape, hexEscape); @@ -205,7 +208,8 @@ encode.options = { 'encodeEverything': false, 'strict': false, - 'useNamedReferences': false + 'useNamedReferences': false, + 'allowUnsafeSymbols': false }; var decode = function(html, options) { diff --git a/src/he.js b/src/he.js index 2ba0c14..8d5ec37 100644 --- a/src/he.js +++ b/src/he.js @@ -156,6 +156,7 @@ } var encodeEverything = options.encodeEverything; var useNamedReferences = options.useNamedReferences; + var allowUnsafeSymbols = options.allowUnsafeSymbols; if (encodeEverything) { // Encode ASCII symbols. string = string.replace(regexAsciiWhitelist, function(symbol) { @@ -185,9 +186,11 @@ } else if (useNamedReferences) { // Apply named character references. // Encode `<>"'&` using named character references. - string = string.replace(regexEscape, function(string) { - return '&' + encodeMap[string] + ';'; // no need to check `has()` here - }); + if (!allowUnsafeSymbols) { + string = string.replace(regexEscape, function(string) { + return '&' + encodeMap[string] + ';'; // no need to check `has()` here + }); + } // Shorten escapes that represent two symbols, of which at least one is // `<>"'&`. string = string @@ -198,7 +201,7 @@ // Note: there is no need to check `has(encodeMap, string)` here. return '&' + encodeMap[string] + ';'; }); - } else { + } else if (!allowUnsafeSymbols) { // Encode `<>"'&` using hexadecimal escapes, now that they’re not handled // using named character references. string = string.replace(regexEscape, hexEscape); @@ -220,7 +223,8 @@ encode.options = { 'encodeEverything': false, 'strict': false, - 'useNamedReferences': false + 'useNamedReferences': false, + 'allowUnsafeSymbols': false }; var decode = function(html, options) { diff --git a/tests/tests.js b/tests/tests.js index 75057f3..d58c946 100644 --- a/tests/tests.js +++ b/tests/tests.js @@ -6598,6 +6598,29 @@ '\0\x89', 'Does not encode invalid code points whose character references would refer to another code point, even when `encodeEverything: true` is used' ); + + equal( + he.encode('foo\xA9baz\u2603"qux', { 'allowUnsafeSymbols': true }), + 'foo©baz☃"qux', + 'Markup characters pass through when `allowUnsafeSymbols: true`' + ); + equal( + he.encode('a', { 'allowUnsafeSymbols': true, 'useNamedReferences': true }), + 'a<∾>', + '`useNamedReferences` only affects non-ASCII symbols when `allowUnsafeSymbols: true`' + ) + raises( + function() { + he.encode('\0\x01\x02\x03\x04\x05\x06\x07\b\x0B\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\uFDD0\uFDD1\uFDD2\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDDF\uFDE0\uFDE1\uFDE2\uFDE3\uFDE4\uFDE5\uFDE6\uFDE7\uFDE8\uFDE9\uFDEA\uFDEB\uFDEC\uFDED\uFDEE\uFDEF\uFFFE\uFFFF\uD83F\uDFFE\uD83F\uDFFF\uD87F\uDFFE\uD87F\uDFFF\uD8BF\uDFFE\uD8BF\uDFFF\uD8FF\uDFFE\uD8FF\uDFFF\uD93F\uDFFE\uD93F\uDFFF\uD97F\uDFFE\uD97F\uDFFF\uD9BF\uDFFE\uD9BF\uDFFF\uD9FF\uDFFE\uD9FF\uDFFF\uDA3F\uDFFE\uDA3F\uDFFF\uDA7F\uDFFE\uDA7F\uDFFF\uDABF\uDFFE\uDABF\uDFFF\uDAFF\uDFFE\uDAFF\uDFFF\uDB3F\uDFFE\uDB3F\uDFFF\uDB7F\uDFFE\uDB7F\uDFFF\uDBBF\uDFFE\uDBBF\uDFFF\uDBFF\uDFFE\uDBFF\uDFFF', { 'allowUnsafeSymbols': true, 'strict': true }); + }, + Error, + 'Parse error: forbidden code point when `allowUnsafeSymbols: true` and `strict: true`' + ); }); test('escape', function() { equal( diff --git a/tests/tests.src.js b/tests/tests.src.js index e7343e1..b510ed2 100644 --- a/tests/tests.src.js +++ b/tests/tests.src.js @@ -6598,6 +6598,29 @@ '\0\x89', 'Does not encode invalid code points whose character references would refer to another code point, even when `encodeEverything: true` is used' ); + + equal( + he.encode('foo\xA9baz\u2603"qux', { 'allowUnsafeSymbols': true }), + 'foo©baz☃"qux', + 'Markup characters pass through when `allowUnsafeSymbols: true`' + ); + equal( + he.encode('a', { 'allowUnsafeSymbols': true, 'useNamedReferences': true }), + 'a<∾>', + '`useNamedReferences` only affects non-ASCII symbols when `allowUnsafeSymbols: true`' + ) + raises( + function() { + he.encode(<%= stringInvalidCodePoints %>, { 'allowUnsafeSymbols': true, 'strict': true }); + }, + Error, + 'Parse error: forbidden code point when `allowUnsafeSymbols: true` and `strict: true`' + ); }); test('escape', function() { equal(