Extend unicode support

* Support \u{xxxx} syntax
 * Add support for surrogate pairs
 * Allow identifiers to have unicode escape sequence
This commit is contained in:
Anthony Van de Gejuchte
2016-06-22 17:12:23 +02:00
committed by Richard van Velzen
parent 07785d0003
commit 63c432f4fa
4 changed files with 292 additions and 64 deletions

View File

@@ -77,9 +77,12 @@ function OutputStream(options) {
var OUTPUT = "";
function to_ascii(str, identifier) {
return str.replace(/[\u0000-\u001f\u007f-\uffff]/g, function(ch) {
var code = ch.charCodeAt(0).toString(16);
if (code.length <= 2 && !identifier) {
return str.replace(/[\ud800-\udbff][\udc00-\udfff]|[\u0000-\u001f\u007f-\uffff]/g, function(ch) {
var code = get_full_char_code(ch, 0).toString(16);
if ((identifier && code.length === 1) || code.length > 4) {
return "\\u{" + code + "}";
} else if (code.length <= 2 && !identifier) {
while (code.length < 2) code = "0" + code;
return "\\x" + code;
} else {
@@ -107,7 +110,7 @@ function OutputStream(options) {
case "\u2029": return "\\u2029";
case "\ufeff": return "\\ufeff";
case "\0":
return /[0-7]/.test(str.charAt(i+1)) ? "\\x00" : "\\0";
return /[0-7]/.test(get_full_char(str, i+1)) ? "\\x00" : "\\0";
}
return s;
});
@@ -158,7 +161,13 @@ function OutputStream(options) {
var last = null;
function last_char() {
return last.charAt(last.length - 1);
var char = last.charAt(last.length - 1);
if (is_surrogate_pair_tail(char)) {
return last.charAt(last.length - 2) + char;
}
return char;
};
function maybe_newline() {
@@ -170,7 +179,7 @@ function OutputStream(options) {
function print(str) {
str = String(str);
var ch = str.charAt(0);
var ch = get_full_char(str, 0);
if (might_need_semicolon) {
might_need_semicolon = false;

File diff suppressed because one or more lines are too long

View File

@@ -15,3 +15,29 @@ unicode_parse_variables: {
var l = 3;
}
}
unicode_escaped_identifier: {
input: {
var \u{61} = "foo";
var \u{10000} = "bar";
}
expect_exact: 'var a="foo";var \u{10000}="bar";';
}
unicode_identifier_ascii_only: {
beautify = {ascii_only: true}
input: {
var \u{0061} = "hi";
var bar = "h\u{0065}llo";
var \u{10000} = "testing \u{101111}";
}
expect_exact: 'var a="hi";var bar="hello";var \\u{10000}="testing \\u{101111}";'
}
unicode_string_literals: {
beautify = {ascii_only: true}
input: {
var a = "6 length unicode character: \u{101111}";
}
expect_exact: 'var a="6 length unicode character: \\u{101111}";'
}

133
test/mocha/unicode.js Normal file
View File

@@ -0,0 +1,133 @@
var assert = require("assert");
var uglify = require("../../");
describe("Unicode", function() {
it("Should not accept invalid code ranges in unicode escape", function() {
var tests = [
"\\u{110000}", // A bit over the unicode range
"\\u{100000061} = 'foo'", // 32-bit overflow resulting in "a"
"\\u{fffffffffff}", // A bit too much over the unicode range
];
var exec = function(test) {
return function() {
uglify.parse(test);
}
}
var fail = function(e) {
return e instanceof uglify.JS_Parse_Error
&& e.message === "SyntaxError: Unicode reference out of bounce";
}
for (var i = 0; i < tests.length; i++) {
assert.throws(exec(tests[i]), fail);
}
});
it("Should not accept invalid unicode sequences", function() {
var tests = [
"var foo = '\\u-111'",
"var bar = '\\u{-1}'",
"var baz = '\\ugggg'"
];
var exec = function(test) {
return function() {
uglify.parse(test);
}
}
var fail = function(e) {
return e instanceof uglify.JS_Parse_Error
&& e.message === "SyntaxError: Invalid hex-character pattern in string";
}
for (var i = 0; i < tests.length; i++) {
assert.throws(exec(tests[i]), fail);
}
});
it("Should throw error if escaped first identifier char is not part of ID_start", function() {
var tests = [
'var \\u{0} = "foo";',
'var \\u{10ffff} = "bar";',
'var \\u000a = "what\'s up";'
];
var exec = function(test) {
return function() {
uglify.parse(test);
}
}
var fail = function(e) {
return e instanceof uglify.JS_Parse_Error
&& e.message === "SyntaxError: First identifier char is an invalid identifier char";
}
for (var i = 0; i < tests.length; i++) {
assert.throws(exec(tests[i]), fail);
}
});
it("Should throw error if escaped non-first identifier char is not part of ID_start", function() {
var tests = [
'var a\\u{0} = "foo";',
'var a\\u{10ffff} = "bar";',
'var z\\u000a = "what\'s up";'
];
var exec = function(test) {
return function() {
uglify.parse(test);
}
}
var fail = function(e) {
return e instanceof uglify.JS_Parse_Error
&& e.message === "SyntaxError: Invalid escaped identifier char";
}
for (var i = 0; i < tests.length; i++) {
assert.throws(exec(tests[i]), fail);
}
});
it("Should throw error if identifier is a keyword with a escape sequences", function() {
var tests = [
'var \\u0069\\u006e = "foo"', // in
'var \\u0076\\u0061\\u0072 = "bar"', // var
'var \\u{66}\\u{6f}\\u{72} = "baz"', // for
'var \\u0069\\u{66} = "foobar"' // if
];
var exec = function(test) {
return function() {
uglify.parse(test);
}
}
var fail = function(e) {
return e instanceof uglify.JS_Parse_Error
&& e.message === "SyntaxError: Escaped characters are not allowed in keywords";
}
for (var i = 0; i < tests.length; i++) {
assert.throws(exec(tests[i]), fail);
}
});
it("Should read strings containing surigates correctly", function() {
var tests = [
['var a = "\ud800\udc00";', 'var a="\\u{10000}";'],
['var b = "\udbff\udfff";', 'var b="\\u{10ffff}";']
];
for (var i = 0; i < tests.length; i++) {
assert.strictEqual(uglify.minify(tests[i][0], {
fromString: true, output: { ascii_only: true}
}).code, tests[i][1]);
}
});
});