improve unicode handling (#3648)

This commit is contained in:
Alex Lam S.L
2019-12-28 18:06:51 +00:00
committed by GitHub
parent 4cf612dc9f
commit 22b47cdd63
3 changed files with 91 additions and 11 deletions

View File

@@ -119,15 +119,20 @@ function OutputStream(options) {
}); });
} : function(str) { } : function(str) {
var s = ""; var s = "";
for (var i = 0; i < str.length; i++) { for (var i = 0, j = 0; i < str.length; i++) {
if (is_surrogate_pair_head(str[i]) && !is_surrogate_pair_tail(str[i + 1]) var code = str.charCodeAt(i);
|| is_surrogate_pair_tail(str[i]) && !is_surrogate_pair_head(str[i - 1])) { if (is_surrogate_pair_head(code)) {
s += "\\u" + str.charCodeAt(i).toString(16); if (is_surrogate_pair_tail(str.charCodeAt(i + 1))) {
} else { i++;
s += str[i]; continue;
}
} else if (!is_surrogate_pair_tail(code)) {
continue;
} }
s += str.slice(j, i) + "\\u" + code.toString(16);
j = i + 1;
} }
return s; return j == 0 ? str : s + str.slice(j);
}; };
function make_string(str, quote) { function make_string(str, quote) {

View File

@@ -133,14 +133,10 @@ function is_letter(code) {
} }
function is_surrogate_pair_head(code) { function is_surrogate_pair_head(code) {
if (typeof code == "string")
code = code.charCodeAt(0);
return code >= 0xd800 && code <= 0xdbff; return code >= 0xd800 && code <= 0xdbff;
} }
function is_surrogate_pair_tail(code) { function is_surrogate_pair_tail(code) {
if (typeof code == "string")
code = code.charCodeAt(0);
return code >= 0xdc00 && code <= 0xdfff; return code >= 0xdc00 && code <= 0xdfff;
} }

View File

@@ -16,6 +16,81 @@ unicode_parse_variables: {
} }
} }
unicode_escaped_identifier: {
input: {
var \u0061 = "\ud800\udc00";
console.log(a);
}
expect_exact: 'var a="\ud800\udc00";console.log(a);'
expect_stdout: "\ud800\udc00"
}
unicode_identifier_ascii_only: {
beautify = {
ascii_only: true,
}
input: {
var \u0061 = "testing \udbc4\udd11";
var bar = "h\u0065llo";
console.log(a, \u0062\u0061r);
}
expect_exact: 'var a="testing \\udbc4\\udd11";var bar="hello";console.log(a,bar);'
expect_stdout: "testing \udbc4\udd11 hello"
}
unicode_string_literals: {
beautify = {
ascii_only: true,
}
input: {
var a = "6 length unicode character: \udbc4\udd11";
console.log(\u0061);
}
expect_exact: 'var a="6 length unicode character: \\udbc4\\udd11";console.log(a);'
expect_stdout: "6 length unicode character: \udbc4\udd11"
}
check_escape_style: {
beautify = {
ascii_only: true,
}
input: {
var a = "\x01";
var \ua0081 = "\x10"; // \u0081 only in ID_Continue
var \u0100 = "\u0100";
var \u1000 = "\u1000";
var \u1000 = "\ud800\udc00";
var \u3f80 = "\udbc0\udc00";
console.log(\u0061, \ua0081, \u0100, \u1000, \u3f80);
}
expect_exact: 'var a="\\x01";var \\ua0081="\\x10";var \\u0100="\\u0100";var \\u1000="\\u1000";var \\u1000="\\ud800\\udc00";var \\u3f80="\\udbc0\\udc00";console.log(a,\\ua0081,\\u0100,\\u1000,\\u3f80);'
expect_stdout: "\u0001 \u0010 \u0100 \ud800\udc00 \udbc0\udc00"
}
escape_non_escaped_identifier: {
beautify = {
ascii_only: true,
}
input: {
var µþ = "µþ";
console.log(\u00b5þ);
}
expect_exact: 'var \\u00b5\\u00fe="\\xb5\\xfe";console.log(\\u00b5\\u00fe);'
expect_stdout: "µþ"
}
non_escape_2_non_escape: {
beautify = {
ascii_only: false,
}
input: {
var µþ = "µþ";
console.log(\u00b5þ);
}
expect_exact: 'var µþ="µþ";console.log(µþ);'
expect_stdout: "µþ"
}
issue_2242_1: { issue_2242_1: {
beautify = { beautify = {
ascii_only: false, ascii_only: false,
@@ -24,6 +99,7 @@ issue_2242_1: {
console.log("\ud83d", "\ude00", "\ud83d\ude00", "\ud83d@\ude00"); console.log("\ud83d", "\ude00", "\ud83d\ude00", "\ud83d@\ude00");
} }
expect_exact: 'console.log("\\ud83d","\\ude00","\ud83d\ude00","\\ud83d@\\ude00");' expect_exact: 'console.log("\\ud83d","\\ude00","\ud83d\ude00","\\ud83d@\\ude00");'
expect_stdout: "\ud83d \ude00 \ud83d\ude00 \ud83d@\ude00"
} }
issue_2242_2: { issue_2242_2: {
@@ -34,6 +110,7 @@ issue_2242_2: {
console.log("\ud83d", "\ude00", "\ud83d\ude00", "\ud83d@\ude00"); console.log("\ud83d", "\ude00", "\ud83d\ude00", "\ud83d@\ude00");
} }
expect_exact: 'console.log("\\ud83d","\\ude00","\\ud83d\\ude00","\\ud83d@\\ude00");' expect_exact: 'console.log("\\ud83d","\\ude00","\\ud83d\\ude00","\\ud83d@\\ude00");'
expect_stdout: "\ud83d \ude00 \ud83d\ude00 \ud83d@\ude00"
} }
issue_2242_3: { issue_2242_3: {
@@ -44,6 +121,7 @@ issue_2242_3: {
console.log("\ud83d" + "\ude00", "\ud83d" + "@" + "\ude00"); console.log("\ud83d" + "\ude00", "\ud83d" + "@" + "\ude00");
} }
expect_exact: 'console.log("\\ud83d"+"\\ude00","\\ud83d"+"@"+"\\ude00");' expect_exact: 'console.log("\\ud83d"+"\\ude00","\\ud83d"+"@"+"\\ude00");'
expect_stdout: "\ud83d\ude00 \ud83d@\ude00"
} }
issue_2242_4: { issue_2242_4: {
@@ -54,6 +132,7 @@ issue_2242_4: {
console.log("\ud83d" + "\ude00", "\ud83d" + "@" + "\ude00"); console.log("\ud83d" + "\ude00", "\ud83d" + "@" + "\ude00");
} }
expect_exact: 'console.log("\ud83d\ude00","\\ud83d@\\ude00");' expect_exact: 'console.log("\ud83d\ude00","\\ud83d@\\ude00");'
expect_stdout: "\ud83d\ude00 \ud83d@\ude00"
} }
issue_2569: { issue_2569: {