fix Unicode handling in parser (#1884)
There was an implicit assumption that first character within surrogate header range implies the next character must form a surrogate pair, which is not necessarily true.
This commit is contained in:
@@ -212,7 +212,7 @@ function run() {
|
||||
fatal("ERROR: " + ex.message);
|
||||
}
|
||||
if (program.output == "spidermonkey") {
|
||||
console.log(JSON.stringify(UglifyJS.parse(new Buffer(result.code).toString()).to_mozilla_ast(), null, 2));
|
||||
console.log(JSON.stringify(UglifyJS.parse(result.code).to_mozilla_ast(), null, 2));
|
||||
} else if (program.output) {
|
||||
fs.writeFileSync(program.output, result.code);
|
||||
if (result.map) {
|
||||
|
||||
@@ -215,16 +215,6 @@ function OutputStream(options) {
|
||||
var might_add_newline = 0;
|
||||
var last = "";
|
||||
|
||||
function last_char() {
|
||||
var char = last.charAt(last.length - 1);
|
||||
|
||||
if (is_surrogate_pair_tail(char)) {
|
||||
return last.charAt(last.length - 2) + char;
|
||||
}
|
||||
|
||||
return char;
|
||||
};
|
||||
|
||||
var ensure_line_len = options.max_line_len ? function() {
|
||||
if (current_col > options.max_line_len) {
|
||||
if (might_add_newline) {
|
||||
@@ -247,7 +237,7 @@ function OutputStream(options) {
|
||||
function print(str) {
|
||||
str = String(str);
|
||||
var ch = get_full_char(str, 0);
|
||||
var prev = last_char();
|
||||
var prev = get_full_char(last, last.length - 1);
|
||||
if (might_need_semicolon) {
|
||||
might_need_semicolon = false;
|
||||
|
||||
|
||||
19
lib/parse.js
19
lib/parse.js
@@ -134,8 +134,17 @@ var UNICODE = {
|
||||
|
||||
function get_full_char(str, pos) {
|
||||
var char = str.charAt(pos);
|
||||
if (char >= "\ud800" && char <= "\udbff") {
|
||||
return char + str.charAt(pos + 1);
|
||||
if (is_surrogate_pair_head(char)) {
|
||||
var next = str.charAt(pos + 1);
|
||||
if (is_surrogate_pair_tail(next)) {
|
||||
return char + next;
|
||||
}
|
||||
}
|
||||
if (is_surrogate_pair_tail(char)) {
|
||||
var prev = str.charAt(pos - 1);
|
||||
if (is_surrogate_pair_head(prev)) {
|
||||
return prev + char;
|
||||
}
|
||||
}
|
||||
return char;
|
||||
}
|
||||
@@ -152,8 +161,8 @@ function get_full_char_length(str) {
|
||||
var surrogates = 0;
|
||||
|
||||
for (var i = 0; i < str.length; i++) {
|
||||
if (str.charCodeAt(i) >= 0xd800 && str.charCodeAt(i) <= 0xdbff) {
|
||||
if (str.charCodeAt(i + 1) >= 0xdc00 && str.charCodeAt(i + 1) <= 0xdfff) {
|
||||
if (is_surrogate_pair_head(str.charCodeAt(i))) {
|
||||
if (is_surrogate_pair_tail(str.charCodeAt(i + 1))) {
|
||||
surrogates++;
|
||||
i++;
|
||||
}
|
||||
@@ -291,7 +300,7 @@ function tokenizer($TEXT, filename, html5_comments, shebang) {
|
||||
ch = "\n";
|
||||
}
|
||||
} else {
|
||||
if (is_surrogate_pair_head(ch)) {
|
||||
if (ch.length > 1) {
|
||||
++S.pos;
|
||||
++S.col;
|
||||
}
|
||||
|
||||
@@ -135,4 +135,11 @@ describe("Unicode", function() {
|
||||
}).code, tests[i][1]);
|
||||
}
|
||||
});
|
||||
|
||||
it("Should parse raw characters correctly", function() {
|
||||
var ast = uglify.parse('console.log("\\udbff");');
|
||||
assert.strictEqual(ast.print_to_string(), 'console.log("\udbff");');
|
||||
ast = uglify.parse(ast.print_to_string());
|
||||
assert.strictEqual(ast.print_to_string(), 'console.log("\udbff");');
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user