fix Unicode handling in parser (#1884)

There was an implicit assumption that first character within surrogate header range implies the next character must form a surrogate pair, which is not necessarily true.
This commit is contained in:
Alex Lam S.L
2017-05-09 01:58:31 +08:00
committed by GitHub
parent 3fac29a017
commit 2433bb4e52
4 changed files with 23 additions and 17 deletions

View File

@@ -215,16 +215,6 @@ function OutputStream(options) {
var might_add_newline = 0;
var last = "";
function last_char() {
var char = last.charAt(last.length - 1);
if (is_surrogate_pair_tail(char)) {
return last.charAt(last.length - 2) + char;
}
return char;
};
var ensure_line_len = options.max_line_len ? function() {
if (current_col > options.max_line_len) {
if (might_add_newline) {
@@ -247,7 +237,7 @@ function OutputStream(options) {
function print(str) {
str = String(str);
var ch = get_full_char(str, 0);
var prev = last_char();
var prev = get_full_char(last, last.length - 1);
if (might_need_semicolon) {
might_need_semicolon = false;

View File

@@ -134,8 +134,17 @@ var UNICODE = {
function get_full_char(str, pos) {
var char = str.charAt(pos);
if (char >= "\ud800" && char <= "\udbff") {
return char + str.charAt(pos + 1);
if (is_surrogate_pair_head(char)) {
var next = str.charAt(pos + 1);
if (is_surrogate_pair_tail(next)) {
return char + next;
}
}
if (is_surrogate_pair_tail(char)) {
var prev = str.charAt(pos - 1);
if (is_surrogate_pair_head(prev)) {
return prev + char;
}
}
return char;
}
@@ -152,8 +161,8 @@ function get_full_char_length(str) {
var surrogates = 0;
for (var i = 0; i < str.length; i++) {
if (str.charCodeAt(i) >= 0xd800 && str.charCodeAt(i) <= 0xdbff) {
if (str.charCodeAt(i + 1) >= 0xdc00 && str.charCodeAt(i + 1) <= 0xdfff) {
if (is_surrogate_pair_head(str.charCodeAt(i))) {
if (is_surrogate_pair_tail(str.charCodeAt(i + 1))) {
surrogates++;
i++;
}
@@ -291,7 +300,7 @@ function tokenizer($TEXT, filename, html5_comments, shebang) {
ch = "\n";
}
} else {
if (is_surrogate_pair_head(ch)) {
if (ch.length > 1) {
++S.pos;
++S.col;
}