fix Unicode handling in parser (#1884)

There was an implicit assumption that first character within surrogate header range implies the next character must form a surrogate pair, which is not necessarily true.
2017-05-09 01:58:31 +08:00
parent 3fac29a017
commit 2433bb4e52
4 changed files with 23 additions and 17 deletions
--- a/lib/parse.js
+++ b/lib/parse.js
@@ -134,8 +134,17 @@ var UNICODE = {

 function get_full_char(str, pos) {
    var char = str.charAt(pos);
-    if (char >= "\ud800" && char <= "\udbff") {
-        return char + str.charAt(pos + 1);
+    if (is_surrogate_pair_head(char)) {
+        var next = str.charAt(pos + 1);
+        if (is_surrogate_pair_tail(next)) {
+            return char + next;
+        }
+    }
+    if (is_surrogate_pair_tail(char)) {
+        var prev = str.charAt(pos - 1);
+        if (is_surrogate_pair_head(prev)) {
+            return prev + char;
+        }
    }
    return char;
 }
@@ -152,8 +161,8 @@ function get_full_char_length(str) {
    var surrogates = 0;

    for (var i = 0; i < str.length; i++) {
-        if (str.charCodeAt(i) >= 0xd800 && str.charCodeAt(i) <= 0xdbff) {
-            if (str.charCodeAt(i + 1) >= 0xdc00 && str.charCodeAt(i + 1) <= 0xdfff) {
+        if (is_surrogate_pair_head(str.charCodeAt(i))) {
+            if (is_surrogate_pair_tail(str.charCodeAt(i + 1))) {
                surrogates++;
                i++;
            }
@@ -291,7 +300,7 @@ function tokenizer($TEXT, filename, html5_comments, shebang) {
                ch = "\n";
            }
        } else {
-            if (is_surrogate_pair_head(ch)) {
+            if (ch.length > 1) {
                ++S.pos;
                ++S.col;
            }