THRIFT-2410: Add UTF-16 to UTF-8 converter and use for convert the escaped unicode character to UTF-8 byte array.

Client: Java ME Patch: Phongphan Phuttha This closes #702
2024-11-07 02:45:22 +00:00 · 2015-11-18 19:44:30 +07:00 · 2015-11-18 19:44:30 +07:00 · 69826b21ec
commit 69826b21ec
parent 0ad6ee95e0
1 changed files with 84 additions and 4 deletions
--- a/lib/javame/src/org/apache/thrift/protocol/TJSONProtocol.java
+++ b/lib/javame/src/org/apache/thrift/protocol/TJSONProtocol.java
@ -19,6 +19,7 @@

 package org.apache.thrift.protocol;

+import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.util.Stack;

@ -348,6 +349,47 @@ public class TJSONProtocol extends TProtocol {
    }
  }

+  private static boolean isHighSurrogate(char c) {
+    return c >= '\uD800' && c <= '\uDBFF';
+  }
+
+  private static boolean isLowSurrogate(char c) {
+    return c >= '\uDC00' && c <= '\uDFFF';
+  }
+
+  private static byte[] toUTF8(int codepoint) {
+    final int[] FIRST_BYTE_MASK = { 0, 0xc0, 0xe0, 0xf0 };
+    int length = 0;
+    if (codepoint <= 0x7f) length = 1;
+    else if (codepoint <= 0x7ff) length = 2;
+    else if (codepoint <= 0xffff) length = 3;
+    else if (codepoint <= 0x1fffff) length = 4;
+    else throw new RuntimeException("Code point over U+1FFFFF is not supported");
+
+    byte[] bytes = new byte[length];
+    switch (length) {
+    case 4:
+      bytes[3] = (byte)((codepoint & 0x3f) | 0x80);
+      codepoint >>= 6;
+    case 3:
+      bytes[2] = (byte)((codepoint & 0x3f) | 0x80);
+      codepoint >>= 6;
+    case 2:
+      bytes[1] = (byte)((codepoint & 0x3f) | 0x80);
+      codepoint >>= 6;
+    case 1:
+      bytes[0] = (byte)(codepoint | FIRST_BYTE_MASK[length - 1]);
+    }
+
+    return bytes;
+  }
+
+  private static byte[] toUTF8(int high, int low) {
+    int codepoint = (1 << 16) + ((high & 0x3ff) << 10);
+    codepoint += low & 0x3ff;
+    return toUTF8(codepoint);
+  }
+
  // Write the bytes in array buf as a JSON characters, escaping as needed
  private void writeJSONString(byte[] b) throws TException {
    context_.write();
@ -596,6 +638,7 @@ public class TJSONProtocol extends TProtocol {
  private TByteArrayOutputStream readJSONString(boolean skipContext)
                                                                    throws TException {
    TByteArrayOutputStream arr = new TByteArrayOutputStream(DEF_STRING_SIZE);
+    int highSurrogate = 0;
    if (!skipContext) {
      context_.read();
    }
@ -608,10 +651,42 @@ public class TJSONProtocol extends TProtocol {
      if (ch == ESCSEQ[0]) {
        ch = reader_.read();
        if (ch == ESCSEQ[1]) {
-          readJSONSyntaxChar(ZERO);
-          readJSONSyntaxChar(ZERO);
-          trans_.readAll(tmpbuf_, 0, 2);
-          ch = (byte)((hexVal(tmpbuf_[0]) << 4) + hexVal(tmpbuf_[1]));
+          trans_.readAll(tmpbuf_, 0, 4);
+          short cu = (short)(
+              ((short)hexVal(tmpbuf_[0]) << 12) +
+              ((short)hexVal(tmpbuf_[1]) << 8) +
+              ((short)hexVal(tmpbuf_[2]) << 4) +
+              (short)hexVal(tmpbuf_[3]));
+          try {
+            if (isHighSurrogate((char)cu)) {
+              if (highSurrogate != 0) {
+                throw new TProtocolException(TProtocolException.INVALID_DATA,
+                    "Expected low surrogate char");
+              }
+              highSurrogate = cu;
+            }
+            else if (isLowSurrogate((char)cu)) {
+              if (highSurrogate == 0) {
+                throw new TProtocolException(TProtocolException.INVALID_DATA,
+                    "Expected high surrogate char");
+              }
+
+              arr.write(toUTF8(highSurrogate, cu));
+              highSurrogate = 0;
+            }
+            else {
+              arr.write(toUTF8(cu));
+            }
+            continue;
+          }
+          catch (UnsupportedEncodingException ex) {
+            throw new TProtocolException(TProtocolException.NOT_IMPLEMENTED,
+                "JVM does not support UTF-8");
+          }
+          catch (IOException ex) {
+            throw new TProtocolException(TProtocolException.INVALID_DATA,
+                "Invalid unicode sequence");
+          }
        }
        else {
          int off = ESCAPE_CHARS.indexOf(ch);
@ -624,6 +699,11 @@ public class TJSONProtocol extends TProtocol {
      }
      arr.write(ch);
    }
+
+    if (highSurrogate != 0) {
+      throw new TProtocolException(TProtocolException.INVALID_DATA,
+          "Expected low surrogate char");
+    }
    return arr;
  }