THRIFT-2410: Add UTF-16 to UTF-8 converter and use for convert the escaped unicode character to UTF-8 byte array.

Client: Java ME
Patch: Phongphan Phuttha

This closes #702
This commit is contained in:
Phongphan Phuttha 2015-11-18 19:44:30 +07:00 committed by Nobuaki Sukegawa
parent 0ad6ee95e0
commit 69826b21ec

View File

@ -19,6 +19,7 @@
package org.apache.thrift.protocol;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Stack;
@ -348,6 +349,47 @@ public class TJSONProtocol extends TProtocol {
}
}
private static boolean isHighSurrogate(char c) {
return c >= '\uD800' && c <= '\uDBFF';
}
private static boolean isLowSurrogate(char c) {
return c >= '\uDC00' && c <= '\uDFFF';
}
private static byte[] toUTF8(int codepoint) {
final int[] FIRST_BYTE_MASK = { 0, 0xc0, 0xe0, 0xf0 };
int length = 0;
if (codepoint <= 0x7f) length = 1;
else if (codepoint <= 0x7ff) length = 2;
else if (codepoint <= 0xffff) length = 3;
else if (codepoint <= 0x1fffff) length = 4;
else throw new RuntimeException("Code point over U+1FFFFF is not supported");
byte[] bytes = new byte[length];
switch (length) {
case 4:
bytes[3] = (byte)((codepoint & 0x3f) | 0x80);
codepoint >>= 6;
case 3:
bytes[2] = (byte)((codepoint & 0x3f) | 0x80);
codepoint >>= 6;
case 2:
bytes[1] = (byte)((codepoint & 0x3f) | 0x80);
codepoint >>= 6;
case 1:
bytes[0] = (byte)(codepoint | FIRST_BYTE_MASK[length - 1]);
}
return bytes;
}
private static byte[] toUTF8(int high, int low) {
int codepoint = (1 << 16) + ((high & 0x3ff) << 10);
codepoint += low & 0x3ff;
return toUTF8(codepoint);
}
// Write the bytes in array buf as a JSON characters, escaping as needed
private void writeJSONString(byte[] b) throws TException {
context_.write();
@ -596,6 +638,7 @@ public class TJSONProtocol extends TProtocol {
private TByteArrayOutputStream readJSONString(boolean skipContext)
throws TException {
TByteArrayOutputStream arr = new TByteArrayOutputStream(DEF_STRING_SIZE);
int highSurrogate = 0;
if (!skipContext) {
context_.read();
}
@ -608,10 +651,42 @@ public class TJSONProtocol extends TProtocol {
if (ch == ESCSEQ[0]) {
ch = reader_.read();
if (ch == ESCSEQ[1]) {
readJSONSyntaxChar(ZERO);
readJSONSyntaxChar(ZERO);
trans_.readAll(tmpbuf_, 0, 2);
ch = (byte)((hexVal(tmpbuf_[0]) << 4) + hexVal(tmpbuf_[1]));
trans_.readAll(tmpbuf_, 0, 4);
short cu = (short)(
((short)hexVal(tmpbuf_[0]) << 12) +
((short)hexVal(tmpbuf_[1]) << 8) +
((short)hexVal(tmpbuf_[2]) << 4) +
(short)hexVal(tmpbuf_[3]));
try {
if (isHighSurrogate((char)cu)) {
if (highSurrogate != 0) {
throw new TProtocolException(TProtocolException.INVALID_DATA,
"Expected low surrogate char");
}
highSurrogate = cu;
}
else if (isLowSurrogate((char)cu)) {
if (highSurrogate == 0) {
throw new TProtocolException(TProtocolException.INVALID_DATA,
"Expected high surrogate char");
}
arr.write(toUTF8(highSurrogate, cu));
highSurrogate = 0;
}
else {
arr.write(toUTF8(cu));
}
continue;
}
catch (UnsupportedEncodingException ex) {
throw new TProtocolException(TProtocolException.NOT_IMPLEMENTED,
"JVM does not support UTF-8");
}
catch (IOException ex) {
throw new TProtocolException(TProtocolException.INVALID_DATA,
"Invalid unicode sequence");
}
}
else {
int off = ESCAPE_CHARS.indexOf(ch);
@ -624,6 +699,11 @@ public class TJSONProtocol extends TProtocol {
}
arr.write(ch);
}
if (highSurrogate != 0) {
throw new TProtocolException(TProtocolException.INVALID_DATA,
"Expected low surrogate char");
}
return arr;
}