mirror of
https://github.com/valitydev/thrift.git
synced 2024-11-07 02:45:22 +00:00
THRIFT-2411 - C++: Fixed support for UTF-16 encoding in JSON protocol
Client: C++ Patch: Phongphan Phuttha <phongphan@acm.org> Support unicode-encoded character including UTF-16 surrogate pair which allow extended character that is not in Basic Multilingual Plane. This closes #648
This commit is contained in:
parent
86da51d2ae
commit
a84e139215
@ -26,6 +26,7 @@
|
|||||||
|
|
||||||
#include <boost/math/special_functions/fpclassify.hpp>
|
#include <boost/math/special_functions/fpclassify.hpp>
|
||||||
#include <boost/lexical_cast.hpp>
|
#include <boost/lexical_cast.hpp>
|
||||||
|
#include <boost/locale.hpp>
|
||||||
|
|
||||||
#include <thrift/protocol/TBase64Utils.h>
|
#include <thrift/protocol/TBase64Utils.h>
|
||||||
#include <thrift/transport/TTransportException.h>
|
#include <thrift/transport/TTransportException.h>
|
||||||
@ -285,6 +286,16 @@ static bool isJSONNumeric(uint8_t ch) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Return true if the code unit is high surrogate
|
||||||
|
static bool isHighSurrogate(uint16_t val) {
|
||||||
|
return val >= 0xD800 && val <= 0xDBFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return true if the code unit is low surrogate
|
||||||
|
static bool isLowSurrogate(uint16_t val) {
|
||||||
|
return val >= 0xDC00 && val <= 0xDFFF;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class to serve as base JSON context and as base class for other context
|
* Class to serve as base JSON context and as base class for other context
|
||||||
* implementations
|
* implementations
|
||||||
@ -709,14 +720,17 @@ uint32_t TJSONProtocol::readJSONSyntaxChar(uint8_t ch) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Decodes the four hex parts of a JSON escaped string character and returns
|
// Decodes the four hex parts of a JSON escaped string character and returns
|
||||||
// the character via out. The first two characters must be "00".
|
// the UTF-16 code unit via out.
|
||||||
uint32_t TJSONProtocol::readJSONEscapeChar(uint8_t* out) {
|
uint32_t TJSONProtocol::readJSONEscapeChar(uint16_t* out) {
|
||||||
uint8_t b[2];
|
uint8_t b[4];
|
||||||
readJSONSyntaxChar(kJSONZeroChar);
|
|
||||||
readJSONSyntaxChar(kJSONZeroChar);
|
|
||||||
b[0] = reader_.read();
|
b[0] = reader_.read();
|
||||||
b[1] = reader_.read();
|
b[1] = reader_.read();
|
||||||
*out = (hexVal(b[0]) << 4) + hexVal(b[1]);
|
b[2] = reader_.read();
|
||||||
|
b[3] = reader_.read();
|
||||||
|
|
||||||
|
*out = (hexVal(b[0]) << 12)
|
||||||
|
+ (hexVal(b[1]) << 8) + (hexVal(b[2]) << 4) + hexVal(b[3]);
|
||||||
|
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -724,6 +738,7 @@ uint32_t TJSONProtocol::readJSONEscapeChar(uint8_t* out) {
|
|||||||
uint32_t TJSONProtocol::readJSONString(std::string& str, bool skipContext) {
|
uint32_t TJSONProtocol::readJSONString(std::string& str, bool skipContext) {
|
||||||
uint32_t result = (skipContext ? 0 : context_->read(reader_));
|
uint32_t result = (skipContext ? 0 : context_->read(reader_));
|
||||||
result += readJSONSyntaxChar(kJSONStringDelimiter);
|
result += readJSONSyntaxChar(kJSONStringDelimiter);
|
||||||
|
std::vector<uint16_t> codeunits;
|
||||||
uint8_t ch;
|
uint8_t ch;
|
||||||
str.clear();
|
str.clear();
|
||||||
while (true) {
|
while (true) {
|
||||||
@ -736,7 +751,22 @@ uint32_t TJSONProtocol::readJSONString(std::string& str, bool skipContext) {
|
|||||||
ch = reader_.read();
|
ch = reader_.read();
|
||||||
++result;
|
++result;
|
||||||
if (ch == kJSONEscapeChar) {
|
if (ch == kJSONEscapeChar) {
|
||||||
result += readJSONEscapeChar(&ch);
|
uint16_t cp;
|
||||||
|
result += readJSONEscapeChar(&cp);
|
||||||
|
if (isHighSurrogate(cp)) {
|
||||||
|
codeunits.push_back(cp);
|
||||||
|
} else {
|
||||||
|
if (isLowSurrogate(cp)
|
||||||
|
&& codeunits.empty()) {
|
||||||
|
throw TProtocolException(TProtocolException::INVALID_DATA,
|
||||||
|
"Missing UTF-16 high surrogate pair.");
|
||||||
|
}
|
||||||
|
codeunits.push_back(cp);
|
||||||
|
codeunits.push_back(0);
|
||||||
|
str += boost::locale::conv::utf_to_utf<char>(codeunits.data());
|
||||||
|
codeunits.clear();
|
||||||
|
}
|
||||||
|
continue;
|
||||||
} else {
|
} else {
|
||||||
size_t pos = kEscapeChars.find(ch);
|
size_t pos = kEscapeChars.find(ch);
|
||||||
if (pos == std::string::npos) {
|
if (pos == std::string::npos) {
|
||||||
@ -747,8 +777,17 @@ uint32_t TJSONProtocol::readJSONString(std::string& str, bool skipContext) {
|
|||||||
ch = kEscapeCharVals[pos];
|
ch = kEscapeCharVals[pos];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (!codeunits.empty()) {
|
||||||
|
throw TProtocolException(TProtocolException::INVALID_DATA,
|
||||||
|
"Missing UTF-16 low surrogate pair.");
|
||||||
|
}
|
||||||
str += ch;
|
str += ch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!codeunits.empty()) {
|
||||||
|
throw TProtocolException(TProtocolException::INVALID_DATA,
|
||||||
|
"Missing UTF-16 low surrogate pair.");
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -127,7 +127,7 @@ private:
|
|||||||
|
|
||||||
uint32_t readJSONSyntaxChar(uint8_t ch);
|
uint32_t readJSONSyntaxChar(uint8_t ch);
|
||||||
|
|
||||||
uint32_t readJSONEscapeChar(uint8_t* out);
|
uint32_t readJSONEscapeChar(uint16_t* out);
|
||||||
|
|
||||||
uint32_t readJSONString(std::string& str, bool skipContext = false);
|
uint32_t readJSONString(std::string& str, bool skipContext = false);
|
||||||
|
|
||||||
|
@ -19,6 +19,8 @@
|
|||||||
|
|
||||||
#define _USE_MATH_DEFINES
|
#define _USE_MATH_DEFINES
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <iomanip>
|
||||||
|
#include <sstream>
|
||||||
#include <thrift/transport/TBufferTransports.h>
|
#include <thrift/transport/TBufferTransports.h>
|
||||||
#include <thrift/protocol/TJSONProtocol.h>
|
#include <thrift/protocol/TJSONProtocol.h>
|
||||||
#include "gen-cpp/DebugProtoTest_types.h"
|
#include "gen-cpp/DebugProtoTest_types.h"
|
||||||
@ -269,3 +271,70 @@ BOOST_AUTO_TEST_CASE(test_json_proto_8) {
|
|||||||
BOOST_CHECK_THROW(ooe2.read(proto.get()),
|
BOOST_CHECK_THROW(ooe2.read(proto.get()),
|
||||||
apache::thrift::protocol::TProtocolException);
|
apache::thrift::protocol::TProtocolException);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string toHexSequence(const std::string& str) {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << std::hex << std::setfill('0');
|
||||||
|
for (std::size_t i = 0; i < str.size(); i++) {
|
||||||
|
ss << "\\x" << int(uint8_t(str[i]));
|
||||||
|
}
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE(test_json_unicode_escaped) {
|
||||||
|
const char json_string[] =
|
||||||
|
"{\"1\":{\"tf\":1},\"2\":{\"tf\":0},\"3\":{\"i8\":127},\"4\":{\"i16\":27000},"
|
||||||
|
"\"5\":{\"i32\":16},\"6\":{\"i64\":6000000000},\"7\":{\"dbl\":3.1415926"
|
||||||
|
"53589793},\"8\":{\"str\":\"JSON THIS!\"},\"9\":{\"str\":\"\\u0e01 \\ud835\\udd3e\"},"
|
||||||
|
"\"10\":{\"tf\":0},\"11\":{\"str\":\"000000\"},\"12\":{\"lst\""
|
||||||
|
":[\"i8\",3,1,2,3]},\"13\":{\"lst\":[\"i16\",3,1,2,3]},\"14\":{\"lst\":[\"i64"
|
||||||
|
"\",3,1,2,3]}}";
|
||||||
|
const char* expected_zomg_unicode = "\xe0\xb8\x81 \xf0\x9d\x94\xbe";
|
||||||
|
|
||||||
|
boost::shared_ptr<TMemoryBuffer> buffer(new TMemoryBuffer(
|
||||||
|
(uint8_t*)(json_string), sizeof(json_string)));
|
||||||
|
boost::shared_ptr<TJSONProtocol> proto(new TJSONProtocol(buffer));
|
||||||
|
|
||||||
|
OneOfEach ooe2;
|
||||||
|
ooe2.read(proto.get());
|
||||||
|
BOOST_CHECK_MESSAGE(!ooe2.zomg_unicode.compare(expected_zomg_unicode),
|
||||||
|
"Expected:\n" << toHexSequence(expected_zomg_unicode) << "\nGotten:\n"
|
||||||
|
<< toHexSequence(ooe2.zomg_unicode));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE(test_json_unicode_escaped_missing_low_surrogate) {
|
||||||
|
const char json_string[] =
|
||||||
|
"{\"1\":{\"tf\":1},\"2\":{\"tf\":0},\"3\":{\"i8\":127},\"4\":{\"i16\":27000},"
|
||||||
|
"\"5\":{\"i32\":16},\"6\":{\"i64\":6000000000},\"7\":{\"dbl\":3.1415926"
|
||||||
|
"53589793},\"8\":{\"str\":\"JSON THIS!\"},\"9\":{\"str\":\"\\ud835\"},"
|
||||||
|
"\"10\":{\"tf\":0},\"11\":{\"str\":\"000000\"},\"12\":{\"lst\""
|
||||||
|
":[\"i8\",3,1,2,3]},\"13\":{\"lst\":[\"i16\",3,1,2,3]},\"14\":{\"lst\":[\"i64"
|
||||||
|
"\",3,1,2,3]}}";
|
||||||
|
|
||||||
|
boost::shared_ptr<TMemoryBuffer> buffer(new TMemoryBuffer(
|
||||||
|
(uint8_t*)(json_string), sizeof(json_string)));
|
||||||
|
boost::shared_ptr<TJSONProtocol> proto(new TJSONProtocol(buffer));
|
||||||
|
|
||||||
|
OneOfEach ooe2;
|
||||||
|
BOOST_CHECK_THROW(ooe2.read(proto.get()),
|
||||||
|
apache::thrift::protocol::TProtocolException);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE(test_json_unicode_escaped_missing_hi_surrogate) {
|
||||||
|
const char json_string[] =
|
||||||
|
"{\"1\":{\"tf\":1},\"2\":{\"tf\":0},\"3\":{\"i8\":127},\"4\":{\"i16\":27000},"
|
||||||
|
"\"5\":{\"i32\":16},\"6\":{\"i64\":6000000000},\"7\":{\"dbl\":3.1415926"
|
||||||
|
"53589793},\"8\":{\"str\":\"JSON THIS!\"},\"9\":{\"str\":\"\\udd3e\"},"
|
||||||
|
"\"10\":{\"tf\":0},\"11\":{\"str\":\"000000\"},\"12\":{\"lst\""
|
||||||
|
":[\"i8\",3,1,2,3]},\"13\":{\"lst\":[\"i16\",3,1,2,3]},\"14\":{\"lst\":[\"i64"
|
||||||
|
"\",3,1,2,3]}}";
|
||||||
|
|
||||||
|
boost::shared_ptr<TMemoryBuffer> buffer(new TMemoryBuffer(
|
||||||
|
(uint8_t*)(json_string), sizeof(json_string)));
|
||||||
|
boost::shared_ptr<TJSONProtocol> proto(new TJSONProtocol(buffer));
|
||||||
|
|
||||||
|
OneOfEach ooe2;
|
||||||
|
BOOST_CHECK_THROW(ooe2.read(proto.get()),
|
||||||
|
apache::thrift::protocol::TProtocolException);
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user