From a84e139215c93a2673a46fa862eb1c05d157e3d0 Mon Sep 17 00:00:00 2001 From: Konrad Grochowski Date: Fri, 16 Oct 2015 11:22:10 +0200 Subject: [PATCH] THRIFT-2411 - C++: Fixed support for UTF-16 encoding in JSON protocol Client: C++ Patch: Phongphan Phuttha Support unicode-encoded character including UTF-16 surrogate pair which allow extended character that is not in Basic Multilingual Plane. This closes #648 --- lib/cpp/src/thrift/protocol/TJSONProtocol.cpp | 53 ++++++++++++-- lib/cpp/src/thrift/protocol/TJSONProtocol.h | 2 +- lib/cpp/test/JSONProtoTest.cpp | 69 +++++++++++++++++++ 3 files changed, 116 insertions(+), 8 deletions(-) diff --git a/lib/cpp/src/thrift/protocol/TJSONProtocol.cpp b/lib/cpp/src/thrift/protocol/TJSONProtocol.cpp index 6865fdc4f..f1370bb85 100644 --- a/lib/cpp/src/thrift/protocol/TJSONProtocol.cpp +++ b/lib/cpp/src/thrift/protocol/TJSONProtocol.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -285,6 +286,16 @@ static bool isJSONNumeric(uint8_t ch) { return false; } +// Return true if the code unit is high surrogate +static bool isHighSurrogate(uint16_t val) { + return val >= 0xD800 && val <= 0xDBFF; +} + +// Return true if the code unit is low surrogate +static bool isLowSurrogate(uint16_t val) { + return val >= 0xDC00 && val <= 0xDFFF; +} + /** * Class to serve as base JSON context and as base class for other context * implementations @@ -709,14 +720,17 @@ uint32_t TJSONProtocol::readJSONSyntaxChar(uint8_t ch) { } // Decodes the four hex parts of a JSON escaped string character and returns -// the character via out. The first two characters must be "00". -uint32_t TJSONProtocol::readJSONEscapeChar(uint8_t* out) { - uint8_t b[2]; - readJSONSyntaxChar(kJSONZeroChar); - readJSONSyntaxChar(kJSONZeroChar); +// the UTF-16 code unit via out. +uint32_t TJSONProtocol::readJSONEscapeChar(uint16_t* out) { + uint8_t b[4]; b[0] = reader_.read(); b[1] = reader_.read(); - *out = (hexVal(b[0]) << 4) + hexVal(b[1]); + b[2] = reader_.read(); + b[3] = reader_.read(); + + *out = (hexVal(b[0]) << 12) + + (hexVal(b[1]) << 8) + (hexVal(b[2]) << 4) + hexVal(b[3]); + return 4; } @@ -724,6 +738,7 @@ uint32_t TJSONProtocol::readJSONEscapeChar(uint8_t* out) { uint32_t TJSONProtocol::readJSONString(std::string& str, bool skipContext) { uint32_t result = (skipContext ? 0 : context_->read(reader_)); result += readJSONSyntaxChar(kJSONStringDelimiter); + std::vector codeunits; uint8_t ch; str.clear(); while (true) { @@ -736,7 +751,22 @@ uint32_t TJSONProtocol::readJSONString(std::string& str, bool skipContext) { ch = reader_.read(); ++result; if (ch == kJSONEscapeChar) { - result += readJSONEscapeChar(&ch); + uint16_t cp; + result += readJSONEscapeChar(&cp); + if (isHighSurrogate(cp)) { + codeunits.push_back(cp); + } else { + if (isLowSurrogate(cp) + && codeunits.empty()) { + throw TProtocolException(TProtocolException::INVALID_DATA, + "Missing UTF-16 high surrogate pair."); + } + codeunits.push_back(cp); + codeunits.push_back(0); + str += boost::locale::conv::utf_to_utf(codeunits.data()); + codeunits.clear(); + } + continue; } else { size_t pos = kEscapeChars.find(ch); if (pos == std::string::npos) { @@ -747,8 +777,17 @@ uint32_t TJSONProtocol::readJSONString(std::string& str, bool skipContext) { ch = kEscapeCharVals[pos]; } } + if (!codeunits.empty()) { + throw TProtocolException(TProtocolException::INVALID_DATA, + "Missing UTF-16 low surrogate pair."); + } str += ch; } + + if (!codeunits.empty()) { + throw TProtocolException(TProtocolException::INVALID_DATA, + "Missing UTF-16 low surrogate pair."); + } return result; } diff --git a/lib/cpp/src/thrift/protocol/TJSONProtocol.h b/lib/cpp/src/thrift/protocol/TJSONProtocol.h index 80d68a4b9..5834eff00 100644 --- a/lib/cpp/src/thrift/protocol/TJSONProtocol.h +++ b/lib/cpp/src/thrift/protocol/TJSONProtocol.h @@ -127,7 +127,7 @@ private: uint32_t readJSONSyntaxChar(uint8_t ch); - uint32_t readJSONEscapeChar(uint8_t* out); + uint32_t readJSONEscapeChar(uint16_t* out); uint32_t readJSONString(std::string& str, bool skipContext = false); diff --git a/lib/cpp/test/JSONProtoTest.cpp b/lib/cpp/test/JSONProtoTest.cpp index f03b2ca5d..e9fe8b58d 100644 --- a/lib/cpp/test/JSONProtoTest.cpp +++ b/lib/cpp/test/JSONProtoTest.cpp @@ -19,6 +19,8 @@ #define _USE_MATH_DEFINES #include +#include +#include #include #include #include "gen-cpp/DebugProtoTest_types.h" @@ -269,3 +271,70 @@ BOOST_AUTO_TEST_CASE(test_json_proto_8) { BOOST_CHECK_THROW(ooe2.read(proto.get()), apache::thrift::protocol::TProtocolException); } + +static std::string toHexSequence(const std::string& str) { + std::stringstream ss; + ss << std::hex << std::setfill('0'); + for (std::size_t i = 0; i < str.size(); i++) { + ss << "\\x" << int(uint8_t(str[i])); + } + return ss.str(); +} + +BOOST_AUTO_TEST_CASE(test_json_unicode_escaped) { + const char json_string[] = + "{\"1\":{\"tf\":1},\"2\":{\"tf\":0},\"3\":{\"i8\":127},\"4\":{\"i16\":27000}," + "\"5\":{\"i32\":16},\"6\":{\"i64\":6000000000},\"7\":{\"dbl\":3.1415926" + "53589793},\"8\":{\"str\":\"JSON THIS!\"},\"9\":{\"str\":\"\\u0e01 \\ud835\\udd3e\"}," + "\"10\":{\"tf\":0},\"11\":{\"str\":\"000000\"},\"12\":{\"lst\"" + ":[\"i8\",3,1,2,3]},\"13\":{\"lst\":[\"i16\",3,1,2,3]},\"14\":{\"lst\":[\"i64" + "\",3,1,2,3]}}"; + const char* expected_zomg_unicode = "\xe0\xb8\x81 \xf0\x9d\x94\xbe"; + + boost::shared_ptr buffer(new TMemoryBuffer( + (uint8_t*)(json_string), sizeof(json_string))); + boost::shared_ptr proto(new TJSONProtocol(buffer)); + + OneOfEach ooe2; + ooe2.read(proto.get()); + BOOST_CHECK_MESSAGE(!ooe2.zomg_unicode.compare(expected_zomg_unicode), + "Expected:\n" << toHexSequence(expected_zomg_unicode) << "\nGotten:\n" + << toHexSequence(ooe2.zomg_unicode)); + +} + +BOOST_AUTO_TEST_CASE(test_json_unicode_escaped_missing_low_surrogate) { + const char json_string[] = + "{\"1\":{\"tf\":1},\"2\":{\"tf\":0},\"3\":{\"i8\":127},\"4\":{\"i16\":27000}," + "\"5\":{\"i32\":16},\"6\":{\"i64\":6000000000},\"7\":{\"dbl\":3.1415926" + "53589793},\"8\":{\"str\":\"JSON THIS!\"},\"9\":{\"str\":\"\\ud835\"}," + "\"10\":{\"tf\":0},\"11\":{\"str\":\"000000\"},\"12\":{\"lst\"" + ":[\"i8\",3,1,2,3]},\"13\":{\"lst\":[\"i16\",3,1,2,3]},\"14\":{\"lst\":[\"i64" + "\",3,1,2,3]}}"; + + boost::shared_ptr buffer(new TMemoryBuffer( + (uint8_t*)(json_string), sizeof(json_string))); + boost::shared_ptr proto(new TJSONProtocol(buffer)); + + OneOfEach ooe2; + BOOST_CHECK_THROW(ooe2.read(proto.get()), + apache::thrift::protocol::TProtocolException); +} + +BOOST_AUTO_TEST_CASE(test_json_unicode_escaped_missing_hi_surrogate) { + const char json_string[] = + "{\"1\":{\"tf\":1},\"2\":{\"tf\":0},\"3\":{\"i8\":127},\"4\":{\"i16\":27000}," + "\"5\":{\"i32\":16},\"6\":{\"i64\":6000000000},\"7\":{\"dbl\":3.1415926" + "53589793},\"8\":{\"str\":\"JSON THIS!\"},\"9\":{\"str\":\"\\udd3e\"}," + "\"10\":{\"tf\":0},\"11\":{\"str\":\"000000\"},\"12\":{\"lst\"" + ":[\"i8\",3,1,2,3]},\"13\":{\"lst\":[\"i16\",3,1,2,3]},\"14\":{\"lst\":[\"i64" + "\",3,1,2,3]}}"; + + boost::shared_ptr buffer(new TMemoryBuffer( + (uint8_t*)(json_string), sizeof(json_string))); + boost::shared_ptr proto(new TJSONProtocol(buffer)); + + OneOfEach ooe2; + BOOST_CHECK_THROW(ooe2.read(proto.get()), + apache::thrift::protocol::TProtocolException); +}