encodings.h 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816
  1. // Tencent is pleased to support the open source community by making RapidJSON
  2. // available.
  3. //
  4. // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All
  5. // rights reserved.
  6. //
  7. // Licensed under the MIT License (the "License"); you may not use this file
  8. // except in compliance with the License. You may obtain a copy of the License
  9. // at
  10. //
  11. // http://opensource.org/licenses/MIT
  12. //
  13. // Unless required by applicable law or agreed to in writing, software
  14. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  16. // License for the specific language governing permissions and limitations under
  17. // the License.
  18. #ifndef RAPIDJSON_ENCODINGS_H_
  19. #define RAPIDJSON_ENCODINGS_H_
  20. #include "rapidjson.h"
  21. #if defined(_MSC_VER) && !defined(__clang__)
  22. RAPIDJSON_DIAG_PUSH
  23. RAPIDJSON_DIAG_OFF(
  24. 4244) // conversion from 'type1' to 'type2', possible loss of data
  25. RAPIDJSON_DIAG_OFF(4702) // unreachable code
  26. #elif defined(__GNUC__)
  27. RAPIDJSON_DIAG_PUSH
  28. RAPIDJSON_DIAG_OFF(effc++)
  29. RAPIDJSON_DIAG_OFF(overflow)
  30. #endif
  31. RAPIDJSON_NAMESPACE_BEGIN
  32. ///////////////////////////////////////////////////////////////////////////////
  33. // Encoding
  34. /*! \class rapidjson::Encoding
  35. \brief Concept for encoding of Unicode characters.
  36. \code
  37. concept Encoding {
  38. typename Ch; //! Type of character. A "character" is actually a code unit
  39. in unicode's definition.
  40. enum { supportUnicode = 1 }; // or 0 if not supporting unicode
  41. //! \brief Encode a Unicode codepoint to an output stream.
  42. //! \param os Output stream.
  43. //! \param codepoint An unicode codepoint, ranging from 0x0 to 0x10FFFF
  44. inclusively. template<typename OutputStream> static void Encode(OutputStream&
  45. os, unsigned codepoint);
  46. //! \brief Decode a Unicode codepoint from an input stream.
  47. //! \param is Input stream.
  48. //! \param codepoint Output of the unicode codepoint.
  49. //! \return true if a valid codepoint can be decoded from the stream.
  50. template <typename InputStream>
  51. static bool Decode(InputStream& is, unsigned* codepoint);
  52. //! \brief Validate one Unicode codepoint from an encoded stream.
  53. //! \param is Input stream to obtain codepoint.
  54. //! \param os Output for copying one codepoint.
  55. //! \return true if it is valid.
  56. //! \note This function just validating and copying the codepoint without
  57. actually decode it. template <typename InputStream, typename OutputStream>
  58. static bool Validate(InputStream& is, OutputStream& os);
  59. // The following functions are deal with byte streams.
  60. //! Take a character from input byte stream, skip BOM if exist.
  61. template <typename InputByteStream>
  62. static CharType TakeBOM(InputByteStream& is);
  63. //! Take a character from input byte stream.
  64. template <typename InputByteStream>
  65. static Ch Take(InputByteStream& is);
  66. //! Put BOM to output byte stream.
  67. template <typename OutputByteStream>
  68. static void PutBOM(OutputByteStream& os);
  69. //! Put a character to output byte stream.
  70. template <typename OutputByteStream>
  71. static void Put(OutputByteStream& os, Ch c);
  72. };
  73. \endcode
  74. */
  75. ///////////////////////////////////////////////////////////////////////////////
  76. // UTF8
  77. //! UTF-8 encoding.
  78. /*! http://en.wikipedia.org/wiki/UTF-8
  79. http://tools.ietf.org/html/rfc3629
  80. \tparam CharType Code unit for storing 8-bit UTF-8 data. Default is char.
  81. \note implements Encoding concept
  82. */
  83. template <typename CharType = char>
  84. struct UTF8 {
  85. typedef CharType Ch;
  86. enum { supportUnicode = 1 };
  87. template <typename OutputStream>
  88. static void Encode(OutputStream &os, unsigned codepoint) {
  89. if (codepoint <= 0x7F)
  90. os.Put(static_cast<Ch>(codepoint & 0xFF));
  91. else if (codepoint <= 0x7FF) {
  92. os.Put(static_cast<Ch>(0xC0 | ((codepoint >> 6) & 0xFF)));
  93. os.Put(static_cast<Ch>(0x80 | ((codepoint & 0x3F))));
  94. } else if (codepoint <= 0xFFFF) {
  95. os.Put(static_cast<Ch>(0xE0 | ((codepoint >> 12) & 0xFF)));
  96. os.Put(static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
  97. os.Put(static_cast<Ch>(0x80 | (codepoint & 0x3F)));
  98. } else {
  99. RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
  100. os.Put(static_cast<Ch>(0xF0 | ((codepoint >> 18) & 0xFF)));
  101. os.Put(static_cast<Ch>(0x80 | ((codepoint >> 12) & 0x3F)));
  102. os.Put(static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
  103. os.Put(static_cast<Ch>(0x80 | (codepoint & 0x3F)));
  104. }
  105. }
  106. template <typename OutputStream>
  107. static void EncodeUnsafe(OutputStream &os, unsigned codepoint) {
  108. if (codepoint <= 0x7F)
  109. PutUnsafe(os, static_cast<Ch>(codepoint & 0xFF));
  110. else if (codepoint <= 0x7FF) {
  111. PutUnsafe(os, static_cast<Ch>(0xC0 | ((codepoint >> 6) & 0xFF)));
  112. PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint & 0x3F))));
  113. } else if (codepoint <= 0xFFFF) {
  114. PutUnsafe(os, static_cast<Ch>(0xE0 | ((codepoint >> 12) & 0xFF)));
  115. PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
  116. PutUnsafe(os, static_cast<Ch>(0x80 | (codepoint & 0x3F)));
  117. } else {
  118. RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
  119. PutUnsafe(os, static_cast<Ch>(0xF0 | ((codepoint >> 18) & 0xFF)));
  120. PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 12) & 0x3F)));
  121. PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
  122. PutUnsafe(os, static_cast<Ch>(0x80 | (codepoint & 0x3F)));
  123. }
  124. }
  125. template <typename InputStream>
  126. static bool Decode(InputStream &is, unsigned *codepoint) {
  127. #define RAPIDJSON_COPY() \
  128. c = is.Take(); \
  129. *codepoint = (*codepoint << 6) | (static_cast<unsigned char>(c) & 0x3Fu)
  130. #define RAPIDJSON_TRANS(mask) \
  131. result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0)
  132. #define RAPIDJSON_TAIL() \
  133. RAPIDJSON_COPY(); \
  134. RAPIDJSON_TRANS(0x70)
  135. typename InputStream::Ch c = is.Take();
  136. if (!(c & 0x80)) {
  137. *codepoint = static_cast<unsigned char>(c);
  138. return true;
  139. }
  140. unsigned char type = GetRange(static_cast<unsigned char>(c));
  141. if (type >= 32) {
  142. *codepoint = 0;
  143. } else {
  144. *codepoint = (0xFFu >> type) & static_cast<unsigned char>(c);
  145. }
  146. bool result = true;
  147. switch (type) {
  148. case 2:
  149. RAPIDJSON_TAIL();
  150. return result;
  151. case 3:
  152. RAPIDJSON_TAIL();
  153. RAPIDJSON_TAIL();
  154. return result;
  155. case 4:
  156. RAPIDJSON_COPY();
  157. RAPIDJSON_TRANS(0x50);
  158. RAPIDJSON_TAIL();
  159. return result;
  160. case 5:
  161. RAPIDJSON_COPY();
  162. RAPIDJSON_TRANS(0x10);
  163. RAPIDJSON_TAIL();
  164. RAPIDJSON_TAIL();
  165. return result;
  166. case 6:
  167. RAPIDJSON_TAIL();
  168. RAPIDJSON_TAIL();
  169. RAPIDJSON_TAIL();
  170. return result;
  171. case 10:
  172. RAPIDJSON_COPY();
  173. RAPIDJSON_TRANS(0x20);
  174. RAPIDJSON_TAIL();
  175. return result;
  176. case 11:
  177. RAPIDJSON_COPY();
  178. RAPIDJSON_TRANS(0x60);
  179. RAPIDJSON_TAIL();
  180. RAPIDJSON_TAIL();
  181. return result;
  182. default:
  183. return false;
  184. }
  185. #undef RAPIDJSON_COPY
  186. #undef RAPIDJSON_TRANS
  187. #undef RAPIDJSON_TAIL
  188. }
  189. template <typename InputStream, typename OutputStream>
  190. static bool Validate(InputStream &is, OutputStream &os) {
  191. #define RAPIDJSON_COPY() os.Put(c = is.Take())
  192. #define RAPIDJSON_TRANS(mask) \
  193. result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0)
  194. #define RAPIDJSON_TAIL() \
  195. RAPIDJSON_COPY(); \
  196. RAPIDJSON_TRANS(0x70)
  197. Ch c;
  198. RAPIDJSON_COPY();
  199. if (!(c & 0x80)) return true;
  200. bool result = true;
  201. switch (GetRange(static_cast<unsigned char>(c))) {
  202. case 2:
  203. RAPIDJSON_TAIL();
  204. return result;
  205. case 3:
  206. RAPIDJSON_TAIL();
  207. RAPIDJSON_TAIL();
  208. return result;
  209. case 4:
  210. RAPIDJSON_COPY();
  211. RAPIDJSON_TRANS(0x50);
  212. RAPIDJSON_TAIL();
  213. return result;
  214. case 5:
  215. RAPIDJSON_COPY();
  216. RAPIDJSON_TRANS(0x10);
  217. RAPIDJSON_TAIL();
  218. RAPIDJSON_TAIL();
  219. return result;
  220. case 6:
  221. RAPIDJSON_TAIL();
  222. RAPIDJSON_TAIL();
  223. RAPIDJSON_TAIL();
  224. return result;
  225. case 10:
  226. RAPIDJSON_COPY();
  227. RAPIDJSON_TRANS(0x20);
  228. RAPIDJSON_TAIL();
  229. return result;
  230. case 11:
  231. RAPIDJSON_COPY();
  232. RAPIDJSON_TRANS(0x60);
  233. RAPIDJSON_TAIL();
  234. RAPIDJSON_TAIL();
  235. return result;
  236. default:
  237. return false;
  238. }
  239. #undef RAPIDJSON_COPY
  240. #undef RAPIDJSON_TRANS
  241. #undef RAPIDJSON_TAIL
  242. }
  243. static unsigned char GetRange(unsigned char c) {
  244. // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  245. // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation
  246. // can test multiple types.
  247. static const unsigned char type[] = {
  248. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  249. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  250. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  251. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  252. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  253. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  254. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  255. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  256. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  257. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  258. 0, 0, 0, 0, 0, 0, 0, 0, 0x10, 0x10, 0x10, 0x10,
  259. 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  260. 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
  261. 0x40, 0x40, 0x40, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
  262. 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
  263. 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
  264. 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  265. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  266. 2, 2, 2, 2, 2, 2, 2, 2, 10, 3, 3, 3,
  267. 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
  268. 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8,
  269. 8, 8, 8, 8,
  270. };
  271. return type[c];
  272. }
  273. template <typename InputByteStream>
  274. static CharType TakeBOM(InputByteStream &is) {
  275. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  276. typename InputByteStream::Ch c = Take(is);
  277. if (static_cast<unsigned char>(c) != 0xEFu) return c;
  278. c = is.Take();
  279. if (static_cast<unsigned char>(c) != 0xBBu) return c;
  280. c = is.Take();
  281. if (static_cast<unsigned char>(c) != 0xBFu) return c;
  282. c = is.Take();
  283. return c;
  284. }
  285. template <typename InputByteStream>
  286. static Ch Take(InputByteStream &is) {
  287. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  288. return static_cast<Ch>(is.Take());
  289. }
  290. template <typename OutputByteStream>
  291. static void PutBOM(OutputByteStream &os) {
  292. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  293. os.Put(static_cast<typename OutputByteStream::Ch>(0xEFu));
  294. os.Put(static_cast<typename OutputByteStream::Ch>(0xBBu));
  295. os.Put(static_cast<typename OutputByteStream::Ch>(0xBFu));
  296. }
  297. template <typename OutputByteStream>
  298. static void Put(OutputByteStream &os, Ch c) {
  299. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  300. os.Put(static_cast<typename OutputByteStream::Ch>(c));
  301. }
  302. };
  303. ///////////////////////////////////////////////////////////////////////////////
  304. // UTF16
  305. //! UTF-16 encoding.
  306. /*! http://en.wikipedia.org/wiki/UTF-16
  307. http://tools.ietf.org/html/rfc2781
  308. \tparam CharType Type for storing 16-bit UTF-16 data. Default is wchar_t.
  309. C++11 may use char16_t instead. \note implements Encoding concept
  310. \note For in-memory access, no need to concern endianness. The code units
  311. and code points are represented by CPU's endianness. For streaming, use
  312. UTF16LE and UTF16BE, which handle endianness.
  313. */
  314. template <typename CharType = wchar_t>
  315. struct UTF16 {
  316. typedef CharType Ch;
  317. RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 2);
  318. enum { supportUnicode = 1 };
  319. template <typename OutputStream>
  320. static void Encode(OutputStream &os, unsigned codepoint) {
  321. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
  322. if (codepoint <= 0xFFFF) {
  323. RAPIDJSON_ASSERT(
  324. codepoint < 0xD800 ||
  325. codepoint > 0xDFFF); // Code point itself cannot be surrogate pair
  326. os.Put(static_cast<typename OutputStream::Ch>(codepoint));
  327. } else {
  328. RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
  329. unsigned v = codepoint - 0x10000;
  330. os.Put(static_cast<typename OutputStream::Ch>((v >> 10) | 0xD800));
  331. os.Put(static_cast<typename OutputStream::Ch>((v & 0x3FF) | 0xDC00));
  332. }
  333. }
  334. template <typename OutputStream>
  335. static void EncodeUnsafe(OutputStream &os, unsigned codepoint) {
  336. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
  337. if (codepoint <= 0xFFFF) {
  338. RAPIDJSON_ASSERT(
  339. codepoint < 0xD800 ||
  340. codepoint > 0xDFFF); // Code point itself cannot be surrogate pair
  341. PutUnsafe(os, static_cast<typename OutputStream::Ch>(codepoint));
  342. } else {
  343. RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
  344. unsigned v = codepoint - 0x10000;
  345. PutUnsafe(os, static_cast<typename OutputStream::Ch>((v >> 10) | 0xD800));
  346. PutUnsafe(os,
  347. static_cast<typename OutputStream::Ch>((v & 0x3FF) | 0xDC00));
  348. }
  349. }
  350. template <typename InputStream>
  351. static bool Decode(InputStream &is, unsigned *codepoint) {
  352. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 2);
  353. typename InputStream::Ch c = is.Take();
  354. if (c < 0xD800 || c > 0xDFFF) {
  355. *codepoint = static_cast<unsigned>(c);
  356. return true;
  357. } else if (c <= 0xDBFF) {
  358. *codepoint = (static_cast<unsigned>(c) & 0x3FF) << 10;
  359. c = is.Take();
  360. *codepoint |= (static_cast<unsigned>(c) & 0x3FF);
  361. *codepoint += 0x10000;
  362. return c >= 0xDC00 && c <= 0xDFFF;
  363. }
  364. return false;
  365. }
  366. template <typename InputStream, typename OutputStream>
  367. static bool Validate(InputStream &is, OutputStream &os) {
  368. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 2);
  369. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
  370. typename InputStream::Ch c;
  371. os.Put(static_cast<typename OutputStream::Ch>(c = is.Take()));
  372. if (c < 0xD800 || c > 0xDFFF)
  373. return true;
  374. else if (c <= 0xDBFF) {
  375. os.Put(c = is.Take());
  376. return c >= 0xDC00 && c <= 0xDFFF;
  377. }
  378. return false;
  379. }
  380. };
  381. //! UTF-16 little endian encoding.
  382. template <typename CharType = wchar_t>
  383. struct UTF16LE : UTF16<CharType> {
  384. template <typename InputByteStream>
  385. static CharType TakeBOM(InputByteStream &is) {
  386. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  387. CharType c = Take(is);
  388. return static_cast<uint16_t>(c) == 0xFEFFu ? Take(is) : c;
  389. }
  390. template <typename InputByteStream>
  391. static CharType Take(InputByteStream &is) {
  392. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  393. unsigned c = static_cast<uint8_t>(is.Take());
  394. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
  395. return static_cast<CharType>(c);
  396. }
  397. template <typename OutputByteStream>
  398. static void PutBOM(OutputByteStream &os) {
  399. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  400. os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
  401. os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
  402. }
  403. template <typename OutputByteStream>
  404. static void Put(OutputByteStream &os, CharType c) {
  405. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  406. os.Put(static_cast<typename OutputByteStream::Ch>(static_cast<unsigned>(c) &
  407. 0xFFu));
  408. os.Put(static_cast<typename OutputByteStream::Ch>(
  409. (static_cast<unsigned>(c) >> 8) & 0xFFu));
  410. }
  411. };
  412. //! UTF-16 big endian encoding.
  413. template <typename CharType = wchar_t>
  414. struct UTF16BE : UTF16<CharType> {
  415. template <typename InputByteStream>
  416. static CharType TakeBOM(InputByteStream &is) {
  417. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  418. CharType c = Take(is);
  419. return static_cast<uint16_t>(c) == 0xFEFFu ? Take(is) : c;
  420. }
  421. template <typename InputByteStream>
  422. static CharType Take(InputByteStream &is) {
  423. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  424. unsigned c = static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
  425. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take()));
  426. return static_cast<CharType>(c);
  427. }
  428. template <typename OutputByteStream>
  429. static void PutBOM(OutputByteStream &os) {
  430. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  431. os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
  432. os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
  433. }
  434. template <typename OutputByteStream>
  435. static void Put(OutputByteStream &os, CharType c) {
  436. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  437. os.Put(static_cast<typename OutputByteStream::Ch>(
  438. (static_cast<unsigned>(c) >> 8) & 0xFFu));
  439. os.Put(static_cast<typename OutputByteStream::Ch>(static_cast<unsigned>(c) &
  440. 0xFFu));
  441. }
  442. };
  443. ///////////////////////////////////////////////////////////////////////////////
  444. // UTF32
  445. //! UTF-32 encoding.
  446. /*! http://en.wikipedia.org/wiki/UTF-32
  447. \tparam CharType Type for storing 32-bit UTF-32 data. Default is unsigned.
  448. C++11 may use char32_t instead. \note implements Encoding concept
  449. \note For in-memory access, no need to concern endianness. The code units
  450. and code points are represented by CPU's endianness. For streaming, use
  451. UTF32LE and UTF32BE, which handle endianness.
  452. */
  453. template <typename CharType = unsigned>
  454. struct UTF32 {
  455. typedef CharType Ch;
  456. RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 4);
  457. enum { supportUnicode = 1 };
  458. template <typename OutputStream>
  459. static void Encode(OutputStream &os, unsigned codepoint) {
  460. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4);
  461. RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
  462. os.Put(codepoint);
  463. }
  464. template <typename OutputStream>
  465. static void EncodeUnsafe(OutputStream &os, unsigned codepoint) {
  466. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4);
  467. RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
  468. PutUnsafe(os, codepoint);
  469. }
  470. template <typename InputStream>
  471. static bool Decode(InputStream &is, unsigned *codepoint) {
  472. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 4);
  473. Ch c = is.Take();
  474. *codepoint = c;
  475. return c <= 0x10FFFF;
  476. }
  477. template <typename InputStream, typename OutputStream>
  478. static bool Validate(InputStream &is, OutputStream &os) {
  479. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 4);
  480. Ch c;
  481. os.Put(c = is.Take());
  482. return c <= 0x10FFFF;
  483. }
  484. };
  485. //! UTF-32 little endian enocoding.
  486. template <typename CharType = unsigned>
  487. struct UTF32LE : UTF32<CharType> {
  488. template <typename InputByteStream>
  489. static CharType TakeBOM(InputByteStream &is) {
  490. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  491. CharType c = Take(is);
  492. return static_cast<uint32_t>(c) == 0x0000FEFFu ? Take(is) : c;
  493. }
  494. template <typename InputByteStream>
  495. static CharType Take(InputByteStream &is) {
  496. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  497. unsigned c = static_cast<uint8_t>(is.Take());
  498. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
  499. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 16;
  500. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 24;
  501. return static_cast<CharType>(c);
  502. }
  503. template <typename OutputByteStream>
  504. static void PutBOM(OutputByteStream &os) {
  505. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  506. os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
  507. os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
  508. os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
  509. os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
  510. }
  511. template <typename OutputByteStream>
  512. static void Put(OutputByteStream &os, CharType c) {
  513. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  514. os.Put(static_cast<typename OutputByteStream::Ch>(c & 0xFFu));
  515. os.Put(static_cast<typename OutputByteStream::Ch>((c >> 8) & 0xFFu));
  516. os.Put(static_cast<typename OutputByteStream::Ch>((c >> 16) & 0xFFu));
  517. os.Put(static_cast<typename OutputByteStream::Ch>((c >> 24) & 0xFFu));
  518. }
  519. };
  520. //! UTF-32 big endian encoding.
  521. template <typename CharType = unsigned>
  522. struct UTF32BE : UTF32<CharType> {
  523. template <typename InputByteStream>
  524. static CharType TakeBOM(InputByteStream &is) {
  525. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  526. CharType c = Take(is);
  527. return static_cast<uint32_t>(c) == 0x0000FEFFu ? Take(is) : c;
  528. }
  529. template <typename InputByteStream>
  530. static CharType Take(InputByteStream &is) {
  531. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  532. unsigned c = static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 24;
  533. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 16;
  534. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
  535. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take()));
  536. return static_cast<CharType>(c);
  537. }
  538. template <typename OutputByteStream>
  539. static void PutBOM(OutputByteStream &os) {
  540. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  541. os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
  542. os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
  543. os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
  544. os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
  545. }
  546. template <typename OutputByteStream>
  547. static void Put(OutputByteStream &os, CharType c) {
  548. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  549. os.Put(static_cast<typename OutputByteStream::Ch>((c >> 24) & 0xFFu));
  550. os.Put(static_cast<typename OutputByteStream::Ch>((c >> 16) & 0xFFu));
  551. os.Put(static_cast<typename OutputByteStream::Ch>((c >> 8) & 0xFFu));
  552. os.Put(static_cast<typename OutputByteStream::Ch>(c & 0xFFu));
  553. }
  554. };
  555. ///////////////////////////////////////////////////////////////////////////////
  556. // ASCII
  557. //! ASCII encoding.
  558. /*! http://en.wikipedia.org/wiki/ASCII
  559. \tparam CharType Code unit for storing 7-bit ASCII data. Default is char.
  560. \note implements Encoding concept
  561. */
  562. template <typename CharType = char>
  563. struct ASCII {
  564. typedef CharType Ch;
  565. enum { supportUnicode = 0 };
  566. template <typename OutputStream>
  567. static void Encode(OutputStream &os, unsigned codepoint) {
  568. RAPIDJSON_ASSERT(codepoint <= 0x7F);
  569. os.Put(static_cast<Ch>(codepoint & 0xFF));
  570. }
  571. template <typename OutputStream>
  572. static void EncodeUnsafe(OutputStream &os, unsigned codepoint) {
  573. RAPIDJSON_ASSERT(codepoint <= 0x7F);
  574. PutUnsafe(os, static_cast<Ch>(codepoint & 0xFF));
  575. }
  576. template <typename InputStream>
  577. static bool Decode(InputStream &is, unsigned *codepoint) {
  578. uint8_t c = static_cast<uint8_t>(is.Take());
  579. *codepoint = c;
  580. return c <= 0X7F;
  581. }
  582. template <typename InputStream, typename OutputStream>
  583. static bool Validate(InputStream &is, OutputStream &os) {
  584. uint8_t c = static_cast<uint8_t>(is.Take());
  585. os.Put(static_cast<typename OutputStream::Ch>(c));
  586. return c <= 0x7F;
  587. }
  588. template <typename InputByteStream>
  589. static CharType TakeBOM(InputByteStream &is) {
  590. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  591. uint8_t c = static_cast<uint8_t>(Take(is));
  592. return static_cast<Ch>(c);
  593. }
  594. template <typename InputByteStream>
  595. static Ch Take(InputByteStream &is) {
  596. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  597. return static_cast<Ch>(is.Take());
  598. }
  599. template <typename OutputByteStream>
  600. static void PutBOM(OutputByteStream &os) {
  601. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  602. (void)os;
  603. }
  604. template <typename OutputByteStream>
  605. static void Put(OutputByteStream &os, Ch c) {
  606. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  607. os.Put(static_cast<typename OutputByteStream::Ch>(c));
  608. }
  609. };
  610. ///////////////////////////////////////////////////////////////////////////////
  611. // AutoUTF
  612. //! Runtime-specified UTF encoding type of a stream.
  613. enum UTFType {
  614. kUTF8 = 0, //!< UTF-8.
  615. kUTF16LE = 1, //!< UTF-16 little endian.
  616. kUTF16BE = 2, //!< UTF-16 big endian.
  617. kUTF32LE = 3, //!< UTF-32 little endian.
  618. kUTF32BE = 4 //!< UTF-32 big endian.
  619. };
  620. //! Dynamically select encoding according to stream's runtime-specified UTF
  621. //! encoding type.
  622. /*! \note This class can be used with AutoUTFInputtStream and
  623. * AutoUTFOutputStream, which provides GetType().
  624. */
  625. template <typename CharType>
  626. struct AutoUTF {
  627. typedef CharType Ch;
  628. enum { supportUnicode = 1 };
  629. #define RAPIDJSON_ENCODINGS_FUNC(x) \
  630. UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
  631. template <typename OutputStream>
  632. static RAPIDJSON_FORCEINLINE void Encode(OutputStream &os,
  633. unsigned codepoint) {
  634. typedef void (*EncodeFunc)(OutputStream &, unsigned);
  635. static const EncodeFunc f[] = {RAPIDJSON_ENCODINGS_FUNC(Encode)};
  636. (*f[os.GetType()])(os, codepoint);
  637. }
  638. template <typename OutputStream>
  639. static RAPIDJSON_FORCEINLINE void EncodeUnsafe(OutputStream &os,
  640. unsigned codepoint) {
  641. typedef void (*EncodeFunc)(OutputStream &, unsigned);
  642. static const EncodeFunc f[] = {RAPIDJSON_ENCODINGS_FUNC(EncodeUnsafe)};
  643. (*f[os.GetType()])(os, codepoint);
  644. }
  645. template <typename InputStream>
  646. static RAPIDJSON_FORCEINLINE bool Decode(InputStream &is,
  647. unsigned *codepoint) {
  648. typedef bool (*DecodeFunc)(InputStream &, unsigned *);
  649. static const DecodeFunc f[] = {RAPIDJSON_ENCODINGS_FUNC(Decode)};
  650. return (*f[is.GetType()])(is, codepoint);
  651. }
  652. template <typename InputStream, typename OutputStream>
  653. static RAPIDJSON_FORCEINLINE bool Validate(InputStream &is,
  654. OutputStream &os) {
  655. typedef bool (*ValidateFunc)(InputStream &, OutputStream &);
  656. static const ValidateFunc f[] = {RAPIDJSON_ENCODINGS_FUNC(Validate)};
  657. return (*f[is.GetType()])(is, os);
  658. }
  659. #undef RAPIDJSON_ENCODINGS_FUNC
  660. };
  661. ///////////////////////////////////////////////////////////////////////////////
  662. // Transcoder
  663. //! Encoding conversion.
  664. template <typename SourceEncoding, typename TargetEncoding>
  665. struct Transcoder {
  666. //! Take one Unicode codepoint from source encoding, convert it to target
  667. //! encoding and put it to the output stream.
  668. template <typename InputStream, typename OutputStream>
  669. static RAPIDJSON_FORCEINLINE bool Transcode(InputStream &is,
  670. OutputStream &os) {
  671. unsigned codepoint;
  672. if (!SourceEncoding::Decode(is, &codepoint)) return false;
  673. TargetEncoding::Encode(os, codepoint);
  674. return true;
  675. }
  676. template <typename InputStream, typename OutputStream>
  677. static RAPIDJSON_FORCEINLINE bool TranscodeUnsafe(InputStream &is,
  678. OutputStream &os) {
  679. unsigned codepoint;
  680. if (!SourceEncoding::Decode(is, &codepoint)) return false;
  681. TargetEncoding::EncodeUnsafe(os, codepoint);
  682. return true;
  683. }
  684. //! Validate one Unicode codepoint from an encoded stream.
  685. template <typename InputStream, typename OutputStream>
  686. static RAPIDJSON_FORCEINLINE bool Validate(InputStream &is,
  687. OutputStream &os) {
  688. return Transcode(
  689. is, os); // Since source/target encoding is different, must transcode.
  690. }
  691. };
  692. // Forward declaration.
  693. template <typename Stream>
  694. inline void PutUnsafe(Stream &stream, typename Stream::Ch c);
  695. //! Specialization of Transcoder with same source and target encoding.
  696. template <typename Encoding>
  697. struct Transcoder<Encoding, Encoding> {
  698. template <typename InputStream, typename OutputStream>
  699. static RAPIDJSON_FORCEINLINE bool Transcode(InputStream &is,
  700. OutputStream &os) {
  701. os.Put(is.Take()); // Just copy one code unit. This semantic is different
  702. // from primary template class.
  703. return true;
  704. }
  705. template <typename InputStream, typename OutputStream>
  706. static RAPIDJSON_FORCEINLINE bool TranscodeUnsafe(InputStream &is,
  707. OutputStream &os) {
  708. PutUnsafe(os, is.Take()); // Just copy one code unit. This semantic is
  709. // different from primary template class.
  710. return true;
  711. }
  712. template <typename InputStream, typename OutputStream>
  713. static RAPIDJSON_FORCEINLINE bool Validate(InputStream &is,
  714. OutputStream &os) {
  715. return Encoding::Validate(is, os); // source/target encoding are the same
  716. }
  717. };
  718. RAPIDJSON_NAMESPACE_END
  719. #if defined(__GNUC__) || (defined(_MSC_VER) && !defined(__clang__))
  720. RAPIDJSON_DIAG_POP
  721. #endif
  722. #endif // RAPIDJSON_ENCODINGS_H_