encodedstream.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
  1. // Tencent is pleased to support the open source community by making RapidJSON
  2. // available.
  3. //
  4. // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All
  5. // rights reserved.
  6. //
  7. // Licensed under the MIT License (the "License"); you may not use this file
  8. // except in compliance with the License. You may obtain a copy of the License
  9. // at
  10. //
  11. // http://opensource.org/licenses/MIT
  12. //
  13. // Unless required by applicable law or agreed to in writing, software
  14. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  16. // License for the specific language governing permissions and limitations under
  17. // the License.
  18. #ifndef RAPIDJSON_ENCODEDSTREAM_H_
  19. #define RAPIDJSON_ENCODEDSTREAM_H_
  20. #include "memorystream.h"
  21. #include "stream.h"
  22. #ifdef __GNUC__
  23. RAPIDJSON_DIAG_PUSH
  24. RAPIDJSON_DIAG_OFF(effc++)
  25. #endif
  26. #ifdef __clang__
  27. RAPIDJSON_DIAG_PUSH
  28. RAPIDJSON_DIAG_OFF(padded)
  29. #endif
  30. RAPIDJSON_NAMESPACE_BEGIN
  31. //! Input byte stream wrapper with a statically bound encoding.
  32. /*!
  33. \tparam Encoding The interpretation of encoding of the stream. Either UTF8,
  34. UTF16LE, UTF16BE, UTF32LE, UTF32BE. \tparam InputByteStream Type of input
  35. byte stream. For example, FileReadStream.
  36. */
  37. template <typename Encoding, typename InputByteStream>
  38. class EncodedInputStream {
  39. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  40. public:
  41. typedef typename Encoding::Ch Ch;
  42. EncodedInputStream(InputByteStream &is) : is_(is) {
  43. current_ = Encoding::TakeBOM(is_);
  44. }
  45. Ch Peek() const { return current_; }
  46. Ch Take() {
  47. Ch c = current_;
  48. current_ = Encoding::Take(is_);
  49. return c;
  50. }
  51. size_t Tell() const { return is_.Tell(); }
  52. // Not implemented
  53. void Put(Ch) { RAPIDJSON_ASSERT(false); }
  54. void Flush() { RAPIDJSON_ASSERT(false); }
  55. Ch *PutBegin() {
  56. RAPIDJSON_ASSERT(false);
  57. return 0;
  58. }
  59. size_t PutEnd(Ch *) {
  60. RAPIDJSON_ASSERT(false);
  61. return 0;
  62. }
  63. private:
  64. EncodedInputStream(const EncodedInputStream &);
  65. EncodedInputStream &operator=(const EncodedInputStream &);
  66. InputByteStream &is_;
  67. Ch current_;
  68. };
  69. //! Specialized for UTF8 MemoryStream.
  70. template <>
  71. class EncodedInputStream<UTF8<>, MemoryStream> {
  72. public:
  73. typedef UTF8<>::Ch Ch;
  74. EncodedInputStream(MemoryStream &is) : is_(is) {
  75. if (static_cast<unsigned char>(is_.Peek()) == 0xEFu) is_.Take();
  76. if (static_cast<unsigned char>(is_.Peek()) == 0xBBu) is_.Take();
  77. if (static_cast<unsigned char>(is_.Peek()) == 0xBFu) is_.Take();
  78. }
  79. Ch Peek() const { return is_.Peek(); }
  80. Ch Take() { return is_.Take(); }
  81. size_t Tell() const { return is_.Tell(); }
  82. // Not implemented
  83. void Put(Ch) {}
  84. void Flush() {}
  85. Ch *PutBegin() { return 0; }
  86. size_t PutEnd(Ch *) { return 0; }
  87. MemoryStream &is_;
  88. private:
  89. EncodedInputStream(const EncodedInputStream &);
  90. EncodedInputStream &operator=(const EncodedInputStream &);
  91. };
  92. //! Output byte stream wrapper with statically bound encoding.
  93. /*!
  94. \tparam Encoding The interpretation of encoding of the stream. Either UTF8,
  95. UTF16LE, UTF16BE, UTF32LE, UTF32BE. \tparam OutputByteStream Type of input
  96. byte stream. For example, FileWriteStream.
  97. */
  98. template <typename Encoding, typename OutputByteStream>
  99. class EncodedOutputStream {
  100. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  101. public:
  102. typedef typename Encoding::Ch Ch;
  103. EncodedOutputStream(OutputByteStream &os, bool putBOM = true) : os_(os) {
  104. if (putBOM) Encoding::PutBOM(os_);
  105. }
  106. void Put(Ch c) { Encoding::Put(os_, c); }
  107. void Flush() { os_.Flush(); }
  108. // Not implemented
  109. Ch Peek() const {
  110. RAPIDJSON_ASSERT(false);
  111. return 0;
  112. }
  113. Ch Take() {
  114. RAPIDJSON_ASSERT(false);
  115. return 0;
  116. }
  117. size_t Tell() const {
  118. RAPIDJSON_ASSERT(false);
  119. return 0;
  120. }
  121. Ch *PutBegin() {
  122. RAPIDJSON_ASSERT(false);
  123. return 0;
  124. }
  125. size_t PutEnd(Ch *) {
  126. RAPIDJSON_ASSERT(false);
  127. return 0;
  128. }
  129. private:
  130. EncodedOutputStream(const EncodedOutputStream &);
  131. EncodedOutputStream &operator=(const EncodedOutputStream &);
  132. OutputByteStream &os_;
  133. };
  134. #define RAPIDJSON_ENCODINGS_FUNC(x) \
  135. UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
  136. //! Input stream wrapper with dynamically bound encoding and automatic encoding
  137. //! detection.
  138. /*!
  139. \tparam CharType Type of character for reading.
  140. \tparam InputByteStream type of input byte stream to be wrapped.
  141. */
  142. template <typename CharType, typename InputByteStream>
  143. class AutoUTFInputStream {
  144. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  145. public:
  146. typedef CharType Ch;
  147. //! Constructor.
  148. /*!
  149. \param is input stream to be wrapped.
  150. \param type UTF encoding type if it is not detected from the stream.
  151. */
  152. AutoUTFInputStream(InputByteStream &is, UTFType type = kUTF8)
  153. : is_(&is), type_(type), hasBOM_(false) {
  154. RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
  155. DetectType();
  156. static const TakeFunc f[] = {RAPIDJSON_ENCODINGS_FUNC(Take)};
  157. takeFunc_ = f[type_];
  158. current_ = takeFunc_(*is_);
  159. }
  160. UTFType GetType() const { return type_; }
  161. bool HasBOM() const { return hasBOM_; }
  162. Ch Peek() const { return current_; }
  163. Ch Take() {
  164. Ch c = current_;
  165. current_ = takeFunc_(*is_);
  166. return c;
  167. }
  168. size_t Tell() const { return is_->Tell(); }
  169. // Not implemented
  170. void Put(Ch) { RAPIDJSON_ASSERT(false); }
  171. void Flush() { RAPIDJSON_ASSERT(false); }
  172. Ch *PutBegin() {
  173. RAPIDJSON_ASSERT(false);
  174. return 0;
  175. }
  176. size_t PutEnd(Ch *) {
  177. RAPIDJSON_ASSERT(false);
  178. return 0;
  179. }
  180. private:
  181. AutoUTFInputStream(const AutoUTFInputStream &);
  182. AutoUTFInputStream &operator=(const AutoUTFInputStream &);
  183. // Detect encoding type with BOM or RFC 4627
  184. void DetectType() {
  185. // BOM (Byte Order Mark):
  186. // 00 00 FE FF UTF-32BE
  187. // FF FE 00 00 UTF-32LE
  188. // FE FF UTF-16BE
  189. // FF FE UTF-16LE
  190. // EF BB BF UTF-8
  191. const unsigned char *c =
  192. reinterpret_cast<const unsigned char *>(is_->Peek4());
  193. if (!c) return;
  194. unsigned bom =
  195. static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24));
  196. hasBOM_ = false;
  197. if (bom == 0xFFFE0000) {
  198. type_ = kUTF32BE;
  199. hasBOM_ = true;
  200. is_->Take();
  201. is_->Take();
  202. is_->Take();
  203. is_->Take();
  204. } else if (bom == 0x0000FEFF) {
  205. type_ = kUTF32LE;
  206. hasBOM_ = true;
  207. is_->Take();
  208. is_->Take();
  209. is_->Take();
  210. is_->Take();
  211. } else if ((bom & 0xFFFF) == 0xFFFE) {
  212. type_ = kUTF16BE;
  213. hasBOM_ = true;
  214. is_->Take();
  215. is_->Take();
  216. } else if ((bom & 0xFFFF) == 0xFEFF) {
  217. type_ = kUTF16LE;
  218. hasBOM_ = true;
  219. is_->Take();
  220. is_->Take();
  221. } else if ((bom & 0xFFFFFF) == 0xBFBBEF) {
  222. type_ = kUTF8;
  223. hasBOM_ = true;
  224. is_->Take();
  225. is_->Take();
  226. is_->Take();
  227. }
  228. // RFC 4627: Section 3
  229. // "Since the first two characters of a JSON text will always be ASCII
  230. // characters [RFC0020], it is possible to determine whether an octet
  231. // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
  232. // at the pattern of nulls in the first four octets."
  233. // 00 00 00 xx UTF-32BE
  234. // 00 xx 00 xx UTF-16BE
  235. // xx 00 00 00 UTF-32LE
  236. // xx 00 xx 00 UTF-16LE
  237. // xx xx xx xx UTF-8
  238. if (!hasBOM_) {
  239. int pattern =
  240. (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0);
  241. switch (pattern) {
  242. case 0x08:
  243. type_ = kUTF32BE;
  244. break;
  245. case 0x0A:
  246. type_ = kUTF16BE;
  247. break;
  248. case 0x01:
  249. type_ = kUTF32LE;
  250. break;
  251. case 0x05:
  252. type_ = kUTF16LE;
  253. break;
  254. case 0x0F:
  255. type_ = kUTF8;
  256. break;
  257. default:
  258. break; // Use type defined by user.
  259. }
  260. }
  261. // Runtime check whether the size of character type is sufficient. It only
  262. // perform checks with assertion.
  263. if (type_ == kUTF16LE || type_ == kUTF16BE)
  264. RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
  265. if (type_ == kUTF32LE || type_ == kUTF32BE)
  266. RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
  267. }
  268. typedef Ch (*TakeFunc)(InputByteStream &is);
  269. InputByteStream *is_;
  270. UTFType type_;
  271. Ch current_;
  272. TakeFunc takeFunc_;
  273. bool hasBOM_;
  274. };
  275. //! Output stream wrapper with dynamically bound encoding and automatic encoding
  276. //! detection.
  277. /*!
  278. \tparam CharType Type of character for writing.
  279. \tparam OutputByteStream type of output byte stream to be wrapped.
  280. */
  281. template <typename CharType, typename OutputByteStream>
  282. class AutoUTFOutputStream {
  283. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  284. public:
  285. typedef CharType Ch;
  286. //! Constructor.
  287. /*!
  288. \param os output stream to be wrapped.
  289. \param type UTF encoding type.
  290. \param putBOM Whether to write BOM at the beginning of the stream.
  291. */
  292. AutoUTFOutputStream(OutputByteStream &os, UTFType type, bool putBOM)
  293. : os_(&os), type_(type) {
  294. RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
  295. // Runtime check whether the size of character type is sufficient. It only
  296. // perform checks with assertion.
  297. if (type_ == kUTF16LE || type_ == kUTF16BE)
  298. RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
  299. if (type_ == kUTF32LE || type_ == kUTF32BE)
  300. RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
  301. static const PutFunc f[] = {RAPIDJSON_ENCODINGS_FUNC(Put)};
  302. putFunc_ = f[type_];
  303. if (putBOM) PutBOM();
  304. }
  305. UTFType GetType() const { return type_; }
  306. void Put(Ch c) { putFunc_(*os_, c); }
  307. void Flush() { os_->Flush(); }
  308. // Not implemented
  309. Ch Peek() const {
  310. RAPIDJSON_ASSERT(false);
  311. return 0;
  312. }
  313. Ch Take() {
  314. RAPIDJSON_ASSERT(false);
  315. return 0;
  316. }
  317. size_t Tell() const {
  318. RAPIDJSON_ASSERT(false);
  319. return 0;
  320. }
  321. Ch *PutBegin() {
  322. RAPIDJSON_ASSERT(false);
  323. return 0;
  324. }
  325. size_t PutEnd(Ch *) {
  326. RAPIDJSON_ASSERT(false);
  327. return 0;
  328. }
  329. private:
  330. AutoUTFOutputStream(const AutoUTFOutputStream &);
  331. AutoUTFOutputStream &operator=(const AutoUTFOutputStream &);
  332. void PutBOM() {
  333. typedef void (*PutBOMFunc)(OutputByteStream &);
  334. static const PutBOMFunc f[] = {RAPIDJSON_ENCODINGS_FUNC(PutBOM)};
  335. f[type_](*os_);
  336. }
  337. typedef void (*PutFunc)(OutputByteStream &, Ch);
  338. OutputByteStream *os_;
  339. UTFType type_;
  340. PutFunc putFunc_;
  341. };
  342. #undef RAPIDJSON_ENCODINGS_FUNC
  343. RAPIDJSON_NAMESPACE_END
  344. #ifdef __clang__
  345. RAPIDJSON_DIAG_POP
  346. #endif
  347. #ifdef __GNUC__
  348. RAPIDJSON_DIAG_POP
  349. #endif
  350. #endif // RAPIDJSON_FILESTREAM_H_