textencoding.js 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. /**
  2. * @fileoverview A UTF8 decoder.
  3. */
  4. goog.module('protobuf.binary.textencoding');
  5. const {checkElementIndex} = goog.require('protobuf.internal.checks');
  6. /**
  7. * Combines an array of codePoints into a string.
  8. * @param {!Array<number>} codePoints
  9. * @return {string}
  10. */
  11. function codePointsToString(codePoints) {
  12. // Performance: http://jsperf.com/string-fromcharcode-test/13
  13. let s = '', i = 0;
  14. const length = codePoints.length;
  15. const BATCH_SIZE = 10000;
  16. while (i < length) {
  17. const end = Math.min(i + BATCH_SIZE, length);
  18. s += String.fromCharCode.apply(null, codePoints.slice(i, end));
  19. i = end;
  20. }
  21. return s;
  22. }
  23. /**
  24. * Decodes raw bytes into a string.
  25. * Supports codepoints from U+0000 up to U+10FFFF.
  26. * (http://en.wikipedia.org/wiki/UTF-8).
  27. * @param {!DataView} bytes
  28. * @return {string}
  29. */
  30. function decode(bytes) {
  31. let cursor = 0;
  32. const codePoints = [];
  33. while (cursor < bytes.byteLength) {
  34. const c = bytes.getUint8(cursor++);
  35. if (c < 0x80) { // Regular 7-bit ASCII.
  36. codePoints.push(c);
  37. } else if (c < 0xC0) {
  38. // UTF-8 continuation mark. We are out of sync. This
  39. // might happen if we attempted to read a character
  40. // with more than four bytes.
  41. continue;
  42. } else if (c < 0xE0) { // UTF-8 with two bytes.
  43. checkElementIndex(cursor, bytes.byteLength);
  44. const c2 = bytes.getUint8(cursor++);
  45. codePoints.push(((c & 0x1F) << 6) | (c2 & 0x3F));
  46. } else if (c < 0xF0) { // UTF-8 with three bytes.
  47. checkElementIndex(cursor + 1, bytes.byteLength);
  48. const c2 = bytes.getUint8(cursor++);
  49. const c3 = bytes.getUint8(cursor++);
  50. codePoints.push(((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
  51. } else if (c < 0xF8) { // UTF-8 with 4 bytes.
  52. checkElementIndex(cursor + 2, bytes.byteLength);
  53. const c2 = bytes.getUint8(cursor++);
  54. const c3 = bytes.getUint8(cursor++);
  55. const c4 = bytes.getUint8(cursor++);
  56. // Characters written on 4 bytes have 21 bits for a codepoint.
  57. // We can't fit that on 16bit characters, so we use surrogates.
  58. let codepoint = ((c & 0x07) << 18) | ((c2 & 0x3F) << 12) |
  59. ((c3 & 0x3F) << 6) | (c4 & 0x3F);
  60. // Surrogates formula from wikipedia.
  61. // 1. Subtract 0x10000 from codepoint
  62. codepoint -= 0x10000;
  63. // 2. Split this into the high 10-bit value and the low 10-bit value
  64. // 3. Add 0xD800 to the high value to form the high surrogate
  65. // 4. Add 0xDC00 to the low value to form the low surrogate:
  66. const low = (codepoint & 0x3FF) + 0xDC00;
  67. const high = ((codepoint >> 10) & 0x3FF) + 0xD800;
  68. codePoints.push(high, low);
  69. }
  70. }
  71. return codePointsToString(codePoints);
  72. }
  73. /**
  74. * Writes a UTF16 JavaScript string to the buffer encoded as UTF8.
  75. * @param {string} value The string to write.
  76. * @return {!Uint8Array} An array containing the encoded bytes.
  77. */
  78. function encode(value) {
  79. const buffer = [];
  80. for (let i = 0; i < value.length; i++) {
  81. const c1 = value.charCodeAt(i);
  82. if (c1 < 0x80) {
  83. buffer.push(c1);
  84. } else if (c1 < 0x800) {
  85. buffer.push((c1 >> 6) | 0xC0);
  86. buffer.push((c1 & 0x3F) | 0x80);
  87. } else if (c1 < 0xD800 || c1 >= 0xE000) {
  88. buffer.push((c1 >> 12) | 0xE0);
  89. buffer.push(((c1 >> 6) & 0x3F) | 0x80);
  90. buffer.push((c1 & 0x3F) | 0x80);
  91. } else {
  92. // surrogate pair
  93. i++;
  94. checkElementIndex(i, value.length);
  95. const c2 = value.charCodeAt(i);
  96. const paired = 0x10000 + (((c1 & 0x3FF) << 10) | (c2 & 0x3FF));
  97. buffer.push((paired >> 18) | 0xF0);
  98. buffer.push(((paired >> 12) & 0x3F) | 0x80);
  99. buffer.push(((paired >> 6) & 0x3F) | 0x80);
  100. buffer.push((paired & 0x3F) | 0x80);
  101. }
  102. }
  103. return new Uint8Array(buffer);
  104. }
  105. exports = {
  106. decode,
  107. encode,
  108. };