Newer
Older
dub_jkp / source / tinyendian.d
  1. // Copyright Ferdinand Majerech 2014.
  2. // Distributed under the Boost Software License, Version 1.0.
  3. // (See accompanying file LICENSE_1_0.txt or copy at
  4. // http://www.boost.org/LICENSE_1_0.txt)
  5.  
  6. /// A minimal library providing functionality for changing the endianness of data.
  7. module tinyendian;
  8.  
  9. import std.system : Endian, endian;
  10.  
  11. /// Unicode UTF encodings.
  12. enum UTFEncoding : ubyte
  13. {
  14. UTF_8,
  15. UTF_16,
  16. UTF_32
  17. }
  18. ///
  19. @safe unittest
  20. {
  21. const ints = [314, -101];
  22. int[2] intsSwapBuffer = ints;
  23. swapByteOrder(intsSwapBuffer[]);
  24. swapByteOrder(intsSwapBuffer[]);
  25. assert(ints == intsSwapBuffer, "Lost information when swapping byte order");
  26.  
  27. const floats = [3.14f, 10.1f];
  28. float[2] floatsSwapBuffer = floats;
  29. swapByteOrder(floatsSwapBuffer[]);
  30. swapByteOrder(floatsSwapBuffer[]);
  31. assert(floats == floatsSwapBuffer, "Lost information when swapping byte order");
  32. }
  33.  
  34. /** Swap byte order of items in an array in place.
  35. *
  36. * Params:
  37. *
  38. * T = Item type. Must be either 2 or 4 bytes long.
  39. * array = Buffer with values to fix byte order of.
  40. */
  41. void swapByteOrder(T)(T[] array) @trusted @nogc pure nothrow
  42. if (T.sizeof == 2 || T.sizeof == 4)
  43. {
  44. // Swap the byte order of all read characters.
  45. foreach (ref item; array)
  46. {
  47. static if (T.sizeof == 2)
  48. {
  49. import std.algorithm.mutation : swap;
  50. swap(*cast(ubyte*)&item, *(cast(ubyte*)&item + 1));
  51. }
  52. else static if (T.sizeof == 4)
  53. {
  54. import core.bitop : bswap;
  55. const swapped = bswap(*cast(uint*)&item);
  56. item = *cast(const(T)*)&swapped;
  57. }
  58. else static assert(false, "Unsupported T: " ~ T.stringof);
  59. }
  60. }
  61.  
  62. /// See fixUTFByteOrder.
  63. struct FixUTFByteOrderResult
  64. {
  65. ubyte[] array;
  66. UTFEncoding encoding;
  67. Endian endian;
  68. uint bytesStripped = 0;
  69. }
  70.  
  71. /** Convert byte order of an array encoded in UTF(8/16/32) to system endianness in place.
  72. *
  73. * Uses the UTF byte-order-mark (BOM) to determine UTF encoding. If there is no BOM
  74. * at the beginning of array, UTF-8 is assumed (this is compatible with ASCII). The
  75. * BOM, if any, will be removed from the buffer.
  76. *
  77. * If the encoding is determined to be UTF-16 or UTF-32 and there aren't enough bytes
  78. * for the last code unit (i.e. if array.length is odd for UTF-16 or not divisible by
  79. * 4 for UTF-32), the extra bytes (1 for UTF-16, 1-3 for UTF-32) are stripped.
  80. *
  81. * Note that this function does $(B not) check if the array is a valid UTF string. It
  82. * only works with the BOM and 1,2 or 4-byte items.
  83. *
  84. * Params:
  85. *
  86. * array = The array with UTF-data.
  87. *
  88. * Returns:
  89. *
  90. * A struct with the following members:
  91. *
  92. * $(D ubyte[] array) A slice of the input array containing data in correct
  93. * byte order, without BOM and in case of UTF-16/UTF-32,
  94. * without stripped bytes, if any.
  95. * $(D UTFEncoding encoding) Encoding of the result (UTF-8, UTF-16 or UTF-32)
  96. * $(D std.system.Endian endian) Endianness of the original array.
  97. * $(D uint bytesStripped) Number of bytes stripped from a UTF-16/UTF-32 array, if
  98. * any. This is non-zero only if array.length was not
  99. * divisible by 2 or 4 for UTF-16 and UTF-32, respectively.
  100. *
  101. * Complexity: (BIGOH array.length)
  102. */
  103. auto fixUTFByteOrder(ubyte[] array) @safe @nogc pure nothrow
  104. {
  105. // Enumerates UTF BOMs, matching indices to byteOrderMarks/bomEndian.
  106. enum BOM: ubyte
  107. {
  108. UTF_8 = 0,
  109. UTF_16_LE = 1,
  110. UTF_16_BE = 2,
  111. UTF_32_LE = 3,
  112. UTF_32_BE = 4,
  113. None = ubyte.max
  114. }
  115.  
  116. // These 2 are from std.stream
  117. static immutable ubyte[][5] byteOrderMarks = [ [0xEF, 0xBB, 0xBF],
  118. [0xFF, 0xFE],
  119. [0xFE, 0xFF],
  120. [0xFF, 0xFE, 0x00, 0x00],
  121. [0x00, 0x00, 0xFE, 0xFF] ];
  122. static immutable Endian[5] bomEndian = [ endian,
  123. Endian.littleEndian,
  124. Endian.bigEndian,
  125. Endian.littleEndian,
  126. Endian.bigEndian ];
  127.  
  128. // Documented in function ddoc.
  129.  
  130. FixUTFByteOrderResult result;
  131.  
  132. // Detect BOM, if any, in the bytes we've read. -1 means no BOM.
  133. // Need the last match: First 2 bytes of UTF-32LE BOM match the UTF-16LE BOM. If we
  134. // used the first match, UTF-16LE would be detected when we have a UTF-32LE BOM.
  135. import std.algorithm.searching : startsWith;
  136. BOM bomId = BOM.None;
  137. foreach (i, bom; byteOrderMarks)
  138. if (array.startsWith(bom))
  139. bomId = cast(BOM)i;
  140.  
  141. result.endian = (bomId != BOM.None) ? bomEndian[bomId] : Endian.init;
  142.  
  143. // Start of UTF data (after BOM, if any)
  144. size_t start = 0;
  145. // If we've read more than just the BOM, put the rest into the array.
  146. with(BOM) final switch(bomId)
  147. {
  148. case None: result.encoding = UTFEncoding.UTF_8; break;
  149. case UTF_8:
  150. start = 3;
  151. result.encoding = UTFEncoding.UTF_8;
  152. break;
  153. case UTF_16_LE, UTF_16_BE:
  154. result.bytesStripped = array.length % 2;
  155. start = 2;
  156. result.encoding = UTFEncoding.UTF_16;
  157. break;
  158. case UTF_32_LE, UTF_32_BE:
  159. result.bytesStripped = array.length % 4;
  160. start = 4;
  161. result.encoding = UTFEncoding.UTF_32;
  162. break;
  163. }
  164.  
  165. // If there's a BOM, we need to move data back to ensure it starts at array[0]
  166. if (start != 0)
  167. {
  168. array = array[start .. $ - result.bytesStripped];
  169. }
  170.  
  171. // We enforce above that array.length is divisible by 2/4 for UTF-16/32
  172. if (endian != result.endian)
  173. {
  174. if (result.encoding == UTFEncoding.UTF_16)
  175. swapByteOrder(cast(wchar[])array);
  176. else if (result.encoding == UTFEncoding.UTF_32)
  177. swapByteOrder(cast(dchar[])array);
  178. }
  179.  
  180. result.array = array;
  181. return result;
  182. }
  183. ///
  184. @safe unittest
  185. {
  186. {
  187. ubyte[] s = [0xEF, 0xBB, 0xBF, 'a'];
  188. FixUTFByteOrderResult r = fixUTFByteOrder(s);
  189. assert(r.encoding == UTFEncoding.UTF_8);
  190. assert(r.array.length == 1);
  191. assert(r.array == ['a']);
  192. assert(r.endian == Endian.littleEndian);
  193. }
  194.  
  195. {
  196. ubyte[] s = ['a'];
  197. FixUTFByteOrderResult r = fixUTFByteOrder(s);
  198. assert(r.encoding == UTFEncoding.UTF_8);
  199. assert(r.array.length == 1);
  200. assert(r.array == ['a']);
  201. assert(r.endian == Endian.bigEndian);
  202. }
  203.  
  204. {
  205. // strip 'a' b/c not complete unit
  206. ubyte[] s = [0xFE, 0xFF, 'a'];
  207. FixUTFByteOrderResult r = fixUTFByteOrder(s);
  208. assert(r.encoding == UTFEncoding.UTF_16);
  209. assert(r.array.length == 0);
  210. assert(r.endian == Endian.bigEndian);
  211. }
  212.  
  213. }