livefiles_hfs_plugin/lf_hfs_sbunicode.h

   1 /* Copyright © 2017-2018 Apple Inc. All rights reserved.
   2  *
   3  *  lf_hfs_sbunicode.h
   4  *  livefiles_hfs
   5  *
   6  *  Created by Oded Shoshani on 31/1/18.
   7  */
   8
   9 #ifndef lf_hfs_sbunicode_h
  10 #define lf_hfs_sbunicode_h
  11
  12 /*
  13  Includes Unicode 3.2 decomposition code derived from Core Foundation
  14  */
  15
  16 /*
  17  * UTF-8 (Unicode Transformation Format)
  18  *
  19  * UTF-8 is the Unicode Transformation Format that serializes a Unicode
  20  * character as a sequence of one to four bytes. Only the shortest form
  21  * required to represent the significant Unicode bits is legal.
  22  *
  23  * UTF-8 Multibyte Codes
  24  *
  25  * Bytes   Bits   Unicode Min  Unicode Max   UTF-8 Byte Sequence (binary)
  26  * -----------------------------------------------------------------------------
  27  *   1       7       0x0000        0x007F    0xxxxxxx
  28  *   2      11       0x0080        0x07FF    110xxxxx 10xxxxxx
  29  *   3      16       0x0800        0xFFFF    1110xxxx 10xxxxxx 10xxxxxx
  30  *   4      21      0x10000      0x10FFFF    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  31  * -----------------------------------------------------------------------------
  32  */
  33
  34 /*
  35  * UTF-8 encode/decode flags
  36  */
  37 #define UTF_REVERSE_ENDIAN      0x0001   /* reverse UCS-2 byte order */
  38 #define UTF_ADD_NULL_TERM       0x0002   /* add null termination */
  39 #define UTF_DECOMPOSED          0x0004   /* generate fully decomposed UCS-2 */
  40 #define UTF_PRECOMPOSED         0x0008   /* generate precomposed UCS-2 */
  41 #define UTF_ESCAPE_ILLEGAL      0x0010   /* escape illegal UTF-8 */
  42 #define UTF_SFM_CONVERSIONS     0x0020   /* Use SFM mappings for illegal NTFS chars */
  43
  44 #define UTF_BIG_ENDIAN       \
  45 ((BYTE_ORDER == BIG_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
  46 #define UTF_LITTLE_ENDIAN    \
  47 ((BYTE_ORDER == LITTLE_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
  48
  49
  50
  51 /*
  52  * utf8_encodelen - Calculate the UTF-8 encoding length
  53  *
  54  * This function takes a Unicode input string, ucsp, of ucslen bytes
  55  * and calculates the size of the UTF-8 output in bytes (not including
  56  * a NULL termination byte). The string must reside in kernel memory.
  57  *
  58  * If '/' chars are possible in the Unicode input then an alternate
  59  * (replacement) char should be provided in altslash.
  60  *
  61  * FLAGS
  62  *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
  63  *
  64  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
  65  *
  66  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
  67  *
  68  *    UTF_DECOMPOSED:  generate fully decomposed output
  69  *
  70  *    UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
  71  *
  72  * ERRORS
  73  *    None
  74  */
  75 size_t utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags);
  76
  77 /*
  78  * utf8_encodestr - Encodes a Unicode string to UTF-8
  79  *
  80  * NOTES:
  81  *    The resulting UTF-8 string is NULL terminated.
  82  *
  83  *    If '/' chars are allowed on disk then an alternate
  84  *    (replacement) char must be provided in altslash.
  85  *
  86  * input flags:
  87  *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
  88  *
  89  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
  90  *
  91  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
  92  *
  93  *    UTF_DECOMPOSED:  generate fully decomposed output
  94  *
  95  *    UTF_ADD_NULL_TERM:  add NULL termination to UTF-8 output
  96  *
  97  * result:
  98  *    ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
  99  *
 100  *    EINVAL: Illegal char found; char was replaced by an '_'.
 101  */
 102 extern int utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p, size_t * utf8len, size_t buflen, u_int16_t altslash, int flags);
 103
 104 /*
 105  * utf8_decodestr - Decodes a UTF-8 string back to Unicode
 106  *
 107  * NOTES:
 108  *    The input UTF-8 string does not need to be null terminated
 109  *    if utf8len is set.
 110  *
 111  *    If '/' chars are allowed on disk then an alternate
 112  *    (replacement) char must be provided in altslash.
 113  *
 114  * input flags:
 115  *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime
 116  *
 117  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 118  *
 119  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 120  *
 121  *    UTF_DECOMPOSED:  generate fully decomposed output (NFD)
 122  *
 123  *    UTF_PRECOMPOSED:  generate precomposed output (NFC)
 124  *
 125  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 126  *
 127  * result:
 128  *    ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
 129  *
 130  *    EINVAL: Illegal UTF-8 sequence found.
 131  */
 132 int utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp, size_t *ucslen, size_t buflen, u_int16_t altslash, int flags);
 133
 134 #endif /* lf_hfs_sbunicode_h */
 135