bsd/sys/utfconv.h

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #ifndef _SYS_UTFCONV_H_
  30 #define _SYS_UTFCONV_H_
  31
  32 #include <sys/appleapiopts.h>
  33 #include <sys/cdefs.h>
  34
  35 #ifdef KERNEL
  36 #ifdef __APPLE_API_UNSTABLE
  37
  38 /*
  39  * UTF-8 encode/decode flags
  40  */
  41 #define UTF_REVERSE_ENDIAN   0x0001   /* reverse UCS-2 byte order */
  42 #define UTF_NO_NULL_TERM     0x0002   /* do not add null termination */
  43 #define UTF_DECOMPOSED       0x0004   /* generate fully decomposed UCS-2 */
  44 #define UTF_PRECOMPOSED      0x0008   /* generate precomposed UCS-2 */
  45 #define UTF_ESCAPE_ILLEGAL   0x0010   /* escape illegal UTF-8 */
  46 #define UTF_SFM_CONVERSIONS  0x0020   /* Use SFM mappings for illegal NTFS chars */
  47
  48 #define UTF_BIG_ENDIAN       \
  49         ((BYTE_ORDER == BIG_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
  50
  51 #define UTF_LITTLE_ENDIAN    \
  52         ((BYTE_ORDER == LITTLE_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
  53
  54 __BEGIN_DECLS
  55
  56
  57 /*
  58  * unicode_combinable - Test for a combining unicode character.
  59  *
  60  * This function is similar to __CFUniCharIsNonBaseCharacter except
  61  * that it also includes Hangul Jamo characters.
  62  */
  63
  64 int unicode_combinable(u_int16_t character);
  65
  66 /*
  67  * Test for a precomposed character.
  68  *
  69  * Similar to __CFUniCharIsDecomposableCharacter.
  70  */
  71
  72 int unicode_decomposeable(u_int16_t character);
  73
  74
  75 /*
  76  * utf8_encodelen - Calculate the UTF-8 encoding length
  77  *
  78  * This function takes an Unicode input string, ucsp, of ucslen bytes
  79  * and calculates the size of the UTF-8 output in bytes (not including
  80  * a NULL termination byte). The string must reside in kernel memory.
  81  *
  82  * FLAGS
  83  *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
  84  *
  85  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
  86  *
  87  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
  88  *
  89  *    UTF_DECOMPOSED:  assume fully decomposed output
  90  *
  91  * ERRORS
  92  *    None
  93  */
  94 size_t
  95 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
  96     int flags);
  97
  98
  99 /*
 100  * utf8_encodestr - Encodes a Unicode string into UTF-8
 101  *
 102  * This function takes an Unicode input string, ucsp, of ucslen bytes
 103  * and produces the UTF-8 output into a buffer of buflen bytes pointed
 104  * to by utf8p. The size of the output in bytes (not including a NULL
 105  * termination byte) is returned in utf8len. The UTF-8 string output
 106  * is NULL terminated. Both buffers must reside in kernel memory.
 107  *
 108  * If '/' chars are possible in the Unicode input then an alternate
 109  * (replacement) char must be provided in altslash.
 110  *
 111  * FLAGS
 112  *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
 113  *
 114  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 115  *
 116  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 117  *
 118  *    UTF_NO_NULL_TERM:  do not add null termination to output string
 119  *
 120  *    UTF_DECOMPOSED:  generate fully decomposed output
 121  *
 122  * ERRORS
 123  *    ENAMETOOLONG:  output did not fit; only utf8len bytes were encoded
 124  *
 125  *    EINVAL:  illegal Unicode char encountered
 126  */
 127 int
 128 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
 129     size_t * utf8len, size_t buflen, u_int16_t altslash, int flags);
 130
 131
 132 /*
 133  * utf8_decodestr - Decodes a UTF-8 string into Unicode
 134  *
 135  * This function takes an UTF-8 input string, utf8p, of utf8len bytes
 136  * and produces the Unicode output into a buffer of buflen bytes pointed
 137  * to by ucsp. The size of the output in bytes (not including a NULL
 138  * termination byte) is returned in ucslen. Both buffers must reside
 139  * in kernel memory.
 140  *
 141  * If '/' chars are allowed in the Unicode output then an alternate
 142  * (replacement) char must be provided in altslash.
 143  *
 144  * FLAGS
 145  *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime
 146  *
 147  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 148  *
 149  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 150  *
 151  *    UTF_DECOMPOSED:  generate fully decomposed output (NFD)
 152  *
 153  *    UTF_PRECOMPOSED:  generate precomposed output (NFC)
 154  *
 155  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 156  *
 157  * ERRORS
 158  *    ENAMETOOLONG:  output did not fit; only ucslen bytes were decoded.
 159  *
 160  *    EINVAL:  illegal UTF-8 sequence encountered.
 161  */
 162 int
 163 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
 164     size_t *ucslen, size_t buflen, u_int16_t altslash, int flags);
 165
 166
 167 /*
 168  * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
 169  *
 170  * This function takes an UTF-8 input string, instr, of inlen bytes
 171  * and produces normalized UTF-8 output into a buffer of buflen bytes
 172  * pointed to by outstr. The size of the output in bytes (not including
 173  * a NULL termination byte) is returned in outlen. In-place conversions
 174  * are not supported (i.e. instr != outstr).  Both buffers must reside
 175  * in kernel memory.
 176  *
 177  * FLAGS
 178  *    UTF_DECOMPOSED:  output string will be fully decomposed (NFD)
 179  *
 180  *    UTF_PRECOMPOSED:  output string will be precomposed (NFC)
 181  *
 182  *    UTF_NO_NULL_TERM:  do not add null termination to output string
 183  *
 184  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 185  *
 186  * ERRORS
 187  *    ENAMETOOLONG:  output did not fit or input exceeded MAXPATHLEN bytes
 188  *
 189  *    EINVAL:  illegal UTF-8 sequence encountered or invalid flags
 190  */
 191 int
 192 utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
 193     size_t *outlen, size_t buflen, int flags);
 194
 195
 196 /*
 197  * utf8_validatestr - validates a UTF-8 string
 198  *
 199  * This function takes an UTF-8 input string, utf8p, of utf8len bytes
 200  * and determines if its valid UTF-8.  The string must reside in kernel
 201  * memory.
 202  *
 203  * ERRORS
 204  *    EINVAL:  illegal UTF-8 sequence encountered.
 205  */
 206 int
 207 utf8_validatestr(const u_int8_t* utf8p, size_t utf8len);
 208
 209
 210 __END_DECLS
 211
 212 #endif /* __APPLE_API_UNSTABLE */
 213 #endif /* KERNEL */
 214
 215 #endif /* !_SYS_UTFCONV_H_ */