[apple/icu.git] / icuSources / common / ustr_imp.h

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*  
**********************************************************************
*   Copyright (C) 1999-2015, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   file name:  ustr_imp.h
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2001jan30
*   created by: Markus W. Scherer
*/

#ifndef __USTR_IMP_H__
#define __USTR_IMP_H__

#include "unicode/utypes.h"
#include "unicode/utf8.h"

/**
 * Internal option for unorm_cmpEquivFold() for strncmp style.
 * If set, checks for both string length and terminating NUL.
 */
#define _STRNCMP_STYLE 0x1000

/**
 * Compare two strings in code point order or code unit order.
 * Works in strcmp style (both lengths -1),
 * strncmp style (lengths equal and >=0, flag TRUE),
 * and memcmp/UnicodeString style (at least one length >=0).
 */
U_CFUNC int32_t U_EXPORT2
uprv_strCompare(const UChar *s1, int32_t length1,
                const UChar *s2, int32_t length2,
                UBool strncmpStyle, UBool codePointOrder);

U_CAPI int32_t U_EXPORT2 
ustr_hashUCharsN(const UChar *str, int32_t length);

U_CAPI int32_t U_EXPORT2 
ustr_hashCharsN(const char *str, int32_t length);

U_CAPI int32_t U_EXPORT2
ustr_hashICharsN(const char *str, int32_t length);

/**
 * NUL-terminate a UChar * string if possible.
 * If length  < destCapacity then NUL-terminate.
 * If length == destCapacity then do not terminate but set U_STRING_NOT_TERMINATED_WARNING.
 * If length  > destCapacity then do not terminate but set U_BUFFER_OVERFLOW_ERROR.
 *
 * @param dest Destination buffer, can be NULL if destCapacity==0.
 * @param destCapacity Number of UChars available at dest.
 * @param length Number of UChars that were (to be) written to dest.
 * @param pErrorCode ICU error code.
 * @return length
 */
U_CAPI int32_t U_EXPORT2
u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode);

/**
 * NUL-terminate a char * string if possible.
 * Same as u_terminateUChars() but for a different string type.
 */
U_CAPI int32_t U_EXPORT2
u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode);

/**
 * NUL-terminate a UChar32 * string if possible.
 * Same as u_terminateUChars() but for a different string type.
 */
U_CAPI int32_t U_EXPORT2
u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode);

/**
 * NUL-terminate a wchar_t * string if possible.
 * Same as u_terminateUChars() but for a different string type.
 */
U_CAPI int32_t U_EXPORT2
u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode);

/**
 * Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
 * Returns 1 for ASCII 0..0x7f.
 * Returns 0 for 0x80..0xc1 as well as for 0xf5..0xff.
 * leadByte might be evaluated multiple times.
 *
 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
 * @return 0..4
 */
#define U8_COUNT_BYTES(leadByte) \
    (U8_IS_SINGLE(leadByte) ? 1 : U8_COUNT_BYTES_NON_ASCII(leadByte))

/**
 * Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
 * Returns 0 for 0x00..0xc1 as well as for 0xf5..0xff.
 * leadByte might be evaluated multiple times.
 *
 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
 * @return 0 or 2..4
 */
#define U8_COUNT_BYTES_NON_ASCII(leadByte) \
    (U8_IS_LEAD(leadByte) ? ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+2 : 0)

#ifdef __cplusplus

U_NAMESPACE_BEGIN

class UTF8 {
public:
    UTF8() = delete;  // all static

    /**
     * Is t a valid UTF-8 trail byte?
     *
     * @param prev Must be the preceding lead byte if i==1 and length>=3;
     *             otherwise ignored.
     * @param t The i-th byte following the lead byte.
     * @param i The index (1..3) of byte t in the byte sequence. 0<i<length
     * @param length The length (2..4) of the byte sequence according to the lead byte.
     * @return TRUE if t is a valid trail byte in this context.
     */
    static inline UBool isValidTrail(int32_t prev, uint8_t t, int32_t i, int32_t length) {
        // The first trail byte after a 3- or 4-byte lead byte
        // needs to be validated together with its lead byte.
        if (length <= 2 || i > 1) {
            return U8_IS_TRAIL(t);
        } else if (length == 3) {
            return U8_IS_VALID_LEAD3_AND_T1(prev, t);
        } else {  // length == 4
            return U8_IS_VALID_LEAD4_AND_T1(prev, t);
        }
    }
};

U_NAMESPACE_END

#endif  // __cplusplus

/**
 * Check whether a code point has the emoji property.
 * (implemented in uchar.cpp as reduced version of u_hasBinaryProperty)
 *
 * @param c Code point to test.
 * @return TRUE or FALSE according to whether c has the emoji property.
 *
 * @internal Apple only
 */
U_CAPI UBool U_EXPORT2
u_isEmoji(UChar32 c);


#endif
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f A	3	/*
b75a7d8f A	4	**********************************************************************
b331163b	5	* Copyright (C) 1999-2015, International Business Machines
b75a7d8f A	6	* Corporation and others. All Rights Reserved.
	7	**********************************************************************
	8	* file name: ustr_imp.h
f3c0d7a5	9	* encoding: UTF-8
b75a7d8f A	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2001jan30
	14	* created by: Markus W. Scherer
	15	*/
	16
	17	#ifndef __USTR_IMP_H__
	18	#define __USTR_IMP_H__
	19
	20	#include "unicode/utypes.h"
0f5d89e8	21	#include "unicode/utf8.h"
729e4ab9 A	22
	23	/**
	24	* Internal option for unorm_cmpEquivFold() for strncmp style.
	25	* If set, checks for both string length and terminating NUL.
729e4ab9 A	26	*/
	27	#define _STRNCMP_STYLE 0x1000
	28
b75a7d8f A	29	/**
	30	* Compare two strings in code point order or code unit order.
	31	* Works in strcmp style (both lengths -1),
	32	* strncmp style (lengths equal and >=0, flag TRUE),
	33	* and memcmp/UnicodeString style (at least one length >=0).
b75a7d8f	34	*/
46f4442e	35	U_CFUNC int32_t U_EXPORT2
b75a7d8f A	36	uprv_strCompare(const UChar *s1, int32_t length1,
	37	const UChar *s2, int32_t length2,
	38	UBool strncmpStyle, UBool codePointOrder);
	39
4388f060 A	40	U_CAPI int32_t U_EXPORT2
4388f060 A	41	ustr_hashUCharsN(const UChar *str, int32_t length);
b75a7d8f	42
4388f060 A	43	U_CAPI int32_t U_EXPORT2
	44	ustr_hashCharsN(const char *str, int32_t length);
	45
	46	U_CAPI int32_t U_EXPORT2
	47	ustr_hashICharsN(const char *str, int32_t length);
b75a7d8f A	48
	49	/**
	50	* NUL-terminate a UChar * string if possible.
	51	* If length < destCapacity then NUL-terminate.
	52	* If length == destCapacity then do not terminate but set U_STRING_NOT_TERMINATED_WARNING.
	53	* If length > destCapacity then do not terminate but set U_BUFFER_OVERFLOW_ERROR.
	54	*
	55	* @param dest Destination buffer, can be NULL if destCapacity==0.
	56	* @param destCapacity Number of UChars available at dest.
	57	* @param length Number of UChars that were (to be) written to dest.
	58	* @param pErrorCode ICU error code.
	59	* @return length
b75a7d8f A	60	*/
	61	U_CAPI int32_t U_EXPORT2
	62	u_terminateUChars(UChar dest, int32_t destCapacity, int32_t length, UErrorCode pErrorCode);
	63
	64	/**
	65	* NUL-terminate a char * string if possible.
	66	* Same as u_terminateUChars() but for a different string type.
	67	*/
	68	U_CAPI int32_t U_EXPORT2
	69	u_terminateChars(char dest, int32_t destCapacity, int32_t length, UErrorCode pErrorCode);
	70
	71	/**
	72	* NUL-terminate a UChar32 * string if possible.
	73	* Same as u_terminateUChars() but for a different string type.
	74	*/
	75	U_CAPI int32_t U_EXPORT2
	76	u_terminateUChar32s(UChar32 dest, int32_t destCapacity, int32_t length, UErrorCode pErrorCode);
	77
	78	/**
	79	* NUL-terminate a wchar_t * string if possible.
	80	* Same as u_terminateUChars() but for a different string type.
	81	*/
	82	U_CAPI int32_t U_EXPORT2
	83	u_terminateWChars(wchar_t dest, int32_t destCapacity, int32_t length, UErrorCode pErrorCode);
	84
0f5d89e8 A	85	/**
	86	* Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
	87	* Returns 1 for ASCII 0..0x7f.
	88	* Returns 0 for 0x80..0xc1 as well as for 0xf5..0xff.
	89	* leadByte might be evaluated multiple times.
	90	*
	91	* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
	92	* @return 0..4
	93	*/
	94	#define U8_COUNT_BYTES(leadByte) \
	95	(U8_IS_SINGLE(leadByte) ? 1 : U8_COUNT_BYTES_NON_ASCII(leadByte))
	96
	97	/**
	98	* Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
	99	* Returns 0 for 0x00..0xc1 as well as for 0xf5..0xff.
	100	* leadByte might be evaluated multiple times.
	101	*
	102	* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
	103	* @return 0 or 2..4
	104	*/
	105	#define U8_COUNT_BYTES_NON_ASCII(leadByte) \
	106	(U8_IS_LEAD(leadByte) ? ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+2 : 0)
	107
	108	#ifdef __cplusplus
	109
	110	U_NAMESPACE_BEGIN
	111
	112	class UTF8 {
	113	public:
	114	UTF8() = delete; // all static
	115
	116	/**
	117	* Is t a valid UTF-8 trail byte?
	118	*
	119	* @param prev Must be the preceding lead byte if i==1 and length>=3;
	120	* otherwise ignored.
	121	* @param t The i-th byte following the lead byte.
	122	* @param i The index (1..3) of byte t in the byte sequence. 0<i<length
	123	* @param length The length (2..4) of the byte sequence according to the lead byte.
	124	* @return TRUE if t is a valid trail byte in this context.
	125	*/
	126	static inline UBool isValidTrail(int32_t prev, uint8_t t, int32_t i, int32_t length) {
	127	// The first trail byte after a 3- or 4-byte lead byte
	128	// needs to be validated together with its lead byte.
	129	if (length <= 2 \|\| i > 1) {
	130	return U8_IS_TRAIL(t);
	131	} else if (length == 3) {
	132	return U8_IS_VALID_LEAD3_AND_T1(prev, t);
	133	} else { // length == 4
	134	return U8_IS_VALID_LEAD4_AND_T1(prev, t);
	135	}
	136	}
	137	};
	138
	139	U_NAMESPACE_END
	140
	141	#endif // __cplusplus
	142
	143	/**
	144	* Check whether a code point has the emoji property.
	145	* (implemented in uchar.cpp as reduced version of u_hasBinaryProperty)
	146	*
	147	* @param c Code point to test.
	148	* @return TRUE or FALSE according to whether c has the emoji property.
149	*
150	* @internal Apple only
151	*/
152	U_CAPI UBool U_EXPORT2
153	u_isEmoji(UChar32 c);
154
155
b75a7d8f	156	#endif