X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..fd0068a84e9996f225edba706498f6ed413d0673:/icuSources/common/utf_impl.c diff --git a/icuSources/common/utf_impl.c b/icuSources/common/utf_impl.c index 431b0eb5..8f45546b 100644 --- a/icuSources/common/utf_impl.c +++ b/icuSources/common/utf_impl.c @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 1999-2003, International Business Machines +* Copyright (C) 1999-2006, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -82,6 +82,28 @@ utf8_errorValue[6]={ 0x3ffffff, 0x7fffffff }; +/* + * Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling + * UTF8_NEXT_CHAR_SAFE(). + * + * The "strict" parameter controls the error behavior: + * <0 "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative + * code point result. + * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE): + * All illegal byte sequences yield a positive code point such that this + * result code point would be encoded with the same number of bytes as + * the illegal sequence. + * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE): + * Same as the obsolete "safe" behavior, but non-characters are also treated + * like illegal sequences. + * + * The special negative (<0) value -2 is used for lenient treatment of surrogate + * code points as legal. Some implementations use this for roundtripping of + * Unicode 16-bit strings that are not well-formed UTF-16, that is, they + * contain unpaired surrogates. + * + * Note that a UBool is the same as an int8_t. + */ U_CAPI UChar32 U_EXPORT2 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) { int32_t i=*pi; @@ -139,7 +161,7 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, /* correct sequence - all trail bytes have (b7..b6)==(10)? */ /* illegal is also set if count>=4 */ - if(illegal || (c)=4 || c>0x10ffff || c0 && UTF_IS_UNICODE_NONCHAR(c))) { + if(count>=4 || c>0x10ffff || c0 && UTF_IS_UNICODE_NONCHAR(c))) { /* illegal sequence or (strict and non-character) */ if(count>=4) { count=3;