git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	***************************************************************************
	3	* Copyright (C) 2008-2014, International Business Machines Corporation
	4	* and others. All Rights Reserved.
	5	***************************************************************************
	6	* file name: uspoof.cpp
	7	* encoding: US-ASCII
	8	* tab size: 8 (not used)
	9	* indentation:4
	10	*
	11	* created on: 2008Feb13
	12	* created by: Andy Heninger
	13	*
	14	* Unicode Spoof Detection
	15	*/
	16	#include "unicode/utypes.h"
	17	#include "unicode/normalizer2.h"
	18	#include "unicode/uspoof.h"
	19	#include "unicode/ustring.h"
	20	#include "unicode/utf16.h"
	21	#include "cmemory.h"
	22	#include "cstring.h"
	23	#include "identifier_info.h"
	24	#include "mutex.h"
	25	#include "scriptset.h"
	26	#include "uassert.h"
	27	#include "ucln_in.h"
	28	#include "uspoof_impl.h"
	29	#include "umutex.h"
	30
	31
	32	#if !UCONFIG_NO_NORMALIZATION
	33
	34	U_NAMESPACE_USE
	35
	36
	37	//
	38	// Static Objects used by the spoof impl, their thread safe initialization and their cleanup.
	39	//
	40	static UnicodeSet *gInclusionSet = NULL;
	41	static UnicodeSet *gRecommendedSet = NULL;
	42	static const Normalizer2 *gNfdNormalizer = NULL;
	43	static UInitOnce gSpoofInitOnce = U_INITONCE_INITIALIZER;
	44
	45	static UBool U_CALLCONV
	46	uspoof_cleanup(void) {
	47	delete gInclusionSet;
	48	gInclusionSet = NULL;
	49	delete gRecommendedSet;
	50	gRecommendedSet = NULL;
	51	gNfdNormalizer = NULL;
	52	gSpoofInitOnce.reset();
	53	return TRUE;
	54	}
	55
	56	static void U_CALLCONV initializeStatics(UErrorCode &status) {
	57	static const char *inclusionPat =
	58	"[\\u0027\\u002d-\\u002e\\u003A\\u00B7\\u0375\\u058A\\u05F3-\\u05F4"
	59	"\\u06FD-\\u06FE\\u0F0B\\u200C-\\u200D\\u2010\\u2019\\u2027\\u30A0\\u30FB]";
	60	gInclusionSet = new UnicodeSet(UnicodeString(inclusionPat, -1, US_INV), status);
	61
	62	// Note: data from http://unicode.org/Public/security/latest/xidmodifications.txt version 6.3.0
	63	// Note: concatenated string constants do not work with UNICODE_STRING_SIMPLE on all platforms.
	64	static const char *recommendedPat =
	65	"[\\u0030-\\u0039\\u0041-\\u005A\\u005F\\u0061-\\u007A\\u00C0-\\u00D6\\u00D8-\\u00F6"
	66	"\\u00F8-\\u0131\\u0134-\\u013E\\u0141-\\u0148\\u014A-\\u017E\\u01A0-\\u01A1\\u01AF-\\u01B0"
	67	"\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B\\u021E-\\u021F"
	68	"\\u0226-\\u0233\\u0259\\u02BB-\\u02BC\\u02EC\\u0300-\\u0304\\u0306-\\u030C\\u030F-\\u0311"
	69	"\\u0313-\\u0314\\u031B\\u0323-\\u0328\\u032D-\\u032E\\u0330-\\u0331\\u0335\\u0338-\\u0339"
	70	"\\u0342\\u0345\\u037B-\\u037D\\u0386\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE"
	71	"\\u03FC-\\u045F\\u048A-\\u0527\\u0531-\\u0556\\u0559\\u0561-\\u0586\\u05B4\\u05D0-\\u05EA"
	72	"\\u05F0-\\u05F2\\u0620-\\u063F\\u0641-\\u0655\\u0660-\\u0669\\u0670-\\u0672\\u0674"
	73	"\\u0679-\\u068D\\u068F-\\u06D3\\u06D5\\u06E5-\\u06E6\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1"
	74	"\\u08A0\\u08A2-\\u08AC\\u0901-\\u094D\\u094F-\\u0950\\u0956-\\u0957\\u0960-\\u0963"
	75	"\\u0966-\\u096F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983\\u0985-\\u098C\\u098F-\\u0990"
	76	"\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BC-\\u09C4\\u09C7-\\u09C8"
	77	"\\u09CB-\\u09CE\\u09D7\\u09E0-\\u09E3\\u09E6-\\u09F1\\u0A01-\\u0A03\\u0A05-\\u0A0A"
	78	"\\u0A0F-\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A35\\u0A38-\\u0A39\\u0A3C"
	79	"\\u0A3E-\\u0A42\\u0A47-\\u0A48\\u0A4B-\\u0A4D\\u0A5C\\u0A66-\\u0A74\\u0A81-\\u0A83"
	80	"\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2-\\u0AB3\\u0AB5-\\u0AB9"
	81	"\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF"
	82	"\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F-\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32-\\u0B33"
	83	"\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47-\\u0B48\\u0B4B-\\u0B4D\\u0B56-\\u0B57\\u0B5F-\\u0B61"
	84	"\\u0B66-\\u0B6F\\u0B71\\u0B82-\\u0B83\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95"
	85	"\\u0B99-\\u0B9A\\u0B9C\\u0B9E-\\u0B9F\\u0BA3-\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9"
	86	"\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD0\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C03"
	87	"\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44"
	88	"\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55-\\u0C56\\u0C60-\\u0C61\\u0C66-\\u0C6F\\u0C82-\\u0C83"
	89	"\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4"
	90	"\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0CD5-\\u0CD6\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1-\\u0CF2"
	91	"\\u0D02-\\u0D03\\u0D05-\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48"
	92	"\\u0D4A-\\u0D4E\\u0D57\\u0D60-\\u0D61\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82-\\u0D83"
	93	"\\u0D85-\\u0D8E\\u0D91-\\u0D96\\u0D9A-\\u0DA5\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD"
	94	"\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0DD4\\u0DD6\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32"
	95	"\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\u0E59\\u0E81-\\u0E82\\u0E84\\u0E87-\\u0E88"
	96	"\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA-\\u0EAB"
	97	"\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD\\u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD"
	98	"\\u0ED0-\\u0ED9\\u0EDE-\\u0EDF\\u0F00\\u0F20-\\u0F29\\u0F35\\u0F37\\u0F3E-\\u0F42"
	99	"\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F56\\u0F58-\\u0F5B\\u0F5D-\\u0F68"
	100	"\\u0F6A-\\u0F6C\\u0F71-\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0F82-\\u0F84\\u0F86-\\u0F92"
	101	"\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8"
	102	"\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D\\u10C7\\u10CD\\u10D0-\\u10F0"
	103	"\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D\\u1250-\\u1256\\u1258"
	104	"\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE"
	105	"\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1315\\u1318-\\u135A"
	106	"\\u135D-\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-\\u17CA"
	107	"\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1E00-\\u1E99\\u1EBF\\u1F00-\\u1F15\\u1F18-\\u1F1D"
	108	"\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70"
	109	"\\u1F72\\u1F74\\u1F76\\u1F78\\u1F7A\\u1F7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4"
	110	"\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA"
	111	"\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FF8\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6"
	112	"\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6"
	113	"\\u2DD8-\\u2DDE\\u3005-\\u3007\\u3041-\\u3096\\u3099-\\u309A\\u309D-\\u309E\\u30A1-\\u30FA"
	114	"\\u30FC-\\u30FE\\u3105-\\u312D\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-\\u9FCC\\uA660-\\uA661"
	115	"\\uA674-\\uA67B\\uA67F\\uA69F\\uA717-\\uA71F\\uA788\\uA78D-\\uA78E\\uA790-\\uA793"
	116	"\\uA7A0-\\uA7AA\\uA7FA\\uA9CF\\uAA60-\\uAA76\\uAA7A-\\uAA7B\\uAB01-\\uAB06\\uAB09-\\uAB0E"
	117	"\\uAB11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E-\\uFA0F\\uFA11"
	118	"\\uFA13-\\uFA14\\uFA1F\\uFA21\\uFA23-\\uFA24\\uFA27-\\uFA29\\U0001B000-\\U0001B001\\U00020000-\\U0002A6D6"
	119	"\\U0002A700-\\U0002B734\\U0002B740-\\U0002B81D]";
	120	gRecommendedSet = new UnicodeSet(UnicodeString(recommendedPat, -1, US_INV), status);
	121	gNfdNormalizer = Normalizer2::getNFDInstance(status);
	122	ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup);
	123	}
	124
	125
	126	U_CAPI USpoofChecker * U_EXPORT2
	127	uspoof_open(UErrorCode *status) {
	128	if (U_FAILURE(*status)) {
	129	return NULL;
	130	}
	131	umtx_initOnce(gSpoofInitOnce, &initializeStatics, *status);
	132	SpoofImpl si = new SpoofImpl(SpoofData::getDefault(status), *status);
	133	if (U_FAILURE(*status)) {
	134	delete si;
	135	si = NULL;
	136	}
	137	return reinterpret_cast<USpoofChecker *>(si);
	138	}
	139
	140
	141	U_CAPI USpoofChecker * U_EXPORT2
	142	uspoof_openFromSerialized(const void data, int32_t length, int32_t pActualLength,
	143	UErrorCode *status) {
	144	if (U_FAILURE(*status)) {
	145	return NULL;
	146	}
	147	umtx_initOnce(gSpoofInitOnce, &initializeStatics, *status);
	148	SpoofData sd = new SpoofData(data, length, status);
	149	SpoofImpl si = new SpoofImpl(sd, status);
	150	if (U_FAILURE(*status)) {
	151	delete sd;
	152	delete si;
	153	return NULL;
	154	}
	155	if (sd == NULL \|\| si == NULL) {
	156	*status = U_MEMORY_ALLOCATION_ERROR;
	157	delete sd;
	158	delete si;
	159	return NULL;
	160	}
	161
	162	if (pActualLength != NULL) {
	163	*pActualLength = sd->fRawData->fLength;
	164	}
	165	return reinterpret_cast<USpoofChecker *>(si);
	166	}
	167
	168
	169	U_CAPI USpoofChecker * U_EXPORT2
	170	uspoof_clone(const USpoofChecker sc, UErrorCode status) {
	171	const SpoofImpl src = SpoofImpl::validateThis(sc, status);
	172	if (src == NULL) {
	173	return NULL;
	174	}
	175	SpoofImpl result = new SpoofImpl(src, *status); // copy constructor
	176	if (U_FAILURE(*status)) {
	177	delete result;
	178	result = NULL;
	179	}
	180	return reinterpret_cast<USpoofChecker *>(result);
	181	}
	182
	183
	184	U_CAPI void U_EXPORT2
	185	uspoof_close(USpoofChecker *sc) {
	186	UErrorCode status = U_ZERO_ERROR;
	187	SpoofImpl *This = SpoofImpl::validateThis(sc, status);
	188	delete This;
	189	}
	190
	191
	192	U_CAPI void U_EXPORT2
	193	uspoof_setChecks(USpoofChecker sc, int32_t checks, UErrorCode status) {
	194	SpoofImpl This = SpoofImpl::validateThis(sc, status);
	195	if (This == NULL) {
	196	return;
	197	}
	198
	199	// Verify that the requested checks are all ones (bits) that
	200	// are acceptable, known values.
	201	if (checks & ~(USPOOF_ALL_CHECKS \| USPOOF_AUX_INFO)) {
	202	*status = U_ILLEGAL_ARGUMENT_ERROR;
	203	return;
	204	}
	205
	206	This->fChecks = checks;
	207	}
	208
	209
	210	U_CAPI int32_t U_EXPORT2
	211	uspoof_getChecks(const USpoofChecker sc, UErrorCode status) {
	212	const SpoofImpl This = SpoofImpl::validateThis(sc, status);
	213	if (This == NULL) {
	214	return 0;
	215	}
	216	return This->fChecks;
	217	}
	218
	219	U_CAPI void U_EXPORT2
	220	uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel) {
	221	UErrorCode status = U_ZERO_ERROR;
	222	SpoofImpl *This = SpoofImpl::validateThis(sc, status);
	223	if (This != NULL) {
	224	This->fRestrictionLevel = restrictionLevel;
	225	}
	226	}
	227
	228	U_CAPI URestrictionLevel U_EXPORT2
	229	uspoof_getRestrictionLevel(const USpoofChecker *sc) {
	230	UErrorCode status = U_ZERO_ERROR;
	231	const SpoofImpl *This = SpoofImpl::validateThis(sc, status);
	232	if (This == NULL) {
	233	return USPOOF_UNRESTRICTIVE;
	234	}
	235	return This->fRestrictionLevel;
	236	}
	237
	238	U_CAPI void U_EXPORT2
	239	uspoof_setAllowedLocales(USpoofChecker sc, const char localesList, UErrorCode *status) {
	240	SpoofImpl This = SpoofImpl::validateThis(sc, status);
	241	if (This == NULL) {
	242	return;
	243	}
	244	This->setAllowedLocales(localesList, *status);
	245	}
	246
	247	U_CAPI const char * U_EXPORT2
	248	uspoof_getAllowedLocales(USpoofChecker sc, UErrorCode status) {
	249	SpoofImpl This = SpoofImpl::validateThis(sc, status);
	250	if (This == NULL) {
	251	return NULL;
	252	}
	253	return This->getAllowedLocales(*status);
	254	}
	255
	256
	257	U_CAPI const USet * U_EXPORT2
	258	uspoof_getAllowedChars(const USpoofChecker sc, UErrorCode status) {
	259	const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
	260	return result->toUSet();
	261	}
	262
	263	U_CAPI const UnicodeSet * U_EXPORT2
	264	uspoof_getAllowedUnicodeSet(const USpoofChecker sc, UErrorCode status) {
	265	const SpoofImpl This = SpoofImpl::validateThis(sc, status);
	266	if (This == NULL) {
	267	return NULL;
	268	}
	269	return This->fAllowedCharsSet;
	270	}
	271
	272
	273	U_CAPI void U_EXPORT2
	274	uspoof_setAllowedChars(USpoofChecker sc, const USet chars, UErrorCode *status) {
	275	const UnicodeSet *set = UnicodeSet::fromUSet(chars);
	276	uspoof_setAllowedUnicodeSet(sc, set, status);
	277	}
	278
	279
	280	U_CAPI void U_EXPORT2
	281	uspoof_setAllowedUnicodeSet(USpoofChecker sc, const UnicodeSet chars, UErrorCode *status) {
	282	SpoofImpl This = SpoofImpl::validateThis(sc, status);
	283	if (This == NULL) {
	284	return;
	285	}
	286	if (chars->isBogus()) {
	287	*status = U_ILLEGAL_ARGUMENT_ERROR;
	288	return;
	289	}
	290	UnicodeSet clonedSet = static_cast<UnicodeSet >(chars->clone());
	291	if (clonedSet == NULL \|\| clonedSet->isBogus()) {
	292	*status = U_MEMORY_ALLOCATION_ERROR;
	293	return;
	294	}
	295	clonedSet->freeze();
	296	delete This->fAllowedCharsSet;
	297	This->fAllowedCharsSet = clonedSet;
	298	This->fChecks \|= USPOOF_CHAR_LIMIT;
	299	}
	300
	301
	302	U_CAPI int32_t U_EXPORT2
	303	uspoof_check(const USpoofChecker *sc,
	304	const UChar *id, int32_t length,
	305	int32_t *position,
	306	UErrorCode *status) {
	307
	308	const SpoofImpl This = SpoofImpl::validateThis(sc, status);
	309	if (This == NULL) {
	310	return 0;
	311	}
	312	if (length < -1) {
	313	*status = U_ILLEGAL_ARGUMENT_ERROR;
	314	return 0;
	315	}
	316	UnicodeString idStr((length == -1), id, length); // Aliasing constructor.
	317	int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
	318	return result;
	319	}
	320
	321
	322	U_CAPI int32_t U_EXPORT2
	323	uspoof_checkUTF8(const USpoofChecker *sc,
	324	const char *id, int32_t length,
	325	int32_t *position,
	326	UErrorCode *status) {
	327
	328	if (U_FAILURE(*status)) {
	329	return 0;
	330	}
	331	UnicodeString idStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
	332	int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
	333	return result;
	334	}
	335
	336
	337	U_CAPI int32_t U_EXPORT2
	338	uspoof_areConfusable(const USpoofChecker *sc,
	339	const UChar *id1, int32_t length1,
	340	const UChar *id2, int32_t length2,
	341	UErrorCode *status) {
	342	SpoofImpl::validateThis(sc, *status);
	343	if (U_FAILURE(*status)) {
	344	return 0;
	345	}
	346	if (length1 < -1 \|\| length2 < -1) {
	347	*status = U_ILLEGAL_ARGUMENT_ERROR;
	348	return 0;
	349	}
	350
	351	UnicodeString id1Str((length1==-1), id1, length1); // Aliasing constructor
	352	UnicodeString id2Str((length2==-1), id2, length2); // Aliasing constructor
	353	return uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status);
	354	}
	355
	356
	357	U_CAPI int32_t U_EXPORT2
	358	uspoof_areConfusableUTF8(const USpoofChecker *sc,
	359	const char *id1, int32_t length1,
	360	const char *id2, int32_t length2,
	361	UErrorCode *status) {
	362	SpoofImpl::validateThis(sc, *status);
	363	if (U_FAILURE(*status)) {
	364	return 0;
	365	}
	366	if (length1 < -1 \|\| length2 < -1) {
	367	*status = U_ILLEGAL_ARGUMENT_ERROR;
	368	return 0;
	369	}
	370	UnicodeString id1Str = UnicodeString::fromUTF8(StringPiece(id1, length1>=0? length1 : uprv_strlen(id1)));
	371	UnicodeString id2Str = UnicodeString::fromUTF8(StringPiece(id2, length2>=0? length2 : uprv_strlen(id2)));
	372	int32_t results = uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status);
	373	return results;
	374	}
	375
	376
	377	U_CAPI int32_t U_EXPORT2
	378	uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
	379	const icu::UnicodeString &id1,
	380	const icu::UnicodeString &id2,
	381	UErrorCode *status) {
	382	const SpoofImpl This = SpoofImpl::validateThis(sc, status);
	383	if (U_FAILURE(*status)) {
	384	return 0;
	385	}
	386	//
	387	// See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
	388	// and for definitions of the types (single, whole, mixed-script) of confusables.
	389
	390	// We only care about a few of the check flags. Ignore the others.
	391	// If no tests relavant to this function have been specified, return an error.
	392	// TODO: is this really the right thing to do? It's probably an error on the caller's part,
	393	// but logically we would just return 0 (no error).
	394	if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE \| USPOOF_MIXED_SCRIPT_CONFUSABLE \|
	395	USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
	396	*status = U_INVALID_STATE_ERROR;
	397	return 0;
	398	}
	399	int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
	400
	401	int32_t result = 0;
	402	IdentifierInfo identifierInfo = This->getIdentifierInfo(status);
	403	if (U_FAILURE(*status)) {
	404	return 0;
	405	}
	406	identifierInfo->setIdentifier(id1, *status);
	407	int32_t id1ScriptCount = identifierInfo->getScriptCount();
	408	identifierInfo->setIdentifier(id2, *status);
	409	int32_t id2ScriptCount = identifierInfo->getScriptCount();
	410	This->releaseIdentifierInfo(identifierInfo);
	411	identifierInfo = NULL;
	412
	413	if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
	414	UnicodeString id1Skeleton;
	415	UnicodeString id2Skeleton;
	416	if (id1ScriptCount <= 1 && id2ScriptCount <= 1) {
	417	flagsForSkeleton \|= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
	418	uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
	419	uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
	420	if (id1Skeleton == id2Skeleton) {
	421	result \|= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
	422	}
	423	}
	424	}
	425
	426	if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
	427	// If the two inputs are single script confusable they cannot also be
	428	// mixed or whole script confusable, according to the UAX39 definitions.
	429	// So we can skip those tests.
	430	return result;
	431	}
	432
	433	// Two identifiers are whole script confusable if each is of a single script
	434	// and they are mixed script confusable.
	435	UBool possiblyWholeScriptConfusables =
	436	id1ScriptCount <= 1 && id2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
	437
	438	//
	439	// Mixed Script Check
	440	//
	441	if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) \|\| possiblyWholeScriptConfusables ) {
	442	// For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
	443	// the mixed script table skeleton, which is what we want.
	444	// The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
	445	UnicodeString id1Skeleton;
	446	UnicodeString id2Skeleton;
	447	flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
	448	uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
	449	uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
	450	if (id1Skeleton == id2Skeleton) {
	451	result \|= USPOOF_MIXED_SCRIPT_CONFUSABLE;
	452	if (possiblyWholeScriptConfusables) {
	453	result \|= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
	454	}
	455	}
	456	}
	457
	458	return result;
	459	}
	460
	461
	462
	463
	464	U_CAPI int32_t U_EXPORT2
	465	uspoof_checkUnicodeString(const USpoofChecker *sc,
	466	const icu::UnicodeString &id,
	467	int32_t *position,
	468	UErrorCode *status) {
	469	const SpoofImpl This = SpoofImpl::validateThis(sc, status);
	470	if (This == NULL) {
	471	return 0;
	472	}
	473	int32_t result = 0;
	474
	475	IdentifierInfo *identifierInfo = NULL;
	476	if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL \| USPOOF_MIXED_NUMBERS)) {
	477	identifierInfo = This->getIdentifierInfo(*status);
	478	if (U_FAILURE(*status)) {
	479	goto cleanupAndReturn;
	480	}
	481	identifierInfo->setIdentifier(id, *status);
	482	identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet);
	483	}
	484
	485
	486	if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) {
	487	URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status);
	488	if (idRestrictionLevel > This->fRestrictionLevel) {
	489	result \|= USPOOF_RESTRICTION_LEVEL;
	490	}
	491	if (This->fChecks & USPOOF_AUX_INFO) {
	492	result \|= idRestrictionLevel;
	493	}
	494	}
	495
	496	if ((This->fChecks) & USPOOF_MIXED_NUMBERS) {
	497	const UnicodeSet *numerics = identifierInfo->getNumerics();
	498	if (numerics->size() > 1) {
	499	result \|= USPOOF_MIXED_NUMBERS;
	500	}
	501
	502	// TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier.
	503	// We have no easy way to do the same in C.
	504	// if (checkResult != null) {
	505	// checkResult.numerics = numerics;
	506	// }
	507	}
	508
	509
	510	if (This->fChecks & (USPOOF_CHAR_LIMIT)) {
	511	int32_t i;
	512	UChar32 c;
	513	int32_t length = id.length();
	514	for (i=0; i<length ;) {
	515	c = id.char32At(i);
	516	i += U16_LENGTH(c);
	517	if (!This->fAllowedCharsSet->contains(c)) {
	518	result \|= USPOOF_CHAR_LIMIT;
	519	break;
	520	}
	521	}
	522	}
	523
	524	if (This->fChecks &
	525	(USPOOF_WHOLE_SCRIPT_CONFUSABLE \| USPOOF_MIXED_SCRIPT_CONFUSABLE \| USPOOF_INVISIBLE)) {
	526	// These are the checks that need to be done on NFD input
	527	UnicodeString nfdText;
	528	gNfdNormalizer->normalize(id, nfdText, *status);
	529	int32_t nfdLength = nfdText.length();
	530
	531	if (This->fChecks & USPOOF_INVISIBLE) {
	532
	533	// scan for more than one occurence of the same non-spacing mark
	534	// in a sequence of non-spacing marks.
	535	int32_t i;
	536	UChar32 c;
	537	UChar32 firstNonspacingMark = 0;
	538	UBool haveMultipleMarks = FALSE;
	539	UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
	540
	541	for (i=0; i<nfdLength ;) {
	542	c = nfdText.char32At(i);
	543	i += U16_LENGTH(c);
	544	if (u_charType(c) != U_NON_SPACING_MARK) {
	545	firstNonspacingMark = 0;
	546	if (haveMultipleMarks) {
	547	marksSeenSoFar.clear();
	548	haveMultipleMarks = FALSE;
	549	}
	550	continue;
	551	}
	552	if (firstNonspacingMark == 0) {
	553	firstNonspacingMark = c;
	554	continue;
	555	}
	556	if (!haveMultipleMarks) {
	557	marksSeenSoFar.add(firstNonspacingMark);
	558	haveMultipleMarks = TRUE;
	559	}
	560	if (marksSeenSoFar.contains(c)) {
	561	// report the error, and stop scanning.
	562	// No need to find more than the first failure.
	563	result \|= USPOOF_INVISIBLE;
	564	break;
	565	}
	566	marksSeenSoFar.add(c);
	567	}
	568	}
	569
	570
	571	if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE \| USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
	572	// The basic test is the same for both whole and mixed script confusables.
	573	// Compute the set of scripts that every input character has a confusable in.
	574	// For this computation an input character is always considered to be
	575	// confusable with itself in its own script.
	576	//
	577	// If the number of such scripts is two or more, and the input consisted of
	578	// characters all from a single script, we have a whole script confusable.
	579	// (The two scripts will be the original script and the one that is confusable)
	580	//
	581	// If the number of such scripts >= one, and the original input contained characters from
	582	// more than one script, we have a mixed script confusable. (We can transform
	583	// some of the characters, and end up with a visually similar string all in
	584	// one script.)
	585
	586	if (identifierInfo == NULL) {
	587	identifierInfo = This->getIdentifierInfo(*status);
	588	if (U_FAILURE(*status)) {
	589	goto cleanupAndReturn;
	590	}
	591	identifierInfo->setIdentifier(id, *status);
	592	}
	593
	594	int32_t scriptCount = identifierInfo->getScriptCount();
	595
	596	ScriptSet scripts;
	597	This->wholeScriptCheck(nfdText, &scripts, *status);
	598	int32_t confusableScriptCount = scripts.countMembers();
	599	//printf("confusableScriptCount = %d\n", confusableScriptCount);
	600
	601	if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
	602	confusableScriptCount >= 2 &&
	603	scriptCount == 1) {
	604	result \|= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
	605	}
	606
	607	if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
	608	confusableScriptCount >= 1 &&
	609	scriptCount > 1) {
	610	result \|= USPOOF_MIXED_SCRIPT_CONFUSABLE;
	611	}
	612	}
	613	}
	614
	615	cleanupAndReturn:
	616	This->releaseIdentifierInfo(identifierInfo);
	617	if (position != NULL) {
	618	*position = 0;
	619	}
	620	return result;
	621	}
	622
	623
	624	U_CAPI int32_t U_EXPORT2
	625	uspoof_getSkeleton(const USpoofChecker *sc,
	626	uint32_t type,
	627	const UChar *id, int32_t length,
	628	UChar *dest, int32_t destCapacity,
	629	UErrorCode *status) {
	630
	631	SpoofImpl::validateThis(sc, *status);
	632	if (U_FAILURE(*status)) {
	633	return 0;
	634	}
	635	if (length<-1 \|\| destCapacity<0 \|\| (destCapacity==0 && dest!=NULL)) {
	636	*status = U_ILLEGAL_ARGUMENT_ERROR;
	637	return 0;
	638	}
	639
	640	UnicodeString idStr((length==-1), id, length); // Aliasing constructor
	641	UnicodeString destStr;
	642	uspoof_getSkeletonUnicodeString(sc, type, idStr, destStr, status);
	643	destStr.extract(dest, destCapacity, *status);
	644	return destStr.length();
	645	}
	646
	647
	648
	649	U_I18N_API UnicodeString & U_EXPORT2
	650	uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
	651	uint32_t type,
	652	const UnicodeString &id,
	653	UnicodeString &dest,
	654	UErrorCode *status) {
	655	const SpoofImpl This = SpoofImpl::validateThis(sc, status);
	656	if (U_FAILURE(*status)) {
	657	return dest;
	658	}
	659
	660	int32_t tableMask = 0;
	661	switch (type) {
	662	case 0:
	663	tableMask = USPOOF_ML_TABLE_FLAG;
	664	break;
	665	case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
	666	tableMask = USPOOF_SL_TABLE_FLAG;
	667	break;
	668	case USPOOF_ANY_CASE:
	669	tableMask = USPOOF_MA_TABLE_FLAG;
	670	break;
	671	case USPOOF_SINGLE_SCRIPT_CONFUSABLE \| USPOOF_ANY_CASE:
	672	tableMask = USPOOF_SA_TABLE_FLAG;
	673	break;
	674	default:
	675	*status = U_ILLEGAL_ARGUMENT_ERROR;
	676	return dest;
	677	}
	678
	679	UnicodeString nfdId;
	680	gNfdNormalizer->normalize(id, nfdId, *status);
	681
	682	// Apply the skeleton mapping to the NFD normalized input string
	683	// Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
	684	int32_t inputIndex = 0;
	685	UnicodeString skelStr;
	686	int32_t normalizedLen = nfdId.length();
	687	for (inputIndex=0; inputIndex < normalizedLen; ) {
	688	UChar32 c = nfdId.char32At(inputIndex);
	689	inputIndex += U16_LENGTH(c);
	690	This->confusableLookup(c, tableMask, skelStr);
	691	}
	692
	693	gNfdNormalizer->normalize(skelStr, dest, *status);
	694	return dest;
	695	}
	696
	697
	698	U_CAPI int32_t U_EXPORT2
	699	uspoof_getSkeletonUTF8(const USpoofChecker *sc,
	700	uint32_t type,
	701	const char *id, int32_t length,
	702	char *dest, int32_t destCapacity,
	703	UErrorCode *status) {
	704	SpoofImpl::validateThis(sc, *status);
	705	if (U_FAILURE(*status)) {
	706	return 0;
	707	}
	708	if (length<-1 \|\| destCapacity<0 \|\| (destCapacity==0 && dest!=NULL)) {
	709	*status = U_ILLEGAL_ARGUMENT_ERROR;
	710	return 0;
	711	}
	712
	713	UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
	714	UnicodeString destStr;
	715	uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status);
	716	if (U_FAILURE(*status)) {
	717	return 0;
	718	}
	719
	720	int32_t lengthInUTF8 = 0;
	721	u_strToUTF8(dest, destCapacity, &lengthInUTF8,
	722	destStr.getBuffer(), destStr.length(), status);
	723	return lengthInUTF8;
	724	}
	725
	726
	727	U_CAPI int32_t U_EXPORT2
	728	uspoof_serialize(USpoofChecker sc,void buf, int32_t capacity, UErrorCode *status) {
	729	SpoofImpl This = SpoofImpl::validateThis(sc, status);
	730	if (This == NULL) {
	731	U_ASSERT(U_FAILURE(*status));
	732	return 0;
	733	}
	734	int32_t dataSize = This->fSpoofData->fRawData->fLength;
	735	if (capacity < dataSize) {
	736	*status = U_BUFFER_OVERFLOW_ERROR;
	737	return dataSize;
	738	}
	739	uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
	740	return dataSize;
	741	}
	742
	743	U_CAPI const USet * U_EXPORT2
	744	uspoof_getInclusionSet(UErrorCode *status) {
	745	umtx_initOnce(gSpoofInitOnce, &initializeStatics, *status);
	746	return gInclusionSet->toUSet();
	747	}
	748
	749	U_CAPI const USet * U_EXPORT2
	750	uspoof_getRecommendedSet(UErrorCode *status) {
	751	umtx_initOnce(gSpoofInitOnce, &initializeStatics, *status);
	752	return gRecommendedSet->toUSet();
	753	}
	754
	755	U_I18N_API const UnicodeSet * U_EXPORT2
	756	uspoof_getInclusionUnicodeSet(UErrorCode *status) {
	757	umtx_initOnce(gSpoofInitOnce, &initializeStatics, *status);
	758	return gInclusionSet;
	759	}
	760
	761	U_I18N_API const UnicodeSet * U_EXPORT2
	762	uspoof_getRecommendedUnicodeSet(UErrorCode *status) {
	763	umtx_initOnce(gSpoofInitOnce, &initializeStatics, *status);
	764	return gRecommendedSet;
	765	}
	766
	767
	768
	769	#endif // !UCONFIG_NO_NORMALIZATION