git.saurik.com Git - apple/icu.git/blame - icuSources/test/cintltst/ucsdetst.c

Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf A	3	/*
73c04bcf A	4	****************************************************************************
2ca993e8	5	* Copyright (c) 2005-2016, International Business Machines Corporation and *
73c04bcf A	6	* others. All Rights Reserved. *
	7	****************************************************************************
	8	*/
	9
	10	#include "unicode/utypes.h"
	11
	12	#include "unicode/ucsdet.h"
	13	#include "unicode/ucnv.h"
	14	#include "unicode/ustring.h"
	15
	16	#include "cintltst.h"
2ca993e8	17	#include "cmemory.h"
73c04bcf A	18
	19	#include <stdlib.h>
	20	#include <string.h>
	21
46f4442e A	22	#define NEW_ARRAY(type,count) (type ) malloc((count) sizeof(type))
46f4442e A	23	#define DELETE_ARRAY(array) free(array)
73c04bcf A	24
	25	static void TestConstruction(void);
	26	static void TestUTF8(void);
	27	static void TestUTF16(void);
	28	static void TestC1Bytes(void);
	29	static void TestInputFilter(void);
	30	static void TestChaining(void);
46f4442e	31	static void TestBufferOverflow(void);
729e4ab9 A	32	static void TestIBM424(void);
729e4ab9 A	33	static void TestIBM420(void);
3d1f044b A	34	#if U_PLATFORM_IS_DARWIN_BASED
	35	static void TestMailFilterCSS(void);
	36	#endif
73c04bcf A	37
	38	void addUCsdetTest(TestNode** root);
	39
	40	void addUCsdetTest(TestNode** root)
	41	{
	42	addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
	43	addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
	44	addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
	45	addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
	46	addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
	47	addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
46f4442e	48	addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
729e4ab9 A	49	#if !UCONFIG_NO_LEGACY_CONVERSION
	50	addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
	51	addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
	52	#endif
3d1f044b A	53	#if U_PLATFORM_IS_DARWIN_BASED
	54	addTest(root, &TestMailFilterCSS, "ucsdetst/TestMailFilterCSS");
	55	#endif
73c04bcf A	56	}
	57
	58	static int32_t preflight(const UChar src, int32_t length, UConverter cnv)
	59	{
	60	UErrorCode status;
	61	char buffer[1024];
	62	char dest, destLimit = buffer + sizeof(buffer);
	63	const UChar *srcLimit = src + length;
	64	int32_t result = 0;
	65
	66	do {
	67	dest = buffer;
	68	status = U_ZERO_ERROR;
	69	ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
	70	result += (int32_t) (dest - buffer);
	71	} while (status == U_BUFFER_OVERFLOW_ERROR);
	72
	73	return result;
	74	}
	75
73c04bcf A	76	static char extractBytes(const UChar src, int32_t length, const char codepage, int32_t byteLength)
	77	{
	78	UErrorCode status = U_ZERO_ERROR;
	79	UConverter *cnv = ucnv_open(codepage, &status);
	80	int32_t byteCount = preflight(src, length, cnv);
	81	const UChar *srcLimit = src + length;
	82	char *bytes = NEW_ARRAY(char, byteCount + 1);
	83	char dest = bytes, destLimit = bytes + byteCount + 1;
	84
	85	ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
	86	ucnv_close(cnv);
	87
	88	*byteLength = byteCount;
	89	return bytes;
	90	}
	91
	92	static void freeBytes(char *bytes)
	93	{
	94	DELETE_ARRAY(bytes);
	95	}
	96
	97	static void TestConstruction(void)
	98	{
	99	UErrorCode status = U_ZERO_ERROR;
	100	UCharsetDetector *csd = ucsdet_open(&status);
	101	UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
	102	const char *name;
	103	int32_t count = uenum_count(e, &status);
	104	int32_t i, length;
	105
	106	for(i = 0; i < count; i += 1) {
	107	name = uenum_next(e, &length, &status);
	108
	109	if(name == NULL \|\| length <= 0) {
	110	log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
	111	}
	112	}
	113	/* one past the list of all names must return NULL */
	114	name = uenum_next(e, &length, &status);
	115	if(name != NULL \|\| length != 0 \|\| U_FAILURE(status)) {
	116	log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
	117	}
	118
	119	uenum_close(e);
	120	ucsdet_close(csd);
	121	}
	122
	123	static void TestUTF8(void)
	124	{
	125	UErrorCode status = U_ZERO_ERROR;
46f4442e	126	static const char ss[] = "This is a string with some non-ascii characters that will "
73c04bcf A	127	"be converted to UTF-8, then shoved through the detection process. "
	128	"\\u0391\\u0392\\u0393\\u0394\\u0395"
	129	"Sure would be nice if our source could contain Unicode directly!";
	130	int32_t byteLength = 0, sLength = 0, dLength = 0;
46f4442e A	131	UChar s[sizeof(ss)];
46f4442e A	132	char *bytes;
73c04bcf A	133	UCharsetDetector *csd = ucsdet_open(&status);
73c04bcf A	134	const UCharsetMatch *match;
46f4442e A	135	UChar detected[sizeof(ss)];
	136
	137	sLength = u_unescape(ss, s, sizeof(ss));
	138	bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
73c04bcf A	139
73c04bcf A	140	ucsdet_setText(csd, bytes, byteLength, &status);
46f4442e A	141	if (U_FAILURE(status)) {
	142	log_err("status is %s\n", u_errorName(status));
	143	goto bail;
	144	}
	145
73c04bcf A	146	match = ucsdet_detect(csd, &status);
	147
	148	if (match == NULL) {
	149	log_err("Detection failure for UTF-8: got no matches.\n");
	150	goto bail;
	151	}
	152
	153	dLength = ucsdet_getUChars(match, detected, sLength, &status);
	154
	155	if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
	156	log_err("Round-trip test failed!\n");
	157	}
	158
	159	ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
	160
	161	bail:
73c04bcf A	162	freeBytes(bytes);
	163	ucsdet_close(csd);
	164	}
	165
	166	static void TestUTF16(void)
	167	{
	168	UErrorCode status = U_ZERO_ERROR;
	169	/* Notice the BOM on the start of this string */
46f4442e	170	static const UChar chars[] = {
73c04bcf A	171	0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
	172	0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
	173	0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
	174	0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
	175	0x064a, 0x062a, 0x0000};
2ca993e8	176	int32_t beLength = 0, leLength = 0, cLength = UPRV_LENGTHOF(chars);
73c04bcf A	177	char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
	178	char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
	179	UCharsetDetector *csd = ucsdet_open(&status);
	180	const UCharsetMatch *match;
	181	const char *name;
	182	int32_t conf;
	183
	184	ucsdet_setText(csd, beBytes, beLength, &status);
	185	match = ucsdet_detect(csd, &status);
	186
	187	if (match == NULL) {
	188	log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
	189	goto try_le;
	190	}
	191
	192	name = ucsdet_getName(match, &status);
	193	conf = ucsdet_getConfidence(match, &status);
	194
	195	if (strcmp(name, "UTF-16BE") != 0) {
	196	log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
	197	}
	198
	199	if (conf != 100) {
	200	log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
	201	}
	202
	203	try_le:
	204	ucsdet_setText(csd, leBytes, leLength, &status);
	205	match = ucsdet_detect(csd, &status);
	206
	207	if (match == NULL) {
	208	log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
	209	goto bail;
	210	}
	211
	212	name = ucsdet_getName(match, &status);
	213	conf = ucsdet_getConfidence(match, &status);
	214
	215
	216	if (strcmp(name, "UTF-16LE") != 0) {
	217	log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
	218	}
	219
	220	if (conf != 100) {
	221	log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
	222	}
	223
	224	bail:
	225	freeBytes(leBytes);
	226	freeBytes(beBytes);
	227	ucsdet_close(csd);
	228	}
	229
	230	static void TestC1Bytes(void)
	231	{
	232	#if !UCONFIG_NO_LEGACY_CONVERSION
	233	UErrorCode status = U_ZERO_ERROR;
46f4442e A	234	static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
46f4442e A	235	static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
73c04bcf	236	int32_t sISOLength = 0, sWindowsLength = 0;
46f4442e A	237	UChar sISO[sizeof(ssISO)];
46f4442e A	238	UChar sWindows[sizeof(ssWindows)];
73c04bcf	239	int32_t lISO = 0, lWindows = 0;
46f4442e A	240	char *bISO;
46f4442e A	241	char *bWindows;
73c04bcf A	242	UCharsetDetector *csd = ucsdet_open(&status);
	243	const UCharsetMatch *match;
	244	const char *name;
	245
46f4442e A	246	sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
	247	sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
	248	bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
	249	bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
	250
73c04bcf A	251	ucsdet_setText(csd, bWindows, lWindows, &status);
	252	match = ucsdet_detect(csd, &status);
	253
	254	if (match == NULL) {
	255	log_err("English test with C1 bytes got no matches.\n");
	256	goto bail;
	257	}
	258
	259	name = ucsdet_getName(match, &status);
	260
	261	if (strcmp(name, "windows-1252") != 0) {
729e4ab9	262	log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
73c04bcf A	263	}
	264
	265	ucsdet_setText(csd, bISO, lISO, &status);
	266	match = ucsdet_detect(csd, &status);
	267
	268	if (match == NULL) {
	269	log_err("English text without C1 bytes got no matches.\n");
	270	goto bail;
	271	}
	272
	273	name = ucsdet_getName(match, &status);
	274
	275	if (strcmp(name, "ISO-8859-1") != 0) {
	276	log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
	277	}
	278
	279	bail:
	280	freeBytes(bWindows);
	281	freeBytes(bISO);
	282
	283	ucsdet_close(csd);
	284	#endif
	285	}
	286
	287	static void TestInputFilter(void)
	288	{
	289	UErrorCode status = U_ZERO_ERROR;
46f4442e	290	static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
73c04bcf	291	int32_t sLength = 0;
46f4442e	292	UChar s[sizeof(ss)];
73c04bcf	293	int32_t byteLength = 0;
46f4442e	294	char *bytes;
73c04bcf A	295	UCharsetDetector *csd = ucsdet_open(&status);
	296	const UCharsetMatch *match;
	297	const char lang, name;
	298
46f4442e A	299	sLength = u_unescape(ss, s, sizeof(ss));
	300	bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
	301
73c04bcf A	302	ucsdet_enableInputFilter(csd, TRUE);
	303
	304	if (!ucsdet_isInputFilterEnabled(csd)) {
	305	log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
	306	}
	307
	308
	309	ucsdet_setText(csd, bytes, byteLength, &status);
	310	match = ucsdet_detect(csd, &status);
	311
	312	if (match == NULL) {
	313	log_err("Turning on the input filter resulted in no matches.\n");
	314	goto turn_off;
	315	}
	316
	317	name = ucsdet_getName(match, &status);
	318
	319	if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {
	320	log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
	321	} else {
	322	lang = ucsdet_getLanguage(match, &status);
	323
	324	if (lang == NULL \|\| strcmp(lang, "fr") != 0) {
	325	log_err("Input filter did not strip markup!\n");
	326	}
	327	}
	328
	329	turn_off:
	330	ucsdet_enableInputFilter(csd, FALSE);
	331	ucsdet_setText(csd, bytes, byteLength, &status);
	332	match = ucsdet_detect(csd, &status);
	333
	334	if (match == NULL) {
	335	log_err("Turning off the input filter resulted in no matches.\n");
	336	goto bail;
	337	}
	338
	339	name = ucsdet_getName(match, &status);
	340
	341	if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {
	342	log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
	343	} else {
	344	lang = ucsdet_getLanguage(match, &status);
	345
	346	if (lang == NULL \|\| strcmp(lang, "en") != 0) {
	347	log_err("Unfiltered input did not detect as English!\n");
	348	}
	349	}
	350
	351	bail:
	352	freeBytes(bytes);
	353	ucsdet_close(csd);
	354	}
	355
	356	static void TestChaining(void) {
	357	UErrorCode status = U_USELESS_COLLATOR_ERROR;
	358
	359	ucsdet_open(&status);
	360	ucsdet_setText(NULL, NULL, 0, &status);
	361	ucsdet_getName(NULL, &status);
	362	ucsdet_getConfidence(NULL, &status);
	363	ucsdet_getLanguage(NULL, &status);
	364	ucsdet_detect(NULL, &status);
	365	ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
366	ucsdet_detectAll(NULL, NULL, &status);
367	ucsdet_getUChars(NULL, NULL, 0, &status);
368	ucsdet_getUChars(NULL, NULL, 0, &status);
369	ucsdet_close(NULL);
370
371	/* All of this code should have done nothing. */
372	if (status != U_USELESS_COLLATOR_ERROR) {
373	log_err("Status got changed to %s\n", u_errorName(status));
374	}
375	}
46f4442e A	376
	377	static void TestBufferOverflow(void) {
	378	UErrorCode status = U_ZERO_ERROR;
	379	static const char *testStrings[] = {
	380	"\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
	381	"\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
	382	"\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
	383	"\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
	384	"\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
	385	"\xa1", /* Could be a single byte shift-jis at the end */
	386	"\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
	387	"\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
	388	};
	389	static const char *testResults[] = {
	390	"windows-1252",
	391	"windows-1252",
	392	"windows-1252",
	393	"windows-1252",
	394	"ISO-2022-JP",
	395	NULL,
	396	NULL,
	397	"ISO-8859-1"
	398	};
	399	int32_t idx = 0;
	400	UCharsetDetector *csd = ucsdet_open(&status);
	401	const UCharsetMatch *match;
	402
	403	ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
	404
	405	if (U_FAILURE(status)) {
	406	log_err("Couldn't open detector. %s\n", u_errorName(status));
	407	goto bail;
	408	}
	409
2ca993e8	410	for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) {
46f4442e A	411	ucsdet_setText(csd, testStrings[idx], -1, &status);
	412	match = ucsdet_detect(csd, &status);
	413
	414	if (match == NULL) {
	415	if (testResults[idx] != NULL) {
	416	log_err("Unexpectedly got no results at index %d.\n", idx);
	417	}
	418	else {
	419	log_verbose("Got no result as expected at index %d.\n", idx);
	420	}
	421	continue;
	422	}
	423
	424	if (testResults[idx] == NULL \|\| strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
	425	log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
	426	ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
	427	goto bail;
	428	}
	429	}
	430
	431	bail:
	432	ucsdet_close(csd);
	433	}
	434
729e4ab9 A	435	static void TestIBM424(void)
	436	{
	437	UErrorCode status = U_ZERO_ERROR;
	438
	439	static const UChar chars[] = {
	440	0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
	441	0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
	442	0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
	443	0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
	444	0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
	445	0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
	446	0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
	447	0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
	448	0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
	449	0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
	450	0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
	451	0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
	452	0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
	453	0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
	454	0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
	455	0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
	456	0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
	457	};
	458
	459	static const UChar chars_reverse[] = {
	460	0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
	461	0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
	462	0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
	463	0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
	464	0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
	465	0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
	466	0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
	467	0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
	468	0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
	469	0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
	470	0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
	471	0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
	472	0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
	473	0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
	474	0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
	475	0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
	476	0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
	477	0x0000
	478	};
	479
2ca993e8	480	int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
729e4ab9 A	481
	482	char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
	483	char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
	484
	485	UCharsetDetector *csd = ucsdet_open(&status);
	486	const UCharsetMatch *match;
	487	const char *name;
	488
	489	ucsdet_setText(csd, bytes, bLength, &status);
	490	match = ucsdet_detect(csd, &status);
	491
	492	if (match == NULL) {
	493	log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
	494	goto bail;
	495	}
	496
	497	name = ucsdet_getName(match, &status);
	498	if (strcmp(name, "IBM424_rtl") != 0) {
	499	log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
	500	}
	501
	502	ucsdet_setText(csd, bytes_r, brLength, &status);
	503	match = ucsdet_detect(csd, &status);
	504
	505	if (match == NULL) {
	506	log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
	507	goto bail;
	508	}
	509
	510	name = ucsdet_getName(match, &status);
	511	if (strcmp(name, "IBM424_ltr") != 0) {
	512	log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
	513	}
	514
	515	bail:
	516	freeBytes(bytes);
	517	freeBytes(bytes_r);
	518	ucsdet_close(csd);
	519	}
	520
	521	static void TestIBM420(void)
	522	{
	523	UErrorCode status = U_ZERO_ERROR;
	524
	525	static const UChar chars[] = {
	526	0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
	527	0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
	528	0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
	529	0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
	530	0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
	531	0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
	532	0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
	533	0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
	534	0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
	535	0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
	536	0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
	537	0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
	538	0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
	539	0x0000
	540	};
	541	static const UChar chars_reverse[] = {
	542	0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
	543	0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
	544	0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
545	0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
546	0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
547	0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
548	0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
549	0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
550	0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
551	0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
552	0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
553	0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
554	0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
555	0x0000,
556	};
557
2ca993e8	558	int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
729e4ab9 A	559
	560	char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
	561	char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
	562
	563	UCharsetDetector *csd = ucsdet_open(&status);
	564	const UCharsetMatch *match;
	565	const char *name;
	566
	567	ucsdet_setText(csd, bytes, bLength, &status);
	568	match = ucsdet_detect(csd, &status);
	569
	570	if (match == NULL) {
	571	log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
	572	goto bail;
	573	}
	574
	575	name = ucsdet_getName(match, &status);
	576	if (strcmp(name, "IBM420_rtl") != 0) {
	577	log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
	578	}
	579
	580	ucsdet_setText(csd, bytes_r, brLength, &status);
	581	match = ucsdet_detect(csd, &status);
	582
	583	if (match == NULL) {
	584	log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
	585	goto bail;
	586	}
	587
	588	name = ucsdet_getName(match, &status);
	589	if (strcmp(name, "IBM420_ltr") != 0) {
	590	log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
	591	}
	592
	593	bail:
	594	freeBytes(bytes);
	595	freeBytes(bytes_r);
	596	ucsdet_close(csd);
	597	}
3d1f044b A	598
	599	#if U_PLATFORM_IS_DARWIN_BASED
	600	#include <stdio.h>
	601	// read data from file into a malloc'ed buf, which must be freed by caller.
	602	// returns NULL if error. Copied from cbiapts.c
	603	static void* dataBufFromFile(const char* path, long* dataBufSizeP) {
	604	FILE * dataFile;
	605	void * dataBuf;
	606	long dataBufSize, dataFileRead = 0;
	607
	608	if (dataBufSizeP) {
	609	*dataBufSizeP = 0;
	610	}
	611	dataFile = fopen(path, "r");
	612	if (dataFile == NULL) {
	613	log_data_err("FAIL: for %s, fopen fails\n", path);
	614	return NULL;
	615	}
	616	fseek(dataFile, 0, SEEK_END);
	617	dataBufSize = ftell(dataFile);
	618	rewind(dataFile);
	619
	620	dataBuf = uprv_malloc(dataBufSize);
	621	if (dataBuf != NULL) {
	622	dataFileRead = fread(dataBuf, 1, dataBufSize, dataFile);
	623	}
	624	fclose(dataFile);
	625	if (dataBuf == NULL) {
	626	log_data_err("FAIL: for %s, uprv_malloc fails for dataBuf[%ld]\n", path, dataBufSize);
	627	return NULL;
	628	}
	629	if (dataFileRead < dataBufSize) {
	630	log_data_err("FAIL: for %s, fread fails, read %ld of %ld\n", path, dataFileRead, dataBufSize);
	631	uprv_free(dataBuf);
	632	return NULL;
	633	}
	634	if (dataBufSizeP) {
	635	*dataBufSizeP = dataBufSize;
	636	}
	637	return dataBuf;
	638	}
	639
	640	typedef struct {
	641	const char* sampleTextPath; // relative to cintltst directory
	642	const char* encodingName; // expected
	643	} SampleTextAndEncoding;
	644
340931cb A	645	#ifdef APPLE_XCODE_BUILD
	646	#define TESTDATA_DIR "testdata"
	647	#else
	648	#define TESTDATA_DIR "../testdata"
	649	#endif
	650
3d1f044b	651	static const SampleTextAndEncoding mailSampleTests[] = {
340931cb A	652	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1_2.txt", "iso-8859-1" },
	653	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1_3.txt", "iso-8859-1" },
	654	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1_4.txt", "iso-8859-1" },
	655	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1_6.txt", "iso-8859-1" },
	656	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1_7.txt", "iso-8859-1" },
	657	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1_8.txt", "iso-8859-1" },
	658	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1_9.txt", "iso-8859-1" },
	659	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_2.txt", "iso-8859-1" },
	660	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_3.txt", "iso-8859-1" },
	661	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_4.txt", "iso-8859-1" },
	662	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_6.txt", "iso-8859-1" },
	663	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_7.txt", "iso-8859-1" },
	664	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_8.txt", "iso-8859-1" },
	665	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_9.txt", "iso-8859-1" },
1a147d09	666	// additions for <rdar://problem/56373519>
340931cb	667	{ TESTDATA_DIR "/encodingSamples/mailExample_Latin1_11.txt", "iso-8859-1" },
3d1f044b A	668	{ NULL, NULL }
	669	};
	670
	671	static void TestMailFilterCSS(void) {
	672	UErrorCode status = U_ZERO_ERROR;
	673	UCharsetDetector *detector = ucsdet_open(&status);
	674	if (U_FAILURE(status)) {
	675	log_data_err("ucsdet_open fails. %s\n", u_errorName(status));
	676	} else {
	677	const SampleTextAndEncoding* testPtr;
	678	for (testPtr = mailSampleTests; testPtr->sampleTextPath != NULL; testPtr++) {
	679	long sampleTextLen;
	680	char * sampleText = (char *)dataBufFromFile(testPtr->sampleTextPath, &sampleTextLen);
	681	if (sampleText != NULL) { // dataBufFromFile reports the errors that would produce NULL
	682	status = U_ZERO_ERROR;
	683	ucsdet_setText(detector, sampleText, sampleTextLen, &status);
	684	if (U_FAILURE(status)) {
	685	log_data_err("ucsdet_setText fails for text file %s: %s\n", testPtr->sampleTextPath, u_errorName(status));
	686	} else {
	687	const UCharsetMatch *highestMatch = NULL;
	688	ucsdet_enableInputFilter(detector, TRUE);
	689	highestMatch = ucsdet_detect(detector, &status);
	690	if (U_FAILURE(status) \|\| highestMatch==NULL) {
	691	log_err("ucsdet_detect fails for text file %s: %s\n", testPtr->sampleTextPath, u_errorName(status));
	692	} else {
	693	const char *icuName = ucsdet_getName(highestMatch, &status);
	694	int32_t confidence = ucsdet_getConfidence(highestMatch, &status);
1a147d09	695	const char *langCode = ucsdet_getLanguage(highestMatch, &status);
3d1f044b A	696	if (U_FAILURE(status) \|\| icuName==NULL) {
	697	log_err("ucsdet_getName and/or ucsdet_getConfidence fails for text file %s: %s\n", testPtr->sampleTextPath, u_errorName(status));
	698	} else {
1a147d09 A	699	log_info("For text file %s: expect %s; get %s with confidence %d, langCode %s; text length %ld\n",
1a147d09 A	700	testPtr->sampleTextPath, testPtr->encodingName, icuName, confidence, langCode, sampleTextLen);
3d1f044b A	701	}
	702	}
	703	}
	704	uprv_free(sampleText);
	705	}
	706	}
	707	ucsdet_close(detector);
	708	}
	709	}
	710	#endif /* U_PLATFORM_IS_DARWIN_BASED */