[apple/icu.git] / icuSources / test / cintltst / ucsdetst.c

/*
 ****************************************************************************
 * Copyright (c) 2005-2008, International Business Machines Corporation and *
 * others. All Rights Reserved.                                             *
 ****************************************************************************
 */

#include "unicode/utypes.h"

#include "unicode/ucsdet.h"
#include "unicode/ucnv.h"
#include "unicode/ustring.h"

#include "cintltst.h"

#include <stdlib.h>
#include <string.h>

#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))

#define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) free(array)

static void TestConstruction(void);
static void TestUTF8(void);
static void TestUTF16(void);
static void TestC1Bytes(void);
static void TestInputFilter(void);
static void TestChaining(void);
static void TestBufferOverflow(void);

void addUCsdetTest(TestNode** root);

void addUCsdetTest(TestNode** root)
{
    addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
    addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
    addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
    addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
    addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
    addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
    addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
}

static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
{
    UErrorCode status;
    char buffer[1024];
    char *dest, *destLimit = buffer + sizeof(buffer);
    const UChar *srcLimit = src + length;
    int32_t result = 0;

    do {
        dest = buffer;
        status = U_ZERO_ERROR;
        ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
        result += (int32_t) (dest - buffer);
    } while (status == U_BUFFER_OVERFLOW_ERROR);

    return result;
}

static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
{
    UErrorCode status = U_ZERO_ERROR;
    UConverter *cnv = ucnv_open(codepage, &status);
    int32_t byteCount = preflight(src, length, cnv);
    const UChar *srcLimit = src + length;
    char *bytes = NEW_ARRAY(char, byteCount + 1);
    char *dest = bytes, *destLimit = bytes + byteCount + 1;

    ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
    ucnv_close(cnv);

    *byteLength = byteCount;
    return bytes;
}

static void freeBytes(char *bytes)
{
    DELETE_ARRAY(bytes);
}

static void TestConstruction(void)
{
    UErrorCode status = U_ZERO_ERROR;
    UCharsetDetector *csd = ucsdet_open(&status);
    UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
    const char *name;
    int32_t count = uenum_count(e, &status);
    int32_t i, length;

    for(i = 0; i < count; i += 1) {
        name = uenum_next(e, &length, &status);

        if(name == NULL || length <= 0) {
            log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
        }
    }
    /* one past the list of all names must return NULL */
    name = uenum_next(e, &length, &status);
    if(name != NULL || length != 0 || U_FAILURE(status)) {
        log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
    }

    uenum_close(e);
    ucsdet_close(csd);
}

static void TestUTF8(void)
{
    UErrorCode status = U_ZERO_ERROR;
    static const char ss[] = "This is a string with some non-ascii characters that will "
               "be converted to UTF-8, then shoved through the detection process.  "
               "\\u0391\\u0392\\u0393\\u0394\\u0395"
               "Sure would be nice if our source could contain Unicode directly!";
    int32_t byteLength = 0, sLength = 0, dLength = 0;
    UChar s[sizeof(ss)];
    char *bytes;
    UCharsetDetector *csd = ucsdet_open(&status);
    const UCharsetMatch *match;
    UChar detected[sizeof(ss)];

    sLength = u_unescape(ss, s, sizeof(ss));
    bytes = extractBytes(s, sLength, "UTF-8", &byteLength);

    ucsdet_setText(csd, bytes, byteLength, &status);
    if (U_FAILURE(status)) {
        log_err("status is %s\n", u_errorName(status));
        goto bail;
    }

    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        log_err("Detection failure for UTF-8: got no matches.\n");
        goto bail;
    }

    dLength = ucsdet_getUChars(match, detected, sLength, &status);

    if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
        log_err("Round-trip test failed!\n");
    }

    ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */

bail:
    freeBytes(bytes);
    ucsdet_close(csd);
}

static void TestUTF16(void)
{
    UErrorCode status = U_ZERO_ERROR;
    /* Notice the BOM on the start of this string */
    static const UChar chars[] = {
        0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
        0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
        0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
        0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
        0x064a, 0x062a, 0x0000};
    int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars);
    char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
    char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
    UCharsetDetector *csd = ucsdet_open(&status);
    const UCharsetMatch *match;
    const char *name;
    int32_t conf;

    ucsdet_setText(csd, beBytes, beLength, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
        goto try_le;
    }

    name  = ucsdet_getName(match, &status);
    conf  = ucsdet_getConfidence(match, &status);

    if (strcmp(name, "UTF-16BE") != 0) {
        log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
    }

    if (conf != 100) {
        log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
    }

try_le:
    ucsdet_setText(csd, leBytes, leLength, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
        goto bail;
    }

    name  = ucsdet_getName(match, &status);
    conf = ucsdet_getConfidence(match, &status);


    if (strcmp(name, "UTF-16LE") != 0) {
        log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
    }

    if (conf != 100) {
        log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
    }

bail:
    freeBytes(leBytes);
    freeBytes(beBytes);
    ucsdet_close(csd);
}

static void TestC1Bytes(void)
{
#if !UCONFIG_NO_LEGACY_CONVERSION
    UErrorCode status = U_ZERO_ERROR;
    static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
    static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
    int32_t sISOLength = 0, sWindowsLength = 0;
    UChar sISO[sizeof(ssISO)];
    UChar sWindows[sizeof(ssWindows)];
    int32_t lISO = 0, lWindows = 0;
    char *bISO;
    char *bWindows;
    UCharsetDetector *csd = ucsdet_open(&status);
    const UCharsetMatch *match;
    const char *name;

    sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
    sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
    bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
    bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);

    ucsdet_setText(csd, bWindows, lWindows, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        log_err("English test with C1 bytes got no matches.\n");
        goto bail;
    }

    name  = ucsdet_getName(match, &status);

    if (strcmp(name, "windows-1252") != 0) {
        log_err("English text with C1 bytes does not detect as windows-1252, but as %s\n", name);
    }

    ucsdet_setText(csd, bISO, lISO, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        log_err("English text without C1 bytes got no matches.\n");
        goto bail;
    }

    name  = ucsdet_getName(match, &status);

    if (strcmp(name, "ISO-8859-1") != 0) {
        log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
    }

bail:
    freeBytes(bWindows);
    freeBytes(bISO);

    ucsdet_close(csd);
#endif
}

static void TestInputFilter(void)
{
    UErrorCode status = U_ZERO_ERROR;
    static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
    int32_t sLength = 0;
    UChar s[sizeof(ss)];
    int32_t byteLength = 0;
    char *bytes;
    UCharsetDetector *csd = ucsdet_open(&status);
    const UCharsetMatch *match;
    const char *lang, *name;

    sLength = u_unescape(ss, s, sizeof(ss));
    bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);

    ucsdet_enableInputFilter(csd, TRUE);

    if (!ucsdet_isInputFilterEnabled(csd)) {
        log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
    }


    ucsdet_setText(csd, bytes, byteLength, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        log_err("Turning on the input filter resulted in no matches.\n");
        goto turn_off;
    }

    name = ucsdet_getName(match, &status);

    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
        log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
    } else {
        lang = ucsdet_getLanguage(match, &status);

        if (lang == NULL || strcmp(lang, "fr") != 0) {
            log_err("Input filter did not strip markup!\n");
        }
    }

turn_off:
    ucsdet_enableInputFilter(csd, FALSE);
    ucsdet_setText(csd, bytes, byteLength, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        log_err("Turning off the input filter resulted in no matches.\n");
        goto bail;
    }

    name = ucsdet_getName(match, &status);

    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
        log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
    } else {
        lang = ucsdet_getLanguage(match, &status);

        if (lang == NULL || strcmp(lang, "en") != 0) {
            log_err("Unfiltered input did not detect as English!\n");
        }
    }

bail:
    freeBytes(bytes);
    ucsdet_close(csd);
}

static void TestChaining(void) {
    UErrorCode status = U_USELESS_COLLATOR_ERROR;

    ucsdet_open(&status);
    ucsdet_setText(NULL, NULL, 0, &status);
    ucsdet_getName(NULL, &status);
    ucsdet_getConfidence(NULL, &status);
    ucsdet_getLanguage(NULL, &status);
    ucsdet_detect(NULL, &status);
    ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
    ucsdet_detectAll(NULL, NULL, &status);
    ucsdet_getUChars(NULL, NULL, 0, &status);
    ucsdet_getUChars(NULL, NULL, 0, &status);
    ucsdet_close(NULL);

    /* All of this code should have done nothing. */
    if (status != U_USELESS_COLLATOR_ERROR) {
        log_err("Status got changed to %s\n", u_errorName(status));
    }
}

static void TestBufferOverflow(void) {
    UErrorCode status = U_ZERO_ERROR;
    static const char *testStrings[] = {
        "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
        "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
        "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
        "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
        "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
        "\xa1", /* Could be a single byte shift-jis at the end */
        "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
        "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
    };
    static const char *testResults[] = {
        "windows-1252",
        "windows-1252",
        "windows-1252",
        "windows-1252",
        "ISO-2022-JP",
        NULL,
        NULL,
        "ISO-8859-1"
    };
    int32_t idx = 0;
    UCharsetDetector *csd = ucsdet_open(&status);
    const UCharsetMatch *match;

    ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);

    if (U_FAILURE(status)) {
        log_err("Couldn't open detector. %s\n", u_errorName(status));
        goto bail;
    }

    for (idx = 0; idx < ARRAY_SIZE(testStrings); idx++) {
        ucsdet_setText(csd, testStrings[idx], -1, &status);
        match = ucsdet_detect(csd, &status);

        if (match == NULL) {
            if (testResults[idx] != NULL) {
                log_err("Unexpectedly got no results at index %d.\n", idx);
            }
            else {
                log_verbose("Got no result as expected at index %d.\n", idx);
            }
            continue;
        }

        if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
            log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
                ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
            goto bail;
        }
    }

bail:
    ucsdet_close(csd);
}
Commit	Line	Data
73c04bcf A	1	/*
73c04bcf A	2	****************************************************************************
46f4442e	3	* Copyright (c) 2005-2008, International Business Machines Corporation and *
73c04bcf A	4	* others. All Rights Reserved. *
	5	****************************************************************************
	6	*/
	7
	8	#include "unicode/utypes.h"
	9
	10	#include "unicode/ucsdet.h"
	11	#include "unicode/ucnv.h"
	12	#include "unicode/ustring.h"
	13
	14	#include "cintltst.h"
	15
	16	#include <stdlib.h>
	17	#include <string.h>
	18
46f4442e	19	#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
73c04bcf	20
46f4442e A	21	#define NEW_ARRAY(type,count) (type ) malloc((count) sizeof(type))
46f4442e A	22	#define DELETE_ARRAY(array) free(array)
73c04bcf A	23
	24	static void TestConstruction(void);
	25	static void TestUTF8(void);
	26	static void TestUTF16(void);
	27	static void TestC1Bytes(void);
	28	static void TestInputFilter(void);
	29	static void TestChaining(void);
46f4442e	30	static void TestBufferOverflow(void);
73c04bcf A	31
	32	void addUCsdetTest(TestNode** root);
	33
	34	void addUCsdetTest(TestNode** root)
	35	{
	36	addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
	37	addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
	38	addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
	39	addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
	40	addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
	41	addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
46f4442e	42	addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
73c04bcf A	43	}
	44
	45	static int32_t preflight(const UChar src, int32_t length, UConverter cnv)
	46	{
	47	UErrorCode status;
	48	char buffer[1024];
	49	char dest, destLimit = buffer + sizeof(buffer);
	50	const UChar *srcLimit = src + length;
	51	int32_t result = 0;
	52
	53	do {
	54	dest = buffer;
	55	status = U_ZERO_ERROR;
	56	ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
	57	result += (int32_t) (dest - buffer);
	58	} while (status == U_BUFFER_OVERFLOW_ERROR);
	59
	60	return result;
	61	}
	62
73c04bcf A	63	static char extractBytes(const UChar src, int32_t length, const char codepage, int32_t byteLength)
	64	{
	65	UErrorCode status = U_ZERO_ERROR;
	66	UConverter *cnv = ucnv_open(codepage, &status);
	67	int32_t byteCount = preflight(src, length, cnv);
	68	const UChar *srcLimit = src + length;
	69	char *bytes = NEW_ARRAY(char, byteCount + 1);
	70	char dest = bytes, destLimit = bytes + byteCount + 1;
	71
	72	ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
	73	ucnv_close(cnv);
	74
	75	*byteLength = byteCount;
	76	return bytes;
	77	}
	78
	79	static void freeBytes(char *bytes)
	80	{
	81	DELETE_ARRAY(bytes);
	82	}
	83
	84	static void TestConstruction(void)
	85	{
	86	UErrorCode status = U_ZERO_ERROR;
	87	UCharsetDetector *csd = ucsdet_open(&status);
	88	UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
	89	const char *name;
	90	int32_t count = uenum_count(e, &status);
	91	int32_t i, length;
	92
	93	for(i = 0; i < count; i += 1) {
	94	name = uenum_next(e, &length, &status);
	95
	96	if(name == NULL \|\| length <= 0) {
	97	log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
	98	}
	99	}
	100	/* one past the list of all names must return NULL */
	101	name = uenum_next(e, &length, &status);
	102	if(name != NULL \|\| length != 0 \|\| U_FAILURE(status)) {
	103	log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
	104	}
	105
	106	uenum_close(e);
	107	ucsdet_close(csd);
	108	}
	109
	110	static void TestUTF8(void)
	111	{
	112	UErrorCode status = U_ZERO_ERROR;
46f4442e	113	static const char ss[] = "This is a string with some non-ascii characters that will "
73c04bcf A	114	"be converted to UTF-8, then shoved through the detection process. "
	115	"\\u0391\\u0392\\u0393\\u0394\\u0395"
	116	"Sure would be nice if our source could contain Unicode directly!";
	117	int32_t byteLength = 0, sLength = 0, dLength = 0;
46f4442e A	118	UChar s[sizeof(ss)];
46f4442e A	119	char *bytes;
73c04bcf A	120	UCharsetDetector *csd = ucsdet_open(&status);
73c04bcf A	121	const UCharsetMatch *match;
46f4442e A	122	UChar detected[sizeof(ss)];
	123
	124	sLength = u_unescape(ss, s, sizeof(ss));
	125	bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
73c04bcf A	126
73c04bcf A	127	ucsdet_setText(csd, bytes, byteLength, &status);
46f4442e A	128	if (U_FAILURE(status)) {
	129	log_err("status is %s\n", u_errorName(status));
	130	goto bail;
	131	}
	132
73c04bcf A	133	match = ucsdet_detect(csd, &status);
	134
	135	if (match == NULL) {
	136	log_err("Detection failure for UTF-8: got no matches.\n");
	137	goto bail;
	138	}
	139
	140	dLength = ucsdet_getUChars(match, detected, sLength, &status);
	141
	142	if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
	143	log_err("Round-trip test failed!\n");
	144	}
	145
	146	ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
	147
	148	bail:
73c04bcf A	149	freeBytes(bytes);
	150	ucsdet_close(csd);
	151	}
	152
	153	static void TestUTF16(void)
	154	{
	155	UErrorCode status = U_ZERO_ERROR;
	156	/* Notice the BOM on the start of this string */
46f4442e	157	static const UChar chars[] = {
73c04bcf A	158	0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
	159	0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
	160	0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
	161	0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
	162	0x064a, 0x062a, 0x0000};
	163	int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars);
	164	char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
	165	char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
	166	UCharsetDetector *csd = ucsdet_open(&status);
	167	const UCharsetMatch *match;
	168	const char *name;
	169	int32_t conf;
	170
	171	ucsdet_setText(csd, beBytes, beLength, &status);
	172	match = ucsdet_detect(csd, &status);
	173
	174	if (match == NULL) {
	175	log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
	176	goto try_le;
	177	}
	178
	179	name = ucsdet_getName(match, &status);
	180	conf = ucsdet_getConfidence(match, &status);
	181
	182	if (strcmp(name, "UTF-16BE") != 0) {
	183	log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
	184	}
	185
	186	if (conf != 100) {
	187	log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
	188	}
	189
	190	try_le:
	191	ucsdet_setText(csd, leBytes, leLength, &status);
	192	match = ucsdet_detect(csd, &status);
	193
	194	if (match == NULL) {
	195	log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
	196	goto bail;
	197	}
	198
	199	name = ucsdet_getName(match, &status);
	200	conf = ucsdet_getConfidence(match, &status);
	201
	202
	203	if (strcmp(name, "UTF-16LE") != 0) {
	204	log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
	205	}
	206
	207	if (conf != 100) {
	208	log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
	209	}
	210
	211	bail:
	212	freeBytes(leBytes);
	213	freeBytes(beBytes);
	214	ucsdet_close(csd);
	215	}
	216
	217	static void TestC1Bytes(void)
	218	{
	219	#if !UCONFIG_NO_LEGACY_CONVERSION
	220	UErrorCode status = U_ZERO_ERROR;
46f4442e A	221	static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
46f4442e A	222	static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
73c04bcf	223	int32_t sISOLength = 0, sWindowsLength = 0;
46f4442e A	224	UChar sISO[sizeof(ssISO)];
46f4442e A	225	UChar sWindows[sizeof(ssWindows)];
73c04bcf	226	int32_t lISO = 0, lWindows = 0;
46f4442e A	227	char *bISO;
46f4442e A	228	char *bWindows;
73c04bcf A	229	UCharsetDetector *csd = ucsdet_open(&status);
	230	const UCharsetMatch *match;
	231	const char *name;
	232
46f4442e A	233	sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
	234	sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
	235	bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
	236	bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
	237
73c04bcf A	238	ucsdet_setText(csd, bWindows, lWindows, &status);
	239	match = ucsdet_detect(csd, &status);
	240
	241	if (match == NULL) {
	242	log_err("English test with C1 bytes got no matches.\n");
	243	goto bail;
	244	}
	245
	246	name = ucsdet_getName(match, &status);
	247
	248	if (strcmp(name, "windows-1252") != 0) {
	249	log_err("English text with C1 bytes does not detect as windows-1252, but as %s\n", name);
	250	}
	251
	252	ucsdet_setText(csd, bISO, lISO, &status);
	253	match = ucsdet_detect(csd, &status);
	254
	255	if (match == NULL) {
	256	log_err("English text without C1 bytes got no matches.\n");
	257	goto bail;
	258	}
	259
	260	name = ucsdet_getName(match, &status);
	261
	262	if (strcmp(name, "ISO-8859-1") != 0) {
	263	log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
	264	}
	265
	266	bail:
	267	freeBytes(bWindows);
	268	freeBytes(bISO);
	269
	270	ucsdet_close(csd);
	271	#endif
	272	}
	273
	274	static void TestInputFilter(void)
	275	{
	276	UErrorCode status = U_ZERO_ERROR;
46f4442e	277	static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
73c04bcf	278	int32_t sLength = 0;
46f4442e	279	UChar s[sizeof(ss)];
73c04bcf	280	int32_t byteLength = 0;
46f4442e	281	char *bytes;
73c04bcf A	282	UCharsetDetector *csd = ucsdet_open(&status);
	283	const UCharsetMatch *match;
	284	const char lang, name;
	285
46f4442e A	286	sLength = u_unescape(ss, s, sizeof(ss));
	287	bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
	288
73c04bcf A	289	ucsdet_enableInputFilter(csd, TRUE);
	290
	291	if (!ucsdet_isInputFilterEnabled(csd)) {
	292	log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
	293	}
	294
	295
	296	ucsdet_setText(csd, bytes, byteLength, &status);
	297	match = ucsdet_detect(csd, &status);
	298
	299	if (match == NULL) {
	300	log_err("Turning on the input filter resulted in no matches.\n");
	301	goto turn_off;
	302	}
	303
	304	name = ucsdet_getName(match, &status);
	305
	306	if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {
	307	log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
	308	} else {
	309	lang = ucsdet_getLanguage(match, &status);
	310
	311	if (lang == NULL \|\| strcmp(lang, "fr") != 0) {
	312	log_err("Input filter did not strip markup!\n");
	313	}
	314	}
	315
	316	turn_off:
	317	ucsdet_enableInputFilter(csd, FALSE);
	318	ucsdet_setText(csd, bytes, byteLength, &status);
	319	match = ucsdet_detect(csd, &status);
	320
	321	if (match == NULL) {
	322	log_err("Turning off the input filter resulted in no matches.\n");
	323	goto bail;
	324	}
	325
	326	name = ucsdet_getName(match, &status);
	327
	328	if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {
	329	log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
	330	} else {
	331	lang = ucsdet_getLanguage(match, &status);
	332
	333	if (lang == NULL \|\| strcmp(lang, "en") != 0) {
	334	log_err("Unfiltered input did not detect as English!\n");
	335	}
	336	}
	337
	338	bail:
	339	freeBytes(bytes);
	340	ucsdet_close(csd);
	341	}
	342
	343	static void TestChaining(void) {
	344	UErrorCode status = U_USELESS_COLLATOR_ERROR;
	345
	346	ucsdet_open(&status);
	347	ucsdet_setText(NULL, NULL, 0, &status);
	348	ucsdet_getName(NULL, &status);
	349	ucsdet_getConfidence(NULL, &status);
	350	ucsdet_getLanguage(NULL, &status);
	351	ucsdet_detect(NULL, &status);
	352	ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
353	ucsdet_detectAll(NULL, NULL, &status);
354	ucsdet_getUChars(NULL, NULL, 0, &status);
355	ucsdet_getUChars(NULL, NULL, 0, &status);
356	ucsdet_close(NULL);
357
358	/* All of this code should have done nothing. */
359	if (status != U_USELESS_COLLATOR_ERROR) {
360	log_err("Status got changed to %s\n", u_errorName(status));
361	}
362	}
46f4442e A	363
	364	static void TestBufferOverflow(void) {
	365	UErrorCode status = U_ZERO_ERROR;
	366	static const char *testStrings[] = {
	367	"\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
	368	"\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
	369	"\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
	370	"\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
	371	"\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
	372	"\xa1", /* Could be a single byte shift-jis at the end */
	373	"\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
	374	"\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
	375	};
	376	static const char *testResults[] = {
	377	"windows-1252",
	378	"windows-1252",
	379	"windows-1252",
	380	"windows-1252",
	381	"ISO-2022-JP",
	382	NULL,
	383	NULL,
	384	"ISO-8859-1"
	385	};
	386	int32_t idx = 0;
	387	UCharsetDetector *csd = ucsdet_open(&status);
	388	const UCharsetMatch *match;
	389
	390	ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
	391
	392	if (U_FAILURE(status)) {
	393	log_err("Couldn't open detector. %s\n", u_errorName(status));
	394	goto bail;
	395	}
	396
	397	for (idx = 0; idx < ARRAY_SIZE(testStrings); idx++) {
	398	ucsdet_setText(csd, testStrings[idx], -1, &status);
	399	match = ucsdet_detect(csd, &status);
	400
	401	if (match == NULL) {
	402	if (testResults[idx] != NULL) {
	403	log_err("Unexpectedly got no results at index %d.\n", idx);
	404	}
	405	else {
	406	log_verbose("Got no result as expected at index %d.\n", idx);
	407	}
	408	continue;
	409	}
	410
	411	if (testResults[idx] == NULL \|\| strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
	412	log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
	413	ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
	414	goto bail;
	415	}
	416	}
	417
	418	bail:
	419	ucsdet_close(csd);
	420	}
	421