[apple/icu.git] / icuSources / test / intltest / csdetest.cpp

/*
 **********************************************************************
 *   Copyright (C) 2005-2008, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */


#include "unicode/utypes.h"
#include "unicode/ucsdet.h"
#include "unicode/ucnv.h"
#include "unicode/unistr.h"
#include "unicode/putil.h"

#include "intltest.h"
#include "csdetest.h"

#include "xmlparser.h"

#include <stdlib.h>
#include <string.h>

#ifdef DEBUG_DETECT
#include <stdio.h>
#endif

#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))

#define CH_SPACE 0x0020
#define CH_SLASH 0x002F

//---------------------------------------------------------------------------
//
//  Test class boilerplate
//
//---------------------------------------------------------------------------
CharsetDetectionTest::CharsetDetectionTest()
{
}


CharsetDetectionTest::~CharsetDetectionTest()
{
}


void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
{
    if (exec) logln("TestSuite CharsetDetectionTest: ");
    switch (index) {
       case 0: name = "ConstructionTest";
            if (exec) ConstructionTest();
            break;

       case 1: name = "UTF8Test";
            if (exec) UTF8Test();
            break;

       case 2: name = "UTF16Test";
            if (exec) UTF16Test();
            break;

       case 3: name = "C1BytesTest";
            if (exec) C1BytesTest();
            break;

       case 4: name = "InputFilterTest";
            if (exec) InputFilterTest();
            break;

       case 5: name = "DetectionTest";
            if (exec) DetectionTest();
            break;

        default: name = "";
            break; //needed to end loop
    }
}

static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
{
    int32_t offset = -1;

    splits = 1;
    while((offset = src.indexOf(ch, offset + 1)) >= 0) {
        splits += 1;
    }

    UnicodeString *result = new UnicodeString[splits];

    int32_t start = 0;
    int32_t split = 0;
    int32_t end;

    while((end = src.indexOf(ch, start)) >= 0) {
        src.extractBetween(start, end, result[split++]);
        start = end + 1;
    }

    src.extractBetween(start, src.length(), result[split]);

    return result;
}

static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
{
    int32_t sLength = source.length();
    char *bytes = NULL;

    length = source.extract(0, sLength, NULL, codepage);

    if (length > 0) {
        bytes = NEW_ARRAY(char, length + 1);
        source.extract(0, sLength, bytes, codepage);
    }
    
    return bytes;
}

static void freeBytes(char *bytes)
{
    DELETE_ARRAY(bytes);
}

void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
{
    int32_t splits = 0;
    int32_t testLength = testString.length();
    UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
    UErrorCode status = U_ZERO_ERROR;
    int32_t cpLength = eSplit[0].length();
    char codepage[64];

    u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
    codepage[cpLength] = '\0';

    UCharsetDetector *csd = ucsdet_open(&status);

    int32_t byteLength = 0;
    char *bytes = extractBytes(testString, codepage, byteLength);

    if (bytes == NULL) {
#if !UCONFIG_NO_LEGACY_CONVERSION
        errln("Can't open a " + encoding + " converter for " + id);
#endif
        return;
    }

    ucsdet_setText(csd, bytes, byteLength, &status);

    int32_t matchCount = 0;
    const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);


    UnicodeString name(ucsdet_getName(matches[0], &status));
    UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
    UChar *decoded = NULL;
    int32_t dLength = 0;

    if (matchCount == 0) {
        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
        goto bail;
    }

    if (name.compare(eSplit[0]) != 0) {
        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);

#ifdef DEBUG_DETECT
        for (int32_t m = 0; m < matchCount; m += 1) {
            const char *name = ucsdet_getName(matches[m], &status);
            const char *lang = ucsdet_getLanguage(matches[m], &status);
            int32_t confidence = ucsdet_getConfidence(matches[m], &status);

            printf("%s (%s) %d\n", name, lang, confidence);
        }
#endif
        goto bail;
    }

    if (splits > 1 && lang.compare(eSplit[1]) != 0) {
        errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
        goto bail;
    }

    decoded = NEW_ARRAY(UChar, testLength);
    dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);

    if (testString.compare(decoded, dLength) != 0) {
        errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");

#ifdef DEBUG_DETECT
        for(int32_t i = 0; i < testLength; i += 1) {
            if(testString[i] != decoded[i]) {
                printf("Strings differ at byte %d\n", i);
                break;
            }
        }
#endif

    }

    DELETE_ARRAY(decoded);

bail:
    freeBytes(bytes);
    ucsdet_close(csd);
    delete[] eSplit;
}

const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
    UErrorCode status = U_ZERO_ERROR;
    const char *testDataDirectory = IntlTest::getSourceTestData(status);

    if (U_FAILURE(status)) {
        errln("ERROR: getPath() failed - %s", u_errorName(status));
        return NULL;
    }

    strcpy(buffer, testDataDirectory);
    strcat(buffer, filename);
    return buffer;
}

void CharsetDetectionTest::ConstructionTest()
{
    UErrorCode status = U_ZERO_ERROR;
    UCharsetDetector *csd = ucsdet_open(&status);
    UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
    int32_t count = uenum_count(e, &status);

#ifdef DEBUG_DETECT
    printf("There are %d recognizers.\n", count);
#endif

    for(int32_t i = 0; i < count; i += 1) {
        int32_t length;
        const char *name = uenum_next(e, &length, &status);

        if(name == NULL || length <= 0) {
            errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
        }

#ifdef DEBUG_DETECT
        printf("%s\n", name);
#endif
    }

    uenum_close(e);
    ucsdet_close(csd);
}

void CharsetDetectionTest::UTF8Test()
{
    UErrorCode status = U_ZERO_ERROR;
    UnicodeString ss = "This is a string with some non-ascii characters that will "
                       "be converted to UTF-8, then shoved through the detection process.  "
                       "\\u0391\\u0392\\u0393\\u0394\\u0395"
                       "Sure would be nice if our source could contain Unicode directly!";
    UnicodeString s = ss.unescape();
    int32_t byteLength = 0, sLength = s.length();
    char *bytes = extractBytes(s, "UTF-8", byteLength);
    UCharsetDetector *csd = ucsdet_open(&status);
    const UCharsetMatch *match;
    UChar *detected = NEW_ARRAY(UChar, sLength);

    ucsdet_setText(csd, bytes, byteLength, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        errln("Detection failure for UTF-8: got no matches.");
        goto bail;
    }

    ucsdet_getUChars(match, detected, sLength, &status);

    if (s.compare(detected, sLength) != 0) {
        errln("Round-trip test failed!");
    }

    ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */

bail:
    DELETE_ARRAY(detected);
    freeBytes(bytes);
    ucsdet_close(csd);
}

void CharsetDetectionTest::UTF16Test()
{
    UErrorCode status = U_ZERO_ERROR;
    /* Notice the BOM on the start of this string */
    UChar chars[] = {
        0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
        0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
        0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
        0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
        0x064a, 0x062a, 0x0000};
    UnicodeString s(chars);
    int32_t beLength = 0, leLength = 0;
    char *beBytes = extractBytes(s, "UTF-16BE", beLength);
    char *leBytes = extractBytes(s, "UTF-16LE", leLength);
    UCharsetDetector *csd = ucsdet_open(&status);
    const UCharsetMatch *match;
    const char *name;
    int32_t conf;

    ucsdet_setText(csd, beBytes, beLength, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        errln("Encoding detection failure for UTF-16BE: got no matches.");
        goto try_le;
    }

    name  = ucsdet_getName(match, &status);
    conf  = ucsdet_getConfidence(match, &status);

    if (strcmp(name, "UTF-16BE") != 0) {
        errln("Encoding detection failure for UTF-16BE: got %s", name);
        goto try_le; // no point in looking at confidence if we got the wrong character set.
    }

    if (conf != 100) {
        errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
    }

try_le:
    ucsdet_setText(csd, leBytes, leLength, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        errln("Encoding detection failure for UTF-16LE: got no matches.");
        goto bail;
    }

    name  = ucsdet_getName(match, &status);
    conf = ucsdet_getConfidence(match, &status);


    if (strcmp(name, "UTF-16LE") != 0) {
        errln("Enconding detection failure for UTF-16LE: got %s", name);
        goto bail; // no point in looking at confidence if we got the wrong character set.
    }

    if (conf != 100) {
        errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
    }

bail:
    freeBytes(leBytes);
    freeBytes(beBytes);
    ucsdet_close(csd);
}

void CharsetDetectionTest::InputFilterTest()
{
    UErrorCode status = U_ZERO_ERROR;
    UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
    UnicodeString s  = ss.unescape();
    int32_t byteLength = 0;
    char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
    UCharsetDetector *csd = ucsdet_open(&status);
    const UCharsetMatch *match;
    const char *lang, *name;

    ucsdet_enableInputFilter(csd, TRUE);

    if (!ucsdet_isInputFilterEnabled(csd)) {
        errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
    }


    ucsdet_setText(csd, bytes, byteLength, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        errln("Turning on the input filter resulted in no matches.");
        goto turn_off;
    }

    name = ucsdet_getName(match, &status);

    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
        errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
    } else {
        lang = ucsdet_getLanguage(match, &status);

        if (lang == NULL || strcmp(lang, "fr") != 0) {
            errln("Input filter did not strip markup!");
        }
    }

turn_off:
    ucsdet_enableInputFilter(csd, FALSE);
    ucsdet_setText(csd, bytes, byteLength, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        errln("Turning off the input filter resulted in no matches.");
        goto bail;
    }

    name = ucsdet_getName(match, &status);

    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
        errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
    } else {
        lang = ucsdet_getLanguage(match, &status);

        if (lang == NULL || strcmp(lang, "en") != 0) {
            errln("Unfiltered input did not detect as English!");
        }
    }

bail:
    freeBytes(bytes);
    ucsdet_close(csd);
}

void CharsetDetectionTest::C1BytesTest()
{
#if !UCONFIG_NO_LEGACY_CONVERSION
    UErrorCode status = U_ZERO_ERROR;
    UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
    UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
    UnicodeString sWindows  = ssWindows.unescape();
    int32_t lISO = 0, lWindows = 0;
    char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
    char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
    UCharsetDetector *csd = ucsdet_open(&status);
    const UCharsetMatch *match;
    const char *name;

    ucsdet_setText(csd, bWindows, lWindows, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        errln("English test with C1 bytes got no matches.");
        goto bail;
    }

    name  = ucsdet_getName(match, &status);

    if (strcmp(name, "windows-1252") != 0) {
        errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
    }

    ucsdet_setText(csd, bISO, lISO, &status);
    match = ucsdet_detect(csd, &status);

    if (match == NULL) {
        errln("English text without C1 bytes got no matches.");
        goto bail;
    }

    name  = ucsdet_getName(match, &status);

    if (strcmp(name, "ISO-8859-1") != 0) {
        errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
    }

bail:
    freeBytes(bWindows);
    freeBytes(bISO);

    ucsdet_close(csd);
#endif
}

void CharsetDetectionTest::DetectionTest()
{
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
    UErrorCode status = U_ZERO_ERROR;
    char path[2048];
    const char *testFilePath = getPath(path, "csdetest.xml");

    if (testFilePath == NULL) {
        return; /* Couldn't get path: error message already output. */
    }

    UXMLParser  *parser = UXMLParser::createParser(status);
    if (!assertSuccess("UXMLParser::createParser",status)) return;
    UXMLElement *root   = parser->parseFile(testFilePath, status);
    if (!assertSuccess( "parseFile",status)) return;

    UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
    UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
    UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");

    const UXMLElement *testCase;
    int32_t tc = 0;

    while((testCase = root->nextChildElement(tc)) != NULL) {
        if (testCase->getTagName().compare(test_case) == 0) {
            const UnicodeString *id = testCase->getAttribute(id_attr);
            const UnicodeString *encodings = testCase->getAttribute(enc_attr);
            const UnicodeString  text = testCase->getText(TRUE);
            int32_t encodingCount;
            UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);

            for(int32_t e = 0; e < encodingCount; e += 1) {
                checkEncoding(text, encodingList[e], *id);
            }

            delete[] encodingList;
        }
    }

    delete root;
    delete parser;
#endif
}
Commit	Line	Data
73c04bcf A	1	/*
73c04bcf A	2	**********************************************************************
46f4442e	3	* Copyright (C) 2005-2008, International Business Machines
73c04bcf A	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	*/
	7
	8
	9	#include "unicode/utypes.h"
	10	#include "unicode/ucsdet.h"
	11	#include "unicode/ucnv.h"
	12	#include "unicode/unistr.h"
	13	#include "unicode/putil.h"
	14
	15	#include "intltest.h"
	16	#include "csdetest.h"
	17
	18	#include "xmlparser.h"
	19
	20	#include <stdlib.h>
	21	#include <string.h>
	22
	23	#ifdef DEBUG_DETECT
	24	#include <stdio.h>
	25	#endif
	26
	27	#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
	28
	29	#define NEW_ARRAY(type,count) (type ) /uprv_/malloc((count) sizeof(type))
	30	#define DELETE_ARRAY(array) /uprv_/free((void *) (array))
	31
	32	#define CH_SPACE 0x0020
	33	#define CH_SLASH 0x002F
	34
	35	//---------------------------------------------------------------------------
	36	//
	37	// Test class boilerplate
	38	//
	39	//---------------------------------------------------------------------------
	40	CharsetDetectionTest::CharsetDetectionTest()
	41	{
	42	}
	43
	44
	45	CharsetDetectionTest::~CharsetDetectionTest()
	46	{
	47	}
	48
	49
	50
	51	void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /par/ )
	52	{
	53	if (exec) logln("TestSuite CharsetDetectionTest: ");
	54	switch (index) {
	55	case 0: name = "ConstructionTest";
	56	if (exec) ConstructionTest();
	57	break;
	58
	59	case 1: name = "UTF8Test";
	60	if (exec) UTF8Test();
	61	break;
	62
	63	case 2: name = "UTF16Test";
	64	if (exec) UTF16Test();
	65	break;
	66
	67	case 3: name = "C1BytesTest";
68	if (exec) C1BytesTest();
69	break;
70
71	case 4: name = "InputFilterTest";
72	if (exec) InputFilterTest();
73	break;
74
75	case 5: name = "DetectionTest";
76	if (exec) DetectionTest();
77	break;
78
79	default: name = "";
80	break; //needed to end loop
81	}
82	}
83
84	static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
85	{
86	int32_t offset = -1;
87
88	splits = 1;
89	while((offset = src.indexOf(ch, offset + 1)) >= 0) {
90	splits += 1;
91	}
92
93	UnicodeString *result = new UnicodeString[splits];
94
95	int32_t start = 0;
96	int32_t split = 0;
97	int32_t end;
98
99	while((end = src.indexOf(ch, start)) >= 0) {
100	src.extractBetween(start, end, result[split++]);
101	start = end + 1;
102	}
103
104	src.extractBetween(start, src.length(), result[split]);
105
106	return result;
107	}
108
109	static char extractBytes(const UnicodeString &source, const char codepage, int32_t &length)
110	{
111	int32_t sLength = source.length();
112	char *bytes = NULL;
113
114	length = source.extract(0, sLength, NULL, codepage);
115
116	if (length > 0) {
117	bytes = NEW_ARRAY(char, length + 1);
118	source.extract(0, sLength, bytes, codepage);
119	}
120
121	return bytes;
122	}
123
124	static void freeBytes(char *bytes)
125	{
126	DELETE_ARRAY(bytes);
127	}
128
129	void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
130	{
131	int32_t splits = 0;
132	int32_t testLength = testString.length();
133	UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
134	UErrorCode status = U_ZERO_ERROR;
135	int32_t cpLength = eSplit[0].length();
136	char codepage[64];
137
138	u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
139	codepage[cpLength] = '\0';
140
141	UCharsetDetector *csd = ucsdet_open(&status);
142
143	int32_t byteLength = 0;
144	char *bytes = extractBytes(testString, codepage, byteLength);
145
146	if (bytes == NULL) {
147	#if !UCONFIG_NO_LEGACY_CONVERSION
148	errln("Can't open a " + encoding + " converter for " + id);
149	#endif
150	return;
151	}
152
153	ucsdet_setText(csd, bytes, byteLength, &status);
154
155	int32_t matchCount = 0;
156	const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
157
158
159	UnicodeString name(ucsdet_getName(matches[0], &status));
160	UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
161	UChar *decoded = NULL;
162	int32_t dLength = 0;
163
164	if (matchCount == 0) {
165	errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
166	goto bail;
167	}
168
169	if (name.compare(eSplit[0]) != 0) {
170	errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
171
172	#ifdef DEBUG_DETECT
173	for (int32_t m = 0; m < matchCount; m += 1) {
174	const char *name = ucsdet_getName(matches[m], &status);
175	const char *lang = ucsdet_getLanguage(matches[m], &status);
176	int32_t confidence = ucsdet_getConfidence(matches[m], &status);
177
178	printf("%s (%s) %d\n", name, lang, confidence);
179	}
180	#endif
181	goto bail;
182	}
183
184	if (splits > 1 && lang.compare(eSplit[1]) != 0) {
185	errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
186	goto bail;
187	}
188
189	decoded = NEW_ARRAY(UChar, testLength);
190	dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
191
192	if (testString.compare(decoded, dLength) != 0) {
193	errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
194
195	#ifdef DEBUG_DETECT
196	for(int32_t i = 0; i < testLength; i += 1) {
197	if(testString[i] != decoded[i]) {
198	printf("Strings differ at byte %d\n", i);
199	break;
200	}
201	}
202	#endif
203
204	}
205
206	DELETE_ARRAY(decoded);
207
208	bail:
209	freeBytes(bytes);
210	ucsdet_close(csd);
211	delete[] eSplit;
212	}
213
214	const char CharsetDetectionTest::getPath(char buffer[2048], const char filename) {
215	UErrorCode status = U_ZERO_ERROR;
216	const char *testDataDirectory = IntlTest::getSourceTestData(status);
217
218	if (U_FAILURE(status)) {
219	errln("ERROR: getPath() failed - %s", u_errorName(status));
220	return NULL;
221	}
222
223	strcpy(buffer, testDataDirectory);
224	strcat(buffer, filename);
225	return buffer;
226	}
227
228	void CharsetDetectionTest::ConstructionTest()
229	{
230	UErrorCode status = U_ZERO_ERROR;
231	UCharsetDetector *csd = ucsdet_open(&status);
232	UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
233	int32_t count = uenum_count(e, &status);
234
235	#ifdef DEBUG_DETECT
236	printf("There are %d recognizers.\n", count);
237	#endif
238
239	for(int32_t i = 0; i < count; i += 1) {
240	int32_t length;
241	const char *name = uenum_next(e, &length, &status);
242
243	if(name == NULL \|\| length <= 0) {
244	errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
245	}
246
247	#ifdef DEBUG_DETECT
248	printf("%s\n", name);
249	#endif
250	}
251
252	uenum_close(e);
253	ucsdet_close(csd);
254	}
255
256	void CharsetDetectionTest::UTF8Test()
257	{
258	UErrorCode status = U_ZERO_ERROR;
259	UnicodeString ss = "This is a string with some non-ascii characters that will "
260	"be converted to UTF-8, then shoved through the detection process. "
261	"\\u0391\\u0392\\u0393\\u0394\\u0395"
262	"Sure would be nice if our source could contain Unicode directly!";
263	UnicodeString s = ss.unescape();
264	int32_t byteLength = 0, sLength = s.length();
265	char *bytes = extractBytes(s, "UTF-8", byteLength);
266	UCharsetDetector *csd = ucsdet_open(&status);
267	const UCharsetMatch *match;
268	UChar *detected = NEW_ARRAY(UChar, sLength);
269
270	ucsdet_setText(csd, bytes, byteLength, &status);
271	match = ucsdet_detect(csd, &status);
272
273	if (match == NULL) {
274	errln("Detection failure for UTF-8: got no matches.");
275	goto bail;
276	}
277
278	ucsdet_getUChars(match, detected, sLength, &status);
279
280	if (s.compare(detected, sLength) != 0) {
281	errln("Round-trip test failed!");
282	}
283
284	ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
285
286	bail:
287	DELETE_ARRAY(detected);
288	freeBytes(bytes);
289	ucsdet_close(csd);
290	}
291
292	void CharsetDetectionTest::UTF16Test()
293	{
294	UErrorCode status = U_ZERO_ERROR;
295	/* Notice the BOM on the start of this string */
296	UChar chars[] = {
297	0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
298	0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
299	0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
300	0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
301	0x064a, 0x062a, 0x0000};
302	UnicodeString s(chars);
303	int32_t beLength = 0, leLength = 0;
304	char *beBytes = extractBytes(s, "UTF-16BE", beLength);
305	char *leBytes = extractBytes(s, "UTF-16LE", leLength);
306	UCharsetDetector *csd = ucsdet_open(&status);
307	const UCharsetMatch *match;
308	const char *name;
309	int32_t conf;
310
311	ucsdet_setText(csd, beBytes, beLength, &status);
312	match = ucsdet_detect(csd, &status);
313
314	if (match == NULL) {
315	errln("Encoding detection failure for UTF-16BE: got no matches.");
316	goto try_le;
317	}
318
319	name = ucsdet_getName(match, &status);
320	conf = ucsdet_getConfidence(match, &status);
321
322	if (strcmp(name, "UTF-16BE") != 0) {
323	errln("Encoding detection failure for UTF-16BE: got %s", name);
324	goto try_le; // no point in looking at confidence if we got the wrong character set.
325	}
326
327	if (conf != 100) {
328	errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
329	}
330
331	try_le:
332	ucsdet_setText(csd, leBytes, leLength, &status);
333	match = ucsdet_detect(csd, &status);
334
335	if (match == NULL) {
336	errln("Encoding detection failure for UTF-16LE: got no matches.");
337	goto bail;
338	}
339
340	name = ucsdet_getName(match, &status);
341	conf = ucsdet_getConfidence(match, &status);
342
343
344	if (strcmp(name, "UTF-16LE") != 0) {
345	errln("Enconding detection failure for UTF-16LE: got %s", name);
346	goto bail; // no point in looking at confidence if we got the wrong character set.
347	}
348
349	if (conf != 100) {
350	errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
351	}
352
353	bail:
354	freeBytes(leBytes);
355	freeBytes(beBytes);
356	ucsdet_close(csd);
357	}
358
359	void CharsetDetectionTest::InputFilterTest()
360	{
361	UErrorCode status = U_ZERO_ERROR;
362	UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
363	UnicodeString s = ss.unescape();
364	int32_t byteLength = 0;
365	char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
366	UCharsetDetector *csd = ucsdet_open(&status);
367	const UCharsetMatch *match;
368	const char lang, name;
369
370	ucsdet_enableInputFilter(csd, TRUE);
371
372	if (!ucsdet_isInputFilterEnabled(csd)) {
373	errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
374	}
375
376
377	ucsdet_setText(csd, bytes, byteLength, &status);
378	match = ucsdet_detect(csd, &status);
379
380	if (match == NULL) {
381	errln("Turning on the input filter resulted in no matches.");
382	goto turn_off;
383	}
384
385	name = ucsdet_getName(match, &status);
386
387	if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {
388	errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
389	} else {
390	lang = ucsdet_getLanguage(match, &status);
391
392	if (lang == NULL \|\| strcmp(lang, "fr") != 0) {
393	errln("Input filter did not strip markup!");
394	}
395	}
396
397	turn_off:
398	ucsdet_enableInputFilter(csd, FALSE);
399	ucsdet_setText(csd, bytes, byteLength, &status);
400	match = ucsdet_detect(csd, &status);
401
402	if (match == NULL) {
403	errln("Turning off the input filter resulted in no matches.");
404	goto bail;
405	}
406
407	name = ucsdet_getName(match, &status);
408
409	if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {
410	errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
411	} else {
412	lang = ucsdet_getLanguage(match, &status);
413
414	if (lang == NULL \|\| strcmp(lang, "en") != 0) {
415	errln("Unfiltered input did not detect as English!");
416	}
417	}
418
419	bail:
420	freeBytes(bytes);
421	ucsdet_close(csd);
422	}
423
424	void CharsetDetectionTest::C1BytesTest()
425	{
426	#if !UCONFIG_NO_LEGACY_CONVERSION
427	UErrorCode status = U_ZERO_ERROR;
428	UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
46f4442e	429	UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
73c04bcf A	430	UnicodeString sWindows = ssWindows.unescape();
	431	int32_t lISO = 0, lWindows = 0;
	432	char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
	433	char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
	434	UCharsetDetector *csd = ucsdet_open(&status);
	435	const UCharsetMatch *match;
	436	const char *name;
	437
	438	ucsdet_setText(csd, bWindows, lWindows, &status);
	439	match = ucsdet_detect(csd, &status);
	440
	441	if (match == NULL) {
	442	errln("English test with C1 bytes got no matches.");
	443	goto bail;
	444	}
	445
	446	name = ucsdet_getName(match, &status);
	447
	448	if (strcmp(name, "windows-1252") != 0) {
	449	errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
	450	}
	451
	452	ucsdet_setText(csd, bISO, lISO, &status);
	453	match = ucsdet_detect(csd, &status);
	454
	455	if (match == NULL) {
	456	errln("English text without C1 bytes got no matches.");
	457	goto bail;
	458	}
	459
	460	name = ucsdet_getName(match, &status);
	461
	462	if (strcmp(name, "ISO-8859-1") != 0) {
	463	errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
	464	}
	465
	466	bail:
	467	freeBytes(bWindows);
	468	freeBytes(bISO);
	469
	470	ucsdet_close(csd);
	471	#endif
	472	}
	473
	474	void CharsetDetectionTest::DetectionTest()
	475	{
	476	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
	477	UErrorCode status = U_ZERO_ERROR;
	478	char path[2048];
	479	const char *testFilePath = getPath(path, "csdetest.xml");
	480
	481	if (testFilePath == NULL) {
	482	return; /* Couldn't get path: error message already output. */
	483	}
	484
	485	UXMLParser *parser = UXMLParser::createParser(status);
	486	if (!assertSuccess("UXMLParser::createParser",status)) return;
	487	UXMLElement *root = parser->parseFile(testFilePath, status);
	488	if (!assertSuccess( "parseFile",status)) return;
	489
	490	UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
	491	UnicodeString id_attr = UNICODE_STRING_SIMPLE("id");
	492	UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings");
	493
494	const UXMLElement *testCase;
495	int32_t tc = 0;
496
497	while((testCase = root->nextChildElement(tc)) != NULL) {
498	if (testCase->getTagName().compare(test_case) == 0) {
499	const UnicodeString *id = testCase->getAttribute(id_attr);
500	const UnicodeString *encodings = testCase->getAttribute(enc_attr);
501	const UnicodeString text = testCase->getText(TRUE);
502	int32_t encodingCount;
503	UnicodeString encodingList = split(encodings, CH_SPACE, encodingCount);
504
505	for(int32_t e = 0; e < encodingCount; e += 1) {
506	checkEncoding(text, encodingList[e], *id);
507	}
508
509	delete[] encodingList;
510	}
511	}
512
513	delete root;
514	delete parser;
515	#endif
516	}
517
518