[apple/icu.git] / icuSources / test / intltest / canittst.cpp

/********************************************************************
 * COPYRIGHT: 
 * Copyright (c) 2002-2004, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************
 *
 * @author Mark E. Davis
 * @author Vladimir Weinstein
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#include "intltest.h"
#include "cstring.h"
#include "canittst.h"
#include "unicode/caniter.h"
#include "unicode/normlzr.h"
#include "unicode/uchar.h"
#include "hash.h"

#define ARRAY_LENGTH(array) ((int32_t)(sizeof (array) / sizeof (*array)))

#define CASE(id,test) case id:                          \
                          name = #test;                 \
                          if (exec) {                   \
                              logln(#test "---");       \
                              logln((UnicodeString)""); \
                              test();                   \
                          }                             \
                          break

void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec,
                                         const char* &name, char* /*par*/) {
    switch (index) {
        CASE(0, TestBasic);
        CASE(1, TestExhaustive);
        CASE(2, TestAPI);
      default: name = ""; break;
    }
}

/**
 * Convert Java-style strings with \u Unicode escapes into UnicodeString objects
static UnicodeString str(const char *input)
{
    UnicodeString str(input, ""); // Invariant conversion
    return str.unescape();
}
 */


CanonicalIteratorTest::CanonicalIteratorTest() :
nameTrans(NULL), hexTrans(NULL)
{
}

CanonicalIteratorTest::~CanonicalIteratorTest()
{
#if !UCONFIG_NO_TRANSLITERATION
  if(nameTrans != NULL) {
    delete(nameTrans);
  }
  if(hexTrans != NULL) {
    delete(hexTrans);
  }
#endif
}

void CanonicalIteratorTest::TestExhaustive() {
    UErrorCode status = U_ZERO_ERROR;
    CanonicalIterator it("", status);
    UChar32 i = 0;
    UnicodeString s, decomp, comp;
    // Test static and dynamic class IDs
    if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
        errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID");
    }
    for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) {
        //for (i = 0xae00; i < 0xaf00; ++i) {
        
        if ((i % 0x100) == 0) {
            logln("Testing U+%06X", i);
        }
        
        // skip characters we know don't have decomps
        int8_t type = u_charType(i);
        if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR
            || type == U_SURROGATE) continue;
        
        s = i;
        s += (UChar32)0x0345; //"\\u0345";
        
        Normalizer::decompose(s, FALSE, 0, decomp, status);
        Normalizer::compose(s, FALSE, 0, comp, status);
        
        // skip characters that don't have either decomp.
        // need quick test for this!
        if (s == decomp && s == comp) {
            continue;
        }
        
        it.setSource(s, status);
        UBool gotDecomp = FALSE;
        UBool gotComp = FALSE;
        UBool gotSource = FALSE;
        
        while (TRUE) {
            UnicodeString item = it.next();
            if (item.isBogus()) break;
            if (item == s) gotSource = TRUE;
            if (item == decomp) gotDecomp = TRUE;
            if (item == comp) gotComp = TRUE;
        }
        
        if (!gotSource || !gotDecomp || !gotComp) {
            errln("FAIL CanonicalIterator: " + s + (int)i);
        }
    }
}

void CanonicalIteratorTest::TestBasic() {

    UErrorCode status = U_ZERO_ERROR;

    static const char * const testArray[][2] = {
        {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, "
            "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, "
            "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, "
            "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"},
        {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"},
        {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"},
    };
    
#if 0
    // This is not interesting for C/C++ as the data is already built beforehand
    // check build
    UnicodeSet ss = CanonicalIterator.getSafeStart();
    logln("Safe Start: " + ss.toPattern(true));
    ss = CanonicalIterator.getStarts('a');
    expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
        new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
        + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
            );
#endif

    // check permute
    // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!

    Hashtable *permutations = new Hashtable(FALSE, status);
    permutations->setValueDeleter(uhash_deleteUnicodeString);
    UnicodeString toPermute("ABC");

    CanonicalIterator::permute(toPermute, FALSE, permutations, status);

    logln("testing permutation");
  
    expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA");

    delete permutations;
    
    // try samples
    logln("testing samples");
    Hashtable *set = new Hashtable(FALSE, status);
    set->setValueDeleter(uhash_deleteUnicodeString);
    int32_t i = 0;
    CanonicalIterator it("", status);
    if(U_SUCCESS(status)) {
      for (i = 0; i < ARRAY_LENGTH(testArray); ++i) {
          //logln("Results for: " + name.transliterate(testArray[i]));
          UnicodeString testStr = CharsToUnicodeString(testArray[i][0]);
          it.setSource(testStr, status);
          set->removeAll();
          while (TRUE) {
              //UnicodeString *result = new UnicodeString(it.next());
              UnicodeString result(it.next());
              if (result.isBogus()) {
                  break;
              }
              set->put(result, new UnicodeString(result), status); // Add result to the table
              //logln(++counter + ": " + hex.transliterate(result));
              //logln(" = " + name.transliterate(result));
          }
          expectEqual(i + ": ", testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1]));

      }
    } else {
      errln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status));
    }
    delete set;
}

void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) {
    if (!(a==b)) {
        errln("FAIL: " + message + getReadable(item));
        errln("\t" + getReadable(a));
        errln("\t" + getReadable(b));
    } else {
        logln("Checked: " + message + getReadable(item));
        logln("\t" + getReadable(a));
        logln("\t" + getReadable(b));
    }
}

UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) {
  UErrorCode status = U_ZERO_ERROR;
  UnicodeString result = "[";
    if (s.length() == 0) return "";
    // set up for readable display
#if !UCONFIG_NO_TRANSLITERATION
    if(verbose) {
      if (nameTrans == NULL)
          nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status);
      UnicodeString sName = s;
      nameTrans->transliterate(sName);
      result += sName;
      result += ";";
    }
    if (hexTrans == NULL)
        hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status);
#endif
    UnicodeString sHex = s;
#if !UCONFIG_NO_TRANSLITERATION
    if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated
      hexTrans->transliterate(sHex);
    }
#endif
    result += sHex;
    result += "]";
    return result;
    //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]";
}

U_CFUNC int U_CALLCONV
compareUnicodeStrings(const void *s1, const void *s2) {
  UnicodeString **st1 = (UnicodeString **)s1;
  UnicodeString **st2 = (UnicodeString **)s2;

  return (*st1)->compare(**st2);
}


UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) {
    UnicodeString result;

    // Iterate over the Hashtable, then qsort.

    UnicodeString **resArray = new UnicodeString*[col->count()];
    int32_t i = 0;

    const UHashElement *ne = NULL;
    int32_t el = -1;
    //Iterator it = basic.iterator();
    ne = col->nextElement(el);
    //while (it.hasNext()) 
    while (ne != NULL) {
      //String item = (String) it.next();
      UnicodeString *item = (UnicodeString *)(ne->value.pointer);
      resArray[i++] = item;
      ne = col->nextElement(el);
    }

    for(i = 0; i<col->count(); ++i) {
      logln(*resArray[i]);
    }

    qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings);

    result = *resArray[0];

    for(i = 1; i<col->count(); ++i) {
      result += ", ";
      result += *resArray[i];
    }

/*
    Iterator it = col.iterator();
    while (it.hasNext()) {
        if (result.length() != 0) result.append(", ");
        result.append(it.next().toString());
    }
*/

    delete [] resArray;

    return result;
}

void CanonicalIteratorTest::TestAPI() {
  UErrorCode status = U_ZERO_ERROR;
  // Test reset and getSource
  UnicodeString start("ljubav");
  logln("Testing CanonicalIterator::getSource");
  logln("Instantiating canonical iterator with string "+start);
  CanonicalIterator can(start, status);
  UnicodeString source = can.getSource();
  logln("CanonicalIterator::getSource returned "+source);
  if(start != source) {
    errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source);
  }
  logln("Testing CanonicalIterator::reset");
  UnicodeString next = can.next();
  logln("CanonicalIterator::next returned "+next);

  can.reset();

  UnicodeString afterReset = can.next();
  logln("After reset, CanonicalIterator::next returned "+afterReset);

  if(next != afterReset) {
    errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+").");
  }
  
  logln("Testing getStaticClassID and getDynamicClassID");
  if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
      errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID");
  }
}

#endif /* #if !UCONFIG_NO_NORMALIZATION */
Commit	Line	Data
b75a7d8f A	1	/********************************************************************
b75a7d8f A	2	* COPYRIGHT:
374ca955	3	* Copyright (c) 2002-2004, International Business Machines Corporation and
b75a7d8f A	4	* others. All Rights Reserved.
	5	********************************************************************
	6	*
	7	* @author Mark E. Davis
	8	* @author Vladimir Weinstein
	9	*/
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_NORMALIZATION
	14
	15	#include "intltest.h"
	16	#include "cstring.h"
	17	#include "canittst.h"
	18	#include "unicode/caniter.h"
374ca955	19	#include "unicode/normlzr.h"
b75a7d8f	20	#include "unicode/uchar.h"
374ca955	21	#include "hash.h"
b75a7d8f A	22
	23	#define ARRAY_LENGTH(array) ((int32_t)(sizeof (array) / sizeof (*array)))
	24
	25	#define CASE(id,test) case id: \
	26	name = #test; \
	27	if (exec) { \
	28	logln(#test "---"); \
	29	logln((UnicodeString)""); \
	30	test(); \
	31	} \
	32	break
	33
	34	void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec,
	35	const char* &name, char* /par/) {
	36	switch (index) {
	37	CASE(0, TestBasic);
	38	CASE(1, TestExhaustive);
	39	CASE(2, TestAPI);
	40	default: name = ""; break;
	41	}
	42	}
	43
	44	/**
	45	* Convert Java-style strings with \u Unicode escapes into UnicodeString objects
	46	static UnicodeString str(const char *input)
	47	{
	48	UnicodeString str(input, ""); // Invariant conversion
	49	return str.unescape();
	50	}
	51	*/
	52
	53
	54	CanonicalIteratorTest::CanonicalIteratorTest() :
	55	nameTrans(NULL), hexTrans(NULL)
	56	{
	57	}
	58
	59	CanonicalIteratorTest::~CanonicalIteratorTest()
	60	{
	61	#if !UCONFIG_NO_TRANSLITERATION
	62	if(nameTrans != NULL) {
	63	delete(nameTrans);
	64	}
	65	if(hexTrans != NULL) {
	66	delete(hexTrans);
	67	}
	68	#endif
	69	}
	70
	71	void CanonicalIteratorTest::TestExhaustive() {
	72	UErrorCode status = U_ZERO_ERROR;
	73	CanonicalIterator it("", status);
	74	UChar32 i = 0;
	75	UnicodeString s, decomp, comp;
374ca955	76	// Test static and dynamic class IDs
b75a7d8f	77	if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
374ca955 A	78	errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID");
374ca955 A	79	}
b75a7d8f A	80	for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) {
	81	//for (i = 0xae00; i < 0xaf00; ++i) {
	82
	83	if ((i % 0x100) == 0) {
	84	logln("Testing U+%06X", i);
	85	}
	86
	87	// skip characters we know don't have decomps
	88	int8_t type = u_charType(i);
	89	if (type == U_UNASSIGNED \|\| type == U_PRIVATE_USE_CHAR
	90	\|\| type == U_SURROGATE) continue;
	91
	92	s = i;
	93	s += (UChar32)0x0345; //"\\u0345";
	94
	95	Normalizer::decompose(s, FALSE, 0, decomp, status);
	96	Normalizer::compose(s, FALSE, 0, comp, status);
	97
	98	// skip characters that don't have either decomp.
	99	// need quick test for this!
	100	if (s == decomp && s == comp) {
	101	continue;
	102	}
	103
	104	it.setSource(s, status);
	105	UBool gotDecomp = FALSE;
	106	UBool gotComp = FALSE;
	107	UBool gotSource = FALSE;
	108
	109	while (TRUE) {
	110	UnicodeString item = it.next();
	111	if (item.isBogus()) break;
	112	if (item == s) gotSource = TRUE;
	113	if (item == decomp) gotDecomp = TRUE;
	114	if (item == comp) gotComp = TRUE;
	115	}
	116
	117	if (!gotSource \|\| !gotDecomp \|\| !gotComp) {
	118	errln("FAIL CanonicalIterator: " + s + (int)i);
	119	}
	120	}
	121	}
	122
	123	void CanonicalIteratorTest::TestBasic() {
	124
	125	UErrorCode status = U_ZERO_ERROR;
	126
	127	static const char * const testArray[][2] = {
	128	{"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, "
	129	"A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, "
	130	"\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, "
	131	"\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"},
	132	{"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"},
	133	{"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"},
	134	};
	135
	136	#if 0
	137	// This is not interesting for C/C++ as the data is already built beforehand
	138	// check build
	139	UnicodeSet ss = CanonicalIterator.getSafeStart();
	140	logln("Safe Start: " + ss.toPattern(true));
	141	ss = CanonicalIterator.getStarts('a');
	142	expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
	143	new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
144	+ "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
145	);
146	#endif
147
148	// check permute
149	// NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
150
151	Hashtable *permutations = new Hashtable(FALSE, status);
152	permutations->setValueDeleter(uhash_deleteUnicodeString);
153	UnicodeString toPermute("ABC");
154
155	CanonicalIterator::permute(toPermute, FALSE, permutations, status);
156
157	logln("testing permutation");
158
159	expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA");
160
161	delete permutations;
162
163	// try samples
164	logln("testing samples");
165	Hashtable *set = new Hashtable(FALSE, status);
166	set->setValueDeleter(uhash_deleteUnicodeString);
167	int32_t i = 0;
168	CanonicalIterator it("", status);
169	if(U_SUCCESS(status)) {
170	for (i = 0; i < ARRAY_LENGTH(testArray); ++i) {
171	//logln("Results for: " + name.transliterate(testArray[i]));
172	UnicodeString testStr = CharsToUnicodeString(testArray[i][0]);
173	it.setSource(testStr, status);
174	set->removeAll();
175	while (TRUE) {
176	//UnicodeString *result = new UnicodeString(it.next());
177	UnicodeString result(it.next());
178	if (result.isBogus()) {
179	break;
180	}
181	set->put(result, new UnicodeString(result), status); // Add result to the table
182	//logln(++counter + ": " + hex.transliterate(result));
183	//logln(" = " + name.transliterate(result));
184	}
185	expectEqual(i + ": ", testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1]));
186
187	}
188	} else {
189	errln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status));
190	}
191	delete set;
192	}
193
194	void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) {
195	if (!(a==b)) {
196	errln("FAIL: " + message + getReadable(item));
197	errln("\t" + getReadable(a));
198	errln("\t" + getReadable(b));
199	} else {
200	logln("Checked: " + message + getReadable(item));
201	logln("\t" + getReadable(a));
202	logln("\t" + getReadable(b));
203	}
204	}
205
206	UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) {
207	UErrorCode status = U_ZERO_ERROR;
208	UnicodeString result = "[";
209	if (s.length() == 0) return "";
210	// set up for readable display
211	#if !UCONFIG_NO_TRANSLITERATION
212	if(verbose) {
213	if (nameTrans == NULL)
214	nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status);
215	UnicodeString sName = s;
216	nameTrans->transliterate(sName);
217	result += sName;
218	result += ";";
219	}
220	if (hexTrans == NULL)
221	hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status);
222	#endif
223	UnicodeString sHex = s;
224	#if !UCONFIG_NO_TRANSLITERATION
225	if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated
226	hexTrans->transliterate(sHex);
227	}
228	#endif
229	result += sHex;
230	result += "]";
231	return result;
232	//return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]";
233	}
234
374ca955 A	235	U_CFUNC int U_CALLCONV
374ca955 A	236	compareUnicodeStrings(const void s1, const void s2) {
b75a7d8f A	237	UnicodeString st1 = (UnicodeString )s1;
	238	UnicodeString st2 = (UnicodeString )s2;
	239
	240	return (st1)->compare(*st2);
	241	}
	242
	243
	244	UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) {
	245	UnicodeString result;
	246
	247	// Iterate over the Hashtable, then qsort.
	248
	249	UnicodeString *resArray = new UnicodeString[col->count()];
	250	int32_t i = 0;
	251
	252	const UHashElement *ne = NULL;
	253	int32_t el = -1;
	254	//Iterator it = basic.iterator();
	255	ne = col->nextElement(el);
	256	//while (it.hasNext())
	257	while (ne != NULL) {
	258	//String item = (String) it.next();
	259	UnicodeString item = (UnicodeString )(ne->value.pointer);
	260	resArray[i++] = item;
	261	ne = col->nextElement(el);
	262	}
	263
	264	for(i = 0; i<col->count(); ++i) {
	265	logln(*resArray[i]);
	266	}
	267
	268	qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings);
	269
	270	result = *resArray[0];
	271
	272	for(i = 1; i<col->count(); ++i) {
	273	result += ", ";
	274	result += *resArray[i];
	275	}
	276
	277	/*
	278	Iterator it = col.iterator();
	279	while (it.hasNext()) {
	280	if (result.length() != 0) result.append(", ");
	281	result.append(it.next().toString());
	282	}
	283	*/
	284
	285	delete [] resArray;
	286
	287	return result;
	288	}
	289
	290	void CanonicalIteratorTest::TestAPI() {
	291	UErrorCode status = U_ZERO_ERROR;
	292	// Test reset and getSource
	293	UnicodeString start("ljubav");
	294	logln("Testing CanonicalIterator::getSource");
	295	logln("Instantiating canonical iterator with string "+start);
	296	CanonicalIterator can(start, status);
	297	UnicodeString source = can.getSource();
	298	logln("CanonicalIterator::getSource returned "+source);
	299	if(start != source) {
	300	errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source);
301	}
302	logln("Testing CanonicalIterator::reset");
303	UnicodeString next = can.next();
304	logln("CanonicalIterator::next returned "+next);
305
306	can.reset();
307
308	UnicodeString afterReset = can.next();
309	logln("After reset, CanonicalIterator::next returned "+afterReset);
310
311	if(next != afterReset) {
312	errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+").");
313	}
314
315	logln("Testing getStaticClassID and getDynamicClassID");
316	if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
317	errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID");
318	}
319	}
320
321	#endif /* #if !UCONFIG_NO_NORMALIZATION */