git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/tools/dumpce/dumpce.cpp

... / ...

Commit	Line	Data
	1	/********************************************************************
	2	* COPYRIGHT:
	3	* Copyright (C) 2001-2003 IBM, Inc. All Rights Reserved.
	4	*
	5	********************************************************************/
	6	/********************************************************************************
	7	*
	8	* File dumpce.cpp
	9	*
	10	* Modification History:
	11	* Name Date Description
	12	* synwee May 31 2001 Creation
	13	*
	14	*********************************************************************************
	15	*/
	16
	17	/**
	18	* This program outputs the collation elements used for a requested tailoring.
	19	*
	20	* Usage:
	21	* dumpce options... please check main function.
	22	*/
	23	#include <unicode/utypes.h>
	24	#include <unicode/ucol.h>
	25	#include <unicode/uloc.h>
	26	#include <unicode/ucoleitr.h>
	27	#include <unicode/uchar.h>
	28	#include <unicode/uscript.h>
	29	#include <unicode/utf16.h>
	30	#include <unicode/putil.h>
	31	#include <unicode/ustring.h>
	32	#include <stdio.h>
	33	#include <stdlib.h>
	34	#include <string.h>
	35	#include <time.h>
	36	#include "ucol_tok.h"
	37	#include "cstring.h"
	38	#include "uoptions.h"
	39	#include "ucol_imp.h"
	40	#include <unicode/ures.h>
	41	#include <unicode/uniset.h>
	42	#include <unicode/usetiter.h>
	43
	44	/**
	45	* Command line option variables.
	46	* These global variables are set according to the options specified on the
	47	* command line by the user.
	48	*/
	49	static UOption options[]={
	50	/* 00 */ UOPTION_HELP_H,
	51	/* 01 */ UOPTION_HELP_QUESTION_MARK,
	52	/* 02 */ {"locale", NULL, NULL, NULL, 'l', UOPT_REQUIRES_ARG, 0},
	53	/* 03 */ {"serialize", NULL, NULL, NULL, 'z', UOPT_NO_ARG, 0},
	54	/* 04 */ UOPTION_DESTDIR,
	55	/* 05 */ UOPTION_SOURCEDIR,
	56	/* 06 */ {"attribute", NULL, NULL, NULL, 'a', UOPT_REQUIRES_ARG, 0},
	57	/* 07 */ {"rule", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0},
	58	/* 08 */ {"normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0},
	59	/* 09 */ {"scripts", NULL, NULL, NULL, 't', UOPT_NO_ARG, 0},
	60	/* 10 */ {"reducehan", NULL, NULL, NULL, 'e', UOPT_NO_ARG, 0},
	61	/* 11 */ UOPTION_VERBOSE,
	62	/* 12 */ {"wholescripts", NULL, NULL, NULL, 'W', UOPT_NO_ARG, 0}
	63	};
	64
	65	/**
	66	* Collator used in this program
	67	*/
	68	static UCollator *COLLATOR_;
	69	/**
	70	* Output strea, used in this program
	71	*/
	72	static FILE *OUTPUT_;
	73
	74	static UColAttributeValue ATTRIBUTE_[UCOL_ATTRIBUTE_COUNT] = {
	75	UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT,
	76	UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT,
	77	};
	78
	79	typedef struct {
	80	int value;
	81	char *name;
	82	} EnumNameValuePair;
	83
	84	static const EnumNameValuePair ATTRIBUTE_NAME_[] = {
	85	{UCOL_FRENCH_COLLATION, "UCOL_FRENCH_COLLATION"},
	86	{UCOL_ALTERNATE_HANDLING, "UCOL_ALTERNATE_HANDLING"},
	87	{UCOL_CASE_FIRST, "UCOL_CASE_FIRST"},
	88	{UCOL_CASE_LEVEL, "UCOL_CASE_LEVEL"},
	89	{UCOL_NORMALIZATION_MODE,
	90	"UCOL_NORMALIZATION_MODE\|UCOL_DECOMPOSITION_MODE"},
	91	{UCOL_STRENGTH, "UCOL_STRENGTH"},
	92	{UCOL_HIRAGANA_QUATERNARY_MODE, "UCOL_HIRAGANA_QUATERNARY_MODE"},
	93	{UCOL_NUMERIC_COLLATION, "UCOL_NUMERIC_COLLATION"},
	94	NULL
	95	};
	96
	97	static const EnumNameValuePair ATTRIBUTE_VALUE_[] = {
	98	{UCOL_PRIMARY, "UCOL_PRIMARY"},
	99	{UCOL_SECONDARY, "UCOL_SECONDARY"},
	100	{UCOL_TERTIARY, "UCOL_TERTIARY\|UCOL_DEFAULT_STRENGTH"},
	101	{UCOL_QUATERNARY, "UCOL_QUATERNARY"},
	102	{UCOL_IDENTICAL, "UCOL_IDENTICAL"},
	103	{UCOL_OFF, "UCOL_OFF"},
	104	{UCOL_ON, "UCOL_ON"},
	105	{UCOL_SHIFTED, "UCOL_SHIFTED"},
	106	{UCOL_NON_IGNORABLE, "UCOL_NON_IGNORABLE"},
	107	{UCOL_LOWER_FIRST, "UCOL_LOWER_FIRST"},
	108	{UCOL_UPPER_FIRST, "UCOL_UPPER_FIRST"},
	109	NULL
	110	};
	111
	112	typedef struct {
	113	UChar ch[32];
	114	int count; // number of codepoint
	115	UBool tailored;
	116	} ScriptElement;
	117
	118	/**
	119	* Writes the hexadecimal of a null-terminated array of codepoints into a
	120	* file
	121	* @param f UFILE instance to store
	122	* @param c codepoints array
	123	*/
	124	void serialize(FILE f, const UChar c)
	125	{
	126	UChar cp = *(c ++);
	127
	128	fprintf(f, " %04x", cp);
	129
	130	while (*c != 0) {
	131	cp = *(c ++);
	132	fprintf(f, " %04x", cp);
	133	}
	134	}
	135
	136	/**
	137	* Writes the hexadecimal of a non-null-terminated array of codepoints into a
	138	* file
	139	* @param f UFILE instance to store
	140	* @param c codepoints array
	141	* @param l codepoints array length
	142	*/
	143	void serialize(FILE f, const UChar c, int l)
	144	{
	145	int count = 1;
	146	UChar cp = *(c ++);
	147
	148	fprintf(f, " %04x", cp);
	149
	150	while (count < l) {
	151	cp = *(c ++);
	152	fprintf(f, " %04x", cp);
	153	count ++;
	154	}
	155	}
	156
	157	/**
	158	* Sets the iterator to the argument string and outputs the collation elements.
	159	* @param f file output stream
	160	* @param iter collation element iterator
	161	*/
	162	void serialize(FILE f, UCollationElements iter) {
	163	UChar *codepoint = iter->iteratordata_.string;
	164	// unlikely that sortkeys will be over this size
	165	uint8_t sortkey[64];
	166	uint8_t *psortkey = sortkey;
	167	int sortkeylength = 0;
	168
	169	if (iter->iteratordata_.flags & UCOL_ITER_HASLEN) {
	170	serialize(f, codepoint, iter->iteratordata_.endp - codepoint);
	171	sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint,
	172	iter->iteratordata_.endp - codepoint, sortkey, 64);
	173	}
	174	else {
	175	serialize(f, codepoint);
	176	sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint,
	177	-1, sortkey, 64);
	178	}
	179	if (options[11].doesOccur) {
	180	serialize(stdout, codepoint);
	181	fprintf(stdout, "\n");
	182	}
	183
	184	fprintf(f, "; ");
	185
	186	UErrorCode error = U_ZERO_ERROR;
	187	uint32_t ce = ucol_next(iter, &error);
	188	if (U_FAILURE(error)) {
	189	fprintf(f, "Error retrieving collation elements\n");
	190	return;
	191	}
	192
	193	while (TRUE) {
	194	fprintf(f, "[");
	195	if (UCOL_PRIMARYORDER(ce) != 0) {
	196	fprintf(f, "%04x", UCOL_PRIMARYORDER(ce));
	197	}
	198	fprintf(f, ",");
	199	if (UCOL_SECONDARYORDER(ce) != 0) {
	200	fprintf(f, " %02x", UCOL_SECONDARYORDER(ce));
	201	}
	202	fprintf(f, ",");
	203	if (UCOL_TERTIARYORDER(ce) != 0) {
	204	fprintf(f, " %02x", UCOL_TERTIARYORDER(ce));
	205	}
	206	fprintf(f, "] ");
	207
	208	ce = ucol_next(iter, &error);
	209	if (ce == UCOL_NULLORDER) {
	210	break;
	211	}
	212	if (U_FAILURE(error)) {
	213	fprintf(stdout, "Error retrieving collation elements");
	214	return;
	215	}
	216	}
	217
	218	if (sortkeylength > 64) {
	219	fprintf(f, "Sortkey exceeds pre-allocated size");
	220	}
	221
	222	fprintf(f, "[");
	223	while (TRUE) {
	224	fprintf(f, "%02x", *psortkey);
	225	psortkey ++;
	226	if ((*psortkey) == 0) {
	227	break;
	228	}
	229	fprintf(f, " ");
	230	}
	231	fprintf(f, "]\n");
	232	}
	233
	234	/**
	235	* Serializes the contraction within the given argument rule
	236	* @param f file output stream
	237	* @param r rule
	238	* @param rlen rule length
	239	* @param contractionsonly flag to indicate if only contractions are to be
	240	* output or all collation elements
	241	* @param iter iterator to iterate over collation elements
	242	*/
	243	void serialize(FILE f, UChar rule, int rlen, UBool contractiononly,
	244	UCollationElements *iter) {
	245	const UChar *current = NULL;
	246	uint32_t strength = 0;
	247	uint32_t chOffset = 0;
	248	uint32_t chLen = 0;
	249	uint32_t exOffset = 0;
	250	uint32_t exLen = 0;
	251	uint32_t prefixOffset = 0;
	252	uint32_t prefixLen = 0;
	253	uint8_t specs = 0;
	254	UBool rstart = TRUE;
	255	UColTokenParser src;
	256	UColOptionSet opts;
	257	UParseError parseError;
	258	UErrorCode error = U_ZERO_ERROR;
	259
	260	src.opts = &opts;
	261
	262	src.source = rule;
	263	src.current = rule;
	264	src.end = rule + rlen;
	265	src.extraCurrent = src.end;
	266	src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
	267
	268
	269	while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,
	270	&error)) != NULL) {
	271	chOffset = src.parsedToken.charsOffset;
	272	chLen = src.parsedToken.charsLen;
	273	// contractions handled here
	274	if (!contractiononly \|\| chLen > 1) {
	275	ucol_setText(iter, rule + chOffset, chLen, &error);
	276	if (U_FAILURE(error)) {
	277	fprintf(stdout, "Error setting text in iterator\n");
	278	return;
	279	}
	280	serialize(f, iter);
	281	}
	282	rstart = FALSE;
	283	}
	284	}
	285
	286	/**
	287	* Prints the attribute values in the argument collator into the output stream
	288	* @param collator
	289	*/
	290	void outputAttribute(UCollator collator, UErrorCode error)
	291	{
	292	UColAttribute attribute = UCOL_FRENCH_COLLATION;
	293	while (attribute < UCOL_ATTRIBUTE_COUNT) {
	294	int count = 0;
	295	while (TRUE) {
	296	// getting attribute name
	297	if (ATTRIBUTE_NAME_[count].value == attribute) {
	298	fprintf(OUTPUT_, "%s = ", ATTRIBUTE_NAME_[count].name);
	299	break;
	300	}
	301	count ++;
	302	}
	303	count = 0;
	304	int attributeval = ucol_getAttribute(collator, attribute, error);
	305	if (U_FAILURE(*error)) {
	306	fprintf(stdout, "Failure in reading collator attribute\n");
	307	return;
	308	}
	309	while (TRUE) {
	310	// getting attribute value
	311	if (ATTRIBUTE_VALUE_[count].value == attributeval) {
	312	fprintf(OUTPUT_, "%s\n", ATTRIBUTE_VALUE_[count].name);
	313	break;
	314	}
	315	count ++;
	316	}
	317	attribute = (UColAttribute)(attribute + 1);
	318	}
	319	}
	320
	321	/**
	322	* Prints the normalization mode in the argument collator into the output stream
	323	* @param collator
	324	*/
	325	void outputNormalization(UCollator *collator)
	326	{
	327	UErrorCode status = U_ZERO_ERROR;
	328	int normmode = ucol_getAttribute(collator, UCOL_NORMALIZATION_MODE, &status);
	329	int count = 0;
	330	while (TRUE) {
	331	// getting attribute name
	332	if (ATTRIBUTE_VALUE_[count].value == normmode) {
	333	break;
	334	}
	335	count ++;
	336	}
	337	fprintf(OUTPUT_, "NORMALIZATION MODE = %s\n",
	338	ATTRIBUTE_VALUE_[count].name);
	339	}
	340
	341	/**
	342	* Output the collation element belonging to the locale into a file
	343	* @param locale string
	344	* @param fullrules flag to indicate if only tailored collation elements are to
	345	* be output or all collation elements
	346	*/
	347	void serialize(const char *locale, UBool tailoredonly) {
	348	UErrorCode error = U_ZERO_ERROR;
	349	UChar str[128];
	350	int strlen = 0;
	351
	352	fprintf(OUTPUT_, "# This file contains the serialized collation elements\n");
	353	fprintf(OUTPUT_, "# as of the collation version indicated below.\n");
	354	fprintf(OUTPUT_, "# Data format: xxxx xxxx..; [yyyy, yy, yy] [yyyy, yy, yy] ... [yyyy, yy, yy] [zz zz..\n");
	355	fprintf(OUTPUT_, "# where xxxx are codepoints in hexadecimals,\n");
	356	fprintf(OUTPUT_, "# yyyyyyyy are the corresponding\n");
	357	fprintf(OUTPUT_, "# collation elements in hexadecimals\n");
	358	fprintf(OUTPUT_, "# and zz are the sortkey values in hexadecimals\n");
	359
	360	fprintf(OUTPUT_, "\n# Collator information\n");
	361
	362	fprintf(OUTPUT_, "\nLocale: %s\n", locale);
	363	fprintf(stdout, "Locale: %s\n", locale);
	364	UVersionInfo version;
	365	ucol_getVersion(COLLATOR_, version);
	366	fprintf(OUTPUT_, "Version number: %d.%d.%d.%d\n",
	367	version[0], version[1], version[2], version[3]);
	368	outputAttribute(COLLATOR_, &error);
	369	outputNormalization(COLLATOR_);
	370
	371	UCollationElements *iter = ucol_openElements(COLLATOR_, str, strlen,
	372	&error);
	373	if (U_FAILURE(error)) {
	374	fprintf(stdout, "Error creating iterator\n");
	375	return;
	376	}
	377
	378	if (!tailoredonly) {
	379	fprintf(OUTPUT_, "\n# Range of unicode characters\n\n");
	380	UChar32 codepoint = 0;
	381	while (codepoint <= UCHAR_MAX_VALUE) {
	382	if (u_isdefined(codepoint)) {
	383	strlen = 0;
	384	UTF16_APPEND_CHAR_UNSAFE(str, strlen, codepoint);
	385	str[strlen] = 0;
	386	ucol_setText(iter, str, strlen, &error);
	387	if (U_FAILURE(error)) {
	388	fprintf(stdout, "Error setting text in iterator\n");
	389	return;
	390	}
	391	serialize(OUTPUT_, iter);
	392	}
	393	codepoint ++;
	394	}
	395	}
	396
	397	UChar ucarules[0x10000];
	398	UChar *rules;
	399	int32_t rulelength = 0;
	400	rules = ucarules;
	401
	402	if (tailoredonly) {
	403	int32_t rulelength = 0;
	404	const UChar *temp = ucol_getRules(COLLATOR_, &rulelength);
	405	if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) {
	406	rules = (UChar )malloc(sizeof(UChar)
	407	(rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE));
	408	}
	409	memcpy(rules, temp, rulelength * sizeof(UChar));
	410	rules[rulelength] = 0;
	411	fprintf(OUTPUT_, "\n# Tailorings\n\n");
	412	serialize(OUTPUT_, rules, rulelength, FALSE, iter);
	413	if (rules != ucarules) {
	414	free(rules);
	415	}
	416	}
	417	else {
	418	rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, ucarules,
	419	0x10000);
	420	if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) {
	421	rules = (UChar )malloc(sizeof(UChar)
	422	(rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE));
	423	rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, rules,
	424	rulelength);
	425	}
	426	fprintf(OUTPUT_, "\n# Contractions\n\n");
	427	serialize(OUTPUT_, rules, rulelength, TRUE, iter);
	428	if (rules != ucarules) {
	429	free(rules);
	430	}
	431	}
	432
	433	ucol_closeElements(iter);
	434	}
	435
	436	/**
	437	* Sets the collator with the attribute values
	438	* @param collator
	439	* @param error status
	440	*/
	441	void setAttributes(UCollator collator, UErrorCode error)
	442	{
	443	int count = 0;
	444	while (count < UCOL_ATTRIBUTE_COUNT) {
	445	if (ATTRIBUTE_[count] != UCOL_DEFAULT) {
	446	ucol_setAttribute(collator, (UColAttribute)count,
	447	ATTRIBUTE_[count], error);
	448	if (U_FAILURE(*error)) {
	449	return;
	450	}
	451	}
	452	count ++;
	453	}
	454	}
	455
	456	/**
	457	* Appends directory path with an ending seperator if necessary.
	458	* @param path with enough space to append one seperator
	459	* @return new directory path length
	460	*/
	461	int appendDirSeparator(char *dir)
	462	{
	463	int dirlength = strlen(dir);
	464	char dirending = dir[dirlength - 1];
	465	if (dirending != U_FILE_SEP_CHAR) {
	466	dir[dirlength] = U_FILE_SEP_CHAR;
	467	dir[dirlength + 1] = 0;
	468	return dirlength + 1;
	469	}
	470	return dirlength;
	471	}
	472
	473	/**
	474	* Output the collation element into a file
	475	*/
	476	void serialize() {
	477	char filename[128];
	478	int dirlength = 0;
	479
	480	if (options[4].doesOccur) {
	481	strcpy(filename, options[4].value);
	482	dirlength = appendDirSeparator(filename);
	483	}
	484
	485	if (options[2].doesOccur) {
	486	const char locale = (char )options[2].value;
	487	int32_t localeindex = 0;
	488
	489	if (strcmp(locale, "all") == 0) {
	490	if (options[4].doesOccur) {
	491	strcat(filename, "UCA.txt");
	492	OUTPUT_ = fopen(filename, "w");
	493	if (OUTPUT_ == NULL) {
	494	fprintf(stdout, "Cannot open file:%s\n", filename);
	495	return;
	496	}
	497	}
	498	fprintf(stdout, "UCA\n");
	499	UErrorCode error = U_ZERO_ERROR;
	500	COLLATOR_ = ucol_open("en_US", &error);
	501	if (U_FAILURE(error)) {
	502	fprintf(stdout, "Collator creation failed:");
	503	fprintf(stdout, u_errorName(error));
	504	goto CLOSEUCA;
	505	return;
	506	}
	507	setAttributes(COLLATOR_, &error);
	508	if (U_FAILURE(error)) {
	509	fprintf(stdout, "Collator attribute setting failed:");
	510	fprintf(stdout, u_errorName(error));
	511	goto CLOSEUCA;
	512	return;
	513	}
	514
	515	serialize("UCA", FALSE);
	516	CLOSEUCA :
	517	if (options[4].doesOccur) {
	518	filename[dirlength] = 0;
	519	fclose(OUTPUT_);
	520	}
	521	ucol_close(COLLATOR_);
	522	localeindex = ucol_countAvailable() - 1;
	523	fprintf(stdout, "Number of locales: %d\n", localeindex + 1);
	524	locale = ucol_getAvailable(localeindex);
	525	}
	526
	527	while (TRUE) {
	528	UErrorCode error = U_ZERO_ERROR;
	529	COLLATOR_ = ucol_open(locale, &error);
	530	if (U_FAILURE(error)) {
	531	fprintf(stdout, "Collator creation failed:");
	532	fprintf(stdout, u_errorName(error));
	533	goto CLOSETAILOR;
	534	return;
	535	}
	536	setAttributes(COLLATOR_, &error);
	537	if (U_FAILURE(error)) {
	538	fprintf(stdout, "Collator attribute setting failed:");
	539	fprintf(stdout, u_errorName(error));
	540	goto CLOSETAILOR;
	541	return;
	542	}
	543
	544	if (options[4].doesOccur) {
	545	strcat(filename, locale);
	546	strcat(filename, ".txt");
	547	OUTPUT_ = fopen(filename, "w");
	548	if (OUTPUT_ == NULL) {
	549	fprintf(stdout, "Cannot open file:%s\n", filename);
	550	return;
	551	}
	552	}
	553
	554	if (options[3].doesOccur) {
	555	serialize(locale, TRUE);
	556	}
	557
	558	ucol_close(COLLATOR_);
	559
	560	CLOSETAILOR :
	561	if (options[4].doesOccur) {
	562	filename[dirlength] = 0;
	563	fclose(OUTPUT_);
	564	}
	565
	566	localeindex --;
	567	if (localeindex < 0) {
	568	break;
	569	}
	570	locale = ucol_getAvailable(localeindex);
	571	}
	572	}
	573
	574	if (options[7].doesOccur) {
	575	char inputfilename[128];
	576	// rules are to be used
	577	if (options[5].doesOccur) {
	578	strcpy(inputfilename, options[5].value);
	579	appendDirSeparator(inputfilename);
	580	}
	581	strcat(inputfilename, options[7].value);
	582	FILE *input = fopen(inputfilename, "r");
	583	if (input == NULL) {
	584	fprintf(stdout, "Cannot open file:%s\n", filename);
	585	return;
	586	}
	587
	588	char s[1024];
	589	UChar rule[1024];
	590	UChar *prule = rule;
	591	int size = 1024;
	592	// synwee TODO: make this part dynamic
	593	while (fscanf(input, "%[^\n]s", s) != EOF) {
	594	size -= u_unescape(s, prule, size);
	595	prule = prule + u_strlen(prule);
	596	}
	597	fclose(input);
	598
	599	if (options[4].doesOccur) {
	600	strcat(filename, "Rules.txt");
	601	OUTPUT_ = fopen(filename, "w");
	602	if (OUTPUT_ == NULL) {
	603	fprintf(stdout, "Cannot open file:%s\n", filename);
	604	return;
	605	}
	606	}
	607
	608	fprintf(stdout, "Rules\n");
	609	UErrorCode error = U_ZERO_ERROR;
	610	UParseError parseError;
	611	COLLATOR_ = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT,
	612	UCOL_DEFAULT_STRENGTH, &parseError, &error);
	613	if (U_FAILURE(error)) {
	614	fprintf(stdout, "Collator creation failed:");
	615	fprintf(stdout, u_errorName(error));
	616	goto CLOSERULES;
	617	return;
	618	}
	619	setAttributes(COLLATOR_, &error);
	620	if (U_FAILURE(error)) {
	621	fprintf(stdout, "Collator attribute setting failed:");
	622	fprintf(stdout, u_errorName(error));
	623	goto CLOSERULES;
	624	return;
	625	}
	626
	627	serialize("Rule-based", TRUE);
	628	ucol_close(COLLATOR_);
	629
	630	CLOSERULES :
	631	if (options[4].doesOccur) {
	632	filename[dirlength] = 0;
	633	fclose(OUTPUT_);
	634	}
	635	}
	636	}
	637
	638	/**
	639	* Parse for enum values.
	640	* Note this only works for positive enum values.
	641	* @param enumarray array containing names of the enum values in string and
	642	* their corresponding value.
	643	* declared enum value.
	644	* @param str string to be parsed
	645	* @return corresponding integer enum value or -1 if value is not found.
	646	*/
	647	int parseEnums(const EnumNameValuePair enumarray[], const char *str)
	648	{
	649	const char *enumname = enumarray[0].name;
	650	int result = atoi(str);
	651	if (result == 0 && str[0] != '0') {
	652	while (strcmp(enumname, str) != 0) {
	653	// checking for multiple enum names sharing the same values
	654	enumname = strstr(enumname, str);
	655	if (enumname != NULL) {
	656	int size = strchr(enumname, '\|') - enumname;
	657	if (size < 0) {
	658	size = strlen(enumname);
	659	}
	660	if (size == (int)strlen(str)) {
	661	return enumarray[result].value;
	662	}
	663	}
	664	result ++;
	665	if (&(enumarray[result]) == NULL) {
	666	return -1;
	667	}
	668	enumname = enumarray[result].name;
	669	}
	670	}
	671	return -1;
	672	}
	673
	674	/**
	675	* Parser for attribute name value pair
	676	*/
	677	void parseAttributes() {
	678	char str[32];
	679	const char *pname = options[6].value;
	680	const char *pend = options[6].value + strlen(options[6].value);
	681	const char *pvalue;
	682
	683	while (pname < pend) {
	684	pvalue = strchr(pname, '=');
	685	if (pvalue == NULL) {
	686	fprintf(stdout,
	687	"No matching value found for attribute argument %s\n",
	688	pname);
	689	return;
	690	}
	691	int count = pvalue - pname;
	692	strncpy(str, pname, count);
	693	str[count] = 0;
	694
	695	int name = parseEnums(ATTRIBUTE_NAME_, str);
	696	if (name == -1) {
	697	fprintf(stdout, "Attribute name not found: %s\n", str);
	698	return;
	699	}
	700
	701	pvalue ++;
	702	// getting corresponding enum value
	703	pname = strchr(pvalue, ',');
	704	if (pname == NULL) {
	705	pname = pend;
	706	}
	707	count = pname - pvalue;
	708	strncpy(str, pvalue, count);
	709	str[count] = 0;
	710	int value = parseEnums(ATTRIBUTE_VALUE_, str);
	711	if (value == -1) {
	712	fprintf(stdout, "Attribute value not found: %s\n", str);
	713	return;
	714	}
	715	ATTRIBUTE_[name] = (UColAttributeValue)value;
	716	pname ++;
	717	}
	718	}
	719
	720	/**
	721	* Checks if the locale argument is a base language
	722	* @param locale to be checked
	723	* @return TRUE if it is a base language
	724	*/
	725	inline UBool checkLocaleForLanguage(const char *locale)
	726	{
	727	return strlen(locale) <= 2;
	728	}
	729
	730	/**
	731	* Converts a UChar array into its string form "xxxx xxxx"
	732	* @param ch array of UChar characters
	733	* @param count number of UChar characters
	734	*/
	735	void outputUChar(UChar ch[], int count)
	736	{
	737	for (int i = 0; i < count; i ++) {
	738	fprintf(OUTPUT_, "%04X ", ch[i]);
	739	}
	740	}
	741
	742	/**
	743	* If it is a primary difference returns -1 or 1.
	744	* If it is a secondary difference returns -2 or 2.
	745	* If it is a tertiary difference returns -3 or 3.
	746	* If equals returns 0.
	747	*/
	748	int compareSortKey(const void elem1, const void elem2)
	749	{
	750	// compare the 2 script element sort key
	751	UChar ch1 = ((ScriptElement )elem1)->ch;
	752	UChar ch2 = ((ScriptElement )elem2)->ch;
	753	int size1 = ((ScriptElement *)elem1)->count;
	754	int size2 = ((ScriptElement *)elem2)->count;
	755	UErrorCode error = U_ZERO_ERROR;
	756
	757	ucol_setStrength(COLLATOR_, UCOL_PRIMARY);
	758	int result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2);
	759	if (result == 0) {
	760	ucol_setStrength(COLLATOR_, UCOL_SECONDARY);
	761	result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2);
	762	if (result == 0) {
	763	ucol_setStrength(COLLATOR_, UCOL_TERTIARY);
	764	result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2);
	765	if (result < 0) {
	766	return -3;
	767	}
	768	if (result > 0) {
	769	return 3;
	770	}
	771	}
	772	if (result < 0) {
	773	return -2;
	774	}
	775	if (result > 0) {
	776	return 2;
	777	}
	778	}
	779	return result;
	780	}
	781
	782	/**
	783	* Output serialized script elements
	784	* @param element the element to output
	785	* @param compare the comparison with the previous element
	786	* @param expansion flags TRUE if element has an expansion
	787	*/
	788	void outputScriptElem(ScriptElement &element, int compare, UBool expansion)
	789	{
	790	switch (compare) {
	791	case 0:
	792	if (expansion) {
	793	fprintf(OUTPUT_, "<tr><td class='eq' title='[");
	794	}
	795	else {
	796	fprintf(OUTPUT_, "<tr><td class='q' title='[");
	797	}
	798	break;
	799	case -1:
	800	if (expansion) {
	801	fprintf(OUTPUT_, "<tr><td class='ep' title='[");
	802	}
	803	else {
	804	fprintf(OUTPUT_, "<tr><td class='p' title='[");
	805	}
	806	break;
	807	case -2:
	808	if (expansion) {
	809	fprintf(OUTPUT_, "<tr><td class='es' title='[");
	810	}
	811	else {
	812	fprintf(OUTPUT_, "<tr><td class='s' title='[");
	813	}
	814	break;
	815	default:
	816	if (expansion) {
	817	fprintf(OUTPUT_, "<tr><td class='et' title='[");
	818	}
	819	else {
	820	fprintf(OUTPUT_, "<tr><td class='t' title='[");
	821	}
	822	}
	823
	824	uint8_t sortkey[32];
	825	ucol_setStrength(COLLATOR_, UCOL_TERTIARY);
	826	ucol_getSortKey(COLLATOR_, element.ch, element.count, sortkey, 32);
	827	int i = 0;
	828	while (sortkey[i] != 0) {
	829	if (sortkey[i] == 1) {
	830	fprintf(OUTPUT_, " \| ");
	831	}
	832	else {
	833	fprintf(OUTPUT_, "%02x", sortkey[i]);
	834	}
	835
	836	i ++;
	837	}
	838
	839	fprintf(OUTPUT_, "]'>");
	840
	841	UErrorCode error = U_ZERO_ERROR;
	842	char utf8[64];
	843	UChar nfc[32];
	844	int32_t length = unorm_normalize(element.ch, element.count, UNORM_NFC, 0, nfc,
	845	32, &error);
	846	if (U_FAILURE(error)) {
	847	fprintf(stdout, "Error normalizing contractions to NFC\n");
	848	}
	849	u_strToUTF8(utf8, 64, &length, nfc, length, &error);
	850	if (U_FAILURE(error)) {
	851	fprintf(stdout, "Error converting UChar to utf8\n");
	852	return;
	853	}
	854
	855	fprintf(OUTPUT_, "%s<br>", utf8);
	856	fprintf(OUTPUT_, "<tt>");
	857	outputUChar(element.ch, element.count);
	858
	859	if (compare == 0) {
	860	fprintf(OUTPUT_, "</tt></td><td> </td><td> </td><td> </td><td>Q</td><td>");
	861	}
	862	else if (compare == -1) {
	863	fprintf(OUTPUT_, "</tt></td><td>P</td><td> </td><td> </td><td> </td><td>");
	864	}
	865	else if (compare == -2) {
	866	fprintf(OUTPUT_, "</tt></td><td> </td><td>S</td><td> </td><td> </td><td>");
	867	}
	868	else if (compare == -3) {
	869	fprintf(OUTPUT_, "</tt></td><td> </td><td> </td><td>T</td><td> </td><td>");
	870	}
	871
	872	i = 0;
	873	while (i < element.count) {
	874	char str[128];
	875	UChar32 codepoint;
	876	UTF_NEXT_CHAR(element.ch, i, element.count, codepoint);
	877	int32_t temp = u_charName(codepoint, U_UNICODE_CHAR_NAME, str, 128,
	878	&error);
	879	if (U_FAILURE(error)) {
	880	fprintf(stdout, "Error getting character name\n");
	881	return;
	882	}
	883	if (element.tailored) {
	884	fprintf(OUTPUT_, "<b>");
	885	}
	886	fprintf(OUTPUT_, "%s", str);
	887	if (element.tailored) {
	888	fprintf(OUTPUT_, " *</b>");
	889	}
	890	if (i < element.count) {
	891	fprintf(OUTPUT_, "<br>\n");
	892	}
	893	}
	894
	895	fprintf(OUTPUT_, "</td></tr>\n");
	896	}
	897
	898	/**
	899	* Checks if codepoint belongs to scripts
	900	* @param script list
	901	* @param scriptcount number of scripts
	902	* @param codepoint to test
	903	* @return TRUE if codepoint belongs to scripts
	904	*/
	905	UBool checkInScripts(UScriptCode script[], int scriptcount,
	906	UChar32 codepoint)
	907	{
	908	UErrorCode error = U_ZERO_ERROR;
	909	for (int i = 0; i < scriptcount; i ++) {
	910	if (script[i] == USCRIPT_HAN && options[10].doesOccur) {
	911	if ((codepoint >= 0x2E80 && codepoint <= 0x2EE4) \|\|
	912	(codepoint >= 0x2A672 && codepoint <= 0x2A6D6)) {
	913	// reduce han
	914	return TRUE;
	915	}
	916	}
	917	else if (uscript_getScript(codepoint, &error) == script[i]) {
	918	return TRUE;
	919	}
	920	if (U_FAILURE(error)) {
	921	fprintf(stdout, "Error checking character in scripts\n");
	922	return FALSE;
	923	}
	924	}
	925	return FALSE;
	926	}
	927
	928	/**
	929	* Checks if the set of codepoints belongs to the script
	930	* @param script list
	931	* @param scriptcount number of scripts
	932	* @param scriptelem
	933	* @return TRUE if all codepoints belongs to the script
	934	*/
	935	inline UBool checkInScripts(UScriptCode script[], int scriptcount,
	936	ScriptElement scriptelem)
	937	{
	938	int i = 0;
	939	while (i < scriptelem.count) {
	940	UChar32 codepoint;
	941	UTF_NEXT_CHAR(scriptelem.ch, i, scriptelem.count, codepoint);
	942	UErrorCode error = U_ZERO_ERROR;
	943	if (checkInScripts(script, scriptcount, codepoint)) {
	944	return TRUE;
	945	}
	946	}
	947	return FALSE;
	948	}
	949
	950	/**
	951	* Gets the script elements and contractions belonging to the script
	952	* @param elems output list
	953	* @param locale locale
	954	* @return number of script elements
	955	* Add by Richard
	956	*/
	957	int getScriptElementsFromExemplars(ScriptElement scriptelem[], const char* locale) {
	958	UErrorCode error = U_ZERO_ERROR;
	959	UChar32 codepoint = 0;
	960
	961	UResourceBundle* ures = ures_open(NULL, locale, &error);
	962	if (U_FAILURE(error)) {
	963	fprintf(stdout, "Can not find resource bundle for locale: %s\n", locale);
	964	return -1;
	965	}
	966	int32_t length;
	967	const UChar* exemplarChars = ures_getStringByKey(ures, "ExemplarCharacters", &length, &error);
	968
	969	if (U_FAILURE(error)) {
	970	fprintf(stdout, "Can not find ExemplarCharacters in resource bundle\n");
	971	return -1;
	972	}
	973
	974	UChar* upperChars = new UChar[length*2];
	975	if (upperChars == 0) {
	976	fprintf(stdout, "Memory error\n");
	977	return -1;
	978	}
	979
	980	int32_t destLength = u_strToUpper(upperChars, length*2, exemplarChars, -1, locale, &error);
	981	if (U_FAILURE(error)) {
	982	fprintf(stdout, "Error when u_strToUpper() \n");
	983	return -1;
	984	}
	985
	986	UChar* pattern = new UChar[length + destLength + 10];
	987	UChar left[2] = {0x005b, 0x0};
	988	UChar right[2] = {0x005d, 0x0};
	989	pattern = u_strcpy(pattern, left);
	990	pattern = u_strcat(pattern, exemplarChars);
	991	pattern = u_strcat(pattern, upperChars);
	992	pattern = u_strcat(pattern, right);
	993
	994	UnicodeSet * uniset = new UnicodeSet(UnicodeString(pattern), error);
	995	if (U_FAILURE(error)) {
	996	fprintf(stdout, "Can not open USet \n");
	997	return -1;
	998	}
	999
	1000	UnicodeSetIterator* usetiter = new UnicodeSetIterator(*uniset);
	1001
	1002	int32_t count = 0;
	1003
	1004	while (usetiter -> next()) {
	1005	if (usetiter -> isString()) {
	1006	UnicodeString strItem = usetiter -> getString();
	1007
	1008	scriptelem[count].count = 0;
	1009	for (int i = 0; i < strItem.length(); i++) {
	1010	codepoint = strItem.char32At(i);
	1011	UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch,
	1012	scriptelem[count].count, codepoint);
	1013	scriptelem[count].tailored = FALSE;
	1014	}
	1015	} else {
	1016	codepoint = usetiter -> getCodepoint();
	1017	scriptelem[count].count = 0;
	1018	UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch,
	1019	scriptelem[count].count, codepoint);
	1020	scriptelem[count].tailored = FALSE;
	1021	}
	1022
	1023	count++;
	1024	}
	1025
	1026	return count;
	1027	}
	1028
	1029	/**
	1030	* Gets the script elements and contractions belonging to the script
	1031	* @param script list
	1032	* @param scriptcount number of scripts
	1033	* @param elems output list
	1034	* @return number of script elements
	1035	*/
	1036	int getScriptElements(UScriptCode script[], int scriptcount,
	1037	ScriptElement scriptelem[])
	1038	{
	1039	UErrorCode error = U_ZERO_ERROR;
	1040	UChar32 codepoint = 0;
	1041	int count = 0;
	1042	while (codepoint <= UCHAR_MAX_VALUE) {
	1043	if (checkInScripts(script, scriptcount, codepoint)) {
	1044	scriptelem[count].count = 0;
	1045	UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch,
	1046	scriptelem[count].count, codepoint);
	1047	scriptelem[count].tailored = FALSE;
	1048	count ++;
	1049	}
	1050	if (U_FAILURE(error)) {
	1051	fprintf(stdout, "Error determining codepoint in script\n");
	1052	return -1;
	1053	}
	1054	codepoint ++;
	1055	}
	1056
	1057	const UChar *current = NULL;
	1058	uint32_t strength = 0;
	1059	uint32_t chOffset = 0;
	1060	uint32_t chLen = 0;
	1061	uint32_t exOffset = 0;
	1062	uint32_t exLen = 0;
	1063	uint32_t prefixOffset = 0;
	1064	uint32_t prefixLen = 0;
	1065	uint8_t specs = 0;
	1066	UBool rstart = TRUE;
	1067	UColTokenParser src;
	1068	UColOptionSet opts;
	1069	UParseError parseError;
	1070
	1071	int32_t rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, NULL, 0);
	1072	src.source = (UChar )malloc(sizeof(UChar)
	1073	(rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE));
	1074	rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, src.source,
	1075	rulelength);
	1076	src.current = src.source;
	1077	src.end = src.source + rulelength;
	1078	src.extraCurrent = src.end;
	1079	src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
	1080	src.opts = &opts;
	1081
	1082	/*
	1083	ucol_tok_parseNextToken(&src, &strength, &chOffset,
	1084	&chLen, &exOffset, &exLen,
	1085	&prefixOffset, &prefixLen,
	1086	&specs, rstart, &parseError,
	1087	&error)
	1088	*/
	1089	while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,
	1090	&error)) != NULL) {
	1091	// contractions handled here
	1092	if (chLen > 1) {
	1093	u_strncpy(scriptelem[count].ch, src.source + chOffset, chLen);
	1094	scriptelem[count].count = chLen;
	1095	if (checkInScripts(script, scriptcount, scriptelem[count])) {
	1096	scriptelem[count].tailored = FALSE;
	1097	count ++;
	1098	}
	1099	}
	1100	rstart = FALSE;
	1101	}
	1102	if (U_FAILURE(error)) {
	1103	fprintf(stdout, "Error parsing rules: %s\n", u_errorName(error));
	1104	}
	1105	// rule might have been reallocated, so delete this instead
	1106	free(src.source);
	1107	return count;
	1108	}
	1109
	1110	int compareCodepoints(const void elem1, const void elem2)
	1111	{
	1112	UChar ch1 = ((ScriptElement )elem1)->ch; // key
	1113	UChar ch2 = ((ScriptElement )elem2)->ch;
	1114	ch1[((ScriptElement *)elem1)->count] = 0;
	1115	ch2[((ScriptElement *)elem2)->count] = 0;
	1116
	1117	// compare the 2 codepoints
	1118	return u_strcmp(ch1, ch2);
	1119	}
	1120
	1121	UBool hasSubNFD(ScriptElement &se, ScriptElement &key)
	1122	{
	1123	UChar *ch1 = se.ch;
	1124	UChar *ch2 = key.ch; // key
	1125	ch1[se.count] = 0;
	1126	ch2[key.count] = 0;
	1127
	1128	// compare the 2 codepoints
	1129	if (u_strstr(ch1, ch2) != NULL) {
	1130	return TRUE;
	1131	}
	1132
	1133	// check the decomposition
	1134	UChar norm[32];
	1135	UErrorCode error = U_ZERO_ERROR;
	1136	int size = unorm_normalize(ch1, se.count, UNORM_NFD, 0, norm, 32,
	1137	&error);
	1138	if (U_FAILURE(error)) {
	1139	fprintf(stdout, "Error normalizing\n");
	1140	}
	1141	if (u_strstr(norm, ch2) != NULL) {
	1142	return TRUE;
	1143	}
	1144	return FALSE;
	1145	}
	1146
	1147	/**
	1148	* Marks tailored elements
	1149	* @param script list
	1150	* @param scriptcount number of scripts
	1151	* @param scriptelem script element list
	1152	* @param scriptelemlength size of the script element list
	1153	*/
	1154	void markTailored(UScriptCode script[], int scriptcount,
	1155	ScriptElement scriptelem[], int scriptelemlength)
	1156	{
	1157	int32_t rulelength;
	1158	const UChar *rule = ucol_getRules(COLLATOR_, &rulelength);
	1159
	1160	const UChar *current = NULL;
	1161	uint32_t strength = 0;
	1162	uint32_t chOffset = 0;
	1163	uint32_t chLen = 0;
	1164	uint32_t exOffset = 0;
	1165	uint32_t exLen = 0;
	1166	uint32_t prefixOffset = 0;
	1167	uint32_t prefixLen = 0;
	1168	uint8_t specs = 0;
	1169	UBool rstart = TRUE;
	1170	UColTokenParser src;
	1171	UColOptionSet opts;
	1172	UParseError parseError;
	1173
	1174	src.opts = &opts;
	1175	src.source = (UChar *)malloc(
	1176	(rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
	1177	memcpy(src.source, rule, rulelength * sizeof(UChar));
	1178	src.current = src.source;
	1179	src.end = (UChar *)src.source + rulelength;
	1180	src.extraCurrent = src.end;
	1181	src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
	1182
	1183	UErrorCode error = U_ZERO_ERROR;
	1184
	1185	while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,
	1186	&error)) != NULL) {
	1187	if (chLen >= 1 && strength != UCOL_TOK_RESET) {
	1188	// skipping the reset characters and non useful stuff.
	1189	ScriptElement se;
	1190	u_strncpy(se.ch, src.source + chOffset, chLen);
	1191	se.count = chLen;
	1192
	1193	if (checkInScripts(script, scriptcount, se)) {
	1194	/*
	1195	ScriptElement tse = (ScriptElement )bsearch(&se, scriptelem,
	1196	scriptelemlength,
	1197	sizeof(ScriptElement),
	1198	compareCodepoints);
	1199	*/
	1200	for (int i = 0; i < scriptelemlength; i ++) {
	1201	if (!scriptelem[i].tailored &&
	1202	hasSubNFD(scriptelem[i], se)) {
	1203	scriptelem[i].tailored = TRUE;
	1204	}
	1205	}
	1206	}
	1207	}
	1208	rstart = FALSE;
	1209	}
	1210	free(src.source);
	1211	if (U_FAILURE(error)) {
	1212	fprintf(stdout, "Error parsing rules\n");
	1213	}
	1214	}
	1215
	1216	/**
	1217	* Checks if the collation iterator has more than 1 collation element
	1218	* @parem coleiter collation element iterator
	1219	* @return TRUE if collation iterator has more than 1 collation element
	1220	*/
	1221	UBool hasExpansions(UCollationElements *coleiter)
	1222	{
	1223	UErrorCode error = U_ZERO_ERROR;
	1224	int32_t ce = ucol_next(coleiter, &error);
	1225	int count = 0;
	1226
	1227	if (U_FAILURE(error)) {
	1228	fprintf(stdout, "Error getting next collation element\n");
	1229	}
	1230	while (ce != UCOL_NULLORDER) {
	1231	if ((UCOL_PRIMARYORDER(ce) != 0) && !isContinuation(ce)) {
	1232	count ++;
	1233	if (count == 2) {
	1234	return TRUE;
	1235	}
	1236	}
	1237	ce = ucol_next(coleiter, &error);
	1238	if (U_FAILURE(error)) {
	1239	fprintf(stdout, "Error getting next collation element\n");
	1240	}
	1241	}
	1242	return FALSE;
	1243	}
	1244
	1245	/**
	1246	* Prints the footer for index.html
	1247	* @param file output file
	1248	*/
	1249	void outputHTMLFooter()
	1250	{
	1251	fprintf(OUTPUT_, "</table>\n");
	1252	fprintf(OUTPUT_, "</body>\n");
	1253	fprintf(OUTPUT_, "</html>\n");
	1254	}
	1255
	1256	/**
	1257	* Serialize the codepoints from start to end into an html file.
	1258	* Arranging them into ascending collation order.
	1259	* @param script code list
	1260	* @param scriptcount number of scripts
	1261	*/
	1262	//void serializeScripts(UScriptCode script[], int scriptcount)
	1263	//Richard
	1264	void serializeScripts(UScriptCode script[], int scriptcount, const char* locale = NULL)
	1265	{
	1266	UErrorCode error = U_ZERO_ERROR;
	1267
	1268	ScriptElement *scriptelem =
	1269	(ScriptElement )malloc(sizeof(ScriptElement) 0x20000);
	1270	if (scriptelem == NULL) {
	1271	fprintf(stdout, "Memory error\n");
	1272	return;
	1273	}
	1274	int count = 0;
	1275	if(locale) {
	1276	count = getScriptElementsFromExemplars(scriptelem, locale);
	1277	} else {
	1278	count = getScriptElements(script, scriptcount, scriptelem);
	1279	}
	1280
	1281	// Sort script elements using Quicksort algorithm:
	1282	qsort(scriptelem, count, sizeof(ScriptElement), compareCodepoints);
	1283	markTailored(script, scriptcount, scriptelem, count);
	1284	// Sort script elements using Quicksort algorithm:
	1285	qsort(scriptelem, count, sizeof(ScriptElement), compareSortKey);
	1286
	1287	UCollationElements* coleiter = ucol_openElements(COLLATOR_,
	1288	scriptelem[0].ch,
	1289	scriptelem[0].count,
	1290	&error);
	1291	if (U_FAILURE(error)) {
	1292	fprintf(stdout, "Error creating collation element iterator\n");
	1293	return;
	1294	}
	1295
	1296	outputScriptElem(scriptelem[0], -1, hasExpansions(coleiter));
	1297	for (int i = 0; i < count - 1; i ++) {
	1298	ucol_setText(coleiter, scriptelem[i + 1].ch, scriptelem[i + 1].count,
	1299	&error);
	1300	if (U_FAILURE(error)) {
	1301	fprintf(stdout, "Error setting text in collation element iterator\n");
	1302	return;
	1303	}
	1304	outputScriptElem(scriptelem[i + 1],
	1305	compareSortKey(scriptelem + i, scriptelem + i + 1),
	1306	hasExpansions(coleiter));
	1307	}
	1308	free(scriptelem);
	1309	outputHTMLFooter();
	1310	}
	1311
	1312	/**
	1313	* Prints the header for the html
	1314	* @param locale name
	1315	* @param script
	1316	* @param scriptcount number of scripts
	1317	*/
	1318	void outputHTMLHeader(const char *locale, UScriptCode script[],
	1319	int scriptcount)
	1320	{
	1321	fprintf(OUTPUT_, "<html>\n");
	1322	fprintf(OUTPUT_, "<head>\n");
	1323	fprintf(OUTPUT_, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n");
	1324	fprintf(OUTPUT_, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n");
	1325	fprintf(OUTPUT_, "<link rel=\"stylesheet\" href=\"charts.css\" type=\"text/css\">\n");
	1326	fprintf(OUTPUT_, "<title>ICU Collation charts</title>\n");
	1327	fprintf(OUTPUT_, "<base target=\"main\">\n");
	1328	fprintf(OUTPUT_, "</head>\n");
	1329
	1330	fprintf(OUTPUT_, "<body bgcolor=#FFFFFF>\n");
	1331	fprintf(OUTPUT_, "<!--\n");
	1332	fprintf(OUTPUT_, "This file contains sorted characters in ascending order according to the locale stated\n");
	1333	fprintf(OUTPUT_, "If the character is in red, it is tailored in the collation rules.\n");
	1334	fprintf(OUTPUT_, "Background colours have certain meanings:\n");
	1335	fprintf(OUTPUT_, "White - equals the previous character\n");
	1336	fprintf(OUTPUT_, "dark blue - primary greater than the previous character\n");
	1337	fprintf(OUTPUT_, "blue - secondary greater than the previous character\n");
	1338	fprintf(OUTPUT_, "light blue - tertiary greater than the previous character\n");
	1339	fprintf(OUTPUT_, "--!>\n");
	1340
	1341	fprintf(OUTPUT_, "<table border=0>\n");
	1342	UChar displayname[64];
	1343	UErrorCode error = U_ZERO_ERROR;
	1344	int32_t size = uloc_getDisplayName(locale, "en_US", displayname, 64, &error);
	1345	char utf8displayname[128];
	1346	if (U_FAILURE(error)) {
	1347	utf8displayname[0] = 0;
	1348	}
	1349	else {
	1350	int32_t utf8size = 0;
	1351	u_strToUTF8(utf8displayname, 128, &utf8size, displayname, size, &error);
	1352	}
	1353
	1354	fprintf(OUTPUT_, "<tr><th>Locale</th><td class='noborder'>%s</td></tr>\n", utf8displayname);
	1355	fprintf(OUTPUT_, "<tr><th>Script(s)</th>");
	1356	fprintf(OUTPUT_, "<td class='noborder'>");
	1357	for (int i = 0; i < scriptcount; i ++) {
	1358	fprintf(OUTPUT_, "%s", uscript_getName(script[i]));
	1359	if (i + 1 != scriptcount) {
	1360	fprintf(OUTPUT_, ", ");
	1361	}
	1362	}
	1363	fprintf(OUTPUT_, "</td></tr>\n");
	1364
	1365	fprintf(OUTPUT_, "<tr><th>Rules</th><td class='noborder'><a href=http://oss.software.ibm.com/cvs/icu/~checkout~/icu/source/data/locales/%s.txt>%s.txt</a></td></tr>\n", locale, locale);
	1366
	1367	UVersionInfo version;
	1368	ucol_getVersion(COLLATOR_, version);
	1369	fprintf(OUTPUT_, "<tr><th>Collator version</th><td class='noborder'>%d.%d.%d.%d</td></tr>\n",
	1370	version[0], version[1], version[2], version[3]);
	1371
	1372	UColAttribute attr = UCOL_FRENCH_COLLATION;
	1373	while (attr < UCOL_ATTRIBUTE_COUNT) {
	1374	UColAttributeValue value = ucol_getAttribute(COLLATOR_, attr, &error);
	1375	if (U_FAILURE(error)) {
	1376	fprintf(stdout, "Error getting attribute\n");
	1377	return;
	1378	}
	1379	if (value != UCOL_DEFAULT) {
	1380	if (attr == UCOL_FRENCH_COLLATION && value != UCOL_OFF) {
	1381	fprintf(OUTPUT_, "<tr><th>French Collation</th><td class='noborder'>on, code %d</td></tr>\n", value);
	1382	}
	1383	if (attr == UCOL_ALTERNATE_HANDLING && value != UCOL_NON_IGNORABLE) {
	1384	fprintf(OUTPUT_, "<tr><th>Alternate Handling</th><td class='noborder'>shifted, code%d</td></tr>\n", value);
	1385	}
	1386	if (attr == UCOL_CASE_FIRST && value != UCOL_OFF) {
	1387	fprintf(OUTPUT_, "<tr><th>Case First</th><td class='noborder'>on, code %d</td></tr>\n", value);
	1388	}
	1389	if (attr == UCOL_CASE_LEVEL && value != UCOL_OFF) {
	1390	fprintf(OUTPUT_, "<tr><th>Case Level</th><td class='noborder'>on, code %d</td></tr>\n", value);
	1391	}
	1392	if (attr == UCOL_NORMALIZATION_MODE && value != UCOL_OFF) {
	1393	fprintf(OUTPUT_, "<tr><th>Normalization</th><td class='noborder'>on, code %d</td></tr>\n", value);
	1394	}
	1395	if (attr == UCOL_STRENGTH && value != UCOL_TERTIARY) {
	1396	fprintf(OUTPUT_, "<tr><th>Strength</th><td class='noborder'>code %d</td></tr>\n", value);
	1397	}
	1398	if (attr == UCOL_HIRAGANA_QUATERNARY_MODE && value != UCOL_OFF) {
	1399	fprintf(OUTPUT_, "<tr><th>Hiragana Quaternary</th><td class='noborder'>on, code %d</td></tr>\n", value);
	1400	}
	1401	}
	1402	attr = (UColAttribute)(attr + 1);
	1403	}
	1404
	1405	// Get UNIX-style time and display as number and string.
	1406	time_t ltime;
	1407	time( &ltime );
	1408	fprintf(OUTPUT_, "<tr><th>Date Generated</th><td class='noborder'>%s</td></tr>", ctime(&ltime));
	1409
	1410	fprintf(OUTPUT_, "</table>\n");
	1411
	1412	fprintf(OUTPUT_, "<p><a href=help.html>How to read the table</a><br>\n");
	1413	fprintf(OUTPUT_, "<a href=http://www.jtcsv.com/cgi-bin/icu-bugs/ target=new>Submit a bug</a></p>\n");
	1414	fprintf(OUTPUT_, "\n<table>\n");
	1415	fprintf(OUTPUT_, "\n<tr><th>Codepoint</th><th>P</th><th>S</th><th>T</th><th>Q</th><th>Name</th></tr>\n");
	1416	}
	1417
	1418	/**
	1419	* Prints the header for index.html
	1420	* @param file output file
	1421	*/
	1422	void outputListHTMLHeader(FILE *file)
	1423	{
	1424	fprintf(file, "<html>\n");
	1425	fprintf(file, "<head>\n");
	1426	fprintf(file, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n");
	1427	fprintf(file, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n");
	1428	fprintf(file, "<title>ICU Collation Charts</title>\n");
	1429	fprintf(file, "<base target=\"main\">\n");
	1430	fprintf(file, "</head>\n");
	1431	fprintf(file, "<body bgcolor=#FFFFFF>\n");
	1432	fprintf(file, "<h2 align=center>ICU Collation Charts</h2>\n");
	1433	fprintf(file, "<p align=center>\n");
	1434	fprintf(file, "<a href=http://www.unicode.org/charts/collation/ target=new>UCA Charts</a><br>");
	1435	}
	1436
	1437	/**
	1438	* Prints the footer for index.html
	1439	* @param file output file
	1440	*/
	1441	void outputListHTMLFooter(FILE *file)
	1442	{
	1443	fprintf(file, "</p>\n");
	1444	fprintf(file, "<center><image src=http://oss.software.ibm.com/icu/images/w24.gif></center>\n");
	1445	fprintf(file, "</body>\n");
	1446	fprintf(file, "</html>\n");
	1447	}
	1448
	1449	/**
	1450	* Gets all scripts and serialize their codepoints into an html file.
	1451	*/
	1452	void serializeScripts() {
	1453	char filename[128];
	1454	int dirlength = 0;
	1455
	1456	if (options[4].doesOccur) {
	1457	strcpy(filename, options[4].value);
	1458	dirlength = appendDirSeparator(filename);
	1459	} else {
	1460	filename[0] = 0;
	1461	}
	1462
	1463	const char *locale;
	1464	int32_t localelist = 0;
	1465	int32_t localesize;
	1466
	1467	localesize = ucol_countAvailable();
	1468	locale = ucol_getAvailable(localelist);
	1469
	1470	strcat(filename, "list.html");
	1471	FILE *list = fopen(filename, "w");
	1472	filename[dirlength] = 0;
	1473	if (list == NULL) {
	1474	fprintf(stdout, "Cannot open file: %s\n", filename);
	1475	return;
	1476	}
	1477
	1478	outputListHTMLHeader(list);
	1479	fprintf(list, "<blockquote>\n");
	1480	while (TRUE) {
	1481	UErrorCode error = U_ZERO_ERROR;
	1482	COLLATOR_ = ucol_open(locale, &error);
	1483	if (U_FAILURE(error)) {
	1484	fprintf(stdout, "Collator creation failed:");
	1485	fprintf(stdout, u_errorName(error));
	1486	return;
	1487	}
	1488	if ((error != U_USING_FALLBACK_WARNING && // not tailored
	1489	error != U_USING_DEFAULT_WARNING) \|\|
	1490	checkLocaleForLanguage(locale)) {
	1491	fprintf(list, "<a href=%s.html>%s</a> ", locale, locale);
	1492	setAttributes(COLLATOR_, &error);
	1493	if (U_FAILURE(error)) {
	1494	fprintf(stdout, "Collator attribute setting failed:");
	1495	fprintf(stdout, u_errorName(error));
	1496	return;
	1497	}
	1498
	1499	UScriptCode scriptcode[32];
	1500	uint32_t scriptcount = uscript_getCode(locale, scriptcode, 32,
	1501	&error);
	1502	if (U_FAILURE(error)) {
	1503	fprintf(stdout, "Error getting lcale scripts\n");
	1504	return;
	1505	}
	1506
	1507	strcat(filename, locale);
	1508	strcat(filename, ".html");
	1509	OUTPUT_ = fopen(filename, "w");
	1510	if (OUTPUT_ == NULL) {
	1511	fprintf(stdout, "Cannot open file:%s\n", filename);
	1512	return;
	1513	}
	1514	outputHTMLHeader(locale, scriptcode, scriptcount);
	1515	fprintf(stdout, "%s\n", locale);
	1516
	1517	if(options[12].doesOccur) {
	1518	// use whole scripts
	1519	serializeScripts(scriptcode, scriptcount);
	1520	} else {
	1521	// use exemplar chars
	1522	serializeScripts(scriptcode, scriptcount, locale);
	1523	}
	1524	fclose(OUTPUT_);
	1525	}
	1526	ucol_close(COLLATOR_);
	1527
	1528	filename[dirlength] = 0;
	1529	localelist ++;
	1530	if (localelist == localesize) {
	1531	break;
	1532	}
	1533	locale = ucol_getAvailable(localelist);
	1534	}
	1535	fprintf(list, "<br><a href=help.html>help</a><br>");
	1536	fprintf(list, "</blockquote>\n");
	1537	outputListHTMLFooter(list);
	1538	fclose(list);
	1539	}
	1540
	1541	/**
	1542	* Main -- process command line, read in and pre-process the test file,
	1543	* call other functions to do the actual tests.
	1544	*/
	1545	int main(int argc, char *argv[]) {
	1546
	1547	argc = u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]),
	1548	options);
	1549
	1550	// error handling, printing usage message
	1551	if (argc < 0) {
	1552	fprintf(stdout, "error in command line argument: ");
	1553	fprintf(stdout, argv[-argc]);
	1554	fprintf(stdout, "\n");
	1555	}
	1556	if (argc < 0 \|\| options[0].doesOccur \|\| options[1].doesOccur) {
	1557	fprintf(stdout, "Usage: dumpce options...\n"
	1558	"--help\n"
	1559	" Display this message.\n"
	1560	"--locale name\|all\n"
	1561	" ICU locale to use. Default is en_US\n"
	1562	"--serialize\n"
	1563	" Serializes the collation elements in -locale or all locales available and outputs them into --outputdir/locale_ce.txt\n"
	1564	"--destdir dir_name\n"
	1565	" Path for outputing the serialized collation elements. Defaults to stdout if no defined\n"
	1566	"--sourcedir dir_name\n"
	1567	" Path for the input rule file for collation\n"
	1568	"--attribute name=value,name=value...\n"
	1569	" Pairs of attribute names and values for setting\n"
	1570	"--rule filename\n"
	1571	" Name of file containing the collation rules.\n"
	1572	"--normalizaton mode\n"
	1573	" UNormalizationMode mode to be used.\n"
	1574	"--scripts\n"
	1575	" Codepoints from all scripts are sorted and serialized.\n"
	1576	"--reducehan\n"
	1577	" Only 200 Han script characters will be displayed with the use of --scripts.\n"
	1578	"--wholescripts\n"
	1579	" Show collation order for whole scripts instead of just for exemplar characters of a locale\n\n");
	1580
	1581	fprintf(stdout, "Example to generate *.txt files : dumpce --serialize --locale af --destdir /temp --attribute UCOL_STRENGTH=UCOL_DEFAULT_STRENGTH,4=17\n\n");
	1582	fprintf(stdout, "Example to generate *.html files for oss web display: dumpce --scripts --destdir /temp --reducehan\n");
	1583	return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
	1584	}
	1585
	1586	OUTPUT_ = stdout;
	1587	if (options[6].doesOccur) {
	1588	fprintf(stdout, "attributes %s\n", options[6].value);
	1589	parseAttributes();
	1590	}
	1591	if (options[3].doesOccur) {
	1592	serialize();
	1593	}
	1594	if (options[9].doesOccur) {
	1595	serializeScripts();
	1596	}
	1597	return 0;
	1598	}