git.saurik.com Git - apple/icu.git/blame - icuSources/tools/gennorm2/n2builder.cpp

Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
729e4ab9 A	3	/*
	4	*******************************************************************************
	5	*
f3c0d7a5	6	* Copyright (C) 2009-2016, International Business Machines
729e4ab9 A	7	* Corporation and others. All Rights Reserved.
	8	*
	9	*******************************************************************************
	10	* file name: n2builder.cpp
f3c0d7a5	11	* encoding: UTF-8
729e4ab9 A	12	* tab size: 8 (not used)
	13	* indentation:4
	14	*
	15	* created on: 2009nov25
	16	* created by: Markus W. Scherer
	17	*
	18	* Builds Normalizer2 data and writes a binary .nrm file.
	19	* For the file format see source/common/normalizer2impl.h.
	20	*/
	21
	22	#include "unicode/utypes.h"
	23	#include "n2builder.h"
	24
	25	#include <stdio.h>
	26	#include <stdlib.h>
	27	#include <string.h>
729e4ab9	28	#include <vector>
729e4ab9 A	29	#include "unicode/errorcode.h"
	30	#include "unicode/localpointer.h"
	31	#include "unicode/putil.h"
3d1f044b	32	#include "unicode/ucptrie.h"
729e4ab9	33	#include "unicode/udata.h"
3d1f044b	34	#include "unicode/umutablecptrie.h"
729e4ab9 A	35	#include "unicode/uniset.h"
729e4ab9 A	36	#include "unicode/unistr.h"
0f5d89e8	37	#include "unicode/usetiter.h"
729e4ab9	38	#include "unicode/ustring.h"
b331163b	39	#include "charstr.h"
0f5d89e8	40	#include "extradata.h"
729e4ab9 A	41	#include "hash.h"
729e4ab9 A	42	#include "normalizer2impl.h"
0f5d89e8	43	#include "norms.h"
729e4ab9 A	44	#include "toolutil.h"
729e4ab9 A	45	#include "unewdata.h"
729e4ab9	46	#include "uvectr32.h"
b331163b	47	#include "writesrc.h"
729e4ab9 A	48
	49	#if !UCONFIG_NO_NORMALIZATION
	50
	51	/* UDataInfo cf. udata.h */
	52	static UDataInfo dataInfo={
	53	sizeof(UDataInfo),
	54	0,
	55
	56	U_IS_BIG_ENDIAN,
	57	U_CHARSET_FAMILY,
	58	U_SIZEOF_UCHAR,
	59	0,
	60
	61	{ 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
3d1f044b A	62	{ 4, 0, 0, 0 }, /* formatVersion */
3d1f044b A	63	{ 11, 0, 0, 0 } /* dataVersion (Unicode version) */
729e4ab9 A	64	};
	65
	66	U_NAMESPACE_BEGIN
	67
	68	class HangulIterator {
	69	public:
	70	struct Range {
0f5d89e8	71	UChar32 start, end;
729e4ab9 A	72	};
	73
	74	HangulIterator() : rangeIndex(0) {}
	75	const Range *nextRange() {
b331163b	76	if(rangeIndex<UPRV_LENGTHOF(ranges)) {
729e4ab9 A	77	return ranges+rangeIndex++;
	78	} else {
	79	return NULL;
	80	}
	81	}
729e4ab9 A	82	private:
	83	static const Range ranges[4];
	84	int32_t rangeIndex;
	85	};
	86
	87	const HangulIterator::Range HangulIterator::ranges[4]={
0f5d89e8 A	88	{ Hangul::JAMO_L_BASE, Hangul::JAMO_L_END },
0f5d89e8 A	89	{ Hangul::JAMO_V_BASE, Hangul::JAMO_V_END },
729e4ab9	90	// JAMO_T_BASE+1: not U+11A7
0f5d89e8 A	91	{ Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END },
0f5d89e8 A	92	{ Hangul::HANGUL_BASE, Hangul::HANGUL_END },
729e4ab9 A	93	};
729e4ab9 A	94
729e4ab9	95	Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
0f5d89e8	96	norms(errorCode),
b331163b	97	phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL),
3d1f044b	98	norm16TrieBytes(nullptr), norm16TrieLength(0) {
729e4ab9	99	memset(unicodeVersion, 0, sizeof(unicodeVersion));
729e4ab9	100	memset(indexes, 0, sizeof(indexes));
4388f060	101	memset(smallFCD, 0, sizeof(smallFCD));
729e4ab9 A	102	}
	103
	104	Normalizer2DataBuilder::~Normalizer2DataBuilder() {
3d1f044b	105	delete[] norm16TrieBytes;
729e4ab9 A	106	}
	107
	108	void
	109	Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
4388f060 A	110	UVersionInfo nullVersion={ 0, 0, 0, 0 };
	111	UVersionInfo version;
	112	u_versionFromString(version, v);
	113	if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
	114	0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
	115	) {
	116	char buffer[U_MAX_VERSION_STRING_LENGTH];
	117	u_versionToString(unicodeVersion, buffer);
	118	fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
	119	buffer, v);
	120	exit(U_ILLEGAL_ARGUMENT_ERROR);
	121	}
	122	memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
729e4ab9 A	123	}
729e4ab9 A	124
729e4ab9 A	125	Norm Normalizer2DataBuilder::checkNormForMapping(Norm p, UChar32 c) {
	126	if(p!=NULL) {
	127	if(p->mappingType!=Norm::NONE) {
	128	if( overrideHandling==OVERRIDE_NONE \|\|
	129	(overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
	130	) {
	131	fprintf(stderr,
	132	"error in gennorm2 phase %d: "
	133	"not permitted to override mapping for U+%04lX from phase %d\n",
	134	(int)phase, (long)c, (int)p->mappingPhase);
	135	exit(U_INVALID_FORMAT_ERROR);
	136	}
	137	delete p->mapping;
	138	p->mapping=NULL;
	139	}
	140	p->mappingPhase=phase;
	141	}
	142	return p;
	143	}
	144
	145	void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
	146	overrideHandling=oh;
	147	++phase;
	148	}
	149
	150	void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
0f5d89e8 A	151	norms.createNorm(c)->cc=cc;
0f5d89e8 A	152	norms.ccSet.add(c);
729e4ab9 A	153	}
	154
	155	static UBool isWellFormed(const UnicodeString &s) {
	156	UErrorCode errorCode=U_ZERO_ERROR;
f3c0d7a5	157	u_strToUTF8(NULL, 0, NULL, toUCharPtr(s.getBuffer()), s.length(), &errorCode);
729e4ab9 A	158	return U_SUCCESS(errorCode) \|\| errorCode==U_BUFFER_OVERFLOW_ERROR;
	159	}
	160
	161	void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
	162	if(!isWellFormed(m)) {
	163	fprintf(stderr,
	164	"error in gennorm2 phase %d: "
	165	"illegal one-way mapping from U+%04lX to malformed string\n",
	166	(int)phase, (long)c);
	167	exit(U_INVALID_FORMAT_ERROR);
	168	}
0f5d89e8	169	Norm *p=checkNormForMapping(norms.createNorm(c), c);
729e4ab9 A	170	p->mapping=new UnicodeString(m);
	171	p->mappingType=Norm::ONE_WAY;
	172	p->setMappingCP();
0f5d89e8	173	norms.mappingSet.add(c);
729e4ab9 A	174	}
	175
	176	void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
	177	if(U_IS_SURROGATE(c)) {
	178	fprintf(stderr,
	179	"error in gennorm2 phase %d: "
	180	"illegal round-trip mapping from surrogate code point U+%04lX\n",
	181	(int)phase, (long)c);
	182	exit(U_INVALID_FORMAT_ERROR);
	183	}
	184	if(!isWellFormed(m)) {
	185	fprintf(stderr,
	186	"error in gennorm2 phase %d: "
	187	"illegal round-trip mapping from U+%04lX to malformed string\n",
	188	(int)phase, (long)c);
	189	exit(U_INVALID_FORMAT_ERROR);
	190	}
f3c0d7a5	191	int32_t numCP=u_countChar32(toUCharPtr(m.getBuffer()), m.length());
729e4ab9 A	192	if(numCP!=2) {
	193	fprintf(stderr,
	194	"error in gennorm2 phase %d: "
	195	"illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
	196	(int)phase, (long)c, (int)numCP);
	197	exit(U_INVALID_FORMAT_ERROR);
	198	}
0f5d89e8	199	Norm *p=checkNormForMapping(norms.createNorm(c), c);
729e4ab9 A	200	p->mapping=new UnicodeString(m);
	201	p->mappingType=Norm::ROUND_TRIP;
	202	p->mappingCP=U_SENTINEL;
0f5d89e8	203	norms.mappingSet.add(c);
729e4ab9 A	204	}
	205
	206	void Normalizer2DataBuilder::removeMapping(UChar32 c) {
0f5d89e8 A	207	// createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
	208	Norm *p=checkNormForMapping(norms.createNorm(c), c);
	209	p->mappingType=Norm::REMOVED;
	210	norms.mappingSet.add(c);
729e4ab9 A	211	}
729e4ab9 A	212
0f5d89e8 A	213	UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer,
0f5d89e8 A	214	Norm::MappingType mappingType) const {
729e4ab9	215	if(buffer.isEmpty()) {
0f5d89e8	216	return FALSE; // Maps-to-empty-string is no boundary of any kind.
729e4ab9 A	217	}
	218	int32_t lastStarterIndex=buffer.lastStarterIndex();
	219	if(lastStarterIndex<0) {
0f5d89e8 A	220	return FALSE; // no starter
	221	}
	222	const int32_t lastIndex=buffer.length()-1;
	223	if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) {
	224	// One-way mapping where after the last starter is at least one combining mark
	225	// with a combining class greater than 1,
	226	// which means that another combining mark can reorder before it.
	227	// By contrast, in a round-trip mapping this does not prevent a boundary as long as
	228	// the starter or composite does not combine-forward with a following combining mark.
	229	return FALSE;
729e4ab9 A	230	}
729e4ab9 A	231	UChar32 starter=buffer.charAt(lastStarterIndex);
0f5d89e8 A	232	if(lastStarterIndex==0 && norms.combinesBack(starter)) {
	233	// The last starter is at the beginning of the mapping and combines backward.
	234	return FALSE;
	235	}
	236	if(Hangul::isJamoL(starter) \|\|
	237	(Hangul::isJamoV(starter) &&
	238	0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
729e4ab9 A	239	// A Jamo leading consonant or an LV pair combines-forward if it is at the end,
729e4ab9 A	240	// otherwise it is blocked.
0f5d89e8	241	return lastStarterIndex!=lastIndex;
729e4ab9	242	}
4388f060	243	// Note: There can be no Hangul syllable in the fully decomposed mapping.
0f5d89e8 A	244
	245	// Multiple starters can combine into one.
	246	// Look for the first of the last sequence of starters, excluding Jamos.
	247	int32_t i=lastStarterIndex;
	248	UChar32 c;
	249	while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) {
	250	starter=c;
	251	--i;
	252	}
	253	// Compose as far as possible, and see if further compositions with
	254	// characters following this mapping are possible.
	255	const Norm *starterNorm=norms.getNorm(starter);
	256	if(i==lastStarterIndex &&
	257	(starterNorm==nullptr \|\| starterNorm->compositions==nullptr)) {
	258	return TRUE; // The last starter does not combine forward.
729e4ab9	259	}
729e4ab9	260	uint8_t prevCC=0;
0f5d89e8 A	261	while(++i<buffer.length()) {
	262	uint8_t cc=buffer.ccAt(i); // !=0 if after last starter
	263	if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
	264	// The starter combines with a mark that reorders before the current one.
	265	return FALSE;
729e4ab9	266	}
0f5d89e8 A	267	UChar32 c=buffer.charAt(i);
	268	if(starterNorm!=nullptr && (prevCC<cc \|\| prevCC==0) &&
	269	norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) {
	270	// The starter combines with c into a composite replacement starter.
	271	starterNorm=norms.getNorm(starter);
	272	if(i>=lastStarterIndex &&
	273	(starterNorm==nullptr \|\| starterNorm->compositions==nullptr)) {
	274	return TRUE; // The composite does not combine further.
729e4ab9	275	}
0f5d89e8 A	276	// Keep prevCC because we "removed" the combining mark.
	277	} else if(cc==0) {
	278	starterNorm=norms.getNorm(c);
	279	if(i==lastStarterIndex &&
	280	(starterNorm==nullptr \|\| starterNorm->compositions==nullptr)) {
	281	return TRUE; // The new starter does not combine forward.
	282	}
	283	prevCC=0;
729e4ab9 A	284	} else {
729e4ab9 A	285	prevCC=cc;
4388f060	286	}
729e4ab9	287	}
0f5d89e8 A	288	if(prevCC==0) {
0f5d89e8 A	289	return FALSE; // forward-combining starter at the very end
729e4ab9	290	}
0f5d89e8 A	291	if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) {
	292	// The starter combines with another mark.
	293	return FALSE;
729e4ab9	294	}
0f5d89e8	295	return TRUE;
729e4ab9 A	296	}
729e4ab9 A	297
0f5d89e8 A	298	UBool Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer &buffer) const {
	299	if(buffer.lastStarterIndex()<0) {
	300	return FALSE; // no starter
729e4ab9	301	}
0f5d89e8 A	302	const Norm *starterNorm=nullptr;
	303	uint8_t prevCC=0;
	304	for(int32_t i=0; i<buffer.length(); ++i) {
	305	UChar32 c=buffer.charAt(i);
	306	uint8_t cc=buffer.ccAt(i);
	307	if(starterNorm!=nullptr && (prevCC<cc \|\| prevCC==0) &&
	308	norms.getNormRef(c).combinesBack && starterNorm->combine(c)>=0) {
	309	return TRUE; // normal composite
	310	} else if(cc==0) {
	311	if(Hangul::isJamoL(c)) {
	312	if((i+1)<buffer.length() && Hangul::isJamoV(buffer.charAt(i+1))) {
	313	return TRUE; // Hangul syllable
	314	}
	315	starterNorm=nullptr;
729e4ab9	316	} else {
0f5d89e8	317	starterNorm=norms.getNorm(c);
729e4ab9	318	}
729e4ab9	319	}
0f5d89e8	320	prevCC=cc;
729e4ab9	321	}
0f5d89e8	322	return FALSE;
729e4ab9 A	323	}
729e4ab9 A	324
0f5d89e8 A	325	void Normalizer2DataBuilder::postProcess(Norm &norm) {
	326	// Prerequisites: Compositions are built, mappings are recursively decomposed.
	327	// Mappings are not yet in canonical order.
	328	//
	329	// This function works on a Norm struct. We do not know which code point(s) map(s) to it.
	330	// Therefore, we cannot compute algorithmic mapping deltas here.
	331	// Error conditions are checked, but printed later when we do know the offending code point.
	332	if(norm.hasMapping()) {
	333	if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) {
	334	norm.error="mapping longer than maximum of 31";
	335	return;
729e4ab9	336	}
0f5d89e8 A	337	// Ensure canonical order.
	338	BuilderReorderingBuffer buffer;
	339	if(norm.rawMapping!=nullptr) {
	340	norms.reorder(*norm.rawMapping, buffer);
	341	buffer.reset();
729e4ab9	342	}
0f5d89e8 A	343	norms.reorder(*norm.mapping, buffer);
	344	if(buffer.isEmpty()) {
	345	// A character that is deleted (maps to an empty string) must
	346	// get the worst-case lccc and tccc values because arbitrary
	347	// characters on both sides will become adjacent.
	348	norm.leadCC=1;
	349	norm.trailCC=0xff;
4388f060	350	} else {
0f5d89e8 A	351	norm.leadCC=buffer.ccAt(0);
0f5d89e8 A	352	norm.trailCC=buffer.ccAt(buffer.length()-1);
729e4ab9	353	}
0f5d89e8 A	354
	355	norm.hasCompBoundaryBefore=
	356	!buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0));
	357	norm.hasCompBoundaryAfter=
	358	norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer, norm.mappingType);
	359
	360	if(norm.combinesBack) {
	361	norm.error="combines-back and decomposes, not possible in Unicode normalization";
	362	} else if(norm.mappingType==Norm::ROUND_TRIP) {
	363	if(norm.compositions!=NULL) {
	364	norm.type=Norm::YES_NO_COMBINES_FWD;
	365	} else {
	366	norm.type=Norm::YES_NO_MAPPING_ONLY;
	367	}
	368	} else { // one-way mapping
	369	if(norm.compositions!=NULL) {
	370	norm.error="combines-forward and has a one-way mapping, "
	371	"not possible in Unicode normalization";
	372	} else if(buffer.isEmpty()) {
	373	norm.type=Norm::NO_NO_EMPTY;
	374	} else if(!norm.hasCompBoundaryBefore) {
	375	norm.type=Norm::NO_NO_COMP_NO_MAYBE_CC;
	376	} else if(mappingRecomposes(buffer)) {
	377	norm.type=Norm::NO_NO_COMP_BOUNDARY_BEFORE;
	378	} else {
	379	// The mapping is comp-normalized.
	380	norm.type=Norm::NO_NO_COMP_YES;
729e4ab9 A	381	}
729e4ab9 A	382	}
0f5d89e8 A	383	} else { // no mapping
	384	norm.leadCC=norm.trailCC=norm.cc;
	385
	386	norm.hasCompBoundaryBefore=
	387	norm.cc==0 && !norm.combinesBack;
	388	norm.hasCompBoundaryAfter=
	389	norm.cc==0 && !norm.combinesBack && norm.compositions==nullptr;
	390
	391	if(norm.combinesBack) {
	392	if(norm.compositions!=nullptr) {
	393	// Earlier code checked ccc=0.
	394	norm.type=Norm::MAYBE_YES_COMBINES_FWD;
729e4ab9	395	} else {
0f5d89e8	396	norm.type=Norm::MAYBE_YES_SIMPLE; // any ccc
729e4ab9	397	}
0f5d89e8 A	398	} else if(norm.compositions!=nullptr) {
	399	// Earlier code checked ccc=0.
	400	norm.type=Norm::YES_YES_COMBINES_FWD;
	401	} else if(norm.cc!=0) {
	402	norm.type=Norm::YES_YES_WITH_CC;
	403	} else {
	404	norm.type=Norm::INERT;
729e4ab9 A	405	}
	406	}
	407	}
	408
0f5d89e8	409	class Norm16Writer : public Norms::Enumerator {
729e4ab9	410	public:
3d1f044b A	411	Norm16Writer(UMutableCPTrie *trie, Norms &n, Normalizer2DataBuilder &b) :
3d1f044b A	412	Norms::Enumerator(n), builder(b), norm16Trie(trie) {}
0f5d89e8	413	void rangeHandler(UChar32 start, UChar32 end, Norm &norm) U_OVERRIDE {
3d1f044b	414	builder.writeNorm16(norm16Trie, start, end, norm);
729e4ab9	415	}
0f5d89e8	416	Normalizer2DataBuilder &builder;
3d1f044b	417	UMutableCPTrie *norm16Trie;
729e4ab9 A	418	};
729e4ab9 A	419
0f5d89e8 A	420	void Normalizer2DataBuilder::setSmallFCD(UChar32 c) {
	421	UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
	422	smallFCD[lead>>8]\|=(uint8_t)1<<((lead>>5)&7);
	423	}
	424
3d1f044b	425	void Normalizer2DataBuilder::writeNorm16(UMutableCPTrie *norm16Trie, UChar32 start, UChar32 end, Norm &norm) {
0f5d89e8 A	426	if((norm.leadCC\|norm.trailCC)!=0) {
	427	for(UChar32 c=start; c<=end; ++c) {
	428	setSmallFCD(c);
	429	}
	430	}
	431
	432	int32_t norm16;
	433	switch(norm.type) {
	434	case Norm::INERT:
	435	norm16=Normalizer2Impl::INERT;
	436	break;
	437	case Norm::YES_YES_COMBINES_FWD:
	438	norm16=norm.offset*2;
	439	break;
	440	case Norm::YES_NO_COMBINES_FWD:
	441	norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset*2;
	442	break;
	443	case Norm::YES_NO_MAPPING_ONLY:
	444	norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset*2;
	445	break;
	446	case Norm::NO_NO_COMP_YES:
	447	norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset*2;
	448	break;
	449	case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
	450	norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]+norm.offset*2;
	451	break;
	452	case Norm::NO_NO_COMP_NO_MAYBE_CC:
	453	norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]+norm.offset*2;
	454	break;
	455	case Norm::NO_NO_EMPTY:
	456	norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]+norm.offset*2;
	457	break;
	458	case Norm::NO_NO_DELTA:
	459	{
	460	// Positive offset from minNoNoDelta, shifted left for additional bits.
	461	int32_t offset=(norm.offset+Normalizer2Impl::MAX_DELTA)<<Normalizer2Impl::DELTA_SHIFT;
	462	if(norm.trailCC==0) {
	463	// DELTA_TCCC_0==0
	464	} else if(norm.trailCC==1) {
	465	offset\|=Normalizer2Impl::DELTA_TCCC_1;
	466	} else {
	467	offset\|=Normalizer2Impl::DELTA_TCCC_GT_1;
729e4ab9	468	}
0f5d89e8	469	norm16=getMinNoNoDelta()+offset;
729e4ab9	470	break;
729e4ab9	471	}
0f5d89e8 A	472	case Norm::MAYBE_YES_COMBINES_FWD:
	473	norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset*2;
	474	break;
	475	case Norm::MAYBE_YES_SIMPLE:
	476	norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc*2; // ccc=0..255
	477	break;
	478	case Norm::YES_YES_WITH_CC:
	479	U_ASSERT(norm.cc!=0);
	480	norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-2+norm.cc*2; // ccc=1..255
	481	break;
	482	default: // Should not occur.
	483	exit(U_INTERNAL_PROGRAM_ERROR);
	484	}
	485	U_ASSERT((norm16&1)==0);
	486	if(norm.hasCompBoundaryAfter) {
	487	norm16\|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER;
	488	}
	489	IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
3d1f044b	490	umutablecptrie_setRange(norm16Trie, start, end, (uint32_t)norm16, errorCode);
0f5d89e8 A	491
	492	// Set the minimum code points for real data lookups in the quick check loops.
	493	UBool isDecompNo=
	494	(Norm::YES_NO_COMBINES_FWD<=norm.type && norm.type<=Norm::NO_NO_DELTA) \|\|
	495	norm.cc!=0;
	496	if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
	497	indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
	498	}
	499	UBool isCompNoMaybe= norm.type>=Norm::NO_NO_COMP_YES;
	500	if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
	501	indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
	502	}
	503	if(norm.leadCC!=0 && start<indexes[Normalizer2Impl::IX_MIN_LCCC_CP]) {
	504	indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=start;
729e4ab9 A	505	}
	506	}
	507
3d1f044b	508	void Normalizer2DataBuilder::setHangulData(UMutableCPTrie *norm16Trie) {
729e4ab9 A	509	HangulIterator hi;
	510	const HangulIterator::Range *range;
	511	// Check that none of the Hangul/Jamo code points have data.
	512	while((range=hi.nextRange())!=NULL) {
0f5d89e8	513	for(UChar32 c=range->start; c<=range->end; ++c) {
3d1f044b	514	if(umutablecptrie_get(norm16Trie, c)>Normalizer2Impl::INERT) {
729e4ab9 A	515	fprintf(stderr,
	516	"gennorm2 error: "
	517	"illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
	518	(long)c);
	519	exit(U_INVALID_FORMAT_ERROR);
	520	}
	521	}
	522	}
	523	// Set data for algorithmic runtime handling.
	524	IcuToolErrorCode errorCode("gennorm2/setHangulData()");
0f5d89e8 A	525
	526	// Jamo V/T are maybeYes
	527	if(Hangul::JAMO_V_BASE<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
	528	indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=Hangul::JAMO_V_BASE;
729e4ab9	529	}
3d1f044b A	530	umutablecptrie_setRange(norm16Trie, Hangul::JAMO_L_BASE, Hangul::JAMO_L_END,
	531	Normalizer2Impl::JAMO_L, errorCode);
	532	umutablecptrie_setRange(norm16Trie, Hangul::JAMO_V_BASE, Hangul::JAMO_V_END,
	533	Normalizer2Impl::JAMO_VT, errorCode);
0f5d89e8	534	// JAMO_T_BASE+1: not U+11A7
3d1f044b A	535	umutablecptrie_setRange(norm16Trie, Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END,
3d1f044b A	536	Normalizer2Impl::JAMO_VT, errorCode);
0f5d89e8 A	537
	538	// Hangul LV encoded as minYesNo
	539	uint32_t lv=indexes[Normalizer2Impl::IX_MIN_YES_NO];
	540	// Hangul LVT encoded as minYesNoMappingsOnly\|HAS_COMP_BOUNDARY_AFTER
	541	uint32_t lvt=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]\|
	542	Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER;
	543	if(Hangul::HANGUL_BASE<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
	544	indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=Hangul::HANGUL_BASE;
	545	}
	546	// Set the first LV, then write all other Hangul syllables as LVT,
	547	// then overwrite the remaining LV.
3d1f044b A	548	umutablecptrie_set(norm16Trie, Hangul::HANGUL_BASE, lv, errorCode);
3d1f044b A	549	umutablecptrie_setRange(norm16Trie, Hangul::HANGUL_BASE+1, Hangul::HANGUL_END, lvt, errorCode);
0f5d89e8 A	550	UChar32 c=Hangul::HANGUL_BASE;
0f5d89e8 A	551	while((c+=Hangul::JAMO_T_COUNT)<=Hangul::HANGUL_END) {
3d1f044b	552	umutablecptrie_set(norm16Trie, c, lv, errorCode);
0f5d89e8 A	553	}
0f5d89e8 A	554	errorCode.assertSuccess();
729e4ab9 A	555	}
729e4ab9 A	556
3d1f044b	557	LocalUCPTriePointer Normalizer2DataBuilder::processData() {
0f5d89e8 A	558	// Build composition lists before recursive decomposition,
	559	// so that we still have the raw, pair-wise mappings.
	560	CompositionBuilder compBuilder(norms);
	561	norms.enumRanges(compBuilder);
729e4ab9	562
0f5d89e8 A	563	// Recursively decompose all mappings.
0f5d89e8 A	564	Decomposer decomposer(norms);
729e4ab9 A	565	do {
729e4ab9 A	566	decomposer.didDecompose=FALSE;
0f5d89e8	567	norms.enumRanges(decomposer);
729e4ab9 A	568	} while(decomposer.didDecompose);
729e4ab9 A	569
0f5d89e8 A	570	// Set the Norm::Type and other properties.
0f5d89e8 A	571	int32_t normsLength=norms.length();
729e4ab9	572	for(int32_t i=1; i<normsLength; ++i) {
0f5d89e8	573	postProcess(norms.getNormRefByIndex(i));
729e4ab9 A	574	}
729e4ab9 A	575
0f5d89e8 A	576	// Write the properties, mappings and composition lists to
	577	// appropriate parts of the "extra data" array.
	578	ExtraData extra(norms, optimization==OPTIMIZE_FAST);
	579	norms.enumRanges(extra);
	580
	581	extraData=extra.yesYesCompositions;
	582	indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length()*2;
	583	extraData.append(extra.yesNoMappingsAndCompositions);
	584	indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length()*2;
	585	extraData.append(extra.yesNoMappingsOnly);
	586	indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length()*2;
	587	extraData.append(extra.noNoMappingsCompYes);
	588	indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]=extraData.length()*2;
	589	extraData.append(extra.noNoMappingsCompBoundaryBefore);
	590	indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]=extraData.length()*2;
	591	extraData.append(extra.noNoMappingsCompNoMaybeCC);
	592	indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]=extraData.length()*2;
	593	extraData.append(extra.noNoMappingsEmpty);
	594	indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length()*2;
	595
	596	// Pad the maybeYesCompositions length to a multiple of 4,
	597	// so that NO_NO_DELTA bits 2..1 can be used without subtracting the center.
	598	while(extra.maybeYesCompositions.length()&3) {
	599	extra.maybeYesCompositions.append((UChar)0);
	600	}
	601	extraData.insert(0, extra.maybeYesCompositions);
	602	indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
	603	Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
	604	extra.maybeYesCompositions.length()*2;
729e4ab9	605
729e4ab9 A	606	// Pad to even length for 4-byte alignment of following data.
	607	if(extraData.length()&1) {
	608	extraData.append((UChar)0);
	609	}
	610
0f5d89e8 A	611	int32_t minNoNoDelta=getMinNoNoDelta();
0f5d89e8 A	612	U_ASSERT((minNoNoDelta&7)==0);
729e4ab9 A	613	if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
	614	fprintf(stderr,
	615	"gennorm2 error: "
	616	"data structure overflow, too much mapping composition data\n");
	617	exit(U_BUFFER_OVERFLOW_ERROR);
	618	}
	619
0f5d89e8 A	620	// writeNorm16() and setHangulData() reduce these as needed.
	621	indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
	622	indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
	623	indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=0x110000;
	624
3d1f044b A	625	IcuToolErrorCode errorCode("gennorm2/processData()");
	626	UMutableCPTrie *norm16Trie = umutablecptrie_open(
	627	Normalizer2Impl::INERT, Normalizer2Impl::INERT, errorCode);
	628	errorCode.assertSuccess();
	629
0f5d89e8 A	630	// Map each code point to its norm16 value,
	631	// including the properties that fit directly,
	632	// and the offset to the "extra data" if necessary.
3d1f044b	633	Norm16Writer norm16Writer(norm16Trie, norms, *this);
0f5d89e8	634	norms.enumRanges(norm16Writer);
3d1f044b	635	// TODO: iterate via getRange() instead of callback?
729e4ab9	636
3d1f044b	637	setHangulData(norm16Trie);
729e4ab9 A	638
	639	// Look for the "worst" norm16 value of any supplementary code point
	640	// corresponding to a lead surrogate, and set it as that surrogate's value.
0f5d89e8	641	// Enables UTF-16 quick check inner loops to look at only code units.
729e4ab9 A	642	//
	643	// We could be more sophisticated:
	644	// We could collect a bit set for whether there are values in the different
	645	// norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
	646	// and select the best value that only breaks the composition and/or decomposition
	647	// inner loops if necessary.
	648	// However, that seems like overkill for an optimization for supplementary characters.
3d1f044b A	649	//
	650	// First check that surrogate code points are inert.
	651	// The parser should have rejected values/mappings for them.
	652	uint32_t value;
	653	UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPMAP_RANGE_NORMAL, 0,
	654	nullptr, nullptr, &value);
	655	if (value != Normalizer2Impl::INERT \|\| end < 0xdfff) {
	656	fprintf(stderr,
	657	"gennorm2 error: not all surrogate code points are inert: U+d800..U+%04x=%lx\n",
	658	(int)end, (long)value);
	659	exit(U_INTERNAL_PROGRAM_ERROR);
	660	}
	661	uint32_t maxNorm16 = 0;
	662	// ANDing values yields 0 bits where any value has a 0.
	663	// Used for worst-case HAS_COMP_BOUNDARY_AFTER.
	664	uint32_t andedNorm16 = 0;
	665	end = 0;
	666	for (UChar32 start = 0x10000;;) {
	667	if (start > end) {
	668	end = umutablecptrie_getRange(norm16Trie, start, UCPMAP_RANGE_NORMAL, 0,
	669	nullptr, nullptr, &value);
	670	if (end < 0) { break; }
	671	}
	672	if ((start & 0x3ff) == 0) {
	673	// Data for a new lead surrogate.
	674	maxNorm16 = andedNorm16 = value;
	675	} else {
	676	if (value > maxNorm16) {
	677	maxNorm16 = value;
	678	}
	679	andedNorm16 &= value;
	680	}
	681	// Intersect each range with the code points for one lead surrogate.
	682	UChar32 leadEnd = start \| 0x3ff;
	683	if (leadEnd <= end) {
	684	// End of the supplementary block for a lead surrogate.
	685	if (maxNorm16 >= (uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]) {
	686	// Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
	687	// Otherwise it might end up at something like JAMO_VT which stays in
	688	// the inner decomposition quick check loop.
	689	maxNorm16 = (uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO];
	690	}
	691	maxNorm16 =
	692	(maxNorm16 & ~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)\|
	693	(andedNorm16 & Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER);
	694	if (maxNorm16 != Normalizer2Impl::INERT) {
	695	umutablecptrie_set(norm16Trie, U16_LEAD(start), maxNorm16, errorCode);
	696	}
	697	if (value == Normalizer2Impl::INERT) {
	698	// Potentially skip inert supplementary blocks for several lead surrogates.
	699	start = (end + 1) & ~0x3ff;
	700	} else {
	701	start = leadEnd + 1;
	702	}
	703	} else {
	704	start = end + 1;
729e4ab9	705	}
729e4ab9 A	706	}
	707
	708	// Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
	709	// For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
	710	// which is harmless.
	711	// As a result, the minimum code points are always BMP code points.
	712	int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
	713	if(minCP>=0x10000) {
	714	indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
	715	}
	716	minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
	717	if(minCP>=0x10000) {
	718	indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
	719	}
0f5d89e8 A	720	minCP=indexes[Normalizer2Impl::IX_MIN_LCCC_CP];
	721	if(minCP>=0x10000) {
	722	indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=U16_LEAD(minCP);
	723	}
729e4ab9	724
3d1f044b A	725	LocalUCPTriePointer builtTrie(
	726	umutablecptrie_buildImmutable(norm16Trie, UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_16, errorCode));
	727	norm16TrieLength=ucptrie_toBinary(builtTrie.getAlias(), nullptr, 0, errorCode);
729e4ab9	728	if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
3d1f044b	729	fprintf(stderr, "gennorm2 error: unable to build/serialize the normalization trie - %s\n",
729e4ab9 A	730	errorCode.errorName());
	731	exit(errorCode.reset());
	732	}
3d1f044b	733	umutablecptrie_close(norm16Trie);
729e4ab9	734	errorCode.reset();
3d1f044b A	735	norm16TrieBytes=new uint8_t[norm16TrieLength];
	736	ucptrie_toBinary(builtTrie.getAlias(), norm16TrieBytes, norm16TrieLength, errorCode);
	737	errorCode.assertSuccess();
729e4ab9 A	738
	739	int32_t offset=(int32_t)sizeof(indexes);
	740	indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
	741	offset+=norm16TrieLength;
	742	indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
4388f060 A	743	offset+=extraData.length()*2;
	744	indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
	745	offset+=sizeof(smallFCD);
	746	int32_t totalSize=offset;
	747	for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
729e4ab9 A	748	indexes[i]=totalSize;
	749	}
	750
	751	if(beVerbose) {
	752	printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength);
	753	printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length());
4388f060	754	printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD));
729e4ab9 A	755	printf("size of binary data file contents: %5ld bytes\n", (long)totalSize);
	756	printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
	757	printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
0f5d89e8 A	758	printf("minLcccCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_LCCC_CP]);
0f5d89e8 A	759	printf("minYesNo: (with compositions) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
4388f060	760	printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
0f5d89e8 A	761	printf("minNoNo: (comp-normalized) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
	762	printf("minNoNoCompBoundaryBefore: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
	763	printf("minNoNoCompNoMaybeCC: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
	764	printf("minNoNoEmpty: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]);
729e4ab9	765	printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
0f5d89e8	766	printf("minNoNoDelta: 0x%04x\n", (int)minNoNoDelta);
729e4ab9 A	767	printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
	768	}
	769
4388f060 A	770	UVersionInfo nullVersion={ 0, 0, 0, 0 };
	771	if(0==memcmp(nullVersion, unicodeVersion, 4)) {
	772	u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
	773	}
729e4ab9	774	memcpy(dataInfo.dataVersion, unicodeVersion, 4);
3d1f044b	775	return builtTrie;
b331163b A	776	}
	777
	778	void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
	779	processData();
	780
	781	IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
729e4ab9 A	782	UNewDataMemory *pData=
	783	udata_create(NULL, NULL, filename, &dataInfo,
	784	haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
	785	if(errorCode.isFailure()) {
	786	fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
	787	filename, errorCode.errorName());
	788	exit(errorCode.reset());
	789	}
	790	udata_writeBlock(pData, indexes, sizeof(indexes));
3d1f044b	791	udata_writeBlock(pData, norm16TrieBytes, norm16TrieLength);
f3c0d7a5	792	udata_writeUString(pData, toUCharPtr(extraData.getBuffer()), extraData.length());
4388f060	793	udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
729e4ab9 A	794	int32_t writtenSize=udata_finish(pData, errorCode);
	795	if(errorCode.isFailure()) {
	796	fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
	797	exit(errorCode.reset());
	798	}
b331163b	799	int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
729e4ab9 A	800	if(writtenSize!=totalSize) {
	801	fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
	802	(long)writtenSize, (long)totalSize);
	803	exit(U_INTERNAL_PROGRAM_ERROR);
	804	}
	805	}
	806
b331163b A	807	void
b331163b A	808	Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
3d1f044b	809	LocalUCPTriePointer norm16Trie = processData();
b331163b A	810
	811	IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()");
	812	const char *basename=findBasename(filename);
	813	CharString path(filename, (int32_t)(basename-filename), errorCode);
	814	CharString dataName(basename, errorCode);
	815	const char *extension=strrchr(basename, '.');
	816	if(extension!=NULL) {
	817	dataName.truncate((int32_t)(extension-basename));
	818	}
3d1f044b	819	const char *name=dataName.data();
b331163b A	820	errorCode.assertSuccess();
b331163b A	821
3d1f044b	822	FILE *f=usrc_create(path.data(), basename, 2016, "icu/source/tools/gennorm2/n2builder.cpp");
b331163b A	823	if(f==NULL) {
	824	fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n",
	825	filename);
	826	exit(U_FILE_ACCESS_ERROR);
b331163b	827	}
f3c0d7a5	828	fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f);
3d1f044b	829
b331163b	830	char line[100];
3d1f044b	831	sprintf(line, "static const UVersionInfo %s_formatVersion={", name);
b331163b	832	usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n");
3d1f044b	833	sprintf(line, "static const UVersionInfo %s_dataVersion={", name);
b331163b	834	usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n");
3d1f044b A	835	sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", name);
	836	usrc_writeArray(f, line, indexes, 32, Normalizer2Impl::IX_COUNT, "\n};\n\n");
	837
	838	usrc_writeUCPTrie(f, name, norm16Trie.getAlias());
	839
	840	sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", name);
	841	usrc_writeArray(f, line, extraData.getBuffer(), 16, extraData.length(), "\n};\n\n");
	842	sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", name);
	843	usrc_writeArray(f, line, smallFCD, 8, sizeof(smallFCD), "\n};\n\n");
	844
	845	fputs("#endif // INCLUDED_FROM_NORMALIZER2_CPP\n", f);
b331163b A	846	fclose(f);
	847	}
	848
0f5d89e8 A	849	namespace {
	850
	851	bool equalStrings(const UnicodeString s1, const UnicodeString s2) {
	852	if(s1 == nullptr) {
	853	return s2 == nullptr;
	854	} else if(s2 == nullptr) {
	855	return false;
	856	} else {
	857	return s1 == s2;
	858	}
	859	}
	860
	861	const char *typeChars = "?-=>";
	862
	863	void writeMapping(FILE f, const UnicodeString m) {
	864	if(m != nullptr && !m->isEmpty()) {
	865	int32_t i = 0;
	866	UChar32 c = m->char32At(i);
	867	fprintf(f, "%04lX", (long)c);
	868	while((i += U16_LENGTH(c)) < m->length()) {
	869	c = m->char32At(i);
	870	fprintf(f, " %04lX", (long)c);
	871	}
	872	}
	873	fputs("\n", f);
	874	}
	875
	876	} // namespace
	877
	878	void
	879	Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const {
	880	// Do not processData() before writing the input-syntax data file.
	881	FILE *f = fopen(filename, "w");
	882	if(f == nullptr) {
	883	fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
	884	filename);
	885	exit(U_FILE_ACCESS_ERROR);
	886	return;
	887	}
	888
	889	if(unicodeVersion[0] != 0 \|\| unicodeVersion[1] != 0 \|\|
	890	unicodeVersion[2] != 0 \|\| unicodeVersion[3] != 0) {
	891	char uv[U_MAX_VERSION_STRING_LENGTH];
	892	u_versionToString(unicodeVersion, uv);
	893	fprintf(f, "* Unicode %s\n\n", uv);
	894	}
	895
	896	UnicodeSetIterator ccIter(norms.ccSet);
	897	UChar32 start = U_SENTINEL;
	898	UChar32 end = U_SENTINEL;
	899	uint8_t prevCC = 0;
	900	bool done = false;
	901	bool didWrite = false;
	902	do {
	903	UChar32 c;
	904	uint8_t cc;
	905	if(ccIter.next() && !ccIter.isString()) {
	906	c = ccIter.getCodepoint();
	907	cc = norms.getCC(c);
	908	} else {
	909	c = 0x110000;
	910	cc = 0;
	911	done = true;
	912	}
913	if(cc == prevCC && c == (end + 1)) {
914	end = c;
915	} else {
916	if(prevCC != 0) {
917	if(start == end) {
918	fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC);
919	} else {
920	fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC);
921	}
922	didWrite = true;
923	}
924	start = end = c;
925	prevCC = cc;
926	}
927	} while(!done);
928	if(didWrite) {
929	fputs("\n", f);
930	}
931
932	UnicodeSetIterator mIter(norms.mappingSet);
933	start = U_SENTINEL;
934	end = U_SENTINEL;
935	const UnicodeString *prevMapping = nullptr;
936	Norm::MappingType prevType = Norm::NONE;
937	done = false;
938	do {
939	UChar32 c;
940	const Norm *norm;
941	if(mIter.next() && !mIter.isString()) {
942	c = mIter.getCodepoint();
943	norm = norms.getNorm(c);
944	} else {
945	c = 0x110000;
946	norm = nullptr;
947	done = true;
948	}
949	const UnicodeString *mapping;
950	Norm::MappingType type;
951	if(norm == nullptr) {
952	mapping = nullptr;
953	type = Norm::NONE;
954	} else {
955	type = norm->mappingType;
956	if(type == Norm::NONE) {
957	mapping = nullptr;
958	} else {
959	mapping = norm->mapping;
960	}
961	}
962	if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) {
963	end = c;
964	} else {
965	if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) {
966	if(start == end) {
967	fprintf(f, "%04lX%c", (long)start, typeChars[prevType]);
968	} else {
969	fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]);
970	}
971	writeMapping(f, prevMapping);
972	}
973	start = end = c;
974	prevMapping = mapping;
975	prevType = type;
976	}
977	} while(!done);
978
979	fclose(f);
980	}
981
982	void
983	Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1,
984	const Normalizer2DataBuilder &b2,
985	Normalizer2DataBuilder &diff) {
986	// Compute diff = b1 - b2
987	// so that we should be able to get b1 = b2 + diff.
988	if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) {
989	memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH);
990	}
991
992	UnicodeSet ccSet(b1.norms.ccSet);
993	ccSet.addAll(b2.norms.ccSet);
994	UnicodeSetIterator ccIter(ccSet);
995	while(ccIter.next() && !ccIter.isString()) {
996	UChar32 c = ccIter.getCodepoint();
997	uint8_t cc1 = b1.norms.getCC(c);
998	uint8_t cc2 = b2.norms.getCC(c);
999	if(cc1 != cc2) {
1000	diff.setCC(c, cc1);
1001	}
1002	}
1003
1004	UnicodeSet mSet(b1.norms.mappingSet);
1005	mSet.addAll(b2.norms.mappingSet);
1006	UnicodeSetIterator mIter(mSet);
1007	while(mIter.next() && !mIter.isString()) {
1008	UChar32 c = mIter.getCodepoint();
1009	const Norm *norm1 = b1.norms.getNorm(c);
1010	const Norm *norm2 = b2.norms.getNorm(c);
1011	const UnicodeString *mapping1;
1012	Norm::MappingType type1;
1013	if(norm1 == nullptr \|\| !norm1->hasMapping()) {
1014	mapping1 = nullptr;
1015	type1 = Norm::NONE;
1016	} else {
1017	mapping1 = norm1->mapping;
1018	type1 = norm1->mappingType;
1019	}
1020	const UnicodeString *mapping2;
1021	Norm::MappingType type2;
1022	if(norm2 == nullptr \|\| !norm2->hasMapping()) {
1023	mapping2 = nullptr;
1024	type2 = Norm::NONE;
1025	} else {
1026	mapping2 = norm2->mapping;
1027	type2 = norm2->mappingType;
1028	}
1029	if(type1 == type2 && equalStrings(mapping1, mapping2)) {
1030	// Nothing to do.
1031	} else if(type1 == Norm::NONE) {
1032	diff.removeMapping(c);
1033	} else if(type1 == Norm::ROUND_TRIP) {
1034	diff.setRoundTripMapping(c, *mapping1);
1035	} else if(type1 == Norm::ONE_WAY) {
1036	diff.setOneWayMapping(c, *mapping1);
1037	}
1038	}
1039	}
1040
729e4ab9 A	1041	U_NAMESPACE_END
	1042
	1043	#endif /* #if !UCONFIG_NO_NORMALIZATION */
	1044
	1045	/*
	1046	* Hey, Emacs, please set the following:
	1047	*
	1048	* Local Variables:
	1049	* indent-tabs-mode: nil
	1050	* End:
	1051	*/