git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/rbt

Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f	3	/*
374ca955	4	**********************************************************************
2ca993e8	5	* Copyright (C) 1999-2016, International Business Machines
374ca955 A	6	* Corporation and others. All Rights Reserved.
	7	**********************************************************************
	8	* Date Name Description
	9	* 11/17/99 aliu Creation.
	10	**********************************************************************
	11	*/
b75a7d8f A	12
	13	#include "unicode/utypes.h"
	14
	15	#if !UCONFIG_NO_TRANSLITERATION
	16
	17	#include "unicode/uobject.h"
	18	#include "unicode/parseerr.h"
	19	#include "unicode/parsepos.h"
	20	#include "unicode/putil.h"
	21	#include "unicode/uchar.h"
	22	#include "unicode/ustring.h"
	23	#include "unicode/uniset.h"
4388f060	24	#include "unicode/utf16.h"
b75a7d8f A	25	#include "cstring.h"
	26	#include "funcrepl.h"
	27	#include "hash.h"
	28	#include "quant.h"
	29	#include "rbt.h"
	30	#include "rbt_data.h"
	31	#include "rbt_pars.h"
	32	#include "rbt_rule.h"
	33	#include "strmatch.h"
	34	#include "strrepl.h"
374ca955	35	#include "unicode/symtable.h"
b75a7d8f A	36	#include "tridpars.h"
b75a7d8f A	37	#include "uvector.h"
73c04bcf	38	#include "hash.h"
4388f060	39	#include "patternprops.h"
b75a7d8f A	40	#include "util.h"
	41	#include "cmemory.h"
	42	#include "uprops.h"
374ca955	43	#include "putilimp.h"
b75a7d8f A	44
	45	// Operators
	46	#define VARIABLE_DEF_OP ((UChar)0x003D) /=/
	47	#define FORWARD_RULE_OP ((UChar)0x003E) />/
	48	#define REVERSE_RULE_OP ((UChar)0x003C) /</
	49	#define FWDREV_RULE_OP ((UChar)0x007E) /~/ // internal rep of <> op
	50
	51	// Other special characters
	52	#define QUOTE ((UChar)0x0027) /'/
	53	#define ESCAPE ((UChar)0x005C) /\/
	54	#define END_OF_RULE ((UChar)0x003B) /;/
	55	#define RULE_COMMENT_CHAR ((UChar)0x0023) /#/
	56
	57	#define SEGMENT_OPEN ((UChar)0x0028) /(/
	58	#define SEGMENT_CLOSE ((UChar)0x0029) /)/
	59	#define CONTEXT_ANTE ((UChar)0x007B) /{/
	60	#define CONTEXT_POST ((UChar)0x007D) /}/
	61	#define CURSOR_POS ((UChar)0x007C) /\|/
	62	#define CURSOR_OFFSET ((UChar)0x0040) /@/
	63	#define ANCHOR_START ((UChar)0x005E) /^/
	64	#define KLEENE_STAR ((UChar)0x002A) /***/
	65	#define ONE_OR_MORE ((UChar)0x002B) /+/
	66	#define ZERO_OR_ONE ((UChar)0x003F) /?/
	67
	68	#define DOT ((UChar)46) /./
	69
	70	static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]";
	71	91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90,
	72	108, 58, 93, 92, 114, 92, 110, 36, 93, 0
	73	};
	74
	75	// A function is denoted &Source-Target/Variant(text)
	76	#define FUNCTION ((UChar)38) /&/
	77
	78	// Aliases for some of the syntax characters. These are provided so
	79	// transliteration rules can be expressed in XML without clashing with
	80	// XML syntax characters '<', '>', and '&'.
	81	#define ALT_REVERSE_RULE_OP ((UChar)0x2190) // Left Arrow
	82	#define ALT_FORWARD_RULE_OP ((UChar)0x2192) // Right Arrow
	83	#define ALT_FWDREV_RULE_OP ((UChar)0x2194) // Left Right Arrow
	84	#define ALT_FUNCTION ((UChar)0x2206) // Increment (~Greek Capital Delta)
	85
	86	// Special characters disallowed at the top level
	87	static const UChar ILLEGAL_TOP[] = {41,0}; // ")"
	88
	89	// Special characters disallowed within a segment
	90	static const UChar ILLEGAL_SEG[] = {123,125,124,64,0}; // "{}\|@"
	91
	92	// Special characters disallowed within a function argument
	93	static const UChar ILLEGAL_FUNC[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^(.*+?{}\|@"
	94
	95	// By definition, the ANCHOR_END special character is a
	96	// trailing SymbolTable.SYMBOL_REF character.
	97	// private static final char ANCHOR_END = '$';
	98
	99	static const UChar gOPERATORS[] = { // "=><"
	100	VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,
	101	ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,
	102	0
	103	};
	104
	105	static const UChar HALF_ENDERS[] = { // "=><;"
	106	VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,
	107	ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,
108	END_OF_RULE,
109	0
110	};
111
112	// These are also used in Transliterator::toRules()
113	static const int32_t ID_TOKEN_LEN = 2;
114	static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'
115
73c04bcf A	116	/*
	117	commented out until we do real ::BEGIN/::END functionality
	118	static const int32_t BEGIN_TOKEN_LEN = 5;
	119	static const UChar BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN'
	120
	121	static const int32_t END_TOKEN_LEN = 3;
	122	static const UChar END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END'
	123	*/
	124
b75a7d8f A	125	U_NAMESPACE_BEGIN
	126
	127	//----------------------------------------------------------------------
	128	// BEGIN ParseData
	129	//----------------------------------------------------------------------
	130
	131	/**
	132	* This class implements the SymbolTable interface. It is used
	133	* during parsing to give UnicodeSet access to variables that
	134	* have been defined so far. Note that it uses variablesVector,
	135	* _not_ data.setVariables.
	136	*/
	137	class ParseData : public UMemory, public SymbolTable {
	138	public:
	139	const TransliterationRuleData* data; // alias
	140
	141	const UVector* variablesVector; // alias
	142
73c04bcf A	143	const Hashtable* variableNames; // alias
73c04bcf A	144
b75a7d8f	145	ParseData(const TransliterationRuleData* data = 0,
73c04bcf A	146	const UVector* variablesVector = 0,
73c04bcf A	147	const Hashtable* variableNames = 0);
b75a7d8f	148
4388f060 A	149	virtual ~ParseData();
4388f060 A	150
b75a7d8f A	151	virtual const UnicodeString* lookup(const UnicodeString& s) const;
	152
	153	virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
	154
	155	virtual UnicodeString parseReference(const UnicodeString& text,
	156	ParsePosition& pos, int32_t limit) const;
	157	/**
	158	* Return true if the given character is a matcher standin or a plain
	159	* character (non standin).
	160	*/
	161	UBool isMatcher(UChar32 ch);
	162
	163	/**
	164	* Return true if the given character is a replacer standin or a plain
	165	* character (non standin).
	166	*/
	167	UBool isReplacer(UChar32 ch);
	168
	169	private:
	170	ParseData(const ParseData &other); // forbid copying of this class
	171	ParseData &operator=(const ParseData &other); // forbid copying of this class
	172	};
	173
	174	ParseData::ParseData(const TransliterationRuleData* d,
73c04bcf A	175	const UVector* sets,
	176	const Hashtable* vNames) :
	177	data(d), variablesVector(sets), variableNames(vNames) {}
b75a7d8f	178
4388f060 A	179	ParseData::~ParseData() {}
4388f060 A	180
b75a7d8f A	181	/**
	182	* Implement SymbolTable API.
	183	*/
	184	const UnicodeString* ParseData::lookup(const UnicodeString& name) const {
73c04bcf	185	return (const UnicodeString*) variableNames->get(name);
b75a7d8f A	186	}
	187
	188	/**
	189	* Implement SymbolTable API.
	190	*/
	191	const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const {
	192	// Note that we cannot use data.lookupSet() because the
	193	// set array has not been constructed yet.
	194	const UnicodeFunctor* set = NULL;
	195	int32_t i = ch - data->variablesBase;
	196	if (i >= 0 && i < variablesVector->size()) {
3d1f044b A	197	int32_t j = ch - data->variablesBase;
	198	set = (j < variablesVector->size()) ?
	199	(UnicodeFunctor*) variablesVector->elementAt(j) : 0;
b75a7d8f A	200	}
	201	return set;
	202	}
	203
	204	/**
	205	* Implement SymbolTable API. Parse out a symbol reference
	206	* name.
	207	*/
	208	UnicodeString ParseData::parseReference(const UnicodeString& text,
	209	ParsePosition& pos, int32_t limit) const {
	210	int32_t start = pos.getIndex();
	211	int32_t i = start;
	212	UnicodeString result;
	213	while (i < limit) {
	214	UChar c = text.charAt(i);
	215	if ((i==start && !u_isIDStart(c)) \|\| !u_isIDPart(c)) {
	216	break;
	217	}
	218	++i;
	219	}
	220	if (i == start) { // No valid name chars
	221	return result; // Indicate failure with empty string
	222	}
	223	pos.setIndex(i);
	224	text.extractBetween(start, i, result);
	225	return result;
	226	}
	227
	228	UBool ParseData::isMatcher(UChar32 ch) {
	229	// Note that we cannot use data.lookup() because the
	230	// set array has not been constructed yet.
	231	int32_t i = ch - data->variablesBase;
	232	if (i >= 0 && i < variablesVector->size()) {
	233	UnicodeFunctor f = (UnicodeFunctor) variablesVector->elementAt(i);
	234	return f != NULL && f->toMatcher() != NULL;
	235	}
	236	return TRUE;
	237	}
	238
	239	/**
	240	* Return true if the given character is a replacer standin or a plain
	241	* character (non standin).
	242	*/
	243	UBool ParseData::isReplacer(UChar32 ch) {
	244	// Note that we cannot use data.lookup() because the
	245	// set array has not been constructed yet.
	246	int i = ch - data->variablesBase;
	247	if (i >= 0 && i < variablesVector->size()) {
	248	UnicodeFunctor f = (UnicodeFunctor) variablesVector->elementAt(i);
	249	return f != NULL && f->toReplacer() != NULL;
	250	}
	251	return TRUE;
	252	}
	253
	254	//----------------------------------------------------------------------
	255	// BEGIN RuleHalf
	256	//----------------------------------------------------------------------
	257
	258	/**
	259	* A class representing one side of a rule. This class knows how to
	260	* parse half of a rule. It is tightly coupled to the method
	261	* RuleBasedTransliterator.Parser.parseRule().
	262	*/
	263	class RuleHalf : public UMemory {
264
265	public:
266
267	UnicodeString text;
268
269	int32_t cursor; // position of cursor in text
270	int32_t ante; // position of ante context marker '{' in text
271	int32_t post; // position of post context marker '}' in text
272
273	// Record the offset to the cursor either to the left or to the
274	// right of the key. This is indicated by characters on the output
275	// side that allow the cursor to be positioned arbitrarily within
276	// the matching text. For example, abc{def} > \| @@@ xyz; changes
277	// def to xyz and moves the cursor to before abc. Offset characters
278	// must be at the start or end, and they cannot move the cursor past
279	// the ante- or postcontext text. Placeholders are only valid in
280	// output text. The length of the ante and post context is
281	// determined at runtime, because of supplementals and quantifiers.
282	int32_t cursorOffset; // only nonzero on output side
283
284	// Position of first CURSOR_OFFSET on _right_. This will be -1
285	// for \|@, -2 for \|@@, etc., and 1 for @\|, 2 for @@\|, etc.
286	int32_t cursorOffsetPos;
287
288	UBool anchorStart;
289	UBool anchorEnd;
b75a7d8f A	290
	291	/**
	292	* The segment number from 1..n of the next '(' we see
	293	* during parsing; 1-based.
	294	*/
	295	int32_t nextSegmentNumber;
	296
	297	TransliteratorParser& parser;
	298
	299	//--------------------------------------------------
	300	// Methods
	301
	302	RuleHalf(TransliteratorParser& parser);
	303	~RuleHalf();
	304
73c04bcf	305	int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
b75a7d8f A	306
	307	int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
	308	UnicodeString& buf,
	309	const UnicodeString& illegal,
73c04bcf A	310	UBool isSegment,
73c04bcf A	311	UErrorCode& status);
b75a7d8f A	312
	313	/**
	314	* Remove context.
	315	*/
	316	void removeContext();
	317
	318	/**
	319	* Return true if this half looks like valid output, that is, does not
	320	* contain quantifiers or other special input-only elements.
	321	*/
	322	UBool isValidOutput(TransliteratorParser& parser);
	323
	324	/**
	325	* Return true if this half looks like valid input, that is, does not
	326	* contain functions or other special output-only elements.
	327	*/
	328	UBool isValidInput(TransliteratorParser& parser);
	329
	330	int syntaxError(UErrorCode code,
	331	const UnicodeString& rule,
73c04bcf A	332	int32_t start,
	333	UErrorCode& status) {
	334	return parser.syntaxError(code, rule, start, status);
b75a7d8f A	335	}
	336
	337	private:
	338	// Disallowed methods; no impl.
	339	RuleHalf(const RuleHalf&);
	340	RuleHalf& operator=(const RuleHalf&);
	341	};
	342
	343	RuleHalf::RuleHalf(TransliteratorParser& p) :
b75a7d8f A	344	parser(p)
	345	{
	346	cursor = -1;
	347	ante = -1;
	348	post = -1;
	349	cursorOffset = 0;
	350	cursorOffsetPos = 0;
	351	anchorStart = anchorEnd = FALSE;
	352	nextSegmentNumber = 1;
	353	}
	354
	355	RuleHalf::~RuleHalf() {
	356	}
	357
	358	/**
	359	* Parse one side of a rule, stopping at either the limit,
	360	* the END_OF_RULE character, or an operator.
	361	* @return the index after the terminating character, or
	362	* if limit was reached, limit
	363	*/
73c04bcf	364	int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
b75a7d8f A	365	int32_t start = pos;
b75a7d8f A	366	text.truncate(0);
4388f060	367	pos = parseSection(rule, pos, limit, text, UnicodeString(TRUE, ILLEGAL_TOP, -1), FALSE, status);
b75a7d8f A	368
b75a7d8f A	369	if (cursorOffset > 0 && cursor != cursorOffsetPos) {
73c04bcf	370	return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
b75a7d8f A	371	}
	372
	373	return pos;
	374	}
	375
	376	/**
	377	* Parse a section of one side of a rule, stopping at either
	378	* the limit, the END_OF_RULE character, an operator, or a
	379	* segment close character. This method parses both a
	380	* top-level rule half and a segment within such a rule half.
	381	* It calls itself recursively to parse segments and nested
	382	* segments.
	383	* @param buf buffer into which to accumulate the rule pattern
	384	* characters, either literal characters from the rule or
	385	* standins for UnicodeMatcher objects including segments.
	386	* @param illegal the set of special characters that is illegal during
	387	* this parse.
	388	* @param isSegment if true, then we've already seen a '(' and
	389	* pos on entry points right after it. Accumulate everything
	390	* up to the closing ')', put it in a segment matcher object,
	391	* generate a standin for it, and add the standin to buf. As
	392	* a side effect, update the segments vector with a reference
	393	* to the segment matcher. This works recursively for nested
	394	* segments. If isSegment is false, just accumulate
	395	* characters into buf.
	396	* @return the index after the terminating character, or
	397	* if limit was reached, limit
	398	*/
	399	int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
	400	UnicodeString& buf,
	401	const UnicodeString& illegal,
73c04bcf	402	UBool isSegment, UErrorCode& status) {
b75a7d8f A	403	int32_t start = pos;
	404	ParsePosition pp;
	405	UnicodeString scratch;
	406	UBool done = FALSE;
	407	int32_t quoteStart = -1; // Most recent 'single quoted string'
	408	int32_t quoteLimit = -1;
	409	int32_t varStart = -1; // Most recent $variableReference
	410	int32_t varLimit = -1;
	411	int32_t bufStart = buf.length();
	412
	413	while (pos < limit && !done) {
	414	// Since all syntax characters are in the BMP, fetching
	415	// 16-bit code units suffices here.
	416	UChar c = rule.charAt(pos++);
4388f060	417	if (PatternProps::isWhiteSpace(c)) {
b75a7d8f A	418	// Ignore whitespace. Note that this is not Unicode
	419	// spaces, but Java spaces -- a subset, representing
	420	// whitespace likely to be seen in code.
	421	continue;
	422	}
	423	if (u_strchr(HALF_ENDERS, c) != NULL) {
	424	if (isSegment) {
	425	// Unclosed segment
73c04bcf	426	return syntaxError(U_UNCLOSED_SEGMENT, rule, start, status);
b75a7d8f A	427	}
	428	break;
	429	}
	430	if (anchorEnd) {
	431	// Text after a presumed end anchor is a syntax err
73c04bcf	432	return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start, status);
b75a7d8f A	433	}
	434	if (UnicodeSet::resemblesPattern(rule, pos-1)) {
	435	pp.setIndex(pos-1); // Backup to opening '['
73c04bcf A	436	buf.append(parser.parseSet(rule, pp, status));
	437	if (U_FAILURE(status)) {
	438	return syntaxError(U_MALFORMED_SET, rule, start, status);
b75a7d8f A	439	}
	440	pos = pp.getIndex();
	441	continue;
	442	}
	443	// Handle escapes
	444	if (c == ESCAPE) {
	445	if (pos == limit) {
73c04bcf	446	return syntaxError(U_TRAILING_BACKSLASH, rule, start, status);
b75a7d8f A	447	}
	448	UChar32 escaped = rule.unescapeAt(pos); // pos is already past '\\'
	449	if (escaped == (UChar32) -1) {
73c04bcf	450	return syntaxError(U_MALFORMED_UNICODE_ESCAPE, rule, start, status);
b75a7d8f A	451	}
b75a7d8f A	452	if (!parser.checkVariableRange(escaped)) {
73c04bcf	453	return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);
b75a7d8f A	454	}
	455	buf.append(escaped);
	456	continue;
	457	}
	458	// Handle quoted matter
	459	if (c == QUOTE) {
	460	int32_t iq = rule.indexOf(QUOTE, pos);
	461	if (iq == pos) {
	462	buf.append(c); // Parse [''] outside quotes as [']
	463	++pos;
	464	} else {
	465	/* This loop picks up a run of quoted text of the
	466	* form 'aaaa' each time through. If this run
	467	* hasn't really ended ('aaaa''bbbb') then it keeps
	468	* looping, each time adding on a new run. When it
	469	* reaches the final quote it breaks.
	470	*/
	471	quoteStart = buf.length();
	472	for (;;) {
	473	if (iq < 0) {
73c04bcf	474	return syntaxError(U_UNTERMINATED_QUOTE, rule, start, status);
b75a7d8f A	475	}
	476	scratch.truncate(0);
	477	rule.extractBetween(pos, iq, scratch);
	478	buf.append(scratch);
	479	pos = iq+1;
	480	if (pos < limit && rule.charAt(pos) == QUOTE) {
	481	// Parse [''] inside quotes as [']
	482	iq = rule.indexOf(QUOTE, pos+1);
	483	// Continue looping
	484	} else {
	485	break;
	486	}
	487	}
	488	quoteLimit = buf.length();
	489
	490	for (iq=quoteStart; iq<quoteLimit; ++iq) {
	491	if (!parser.checkVariableRange(buf.charAt(iq))) {
73c04bcf	492	return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);
b75a7d8f A	493	}
	494	}
	495	}
	496	continue;
	497	}
	498
	499	if (!parser.checkVariableRange(c)) {
73c04bcf	500	return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);
b75a7d8f A	501	}
	502
	503	if (illegal.indexOf(c) >= 0) {
73c04bcf	504	syntaxError(U_ILLEGAL_CHARACTER, rule, start, status);
b75a7d8f A	505	}
	506
	507	switch (c) {
	508
	509	//------------------------------------------------------
	510	// Elements allowed within and out of segments
	511	//------------------------------------------------------
	512	case ANCHOR_START:
	513	if (buf.length() == 0 && !anchorStart) {
	514	anchorStart = TRUE;
	515	} else {
	516	return syntaxError(U_MISPLACED_ANCHOR_START,
73c04bcf	517	rule, start, status);
b75a7d8f A	518	}
	519	break;
	520	case SEGMENT_OPEN:
	521	{
	522	// bufSegStart is the offset in buf to the first
	523	// character of the segment we are parsing.
	524	int32_t bufSegStart = buf.length();
	525
	526	// Record segment number now, since nextSegmentNumber
	527	// will be incremented during the call to parseSection
	528	// if there are nested segments.
	529	int32_t segmentNumber = nextSegmentNumber++; // 1-based
	530
	531	// Parse the segment
4388f060	532	pos = parseSection(rule, pos, limit, buf, UnicodeString(TRUE, ILLEGAL_SEG, -1), TRUE, status);
b75a7d8f A	533
	534	// After parsing a segment, the relevant characters are
	535	// in buf, starting at offset bufSegStart. Extract them
	536	// into a string matcher, and replace them with a
	537	// standin for that matcher.
	538	StringMatcher* m =
	539	new StringMatcher(buf, bufSegStart, buf.length(),
73c04bcf	540	segmentNumber, *parser.curData);
46f4442e A	541	if (m == NULL) {
	542	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
	543	}
b75a7d8f A	544
b75a7d8f A	545	// Record and associate object and segment number
73c04bcf	546	parser.setSegmentObject(segmentNumber, m, status);
b75a7d8f	547	buf.truncate(bufSegStart);
73c04bcf	548	buf.append(parser.getSegmentStandin(segmentNumber, status));
b75a7d8f A	549	}
	550	break;
	551	case FUNCTION:
	552	case ALT_FUNCTION:
	553	{
	554	int32_t iref = pos;
	555	TransliteratorIDParser::SingleID* single =
	556	TransliteratorIDParser::parseFilterID(rule, iref);
	557	// The next character MUST be a segment open
	558	if (single == NULL \|\|
	559	!ICU_Utility::parseChar(rule, iref, SEGMENT_OPEN)) {
73c04bcf	560	return syntaxError(U_INVALID_FUNCTION, rule, start, status);
b75a7d8f A	561	}
	562
	563	Transliterator *t = single->createInstance();
	564	delete single;
	565	if (t == NULL) {
73c04bcf	566	return syntaxError(U_INVALID_FUNCTION, rule, start, status);
b75a7d8f A	567	}
	568
	569	// bufSegStart is the offset in buf to the first
	570	// character of the segment we are parsing.
	571	int32_t bufSegStart = buf.length();
	572
	573	// Parse the segment
4388f060	574	pos = parseSection(rule, iref, limit, buf, UnicodeString(TRUE, ILLEGAL_FUNC, -1), TRUE, status);
b75a7d8f A	575
	576	// After parsing a segment, the relevant characters are
	577	// in buf, starting at offset bufSegStart.
	578	UnicodeString output;
	579	buf.extractBetween(bufSegStart, buf.length(), output);
	580	FunctionReplacer *r =
73c04bcf	581	new FunctionReplacer(t, new StringReplacer(output, parser.curData));
46f4442e A	582	if (r == NULL) {
	583	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
	584	}
b75a7d8f A	585
	586	// Replace the buffer contents with a stand-in
	587	buf.truncate(bufSegStart);
73c04bcf	588	buf.append(parser.generateStandInFor(r, status));
b75a7d8f A	589	}
	590	break;
	591	case SymbolTable::SYMBOL_REF:
	592	// Handle variable references and segment references "$1" .. "$9"
	593	{
	594	// A variable reference must be followed immediately
	595	// by a Unicode identifier start and zero or more
	596	// Unicode identifier part characters, or by a digit
	597	// 1..9 if it is a segment reference.
	598	if (pos == limit) {
	599	// A variable ref character at the end acts as
	600	// an anchor to the context limit, as in perl.
	601	anchorEnd = TRUE;
	602	break;
	603	}
	604	// Parse "$1" "$2" .. "$9" .. (no upper limit)
	605	c = rule.charAt(pos);
	606	int32_t r = u_digit(c, 10);
	607	if (r >= 1 && r <= 9) {
	608	r = ICU_Utility::parseNumber(rule, pos, 10);
	609	if (r < 0) {
	610	return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE,
73c04bcf	611	rule, start, status);
b75a7d8f	612	}
73c04bcf	613	buf.append(parser.getSegmentStandin(r, status));
b75a7d8f A	614	} else {
	615	pp.setIndex(pos);
	616	UnicodeString name = parser.parseData->
	617	parseReference(rule, pp, limit);
	618	if (name.length() == 0) {
	619	// This means the '$' was not followed by a
	620	// valid name. Try to interpret it as an
	621	// end anchor then. If this also doesn't work
	622	// (if we see a following character) then signal
	623	// an error.
	624	anchorEnd = TRUE;
	625	break;
	626	}
	627	pos = pp.getIndex();
	628	// If this is a variable definition statement,
	629	// then the LHS variable will be undefined. In
	630	// that case appendVariableDef() will append the
	631	// special placeholder char variableLimit-1.
	632	varStart = buf.length();
73c04bcf	633	parser.appendVariableDef(name, buf, status);
b75a7d8f A	634	varLimit = buf.length();
	635	}
	636	}
	637	break;
	638	case DOT:
73c04bcf	639	buf.append(parser.getDotStandIn(status));
b75a7d8f A	640	break;
	641	case KLEENE_STAR:
	642	case ONE_OR_MORE:
	643	case ZERO_OR_ONE:
	644	// Quantifiers. We handle single characters, quoted strings,
	645	// variable references, and segments.
	646	// a+ matches aaa
	647	// 'foo'+ matches foofoofoo
	648	// $v+ matches xyxyxy if $v == xy
	649	// (seg)+ matches segsegseg
	650	{
	651	if (isSegment && buf.length() == bufStart) {
	652	// The */+ immediately follows '('
73c04bcf	653	return syntaxError(U_MISPLACED_QUANTIFIER, rule, start, status);
b75a7d8f A	654	}
	655
	656	int32_t qstart, qlimit;
	657	// The */+ follows an isolated character or quote
	658	// or variable reference
	659	if (buf.length() == quoteLimit) {
	660	// The */+ follows a 'quoted string'
	661	qstart = quoteStart;
	662	qlimit = quoteLimit;
	663	} else if (buf.length() == varLimit) {
	664	// The */+ follows a $variableReference
	665	qstart = varStart;
	666	qlimit = varLimit;
	667	} else {
	668	// The */+ follows a single character, possibly
	669	// a segment standin
	670	qstart = buf.length() - 1;
	671	qlimit = qstart + 1;
	672	}
	673
	674	UnicodeFunctor *m =
73c04bcf	675	new StringMatcher(buf, qstart, qlimit, 0, *parser.curData);
46f4442e A	676	if (m == NULL) {
	677	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
	678	}
b75a7d8f A	679	int32_t min = 0;
	680	int32_t max = Quantifier::MAX;
	681	switch (c) {
	682	case ONE_OR_MORE:
	683	min = 1;
	684	break;
	685	case ZERO_OR_ONE:
	686	min = 0;
	687	max = 1;
	688	break;
	689	// case KLEENE_STAR:
	690	// do nothing -- min, max already set
	691	}
	692	m = new Quantifier(m, min, max);
46f4442e A	693	if (m == NULL) {
	694	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
	695	}
b75a7d8f	696	buf.truncate(qstart);
73c04bcf	697	buf.append(parser.generateStandInFor(m, status));
b75a7d8f A	698	}
	699	break;
	700
	701	//------------------------------------------------------
	702	// Elements allowed ONLY WITHIN segments
	703	//------------------------------------------------------
	704	case SEGMENT_CLOSE:
	705	// assert(isSegment);
	706	// We're done parsing a segment.
	707	done = TRUE;
	708	break;
	709
	710	//------------------------------------------------------
	711	// Elements allowed ONLY OUTSIDE segments
	712	//------------------------------------------------------
	713	case CONTEXT_ANTE:
	714	if (ante >= 0) {
73c04bcf	715	return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start, status);
b75a7d8f A	716	}
	717	ante = buf.length();
	718	break;
	719	case CONTEXT_POST:
	720	if (post >= 0) {
73c04bcf	721	return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start, status);
b75a7d8f A	722	}
	723	post = buf.length();
	724	break;
	725	case CURSOR_POS:
	726	if (cursor >= 0) {
73c04bcf	727	return syntaxError(U_MULTIPLE_CURSORS, rule, start, status);
b75a7d8f A	728	}
	729	cursor = buf.length();
	730	break;
	731	case CURSOR_OFFSET:
	732	if (cursorOffset < 0) {
	733	if (buf.length() > 0) {
73c04bcf	734	return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
b75a7d8f A	735	}
	736	--cursorOffset;
	737	} else if (cursorOffset > 0) {
	738	if (buf.length() != cursorOffsetPos \|\| cursor >= 0) {
73c04bcf	739	return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
b75a7d8f A	740	}
	741	++cursorOffset;
	742	} else {
	743	if (cursor == 0 && buf.length() == 0) {
	744	cursorOffset = -1;
	745	} else if (cursor < 0) {
	746	cursorOffsetPos = buf.length();
	747	cursorOffset = 1;
	748	} else {
73c04bcf	749	return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
b75a7d8f A	750	}
	751	}
	752	break;
	753
	754
	755	//------------------------------------------------------
	756	// Non-special characters
	757	//------------------------------------------------------
	758	default:
	759	// Disallow unquoted characters other than [0-9A-Za-z]
	760	// in the printable ASCII range. These characters are
	761	// reserved for possible future use.
	762	if (c >= 0x0021 && c <= 0x007E &&
	763	!((c >= 0x0030/'0'/ && c <= 0x0039/'9'/) \|\|
	764	(c >= 0x0041/'A'/ && c <= 0x005A/'Z'/) \|\|
	765	(c >= 0x0061/'a'/ && c <= 0x007A/'z'/))) {
73c04bcf	766	return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status);
b75a7d8f A	767	}
	768	buf.append(c);
	769	break;
	770	}
	771	}
	772
	773	return pos;
	774	}
	775
	776	/**
	777	* Remove context.
	778	*/
	779	void RuleHalf::removeContext() {
	780	//text = text.substring(ante < 0 ? 0 : ante,
	781	// post < 0 ? text.length() : post);
	782	if (post >= 0) {
	783	text.remove(post);
	784	}
	785	if (ante >= 0) {
	786	text.removeBetween(0, ante);
	787	}
	788	ante = post = -1;
	789	anchorStart = anchorEnd = FALSE;
	790	}
	791
	792	/**
	793	* Return true if this half looks like valid output, that is, does not
	794	* contain quantifiers or other special input-only elements.
	795	*/
	796	UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) {
	797	for (int32_t i=0; i<text.length(); ) {
	798	UChar32 c = text.char32At(i);
4388f060	799	i += U16_LENGTH(c);
b75a7d8f A	800	if (!transParser.parseData->isReplacer(c)) {
	801	return FALSE;
	802	}
	803	}
	804	return TRUE;
	805	}
	806
	807	/**
	808	* Return true if this half looks like valid input, that is, does not
	809	* contain functions or other special output-only elements.
	810	*/
	811	UBool RuleHalf::isValidInput(TransliteratorParser& transParser) {
	812	for (int32_t i=0; i<text.length(); ) {
	813	UChar32 c = text.char32At(i);
4388f060	814	i += U16_LENGTH(c);
b75a7d8f A	815	if (!transParser.parseData->isMatcher(c)) {
	816	return FALSE;
	817	}
	818	}
	819	return TRUE;
	820	}
	821
	822	//----------------------------------------------------------------------
	823	// PUBLIC API
	824	//----------------------------------------------------------------------
	825
	826	/**
	827	* Constructor.
	828	*/
73c04bcf A	829	TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) :
	830	dataVector(statusReturn),
	831	idBlockVector(statusReturn),
	832	variablesVector(statusReturn),
	833	segmentObjects(statusReturn)
	834	{
4388f060	835	idBlockVector.setDeleter(uprv_deleteUObject);
73c04bcf	836	curData = NULL;
b75a7d8f A	837	compoundFilter = NULL;
b75a7d8f A	838	parseData = NULL;
4388f060	839	variableNames.setValueDeleter(uprv_deleteUObject);
b75a7d8f A	840	}
	841
	842	/**
	843	* Destructor.
	844	*/
	845	TransliteratorParser::~TransliteratorParser() {
73c04bcf A	846	while (!dataVector.isEmpty())
73c04bcf A	847	delete (TransliterationRuleData*)(dataVector.orphanElementAt(0));
b75a7d8f A	848	delete compoundFilter;
b75a7d8f A	849	delete parseData;
73c04bcf A	850	while (!variablesVector.isEmpty())
73c04bcf A	851	delete (UnicodeFunctor*)variablesVector.orphanElementAt(0);
b75a7d8f A	852	}
	853
	854	void
	855	TransliteratorParser::parse(const UnicodeString& rules,
	856	UTransDirection transDirection,
	857	UParseError& pe,
	858	UErrorCode& ec) {
	859	if (U_SUCCESS(ec)) {
73c04bcf	860	parseRules(rules, transDirection, ec);
b75a7d8f	861	pe = parseError;
b75a7d8f A	862	}
	863	}
	864
	865	/**
	866	* Return the compound filter parsed by parse(). Caller owns result.
	867	*/
	868	UnicodeSet* TransliteratorParser::orphanCompoundFilter() {
	869	UnicodeSet* f = compoundFilter;
	870	compoundFilter = NULL;
	871	return f;
	872	}
	873
b75a7d8f A	874	//----------------------------------------------------------------------
	875	// Private implementation
	876	//----------------------------------------------------------------------
	877
	878	/**
	879	* Parse the given string as a sequence of rules, separated by newline
	880	* characters ('\n'), and cause this object to implement those rules. Any
	881	* previous rules are discarded. Typically this method is called exactly
	882	* once, during construction.
	883	* @exception IllegalArgumentException if there is a syntax error in the
	884	* rules
	885	*/
	886	void TransliteratorParser::parseRules(const UnicodeString& rule,
73c04bcf	887	UTransDirection theDirection,
46f4442e A	888	UErrorCode& status)
46f4442e A	889	{
b75a7d8f	890	// Clear error struct
46f4442e	891	uprv_memset(&parseError, 0, sizeof(parseError));
b75a7d8f	892	parseError.line = parseError.offset = -1;
b75a7d8f	893
73c04bcf A	894	UBool parsingIDs = TRUE;
	895	int32_t ruleCount = 0;
	896
	897	while (!dataVector.isEmpty()) {
	898	delete (TransliterationRuleData*)(dataVector.orphanElementAt(0));
	899	}
b75a7d8f A	900	if (U_FAILURE(status)) {
	901	return;
	902	}
	903
73c04bcf A	904	idBlockVector.removeAllElements();
73c04bcf A	905	curData = NULL;
b75a7d8f A	906	direction = theDirection;
	907	ruleCount = 0;
	908
	909	delete compoundFilter;
	910	compoundFilter = NULL;
	911
73c04bcf A	912	while (!variablesVector.isEmpty()) {
73c04bcf A	913	delete (UnicodeFunctor*)variablesVector.orphanElementAt(0);
b75a7d8f	914	}
73c04bcf A	915	variableNames.removeAll();
73c04bcf A	916	parseData = new ParseData(0, &variablesVector, &variableNames);
b75a7d8f A	917	if (parseData == NULL) {
	918	status = U_MEMORY_ALLOCATION_ERROR;
	919	return;
	920	}
b75a7d8f	921
b75a7d8f A	922	dotStandIn = (UChar) -1;
b75a7d8f A	923
46f4442e	924	UnicodeString *tempstr = NULL; // used for memory allocation error checking
b75a7d8f	925	UnicodeString str; // scratch
73c04bcf	926	UnicodeString idBlockResult;
b75a7d8f A	927	int32_t pos = 0;
b75a7d8f A	928	int32_t limit = rule.length();
b75a7d8f A	929
	930	// The compound filter offset is an index into idBlockResult.
	931	// If it is 0, then the compound filter occurred at the start,
	932	// and it is the offset to the _start_ of the compound filter
	933	// pattern. Otherwise it is the offset to the _limit_ of the
	934	// compound filter pattern within idBlockResult.
	935	compoundFilter = NULL;
	936	int32_t compoundFilterOffset = -1;
	937
b75a7d8f A	938	while (pos < limit && U_SUCCESS(status)) {
b75a7d8f A	939	UChar c = rule.charAt(pos++);
4388f060	940	if (PatternProps::isWhiteSpace(c)) {
b75a7d8f A	941	// Ignore leading whitespace.
	942	continue;
	943	}
	944	// Skip lines starting with the comment character
	945	if (c == RULE_COMMENT_CHAR) {
	946	pos = rule.indexOf((UChar)0x000A /\n/, pos) + 1;
	947	if (pos == 0) {
	948	break; // No "\n" found; rest of rule is a commnet
	949	}
	950	continue; // Either fall out or restart with next line
	951	}
73c04bcf A	952
	953	// skip empty rules
	954	if (c == END_OF_RULE)
	955	continue;
	956
	957	// keep track of how many rules we've seen
	958	++ruleCount;
	959
b75a7d8f A	960	// We've found the start of a rule or ID. c is its first
	961	// character, and pos points past c.
	962	--pos;
	963	// Look for an ID token. Must have at least ID_TOKEN_LEN + 1
	964	// chars left.
	965	if ((pos + ID_TOKEN_LEN + 1) <= limit &&
73c04bcf	966	rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) {
b75a7d8f A	967	pos += ID_TOKEN_LEN;
b75a7d8f A	968	c = rule.charAt(pos);
4388f060	969	while (PatternProps::isWhiteSpace(c) && pos < limit) {
b75a7d8f A	970	++pos;
	971	c = rule.charAt(pos);
	972	}
	973
b75a7d8f A	974	int32_t p = pos;
b75a7d8f A	975
73c04bcf A	976	if (!parsingIDs) {
	977	if (curData != NULL) {
	978	if (direction == UTRANS_FORWARD)
	979	dataVector.addElement(curData, status);
	980	else
	981	dataVector.insertElementAt(curData, 0, status);
	982	curData = NULL;
	983	}
	984	parsingIDs = TRUE;
	985	}
	986
b75a7d8f	987	TransliteratorIDParser::SingleID* id =
374ca955	988	TransliteratorIDParser::parseSingleID(rule, p, direction, status);
b75a7d8f A	989	if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) {
b75a7d8f A	990	// Successful ::ID parse.
73c04bcf	991
b75a7d8f	992	if (direction == UTRANS_FORWARD) {
73c04bcf	993	idBlockResult.append(id->canonID).append(END_OF_RULE);
b75a7d8f	994	} else {
73c04bcf A	995	idBlockResult.insert(0, END_OF_RULE);
73c04bcf A	996	idBlockResult.insert(0, id->canonID);
b75a7d8f	997	}
73c04bcf	998
b75a7d8f A	999	} else {
	1000	// Couldn't parse an ID. Try to parse a global filter
	1001	int32_t withParens = -1;
73c04bcf	1002	UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, NULL);
b75a7d8f A	1003	if (f != NULL) {
	1004	if (ICU_Utility::parseChar(rule, p, END_OF_RULE)
	1005	&& (direction == UTRANS_FORWARD) == (withParens == 0))
	1006	{
	1007	if (compoundFilter != NULL) {
	1008	// Multiple compound filters
73c04bcf	1009	syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rule, pos, status);
b75a7d8f A	1010	delete f;
	1011	} else {
	1012	compoundFilter = f;
73c04bcf	1013	compoundFilterOffset = ruleCount;
b75a7d8f A	1014	}
	1015	} else {
	1016	delete f;
	1017	}
	1018	} else {
	1019	// Invalid ::id
	1020	// Can be parsed as neither an ID nor a global filter
73c04bcf	1021	syntaxError(U_INVALID_ID, rule, pos, status);
b75a7d8f A	1022	}
	1023	}
	1024	delete id;
b75a7d8f	1025	pos = p;
b75a7d8f	1026	} else {
73c04bcf	1027	if (parsingIDs) {
46f4442e A	1028	tempstr = new UnicodeString(idBlockResult);
	1029	// NULL pointer check
	1030	if (tempstr == NULL) {
	1031	status = U_MEMORY_ALLOCATION_ERROR;
	1032	return;
	1033	}
73c04bcf	1034	if (direction == UTRANS_FORWARD)
46f4442e	1035	idBlockVector.addElement(tempstr, status);
73c04bcf	1036	else
46f4442e	1037	idBlockVector.insertElementAt(tempstr, 0, status);
73c04bcf A	1038	idBlockResult.remove();
	1039	parsingIDs = FALSE;
	1040	curData = new TransliterationRuleData(status);
46f4442e A	1041	// NULL pointer check
	1042	if (curData == NULL) {
	1043	status = U_MEMORY_ALLOCATION_ERROR;
	1044	return;
	1045	}
73c04bcf A	1046	parseData->data = curData;
	1047
	1048	// By default, rules use part of the private use area
	1049	// E000..F8FF for variables and other stand-ins. Currently
	1050	// the range F000..F8FF is typically sufficient. The 'use
	1051	// variable range' pragma allows rule sets to modify this.
	1052	setVariableRange(0xF000, 0xF8FF, status);
	1053	}
	1054
	1055	if (resemblesPragma(rule, pos, limit)) {
	1056	int32_t ppp = parsePragma(rule, pos, limit, status);
	1057	if (ppp < 0) {
	1058	syntaxError(U_MALFORMED_PRAGMA, rule, pos, status);
b75a7d8f	1059	}
73c04bcf A	1060	pos = ppp;
	1061	// Parse a rule
	1062	} else {
	1063	pos = parseRule(rule, pos, limit, status);
b75a7d8f	1064	}
b75a7d8f A	1065	}
b75a7d8f A	1066	}
b75a7d8f	1067
73c04bcf	1068	if (parsingIDs && idBlockResult.length() > 0) {
46f4442e A	1069	tempstr = new UnicodeString(idBlockResult);
	1070	// NULL pointer check
	1071	if (tempstr == NULL) {
	1072	status = U_MEMORY_ALLOCATION_ERROR;
	1073	return;
	1074	}
73c04bcf	1075	if (direction == UTRANS_FORWARD)
46f4442e	1076	idBlockVector.addElement(tempstr, status);
73c04bcf	1077	else
46f4442e	1078	idBlockVector.insertElementAt(tempstr, 0, status);
b75a7d8f	1079	}
73c04bcf A	1080	else if (!parsingIDs && curData != NULL) {
	1081	if (direction == UTRANS_FORWARD)
	1082	dataVector.addElement(curData, status);
	1083	else
	1084	dataVector.insertElementAt(curData, 0, status);
b75a7d8f	1085	}
73c04bcf	1086
b75a7d8f	1087	if (U_SUCCESS(status)) {
73c04bcf A	1088	// Convert the set vector to an array
	1089	int32_t i, dataVectorSize = dataVector.size();
	1090	for (i = 0; i < dataVectorSize; i++) {
	1091	TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i);
	1092	data->variablesLength = variablesVector.size();
	1093	if (data->variablesLength == 0) {
	1094	data->variables = 0;
	1095	} else {
	1096	data->variables = (UnicodeFunctor*)uprv_malloc(data->variablesLength sizeof(UnicodeFunctor*));
46f4442e A	1097	// NULL pointer check
	1098	if (data->variables == NULL) {
	1099	status = U_MEMORY_ALLOCATION_ERROR;
	1100	return;
	1101	}
73c04bcf A	1102	data->variablesAreOwned = (i == 0);
	1103	}
	1104
	1105	for (int32_t j = 0; j < data->variablesLength; j++) {
	1106	data->variables[j] =
2ca993e8	1107	static_cast<UnicodeFunctor *>(variablesVector.elementAt(j));
73c04bcf A	1108	}
	1109
	1110	data->variableNames.removeAll();
3d1f044b A	1111	int32_t p = UHASH_FIRST;
3d1f044b A	1112	const UHashElement* he = variableNames.nextElement(p);
73c04bcf	1113	while (he != NULL) {
340931cb	1114	UnicodeString* tempus = ((UnicodeString*)(he->value.pointer))->clone();
46f4442e A	1115	if (tempus == NULL) {
	1116	status = U_MEMORY_ALLOCATION_ERROR;
	1117	return;
	1118	}
73c04bcf	1119	data->variableNames.put(((UnicodeString)(he->key.pointer)),
46f4442e	1120	tempus, status);
3d1f044b	1121	he = variableNames.nextElement(p);
73c04bcf A	1122	}
	1123	}
	1124	variablesVector.removeAllElements(); // keeps them from getting deleted when we succeed
	1125
	1126	// Index the rules
b75a7d8f	1127	if (compoundFilter != NULL) {
73c04bcf A	1128	if ((direction == UTRANS_FORWARD && compoundFilterOffset != 1) \|\|
73c04bcf A	1129	(direction == UTRANS_REVERSE && compoundFilterOffset != ruleCount)) {
b75a7d8f A	1130	status = U_MISPLACED_COMPOUND_FILTER;
	1131	}
	1132	}
	1133
73c04bcf A	1134	for (i = 0; i < dataVectorSize; i++) {
	1135	TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i);
	1136	data->ruleSet.freeze(parseError, status);
b75a7d8f	1137	}
73c04bcf A	1138	if (idBlockVector.size() == 1 && ((UnicodeString*)idBlockVector.elementAt(0))->isEmpty()) {
73c04bcf A	1139	idBlockVector.removeElementAt(0);
b75a7d8f A	1140	}
	1141	}
	1142	}
	1143
	1144	/**
	1145	* Set the variable range to [start, end] (inclusive).
	1146	*/
73c04bcf	1147	void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) {
b75a7d8f A	1148	if (start > end \|\| start < 0 \|\| end > 0xFFFF) {
	1149	status = U_MALFORMED_PRAGMA;
	1150	return;
	1151	}
	1152
73c04bcf A	1153	curData->variablesBase = (UChar) start;
	1154	if (dataVector.size() == 0) {
	1155	variableNext = (UChar) start;
	1156	variableLimit = (UChar) (end + 1);
	1157	}
b75a7d8f A	1158	}
	1159
	1160	/**
	1161	* Assert that the given character is NOT within the variable range.
	1162	* If it is, return FALSE. This is neccesary to ensure that the
	1163	* variable range does not overlap characters used in a rule.
	1164	*/
	1165	UBool TransliteratorParser::checkVariableRange(UChar32 ch) const {
73c04bcf	1166	return !(ch >= curData->variablesBase && ch < variableLimit);
b75a7d8f A	1167	}
	1168
	1169	/**
	1170	* Set the maximum backup to 'backup', in response to a pragma
	1171	* statement.
	1172	*/
374ca955	1173	void TransliteratorParser::pragmaMaximumBackup(int32_t /backup/) {
b75a7d8f A	1174	//TODO Finish
	1175	}
	1176
	1177	/**
	1178	* Begin normalizing all rules using the given mode, in response
	1179	* to a pragma statement.
	1180	*/
374ca955	1181	void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /mode/) {
b75a7d8f A	1182	//TODO Finish
	1183	}
	1184
	1185	static const UChar PRAGMA_USE[] = {0x75,0x73,0x65,0x20,0}; // "use "
	1186
	1187	static const UChar PRAGMA_VARIABLE_RANGE[] = {0x7E,0x76,0x61,0x72,0x69,0x61,0x62,0x6C,0x65,0x20,0x72,0x61,0x6E,0x67,0x65,0x20,0x23,0x20,0x23,0x7E,0x3B,0}; // "~variable range # #~;"
	1188
	1189	static const UChar PRAGMA_MAXIMUM_BACKUP[] = {0x7E,0x6D,0x61,0x78,0x69,0x6D,0x75,0x6D,0x20,0x62,0x61,0x63,0x6B,0x75,0x70,0x20,0x23,0x7E,0x3B,0}; // "~maximum backup #~;"
	1190
	1191	static const UChar PRAGMA_NFD_RULES[] = {0x7E,0x6E,0x66,0x64,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfd rules~;"
	1192
	1193	static const UChar PRAGMA_NFC_RULES[] = {0x7E,0x6E,0x66,0x63,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfc rules~;"
	1194
	1195	/**
	1196	* Return true if the given rule looks like a pragma.
	1197	* @param pos offset to the first non-whitespace character
	1198	* of the rule.
	1199	* @param limit pointer past the last character of the rule.
	1200	*/
	1201	UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) {
	1202	// Must start with /use\s/i
4388f060	1203	return ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_USE, 4), NULL) >= 0;
b75a7d8f A	1204	}
	1205
	1206	/**
	1207	* Parse a pragma. This method assumes resemblesPragma() has
	1208	* already returned true.
	1209	* @param pos offset to the first non-whitespace character
	1210	* of the rule.
	1211	* @param limit pointer past the last character of the rule.
	1212	* @return the position index after the final ';' of the pragma,
	1213	* or -1 on failure.
	1214	*/
73c04bcf	1215	int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
b75a7d8f A	1216	int32_t array[2];
	1217
	1218	// resemblesPragma() has already returned true, so we
	1219	// know that pos points to /use\s/i; we can skip 4 characters
	1220	// immediately
	1221	pos += 4;
	1222
	1223	// Here are the pragmas we recognize:
	1224	// use variable range 0xE000 0xEFFF;
	1225	// use maximum backup 16;
	1226	// use nfd rules;
	1227	// use nfc rules;
4388f060	1228	int p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_VARIABLE_RANGE, -1), array);
b75a7d8f	1229	if (p >= 0) {
73c04bcf	1230	setVariableRange(array[0], array[1], status);
b75a7d8f A	1231	return p;
	1232	}
	1233
4388f060	1234	p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_MAXIMUM_BACKUP, -1), array);
b75a7d8f A	1235	if (p >= 0) {
	1236	pragmaMaximumBackup(array[0]);
	1237	return p;
	1238	}
	1239
4388f060	1240	p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_NFD_RULES, -1), NULL);
b75a7d8f A	1241	if (p >= 0) {
	1242	pragmaNormalizeRules(UNORM_NFD);
	1243	return p;
	1244	}
	1245
4388f060	1246	p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_NFC_RULES, -1), NULL);
b75a7d8f A	1247	if (p >= 0) {
	1248	pragmaNormalizeRules(UNORM_NFC);
	1249	return p;
	1250	}
	1251
	1252	// Syntax error: unable to parse pragma
	1253	return -1;
	1254	}
	1255
	1256	/**
	1257	* MAIN PARSER. Parse the next rule in the given rule string, starting
	1258	* at pos. Return the index after the last character parsed. Do not
	1259	* parse characters at or after limit.
	1260	*
	1261	* Important: The character at pos must be a non-whitespace character
	1262	* that is not the comment character.
	1263	*
	1264	* This method handles quoting, escaping, and whitespace removal. It
	1265	* parses the end-of-rule character. It recognizes context and cursor
	1266	* indicators. Once it does a lexical breakdown of the rule at pos, it
	1267	* creates a rule object and adds it to our rule list.
	1268	*/
73c04bcf	1269	int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
b75a7d8f A	1270	// Locate the left side, operator, and right side
	1271	int32_t start = pos;
	1272	UChar op = 0;
	1273	int32_t i;
	1274
	1275	// Set up segments data
	1276	segmentStandins.truncate(0);
73c04bcf	1277	segmentObjects.removeAllElements();
b75a7d8f A	1278
	1279	// Use pointers to automatics to make swapping possible.
	1280	RuleHalf _left(this), _right(this);
	1281	RuleHalf* left = &_left;
	1282	RuleHalf* right = &_right;
	1283
	1284	undefinedVariableName.remove();
73c04bcf	1285	pos = left->parse(rule, pos, limit, status);
b75a7d8f A	1286	if (U_FAILURE(status)) {
	1287	return start;
	1288	}
	1289
	1290	if (pos == limit \|\| u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) {
73c04bcf	1291	return syntaxError(U_MISSING_OPERATOR, rule, start, status);
b75a7d8f A	1292	}
	1293	++pos;
	1294
	1295	// Found an operator char. Check for forward-reverse operator.
	1296	if (op == REVERSE_RULE_OP &&
	1297	(pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
	1298	++pos;
	1299	op = FWDREV_RULE_OP;
	1300	}
	1301
	1302	// Translate alternate op characters.
	1303	switch (op) {
	1304	case ALT_FORWARD_RULE_OP:
	1305	op = FORWARD_RULE_OP;
	1306	break;
	1307	case ALT_REVERSE_RULE_OP:
	1308	op = REVERSE_RULE_OP;
	1309	break;
	1310	case ALT_FWDREV_RULE_OP:
	1311	op = FWDREV_RULE_OP;
	1312	break;
	1313	}
	1314
73c04bcf	1315	pos = right->parse(rule, pos, limit, status);
b75a7d8f A	1316	if (U_FAILURE(status)) {
	1317	return start;
	1318	}
	1319
	1320	if (pos < limit) {
	1321	if (rule.charAt(--pos) == END_OF_RULE) {
	1322	++pos;
	1323	} else {
	1324	// RuleHalf parser must have terminated at an operator
73c04bcf	1325	return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status);
b75a7d8f A	1326	}
	1327	}
	1328
	1329	if (op == VARIABLE_DEF_OP) {
	1330	// LHS is the name. RHS is a single character, either a literal
	1331	// or a set (already parsed). If RHS is longer than one
	1332	// character, it is either a multi-character string, or multiple
	1333	// sets, or a mixture of chars and sets -- syntax error.
	1334
	1335	// We expect to see a single undefined variable (the one being
	1336	// defined).
	1337	if (undefinedVariableName.length() == 0) {
	1338	// "Missing '$' or duplicate definition"
73c04bcf	1339	return syntaxError(U_BAD_VARIABLE_DEFINITION, rule, start, status);
b75a7d8f A	1340	}
	1341	if (left->text.length() != 1 \|\| left->text.charAt(0) != variableLimit) {
	1342	// "Malformed LHS"
73c04bcf	1343	return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status);
b75a7d8f A	1344	}
	1345	if (left->anchorStart \|\| left->anchorEnd \|\|
	1346	right->anchorStart \|\| right->anchorEnd) {
73c04bcf	1347	return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status);
b75a7d8f A	1348	}
	1349	// We allow anything on the right, including an empty string.
	1350	UnicodeString* value = new UnicodeString(right->text);
46f4442e A	1351	// NULL pointer check
	1352	if (value == NULL) {
	1353	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
	1354	}
73c04bcf	1355	variableNames.put(undefinedVariableName, value, status);
b75a7d8f A	1356	++variableLimit;
	1357	return pos;
	1358	}
	1359
	1360	// If this is not a variable definition rule, we shouldn't have
	1361	// any undefined variable names.
	1362	if (undefinedVariableName.length() != 0) {
	1363	return syntaxError(// "Undefined variable $" + undefinedVariableName,
	1364	U_UNDEFINED_VARIABLE,
73c04bcf	1365	rule, start, status);
b75a7d8f A	1366	}
	1367
	1368	// Verify segments
73c04bcf A	1369	if (segmentStandins.length() > segmentObjects.size()) {
73c04bcf A	1370	syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start, status);
b75a7d8f A	1371	}
	1372	for (i=0; i<segmentStandins.length(); ++i) {
	1373	if (segmentStandins.charAt(i) == 0) {
73c04bcf	1374	syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen
b75a7d8f A	1375	}
b75a7d8f A	1376	}
73c04bcf A	1377	for (i=0; i<segmentObjects.size(); ++i) {
	1378	if (segmentObjects.elementAt(i) == NULL) {
	1379	syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen
b75a7d8f A	1380	}
	1381	}
	1382
	1383	// If the direction we want doesn't match the rule
	1384	// direction, do nothing.
	1385	if (op != FWDREV_RULE_OP &&
	1386	((direction == UTRANS_FORWARD) != (op == FORWARD_RULE_OP))) {
	1387	return pos;
	1388	}
	1389
	1390	// Transform the rule into a forward rule by swapping the
	1391	// sides if necessary.
	1392	if (direction == UTRANS_REVERSE) {
	1393	left = &_right;
	1394	right = &_left;
	1395	}
	1396
	1397	// Remove non-applicable elements in forward-reverse
	1398	// rules. Bidirectional rules ignore elements that do not
	1399	// apply.
	1400	if (op == FWDREV_RULE_OP) {
	1401	right->removeContext();
	1402	left->cursor = -1;
	1403	left->cursorOffset = 0;
	1404	}
	1405
	1406	// Normalize context
	1407	if (left->ante < 0) {
	1408	left->ante = 0;
	1409	}
	1410	if (left->post < 0) {
	1411	left->post = left->text.length();
	1412	}
	1413
	1414	// Context is only allowed on the input side. Cursors are only
	1415	// allowed on the output side. Segment delimiters can only appear
	1416	// on the left, and references on the right. Cursor offset
	1417	// cannot appear without an explicit cursor. Cursor offset
	1418	// cannot place the cursor outside the limits of the context.
	1419	// Anchors are only allowed on the input side.
	1420	if (right->ante >= 0 \|\| right->post >= 0 \|\| left->cursor >= 0 \|\|
	1421	(right->cursorOffset != 0 && right->cursor < 0) \|\|
	1422	// - The following two checks were used to ensure that the
	1423	// - the cursor offset stayed within the ante- or postcontext.
	1424	// - However, with the addition of quantifiers, we have to
	1425	// - allow arbitrary cursor offsets and do runtime checking.
	1426	//(right->cursorOffset > (left->text.length() - left->post)) \|\|
	1427	//(-right->cursorOffset > left->ante) \|\|
	1428	right->anchorStart \|\| right->anchorEnd \|\|
	1429	!left->isValidInput(this) \|\| !right->isValidOutput(this) \|\|
	1430	left->ante > left->post) {
	1431
73c04bcf	1432	return syntaxError(U_MALFORMED_RULE, rule, start, status);
b75a7d8f A	1433	}
	1434
	1435	// Flatten segment objects vector to an array
	1436	UnicodeFunctor** segmentsArray = NULL;
73c04bcf A	1437	if (segmentObjects.size() > 0) {
73c04bcf A	1438	segmentsArray = (UnicodeFunctor *)uprv_malloc(segmentObjects.size() sizeof(UnicodeFunctor *));
46f4442e A	1439	// Null pointer check
	1440	if (segmentsArray == NULL) {
	1441	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
	1442	}
73c04bcf	1443	segmentObjects.toArray((void**) segmentsArray);
b75a7d8f	1444	}
46f4442e A	1445	TransliterationRule* temptr = new TransliterationRule(
	1446	left->text, left->ante, left->post,
	1447	right->text, right->cursor, right->cursorOffset,
	1448	segmentsArray,
	1449	segmentObjects.size(),
	1450	left->anchorStart, left->anchorEnd,
	1451	curData,
	1452	status);
	1453	//Null pointer check
	1454	if (temptr == NULL) {
	1455	uprv_free(segmentsArray);
	1456	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
	1457	}
b75a7d8f	1458
46f4442e	1459	curData->ruleSet.addRule(temptr, status);
b75a7d8f A	1460
	1461	return pos;
	1462	}
	1463
	1464	/**
	1465	* Called by main parser upon syntax error. Search the rule string
	1466	* for the probable end of the rule. Of course, if the error is that
	1467	* the end of rule marker is missing, then the rule end will not be found.
	1468	* In any case the rule start will be correctly reported.
	1469	* @param msg error description
	1470	* @param rule pattern string
	1471	* @param start position of first character of current rule
	1472	*/
	1473	int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode,
73c04bcf A	1474	const UnicodeString& rule,
	1475	int32_t pos,
	1476	UErrorCode& status)
	1477	{
b75a7d8f A	1478	parseError.offset = pos;
	1479	parseError.line = 0 ; /* we are not using line numbers */
	1480
	1481	// for pre-context
	1482	const int32_t LEN = U_PARSE_CONTEXT_LEN - 1;
	1483	int32_t start = uprv_max(pos - LEN, 0);
	1484	int32_t stop = pos;
	1485
	1486	rule.extract(start,stop-start,parseError.preContext);
	1487	//null terminate the buffer
	1488	parseError.preContext[stop-start] = 0;
	1489
	1490	//for post-context
	1491	start = pos;
	1492	stop = uprv_min(pos + LEN, rule.length());
	1493
	1494	rule.extract(start,stop-start,parseError.postContext);
	1495	//null terminate the buffer
	1496	parseError.postContext[stop-start]= 0;
	1497
	1498	status = (UErrorCode)parseErrorCode;
	1499	return pos;
	1500
	1501	}
	1502
	1503	/**
	1504	* Parse a UnicodeSet out, store it, and return the stand-in character
	1505	* used to represent it.
	1506	*/
	1507	UChar TransliteratorParser::parseSet(const UnicodeString& rule,
73c04bcf A	1508	ParsePosition& pos,
73c04bcf A	1509	UErrorCode& status) {
374ca955	1510	UnicodeSet* set = new UnicodeSet(rule, pos, USET_IGNORE_SPACE, parseData, status);
46f4442e A	1511	// Null pointer check
	1512	if (set == NULL) {
	1513	status = U_MEMORY_ALLOCATION_ERROR;
	1514	return (UChar)0x0000; // Return empty character with error.
	1515	}
b75a7d8f	1516	set->compact();
73c04bcf	1517	return generateStandInFor(set, status);
b75a7d8f A	1518	}
	1519
	1520	/**
	1521	* Generate and return a stand-in for a new UnicodeFunctor. Store
	1522	* the matcher (adopt it).
	1523	*/
73c04bcf	1524	UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) {
b75a7d8f A	1525	// assert(obj != null);
	1526
	1527	// Look up previous stand-in, if any. This is a short list
	1528	// (typical n is 0, 1, or 2); linear search is optimal.
73c04bcf A	1529	for (int32_t i=0; i<variablesVector.size(); ++i) {
	1530	if (variablesVector.elementAt(i) == adopted) { // [sic] pointer comparison
	1531	return (UChar) (curData->variablesBase + i);
b75a7d8f A	1532	}
	1533	}
	1534
	1535	if (variableNext >= variableLimit) {
	1536	delete adopted;
	1537	status = U_VARIABLE_RANGE_EXHAUSTED;
	1538	return 0;
	1539	}
73c04bcf	1540	variablesVector.addElement(adopted, status);
b75a7d8f A	1541	return variableNext++;
	1542	}
	1543
	1544	/**
	1545	* Return the standin for segment seg (1-based).
	1546	*/
73c04bcf	1547	UChar TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) {
b75a7d8f	1548	// Special character used to indicate an empty spot
73c04bcf	1549	UChar empty = curData->variablesBase - 1;
b75a7d8f A	1550	while (segmentStandins.length() < seg) {
	1551	segmentStandins.append(empty);
	1552	}
	1553	UChar c = segmentStandins.charAt(seg-1);
	1554	if (c == empty) {
	1555	if (variableNext >= variableLimit) {
	1556	status = U_VARIABLE_RANGE_EXHAUSTED;
	1557	return 0;
	1558	}
	1559	c = variableNext++;
	1560	// Set a placeholder in the master variables vector that will be
	1561	// filled in later by setSegmentObject(). We know that we will get
	1562	// called first because setSegmentObject() will call us.
73c04bcf	1563	variablesVector.addElement((void*) NULL, status);
b75a7d8f A	1564	segmentStandins.setCharAt(seg-1, c);
	1565	}
	1566	return c;
	1567	}
	1568
	1569	/**
	1570	* Set the object for segment seg (1-based).
	1571	*/
73c04bcf	1572	void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) {
b75a7d8f A	1573	// Since we call parseSection() recursively, nested
	1574	// segments will result in segment i+1 getting parsed
	1575	// and stored before segment i; be careful with the
	1576	// vector handling here.
73c04bcf	1577	if (segmentObjects.size() < seg) {
46f4442e	1578	segmentObjects.setSize(seg, status);
b75a7d8f	1579	}
73c04bcf A	1580	int32_t index = getSegmentStandin(seg, status) - curData->variablesBase;
	1581	if (segmentObjects.elementAt(seg-1) != NULL \|\|
	1582	variablesVector.elementAt(index) != NULL) {
b75a7d8f A	1583	// should never happen
	1584	status = U_INTERNAL_TRANSLITERATOR_ERROR;
	1585	return;
	1586	}
73c04bcf A	1587	segmentObjects.setElementAt(adopted, seg-1);
73c04bcf A	1588	variablesVector.setElementAt(adopted, index);
b75a7d8f A	1589	}
	1590
	1591	/**
	1592	* Return the stand-in for the dot set. It is allocated the first
	1593	* time and reused thereafter.
	1594	*/
73c04bcf	1595	UChar TransliteratorParser::getDotStandIn(UErrorCode& status) {
b75a7d8f	1596	if (dotStandIn == (UChar) -1) {
4388f060	1597	UnicodeSet* tempus = new UnicodeSet(UnicodeString(TRUE, DOT_SET, -1), status);
46f4442e A	1598	// Null pointer check.
	1599	if (tempus == NULL) {
	1600	status = U_MEMORY_ALLOCATION_ERROR;
	1601	return (UChar)0x0000;
	1602	}
	1603	dotStandIn = generateStandInFor(tempus, status);
b75a7d8f A	1604	}
	1605	return dotStandIn;
	1606	}
	1607
	1608	/**
	1609	* Append the value of the given variable name to the given
	1610	* UnicodeString.
	1611	*/
	1612	void TransliteratorParser::appendVariableDef(const UnicodeString& name,
73c04bcf A	1613	UnicodeString& buf,
	1614	UErrorCode& status) {
	1615	const UnicodeString* s = (const UnicodeString*) variableNames.get(name);
b75a7d8f A	1616	if (s == NULL) {
	1617	// We allow one undefined variable so that variable definition
	1618	// statements work. For the first undefined variable we return
	1619	// the special placeholder variableLimit-1, and save the variable
	1620	// name.
	1621	if (undefinedVariableName.length() == 0) {
	1622	undefinedVariableName = name;
	1623	if (variableNext >= variableLimit) {
	1624	// throw new RuntimeException("Private use variables exhausted");
	1625	status = U_ILLEGAL_ARGUMENT_ERROR;
	1626	return;
	1627	}
	1628	buf.append((UChar) --variableLimit);
	1629	} else {
	1630	//throw new IllegalArgumentException("Undefined variable $"
	1631	// + name);
	1632	status = U_ILLEGAL_ARGUMENT_ERROR;
	1633	return;
	1634	}
	1635	} else {
	1636	buf.append(*s);
	1637	}
	1638	}
	1639
	1640	/**
	1641	* Glue method to get around access restrictions in C++.
	1642	*/
46f4442e	1643	/Transliterator TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
b75a7d8f	1644	return Transliterator::createBasicInstance(id, canonID);
46f4442e	1645	}*/
b75a7d8f A	1646
	1647	U_NAMESPACE_END
	1648
73c04bcf A	1649	U_CAPI int32_t
73c04bcf A	1650	utrans_stripRules(const UChar source, int32_t sourceLen, UChar target, UErrorCode *status) {
46f4442e A	1651	U_NAMESPACE_USE
46f4442e A	1652
73c04bcf A	1653	//const UChar *sourceStart = source;
	1654	const UChar *targetStart = target;
	1655	const UChar *sourceLimit = source+sourceLen;
	1656	UChar *targetLimit = target+sourceLen;
	1657	UChar32 c = 0;
	1658	UBool quoted = FALSE;
	1659	int32_t index;
	1660
	1661	uprv_memset(target, 0, sourceLen*U_SIZEOF_UCHAR);
	1662
	1663	/* read the rules into the buffer */
	1664	while (source < sourceLimit)
	1665	{
	1666	index=0;
	1667	U16_NEXT_UNSAFE(source, index, c);
	1668	source+=index;
	1669	if(c == QUOTE) {
	1670	quoted = (UBool)!quoted;
	1671	}
	1672	else if (!quoted) {
	1673	if (c == RULE_COMMENT_CHAR) {
	1674	/* skip comments and all preceding spaces */
	1675	while (targetStart < target && *(target - 1) == 0x0020) {
	1676	target--;
	1677	}
	1678	do {
2ca993e8 A	1679	if (source == sourceLimit) {
	1680	c = U_SENTINEL;
	1681	break;
	1682	}
73c04bcf A	1683	c = *(source++);
	1684	}
	1685	while (c != CR && c != LF);
2ca993e8 A	1686	if (c < 0) {
	1687	break;
	1688	}
73c04bcf	1689	}
2ca993e8	1690	else if (c == ESCAPE && source < sourceLimit) {
73c04bcf A	1691	UChar32 c2 = *source;
	1692	if (c2 == CR \|\| c2 == LF) {
	1693	/* A backslash at the end of a line. */
	1694	/* Since we're stripping lines, ignore the backslash. */
	1695	source++;
	1696	continue;
	1697	}
	1698	if (c2 == 0x0075 && source+5 < sourceLimit) { /* \u seen. \U isn't unescaped. */
	1699	int32_t escapeOffset = 0;
	1700	UnicodeString escapedStr(source, 5);
	1701	c2 = escapedStr.unescapeAt(escapeOffset);
	1702
	1703	if (c2 == (UChar32)0xFFFFFFFF \|\| escapeOffset == 0)
	1704	{
	1705	*status = U_PARSE_ERROR;
	1706	return 0;
	1707	}
4388f060	1708	if (!PatternProps::isWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) {
73c04bcf A	1709	/* It was escaped for a reason. Write what it was suppose to be. */
	1710	source+=5;
	1711	c = c2;
	1712	}
	1713	}
	1714	else if (c2 == QUOTE) {
	1715	/* \' seen. Make sure we don't do anything when we see it again. */
	1716	quoted = (UBool)!quoted;
	1717	}
	1718	}
	1719	}
	1720	if (c == CR \|\| c == LF)
	1721	{
	1722	/* ignore spaces carriage returns, and all leading spaces on the next line.
	1723	* and line feed unless in the form \uXXXX
	1724	*/
	1725	quoted = FALSE;
	1726	while (source < sourceLimit) {
	1727	c = *(source);
	1728	if (c != CR && c != LF && c != 0x0020) {
	1729	break;
	1730	}
	1731	source++;
	1732	}
	1733	continue;
	1734	}
	1735
	1736	/* Append UChar * after dissembling if c > 0xffff*/
	1737	index=0;
	1738	U16_APPEND_UNSAFE(target, index, c);
	1739	target+=index;
	1740	}
	1741	if (target < targetLimit) {
	1742	*target = 0;
	1743	}
	1744	return (int32_t)(target-targetStart);
	1745	}
	1746
b75a7d8f	1747	#endif /* #if !UCONFIG_NO_TRANSLITERATION */