]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/regexcmp.cpp
ICU-400.39.tar.gz
[apple/icu.git] / icuSources / i18n / regexcmp.cpp
CommitLineData
b75a7d8f
A
1
2//
3// file: regexcmp.cpp
4//
46f4442e 5// Copyright (C) 2002-2008 International Business Machines Corporation and others.
b75a7d8f
A
6// All Rights Reserved.
7//
8// This file contains the ICU regular expression compiler, which is responsible
9// for processing a regular expression pattern into the compiled form that
10// is used by the match finding engine.
11//
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17#include "unicode/unistr.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/uchriter.h"
21#include "unicode/parsepos.h"
22#include "unicode/parseerr.h"
23#include "unicode/regex.h"
374ca955 24#include "util.h"
b75a7d8f
A
25#include "cmemory.h"
26#include "cstring.h"
27#include "uvectr32.h"
28#include "uassert.h"
29#include "ucln_in.h"
46f4442e 30#include "uinvchar.h"
b75a7d8f
A
31
32#include "regeximp.h"
33#include "regexcst.h" // Contains state table for the regex pattern parser.
34 // generated by a Perl script.
35#include "regexcmp.h"
36#include "regexst.h"
37
38
39
40U_NAMESPACE_BEGIN
41
42
73c04bcf 43//------------------------------------------------------------------------------
b75a7d8f
A
44//
45// Constructor.
46//
73c04bcf 47//------------------------------------------------------------------------------
46f4442e
A
48RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
49 fParenStack(status), fSetStack(status), fSetOpStack(status)
b75a7d8f
A
50{
51 fStatus = &status;
52
53 fRXPat = rxp;
54 fScanIndex = 0;
55 fNextIndex = 0;
56 fPeekChar = -1;
57 fLineNum = 1;
58 fCharNum = 0;
59 fQuoteMode = FALSE;
60 fInBackslashQuote = FALSE;
73c04bcf 61 fModeFlags = fRXPat->fFlags | 0x80000000;
b75a7d8f
A
62 fEOLComments = TRUE;
63
64 fMatchOpenParen = -1;
65 fMatchCloseParen = -1;
66 fStringOpStart = -1;
67
68 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
69 status = rxp->fDeferredStatus;
70 }
71}
72
46f4442e
A
73static const UChar chAmp = 0x26; // '&'
74static const UChar chDash = 0x2d; // '-'
b75a7d8f
A
75
76
73c04bcf 77//------------------------------------------------------------------------------
b75a7d8f
A
78//
79// Destructor
80//
73c04bcf 81//------------------------------------------------------------------------------
b75a7d8f
A
82RegexCompile::~RegexCompile() {
83}
84
46f4442e
A
85static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
86 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
87}
88
73c04bcf 89//------------------------------------------------------------------------------
b75a7d8f
A
90//
91// Compile regex pattern. The state machine for rexexp pattern parsing is here.
92// The state tables are hand-written in the file regexcst.txt,
93// and converted to the form used here by a perl
94// script regexcst.pl
95//
73c04bcf 96//------------------------------------------------------------------------------
b75a7d8f
A
97void RegexCompile::compile(
98 const UnicodeString &pat, // Source pat to be compiled.
99 UParseError &pp, // Error position info
100 UErrorCode &e) // Error Code
101{
102 fStatus = &e;
103 fParseErr = &pp;
104 fStackPtr = 0;
105 fStack[fStackPtr] = 0;
106
107 if (U_FAILURE(*fStatus)) {
108 return;
109 }
110
111 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
112 U_ASSERT(fRXPat->fPattern.length() == 0);
113
114 // Prepare the RegexPattern object to receive the compiled pattern.
b75a7d8f
A
115 fRXPat->fPattern = pat;
116 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
117 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
118
119
120 // Initialize the pattern scanning state machine
121 fPatternLength = pat.length();
122 uint16_t state = 1;
123 const RegexTableEl *tableEl;
124 nextChar(fC); // Fetch the first char from the pattern string.
125
126 //
127 // Main loop for the regex pattern parsing state machine.
128 // Runs once per state transition.
129 // Each time through optionally performs, depending on the state table,
130 // - an advance to the the next pattern char
131 // - an action to be performed.
132 // - pushing or popping a state to/from the local state return stack.
133 // file regexcst.txt is the source for the state table. The logic behind
134 // recongizing the pattern syntax is there, not here.
135 //
136 for (;;) {
137 // Bail out if anything has gone wrong.
138 // Regex pattern parsing stops on the first error encountered.
139 if (U_FAILURE(*fStatus)) {
140 break;
141 }
142
143 U_ASSERT(state != 0);
144
145 // Find the state table element that matches the input char from the pattern, or the
146 // class of the input character. Start with the first table row for this
147 // state, then linearly scan forward until we find a row that matches the
148 // character. The last row for each state always matches all characters, so
149 // the search will stop there, if not before.
150 //
151 tableEl = &gRuleParseStateTable[state];
46f4442e 152 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
374ca955 153 fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
b75a7d8f
A
154
155 for (;;) { // loop through table rows belonging to this state, looking for one
156 // that matches the current input char.
374ca955 157 REGEX_SCAN_DEBUG_PRINTF(("."));
b75a7d8f
A
158 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
159 // Table row specified an individual character, not a set, and
160 // the input character is not quoted, and
161 // the input character matched it.
162 break;
163 }
164 if (tableEl->fCharClass == 255) {
165 // Table row specified default, match anything character class.
166 break;
167 }
168 if (tableEl->fCharClass == 254 && fC.fQuoted) {
169 // Table row specified "quoted" and the char was quoted.
170 break;
171 }
172 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
173 // Table row specified eof and we hit eof on the input.
174 break;
175 }
176
177 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
178 fC.fQuoted == FALSE && // char is not escaped &&
179 fC.fChar != (UChar32)-1) { // char is not EOF
46f4442e 180 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
b75a7d8f
A
181 // Table row specified a character class, or set of characters,
182 // and the current char matches it.
183 break;
184 }
185 }
186
187 // No match on this row, advance to the next row for this state,
188 tableEl++;
189 }
374ca955 190 REGEX_SCAN_DEBUG_PRINTF(("\n"));
b75a7d8f
A
191
192 //
193 // We've found the row of the state table that matches the current input
194 // character from the rules string.
195 // Perform any action specified by this row in the state table.
46f4442e 196 if (doParseActions(tableEl->fAction) == FALSE) {
b75a7d8f
A
197 // Break out of the state machine loop if the
198 // the action signalled some kind of error, or
199 // the action was to exit, occurs on normal end-of-rules-input.
200 break;
201 }
202
203 if (tableEl->fPushState != 0) {
204 fStackPtr++;
205 if (fStackPtr >= kStackSize) {
206 error(U_REGEX_INTERNAL_ERROR);
374ca955 207 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
b75a7d8f
A
208 fStackPtr--;
209 }
210 fStack[fStackPtr] = tableEl->fPushState;
211 }
212
213 //
214 // NextChar. This is where characters are actually fetched from the pattern.
215 // Happens under control of the 'n' tag in the state table.
216 //
217 if (tableEl->fNextChar) {
218 nextChar(fC);
219 }
220
221 // Get the next state from the table entry, or from the
222 // state stack if the next state was specified as "pop".
223 if (tableEl->fNextState != 255) {
224 state = tableEl->fNextState;
225 } else {
226 state = fStack[fStackPtr];
227 fStackPtr--;
228 if (fStackPtr < 0) {
229 // state stack underflow
230 // This will occur if the user pattern has mis-matched parentheses,
231 // with extra close parens.
46f4442e 232 //
b75a7d8f
A
233 fStackPtr++;
234 error(U_REGEX_MISMATCHED_PAREN);
235 }
236 }
237
238 }
239
46f4442e
A
240 if (U_FAILURE(*fStatus)) {
241 // Bail out if the pattern had errors.
242 // Set stack cleanup: a successful compile would have left it empty,
243 // but errors can leave temporary sets hanging around.
244 while (!fSetStack.empty()) {
245 delete (UnicodeSet *)fSetStack.pop();
246 }
247 return;
248 }
249
b75a7d8f
A
250 //
251 // The pattern has now been read and processed, and the compiled code generated.
252 //
253
254 // Back-reference fixup
255 //
256 int32_t loc;
257 for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) {
258 int32_t op = fRXPat->fCompiledPat->elementAti(loc);
259 int32_t opType = URX_TYPE(op);
260 if (opType == URX_BACKREF || opType == URX_BACKREF_I) {
261 int32_t where = URX_VAL(op);
262 if (where > fRXPat->fGroupMap->size()) {
263 error(U_REGEX_INVALID_BACK_REF);
264 break;
265 }
266 where = fRXPat->fGroupMap->elementAti(where-1);
267 op = URX_BUILD(opType, where);
268 fRXPat->fCompiledPat->setElementAt(op, loc);
269 }
270 }
271
272
273 //
274 // Compute the number of digits requried for the largest capture group number.
275 //
276 fRXPat->fMaxCaptureDigits = 1;
277 int32_t n = 10;
278 for (;;) {
279 if (n > fRXPat->fGroupMap->size()) {
280 break;
281 }
282 fRXPat->fMaxCaptureDigits++;
283 n *= 10;
284 }
285
286 //
287 // The pattern's fFrameSize so far has accumulated the requirements for
288 // storage for capture parentheses, counters, etc. that are encountered
289 // in the pattern. Add space for the two variables that are always
290 // present in the saved state: the input string position and the
291 // position in the compiled pattern.
292 //
293 fRXPat->fFrameSize+=2;
294
295 //
296 // Get bounds for the minimum and maximum length of a string that this
297 // pattern can match. Used to avoid looking for matches in strings that
298 // are too short.
299 //
300 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
301
302 //
303 // Optimization passes
46f4442e
A
304 //
305 matchStartType();
b75a7d8f
A
306 stripNOPs();
307
308 //
309 // Set up fast latin-1 range sets
310 //
311 int32_t numSets = fRXPat->fSets->size();
312 fRXPat->fSets8 = new Regex8BitSet[numSets];
46f4442e
A
313 // Null pointer check.
314 if (fRXPat->fSets8 == NULL) {
315 e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
316 return;
317 }
b75a7d8f
A
318 int32_t i;
319 for (i=0; i<numSets; i++) {
320 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
321 fRXPat->fSets8[i].init(s);
322 }
323
b75a7d8f
A
324}
325
326
327
328
329
73c04bcf 330//------------------------------------------------------------------------------
b75a7d8f
A
331//
332// doParseAction Do some action during regex pattern parsing.
333// Called by the parse state machine.
334//
335// Generation of the match engine PCode happens here, or
336// in functions called from the parse actions defined here.
337//
338//
73c04bcf 339//------------------------------------------------------------------------------
46f4442e 340UBool RegexCompile::doParseActions(int32_t action)
b75a7d8f
A
341{
342 UBool returnVal = TRUE;
343
344 switch ((Regex_PatternParseAction)action) {
345
346 case doPatStart:
347 // Start of pattern compiles to:
348 //0 SAVE 2 Fall back to position of FAIL
349 //1 jmp 3
350 //2 FAIL Stop if we ever reach here.
351 //3 NOP Dummy, so start of pattern looks the same as
352 // the start of an ( grouping.
353 //4 NOP Resreved, will be replaced by a save if there are
354 // OR | operators at the top level
355 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus);
356 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus);
357 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
b75a7d8f 358
73c04bcf
A
359 // Standard open nonCapture paren action emits the two NOPs and
360 // sets up the paren stack frame.
46f4442e 361 doParseActions(doOpenNonCaptureParen);
b75a7d8f
A
362 break;
363
364 case doPatFinish:
365 // We've scanned to the end of the pattern
366 // The end of pattern compiles to:
367 // URX_END
368 // which will stop the runtime match engine.
369 // Encountering end of pattern also behaves like a close paren,
370 // and forces fixups of the State Save at the beginning of the compiled pattern
371 // and of any OR operations at the top level.
372 //
373 handleCloseParen();
374 if (fParenStack.size() > 0) {
375 // Missing close paren in pattern.
376 error(U_REGEX_MISMATCHED_PAREN);
377 }
378
379 // add the END operation to the compiled pattern.
380 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
381
382 // Terminate the pattern compilation state machine.
383 returnVal = FALSE;
384 break;
385
386
387
388 case doOrOperator:
389 // Scanning a '|', as in (A|B)
390 {
391 // Insert a SAVE operation at the start of the pattern section preceding
392 // this OR at this level. This SAVE will branch the match forward
393 // to the right hand side of the OR in the event that the left hand
394 // side fails to match and backtracks. Locate the position for the
395 // save from the location on the top of the parentheses stack.
396 int32_t savePosition = fParenStack.popi();
397 int32_t op = fRXPat->fCompiledPat->elementAti(savePosition);
398 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
399 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
400 fRXPat->fCompiledPat->setElementAt(op, savePosition);
401
402 // Append an JMP operation into the compiled pattern. The operand for
403 // the JMP will eventually be the location following the ')' for the
404 // group. This will be patched in later, when the ')' is encountered.
405 op = URX_BUILD(URX_JMP, 0);
406 fRXPat->fCompiledPat->addElement(op, *fStatus);
407
408 // Push the position of the newly added JMP op onto the parentheses stack.
409 // This registers if for fixup when this block's close paren is encountered.
410 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
411
412 // Append a NOP to the compiled pattern. This is the slot reserved
413 // for a SAVE in the event that there is yet another '|' following
414 // this one.
415 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
416 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
417 }
418 break;
419
420
421 case doOpenCaptureParen:
422 // Open Paren.
423 // Compile to a
424 // - NOP, which later may be replaced by a save-state if the
425 // parenthesized group gets a * quantifier, followed by
426 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
427 // - NOP, which may later be replaced by a save-state if there
428 // is an '|' alternation within the parens.
429 //
430 // Each capture group gets three slots in the save stack frame:
431 // 0: Capture Group start position (in input string being matched.)
432 // 1: Capture Group end positino.
433 // 2: Start of Match-in-progress.
434 // The first two locations are for a completed capture group, and are
435 // referred to by back references and the like.
436 // The third location stores the capture start position when an START_CAPTURE is
437 // encountered. This will be promoted to a completed capture when (and if) the corresponding
438 // END_CAPure is encountered.
439 {
440 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
441 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame.
442 fRXPat->fFrameSize += 3;
443 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
444 fRXPat->fCompiledPat->addElement(cop, *fStatus);
445 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
446
447 // On the Parentheses stack, start a new frame and add the postions
448 // of the two NOPs. Depending on what follows in the pattern, the
449 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
450 // address of the end of the parenthesized group.
451 fParenStack.push(fModeFlags, *fStatus); // Match mode state
452 fParenStack.push(capturing, *fStatus); // Frame type.
453 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
454 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
455
456 // Save the mapping from group number to stack frame variable position.
457 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
458 }
459 break;
460
461 case doOpenNonCaptureParen:
462 // Open non-caputuring (grouping only) Paren.
463 // Compile to a
464 // - NOP, which later may be replaced by a save-state if the
465 // parenthesized group gets a * quantifier, followed by
466 // - NOP, which may later be replaced by a save-state if there
467 // is an '|' alternation within the parens.
468 {
469 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
470 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
471
472 // On the Parentheses stack, start a new frame and add the postions
473 // of the two NOPs.
474 fParenStack.push(fModeFlags, *fStatus); // Match mode state
475 fParenStack.push(plain, *fStatus); // Begin a new frame.
476 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
477 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
478 }
479 break;
480
481
482 case doOpenAtomicParen:
483 // Open Atomic Paren. (?>
484 // Compile to a
46f4442e 485 // - NOP, which later may be replaced if the parenthesized group
b75a7d8f
A
486 // has a quantifier, followed by
487 // - STO_SP save state stack position, so it can be restored at the ")"
488 // - NOP, which may later be replaced by a save-state if there
489 // is an '|' alternation within the parens.
490 {
491 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
492 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
493 fRXPat->fDataSize += 1; // state stack ptr.
494 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc);
495 fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
496 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
497
498 // On the Parentheses stack, start a new frame and add the postions
499 // of the two NOPs. Depending on what follows in the pattern, the
500 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
501 // address of the end of the parenthesized group.
502 fParenStack.push(fModeFlags, *fStatus); // Match mode state
503 fParenStack.push(atomic, *fStatus); // Frame type.
504 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
505 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
506 }
507 break;
508
509
510 case doOpenLookAhead:
511 // Positive Look-ahead (?= stuff )
46f4442e
A
512 //
513 // Note: Addition of transparent input regions, with the need to
514 // restore the original regions when failing out of a lookahead
515 // block, complicated this sequence. Some conbined opcodes
516 // might make sense - or might not, lookahead aren't that common.
517 //
518 // Caution: min match length optimization knows about this
519 // sequence; don't change without making updates there too.
520 //
b75a7d8f 521 // Compiles to
46f4442e
A
522 // 1 START_LA dataLoc Saves SP, Input Pos
523 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
524 // 3 JMP 6 continue ...
525 //
526 // 4. LA_END Look Ahead failed. Restore regions.
527 // 5. BACKTRACK and back track again.
528 //
529 // 6. NOP reserved for use by quantifiers on the block.
b75a7d8f
A
530 // Look-ahead can't have quantifiers, but paren stack
531 // compile time conventions require the slot anyhow.
46f4442e
A
532 // 7. NOP may be replaced if there is are '|' ops in the block.
533 // 8. code for parenthesized stuff.
534 // 9. LA_END
535 //
b75a7d8f
A
536 // Two data slots are reserved, for saving the stack ptr and the input position.
537 {
538 int32_t dataLoc = fRXPat->fDataSize;
46f4442e 539 fRXPat->fDataSize += 2;
b75a7d8f
A
540 int32_t op = URX_BUILD(URX_LA_START, dataLoc);
541 fRXPat->fCompiledPat->addElement(op, *fStatus);
542
46f4442e
A
543 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
544 fRXPat->fCompiledPat->addElement(op, *fStatus);
545
546 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
547 fRXPat->fCompiledPat->addElement(op, *fStatus);
548
549 op = URX_BUILD(URX_LA_END, dataLoc);
550 fRXPat->fCompiledPat->addElement(op, *fStatus);
551
552 op = URX_BUILD(URX_BACKTRACK, 0);
553 fRXPat->fCompiledPat->addElement(op, *fStatus);
554
b75a7d8f
A
555 op = URX_BUILD(URX_NOP, 0);
556 fRXPat->fCompiledPat->addElement(op, *fStatus);
557 fRXPat->fCompiledPat->addElement(op, *fStatus);
558
559 // On the Parentheses stack, start a new frame and add the postions
46f4442e 560 // of the NOPs.
b75a7d8f
A
561 fParenStack.push(fModeFlags, *fStatus); // Match mode state
562 fParenStack.push(lookAhead, *fStatus); // Frame type.
563 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
564 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
565 }
566 break;
567
568 case doOpenLookAheadNeg:
569 // Negated Lookahead. (?! stuff )
570 // Compiles to
571 // 1. START_LA dataloc
572 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
573 // // which continues with the match.
574 // 3. NOP // Std. Open Paren sequence, for possible '|'
575 // 4. code for parenthesized stuff.
576 // 5. END_LA // Cut back stack, remove saved state from step 2.
46f4442e
A
577 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
578 // 7. END_LA // Restore match region, in case look-ahead was using
579 // an alternate (transparent) region.
b75a7d8f
A
580 {
581 int32_t dataLoc = fRXPat->fDataSize;
46f4442e 582 fRXPat->fDataSize += 2;
b75a7d8f
A
583 int32_t op = URX_BUILD(URX_LA_START, dataLoc);
584 fRXPat->fCompiledPat->addElement(op, *fStatus);
585
586 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patched later.
587 fRXPat->fCompiledPat->addElement(op, *fStatus);
588
589 op = URX_BUILD(URX_NOP, 0);
590 fRXPat->fCompiledPat->addElement(op, *fStatus);
591
592 // On the Parentheses stack, start a new frame and add the postions
46f4442e 593 // of the StateSave and NOP.
b75a7d8f 594 fParenStack.push(fModeFlags, *fStatus); // Match mode state
46f4442e 595 fParenStack.push(negLookAhead, *fStatus); // Frame type
b75a7d8f
A
596 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
597 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
46f4442e 598
b75a7d8f
A
599 // Instructions #5 and #6 will be added when the ')' is encountered.
600 }
601 break;
602
603 case doOpenLookBehind:
604 {
605 // Compile a (?<= look-behind open paren.
606 //
607 // Compiles to
608 // 0 URX_LB_START dataLoc
609 // 1 URX_LB_CONT dataLoc
610 // 2 MinMatchLen
611 // 3 MaxMatchLen
612 // 4 URX_NOP Standard '(' boilerplate.
613 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
614 // 6 <code for LookBehind expression>
615 // 7 URX_LB_END dataLoc # Check match len, restore input len
616 // 8 URX_LA_END dataLoc # Restore stack, input pos
617 //
618 // Allocate a block of matcher data, to contain (when running a match)
619 // 0: Stack ptr on entry
620 // 1: Input Index on entry
621 // 2: Start index of match current match attempt.
46f4442e 622 // 3: Original Input String len.
b75a7d8f
A
623
624 // Allocate data space
625 int32_t dataLoc = fRXPat->fDataSize;
46f4442e
A
626 fRXPat->fDataSize += 4;
627
b75a7d8f
A
628 // Emit URX_LB_START
629 int32_t op = URX_BUILD(URX_LB_START, dataLoc);
630 fRXPat->fCompiledPat->addElement(op, *fStatus);
46f4442e 631
b75a7d8f
A
632 // Emit URX_LB_CONT
633 op = URX_BUILD(URX_LB_CONT, dataLoc);
634 fRXPat->fCompiledPat->addElement(op, *fStatus);
635 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later.
636 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later.
46f4442e 637
b75a7d8f
A
638 // Emit the NOP
639 op = URX_BUILD(URX_NOP, 0);
640 fRXPat->fCompiledPat->addElement(op, *fStatus);
641 fRXPat->fCompiledPat->addElement(op, *fStatus);
46f4442e 642
b75a7d8f 643 // On the Parentheses stack, start a new frame and add the postions
46f4442e 644 // of the URX_LB_CONT and the NOP.
b75a7d8f
A
645 fParenStack.push(fModeFlags, *fStatus); // Match mode state
646 fParenStack.push(lookBehind, *fStatus); // Frame type
647 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
648 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
46f4442e 649
b75a7d8f
A
650 // The final two instructions will be added when the ')' is encountered.
651 }
652
653 break;
654
655 case doOpenLookBehindNeg:
656 {
657 // Compile a (?<! negated look-behind open paren.
658 //
659 // Compiles to
660 // 0 URX_LB_START dataLoc # Save entry stack, input len
661 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
662 // 2 MinMatchLen
663 // 3 MaxMatchLen
664 // 4 continueLoc (9)
665 // 5 URX_NOP Standard '(' boilerplate.
666 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
667 // 7 <code for LookBehind expression>
668 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
669 // 9 ...
670 //
671 // Allocate a block of matcher data, to contain (when running a match)
672 // 0: Stack ptr on entry
673 // 1: Input Index on entry
674 // 2: Start index of match current match attempt.
46f4442e 675 // 3: Original Input String len.
b75a7d8f
A
676
677 // Allocate data space
678 int32_t dataLoc = fRXPat->fDataSize;
46f4442e
A
679 fRXPat->fDataSize += 4;
680
b75a7d8f
A
681 // Emit URX_LB_START
682 int32_t op = URX_BUILD(URX_LB_START, dataLoc);
683 fRXPat->fCompiledPat->addElement(op, *fStatus);
46f4442e 684
b75a7d8f
A
685 // Emit URX_LBN_CONT
686 op = URX_BUILD(URX_LBN_CONT, dataLoc);
687 fRXPat->fCompiledPat->addElement(op, *fStatus);
688 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later.
689 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later.
690 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later.
46f4442e 691
b75a7d8f
A
692 // Emit the NOP
693 op = URX_BUILD(URX_NOP, 0);
694 fRXPat->fCompiledPat->addElement(op, *fStatus);
695 fRXPat->fCompiledPat->addElement(op, *fStatus);
46f4442e 696
b75a7d8f 697 // On the Parentheses stack, start a new frame and add the postions
46f4442e 698 // of the URX_LB_CONT and the NOP.
b75a7d8f
A
699 fParenStack.push(fModeFlags, *fStatus); // Match mode state
700 fParenStack.push(lookBehindN, *fStatus); // Frame type
701 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
702 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
46f4442e 703
b75a7d8f
A
704 // The final two instructions will be added when the ')' is encountered.
705 }
706 break;
707
708 case doConditionalExpr:
709 // Conditionals such as (?(1)a:b)
710 case doPerlInline:
711 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
712 error(U_REGEX_UNIMPLEMENTED);
713 break;
714
715
716 case doCloseParen:
717 handleCloseParen();
718 if (fParenStack.size() <= 0) {
719 // Extra close paren, or missing open paren.
720 error(U_REGEX_MISMATCHED_PAREN);
721 }
722 break;
723
724 case doNOP:
725 break;
726
727
728 case doBadOpenParenType:
729 case doRuleError:
730 error(U_REGEX_RULE_SYNTAX);
731 break;
732
733
734 case doMismatchedParenErr:
735 error(U_REGEX_MISMATCHED_PAREN);
736 break;
737
738 case doPlus:
739 // Normal '+' compiles to
740 // 1. stuff to be repeated (already built)
741 // 2. jmp-sav 1
742 // 3. ...
743 //
744 // Or, if the item to be repeated can match a zero length string,
745 // 1. STO_INP_LOC data-loc
746 // 2. body of stuff to be repeated
747 // 3. JMP_SAV_X 2
748 // 4. ...
749
750 //
751 // Or, if the item to be repeated is simple
752 // 1. Item to be repeated.
753 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
754 // 3. LOOP_C stack location
755 {
756 int32_t topLoc = blockTopLoc(FALSE); // location of item #1
757 int32_t frameLoc;
758
759 // Check for simple constructs, which may get special optimized code.
760 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
761 int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc);
762
763 if (URX_TYPE(repeatedOp) == URX_SETREF) {
764 // Emit optimized code for [char set]+
765 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
766 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
767 frameLoc = fRXPat->fFrameSize;
768 fRXPat->fFrameSize++;
769 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
770 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
771 break;
772 }
773
774 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
46f4442e
A
775 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
776 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
b75a7d8f
A
777 // Emit Optimized code for .+ operations.
778 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
779 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
46f4442e 780 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
b75a7d8f
A
781 loopOpI |= 1;
782 }
46f4442e
A
783 if (fModeFlags & UREGEX_UNIX_LINES) {
784 loopOpI |= 2;
785 }
b75a7d8f
A
786 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
787 frameLoc = fRXPat->fFrameSize;
788 fRXPat->fFrameSize++;
789 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
790 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
791 break;
792 }
793
794 }
795
796 // General case.
797
798 // Check for minimum match length of zero, which requires
799 // extra loop-breaking code.
800 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
801 // Zero length match is possible.
802 // Emit the code sequence that can handle it.
803 insertOp(topLoc);
804 frameLoc = fRXPat->fFrameSize;
805 fRXPat->fFrameSize++;
806
807 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
808 fRXPat->fCompiledPat->setElementAt(op, topLoc);
809
810 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1);
811 fRXPat->fCompiledPat->addElement(op, *fStatus);
812 } else {
813 // Simpler code when the repeated body must match something non-empty
814 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc);
815 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
816 }
817 }
818 break;
819
820 case doNGPlus:
821 // Non-greedy '+?' compiles to
822 // 1. stuff to be repeated (already built)
823 // 2. state-save 1
824 // 3. ...
825 {
826 int32_t topLoc = blockTopLoc(FALSE);
827 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
828 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
829 }
830 break;
831
832
833 case doOpt:
834 // Normal (greedy) ? quantifier.
835 // Compiles to
836 // 1. state save 3
837 // 2. body of optional block
838 // 3. ...
839 // Insert the state save into the compiled pattern, and we're done.
840 {
841 int32_t saveStateLoc = blockTopLoc(TRUE);
842 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
843 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
844 }
845 break;
846
847 case doNGOpt:
848 // Non-greedy ?? quantifier
849 // compiles to
850 // 1. jmp 4
851 // 2. body of optional block
852 // 3 jmp 5
853 // 4. state save 2
854 // 5 ...
855 // This code is less than ideal, with two jmps instead of one, because we can only
856 // insert one instruction at the top of the block being iterated.
857 {
858 int32_t jmp1_loc = blockTopLoc(TRUE);
859 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
860
861 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1);
862 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
863
864 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2);
865 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);
866
867 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
868 fRXPat->fCompiledPat->addElement(save_op, *fStatus);
869 }
870 break;
871
872
873 case doStar:
874 // Normal (greedy) * quantifier.
875 // Compiles to
876 // 1. STATE_SAVE 4
877 // 2. body of stuff being iterated over
878 // 3. JMP_SAV 2
879 // 4. ...
880 //
881 // Or, if the body is a simple [Set],
882 // 1. LOOP_SR_I set number
883 // 2. LOOP_C stack location
884 // ...
885 //
46f4442e 886 // Or if this is a .*
b75a7d8f
A
887 // 1. LOOP_DOT_I (. matches all mode flag)
888 // 2. LOOP_C stack location
889 //
890 // Or, if the body can match a zero-length string, to inhibit infinite loops,
891 // 1. STATE_SAVE 5
892 // 2. STO_INP_LOC data-loc
893 // 3. body of stuff
894 // 4. JMP_SAV_X 2
895 // 5. ...
896 {
897 // location of item #1, the STATE_SAVE
898 int32_t topLoc = blockTopLoc(FALSE);
899 int32_t dataLoc = -1;
900
901 // Check for simple *, where the construct being repeated
902 // compiled to single opcode, and might be optimizable.
903 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
904 int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc);
905
906 if (URX_TYPE(repeatedOp) == URX_SETREF) {
46f4442e 907 // Emit optimized code for a [char set]*
b75a7d8f
A
908 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
909 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
910 dataLoc = fRXPat->fFrameSize;
911 fRXPat->fFrameSize++;
912 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
913 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
914 break;
915 }
916
917 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
46f4442e
A
918 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
919 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
b75a7d8f
A
920 // Emit Optimized code for .* operations.
921 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
922 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
923 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
924 loopOpI |= 1;
925 }
46f4442e
A
926 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
927 loopOpI |= 2;
928 }
b75a7d8f
A
929 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
930 dataLoc = fRXPat->fFrameSize;
931 fRXPat->fFrameSize++;
932 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
933 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
934 break;
935 }
936 }
937
938 // Emit general case code for this *
939 // The optimizations did not apply.
940
941 int32_t saveStateLoc = blockTopLoc(TRUE);
942 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1);
943
944 // Check for minimum match length of zero, which requires
945 // extra loop-breaking code.
946 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
947 insertOp(saveStateLoc);
948 dataLoc = fRXPat->fFrameSize;
949 fRXPat->fFrameSize++;
950
951 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
952 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
953 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2);
954 }
46f4442e 955
b75a7d8f
A
956 // Locate the position in the compiled pattern where the match will continue
957 // after completing the *. (4 or 5 in the comment above)
958 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
959
960 // Put together the save state op store it into the compiled code.
961 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
962 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
963
964 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
965 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
966 }
967 break;
968
969 case doNGStar:
970 // Non-greedy *? quantifier
971 // compiles to
972 // 1. JMP 3
973 // 2. body of stuff being iterated over
974 // 3. STATE_SAVE 2
975 // 4 ...
976 {
977 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
978 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
979 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc);
980 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
981 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
982 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
983 }
984 break;
985
986
987 case doIntervalInit:
988 // The '{' opening an interval quantifier was just scanned.
989 // Init the counter varaiables that will accumulate the values as the digits
990 // are scanned.
991 fIntervalLow = 0;
992 fIntervalUpper = -1;
993 break;
994
995 case doIntevalLowerDigit:
996 // Scanned a digit from the lower value of an {lower,upper} interval
997 {
998 int32_t digitValue = u_charDigitValue(fC.fChar);
999 U_ASSERT(digitValue >= 0);
1000 fIntervalLow = fIntervalLow*10 + digitValue;
1001 if (fIntervalLow < 0) {
1002 error(U_REGEX_NUMBER_TOO_BIG);
1003 }
1004 }
1005 break;
1006
1007 case doIntervalUpperDigit:
1008 // Scanned a digit from the upper value of an {lower,upper} interval
1009 {
1010 if (fIntervalUpper < 0) {
1011 fIntervalUpper = 0;
1012 }
1013 int32_t digitValue = u_charDigitValue(fC.fChar);
1014 U_ASSERT(digitValue >= 0);
1015 fIntervalUpper = fIntervalUpper*10 + digitValue;
374ca955 1016 if (fIntervalUpper < 0) {
b75a7d8f
A
1017 error(U_REGEX_NUMBER_TOO_BIG);
1018 }
1019 }
1020 break;
1021
1022 case doIntervalSame:
1023 // Scanned a single value interval like {27}. Upper = Lower.
1024 fIntervalUpper = fIntervalLow;
1025 break;
1026
1027 case doInterval:
1028 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1029 if (compileInlineInterval() == FALSE) {
1030 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1031 }
1032 break;
1033
1034 case doPossessiveInterval:
1035 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1036 {
1037 // Remember the loc for the top of the block being looped over.
46f4442e
A
1038 // (Can not reserve a slot in the compiled pattern at this time, because
1039 // compileInterval needs to reserve also, and blockTopLoc can only reserve
b75a7d8f
A
1040 // once per block.)
1041 int32_t topLoc = blockTopLoc(FALSE);
1042
1043 // Produce normal looping code.
1044 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1045
1046 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1047 // just as if the loop was inclosed in atomic parentheses.
1048
1049 // First the STO_SP before the start of the loop
1050 insertOp(topLoc);
1051 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
1052 fRXPat->fDataSize += 1; // state stack ptr.
1053 int32_t op = URX_BUILD(URX_STO_SP, varLoc);
1054 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1055
1056 int32_t loopOp = fRXPat->fCompiledPat->popi();
1057 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1058 loopOp++; // point LoopOp after the just-inserted STO_SP
1059 fRXPat->fCompiledPat->push(loopOp, *fStatus);
1060
1061 // Then the LD_SP after the end of the loop
1062 op = URX_BUILD(URX_LD_SP, varLoc);
1063 fRXPat->fCompiledPat->addElement(op, *fStatus);
1064 }
1065
1066 break;
1067
1068 case doNGInterval:
1069 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1070 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1071 break;
1072
1073 case doIntervalError:
1074 error(U_REGEX_BAD_INTERVAL);
1075 break;
1076
1077 case doLiteralChar:
46f4442e 1078 // We've just scanned a "normal" character from the pattern,
b75a7d8f
A
1079 literalChar(fC.fChar);
1080 break;
1081
1082
46f4442e
A
1083 case doEscapedLiteralChar:
1084 // We've just scanned an backslashed escaped character with no
1085 // special meaning. It represents itself.
1086 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1087 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1088 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1089 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1090 }
1091 literalChar(fC.fChar);
1092 break;
1093
b75a7d8f
A
1094
1095 case doDotAny:
1096 // scanned a ".", match any single character.
1097 {
1098 int32_t op;
1099 if (fModeFlags & UREGEX_DOTALL) {
1100 op = URX_BUILD(URX_DOTANY_ALL, 0);
46f4442e
A
1101 } else if (fModeFlags & UREGEX_UNIX_LINES) {
1102 op = URX_BUILD(URX_DOTANY_UNIX, 0);
b75a7d8f
A
1103 } else {
1104 op = URX_BUILD(URX_DOTANY, 0);
1105 }
1106 fRXPat->fCompiledPat->addElement(op, *fStatus);
1107 }
1108 break;
1109
46f4442e 1110 case doCaret:
b75a7d8f 1111 {
46f4442e
A
1112 int32_t op = 0;
1113 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1114 op = URX_CARET;
1115 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1116 op = URX_CARET_M;
1117 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1118 op = URX_CARET; // Only testing true start of input.
1119 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1120 op = URX_CARET_M_UNIX;
1121 }
b75a7d8f
A
1122 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1123 }
1124 break;
1125
46f4442e 1126 case doDollar:
b75a7d8f 1127 {
46f4442e
A
1128 int32_t op = 0;
1129 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1130 op = URX_DOLLAR;
1131 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1132 op = URX_DOLLAR_M;
1133 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1134 op = URX_DOLLAR_D;
1135 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1136 op = URX_DOLLAR_MD;
1137 }
b75a7d8f
A
1138 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1139 }
1140 break;
1141
1142 case doBackslashA:
1143 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
1144 break;
1145
1146 case doBackslashB:
374ca955
A
1147 {
1148 #if UCONFIG_NO_BREAK_ITERATION==1
1149 if (fModeFlags & UREGEX_UWORD) {
1150 error(U_UNSUPPORTED_ERROR);
1151 }
1152 #endif
1153 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1154 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus);
1155 }
b75a7d8f
A
1156 break;
1157
1158 case doBackslashb:
374ca955
A
1159 {
1160 #if UCONFIG_NO_BREAK_ITERATION==1
1161 if (fModeFlags & UREGEX_UWORD) {
1162 error(U_UNSUPPORTED_ERROR);
1163 }
1164 #endif
1165 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1166 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1167 }
b75a7d8f
A
1168 break;
1169
1170 case doBackslashD:
1171 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
1172 break;
1173
1174 case doBackslashd:
1175 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
1176 break;
1177
1178 case doBackslashG:
1179 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
1180 break;
1181
1182 case doBackslashS:
1183 fRXPat->fCompiledPat->addElement(
1184 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
1185 break;
1186
1187 case doBackslashs:
1188 fRXPat->fCompiledPat->addElement(
1189 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
1190 break;
1191
1192 case doBackslashW:
1193 fRXPat->fCompiledPat->addElement(
1194 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
1195 break;
1196
1197 case doBackslashw:
1198 fRXPat->fCompiledPat->addElement(
1199 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
1200 break;
1201
1202 case doBackslashX:
1203 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
1204 break;
1205
1206
1207 case doBackslashZ:
1208 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
1209 break;
1210
1211 case doBackslashz:
1212 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
1213 break;
1214
1215 case doEscapeError:
1216 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1217 break;
1218
1219 case doExit:
1220 returnVal = FALSE;
1221 break;
1222
1223 case doProperty:
1224 {
1225 UnicodeSet *theSet = scanProp();
1226 compileSet(theSet);
1227 }
1228 break;
1229
46f4442e 1230 case doNamedChar:
b75a7d8f 1231 {
46f4442e
A
1232 UChar32 c = scanNamedChar();
1233 literalChar(c);
b75a7d8f
A
1234 }
1235 break;
46f4442e 1236
b75a7d8f
A
1237
1238 case doBackRef:
1239 // BackReference. Somewhat unusual in that the front-end can not completely parse
1240 // the regular expression, because the number of digits to be consumed
1241 // depends on the number of capture groups that have been defined. So
1242 // we have to do it here instead.
1243 {
1244 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1245 int32_t groupNum = 0;
1246 UChar32 c = fC.fChar;
1247
1248 for (;;) {
1249 // Loop once per digit, for max allowed number of digits in a back reference.
1250 int32_t digit = u_charDigitValue(c);
1251 groupNum = groupNum * 10 + digit;
1252 if (groupNum >= numCaptureGroups) {
1253 break;
1254 }
1255 c = peekCharLL();
46f4442e 1256 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
b75a7d8f
A
1257 break;
1258 }
1259 nextCharLL();
1260 }
1261
1262 // Scan of the back reference in the source regexp is complete. Now generate
46f4442e 1263 // the compiled code for it.
b75a7d8f
A
1264 // Because capture groups can be forward-referenced by back-references,
1265 // we fill the operand with the capture group number. At the end
374ca955 1266 // of compilation, it will be changed to the variable's location.
b75a7d8f
A
1267 U_ASSERT(groupNum > 0);
1268 int32_t op;
1269 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1270 op = URX_BUILD(URX_BACKREF_I, groupNum);
1271 } else {
1272 op = URX_BUILD(URX_BACKREF, groupNum);
1273 }
1274 fRXPat->fCompiledPat->addElement(op, *fStatus);
1275 }
1276 break;
1277
1278
b75a7d8f
A
1279 case doPossessivePlus:
1280 // Possessive ++ quantifier.
1281 // Compiles to
1282 // 1. STO_SP
1283 // 2. body of stuff being iterated over
1284 // 3. STATE_SAVE 5
1285 // 4. JMP 2
1286 // 5. LD_SP
1287 // 6. ...
1288 //
1289 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
46f4442e 1290 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
b75a7d8f
A
1291 //
1292 {
1293 // Emit the STO_SP
1294 int32_t topLoc = blockTopLoc(TRUE);
1295 int32_t stoLoc = fRXPat->fDataSize;
1296 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
1297 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
1298 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1299
1300 // Emit the STATE_SAVE
1301 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1302 fRXPat->fCompiledPat->addElement(op, *fStatus);
46f4442e 1303
b75a7d8f
A
1304 // Emit the JMP
1305 op = URX_BUILD(URX_JMP, topLoc+1);
1306 fRXPat->fCompiledPat->addElement(op, *fStatus);
1307
1308 // Emit the LD_SP
1309 op = URX_BUILD(URX_LD_SP, stoLoc);
1310 fRXPat->fCompiledPat->addElement(op, *fStatus);
1311 }
1312 break;
1313
1314 case doPossessiveStar:
1315 // Possessive *+ quantifier.
1316 // Compiles to
1317 // 1. STO_SP loc
1318 // 2. STATE_SAVE 5
1319 // 3. body of stuff being iterated over
1320 // 4. JMP 2
1321 // 5. LD_SP loc
1322 // 6 ...
1323 // TODO: do something to cut back the state stack each time through the loop.
1324 {
1325 // Reserve two slots at the top of the block.
1326 int32_t topLoc = blockTopLoc(TRUE);
1327 insertOp(topLoc);
1328
1329 // emit STO_SP loc
1330 int32_t stoLoc = fRXPat->fDataSize;
1331 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
1332 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
1333 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1334
1335 // Emit the SAVE_STATE 5
1336 int32_t L7 = fRXPat->fCompiledPat->size()+1;
1337 op = URX_BUILD(URX_STATE_SAVE, L7);
1338 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1339
46f4442e 1340 // Append the JMP operation.
b75a7d8f
A
1341 op = URX_BUILD(URX_JMP, topLoc+1);
1342 fRXPat->fCompiledPat->addElement(op, *fStatus);
1343
1344 // Emit the LD_SP loc
1345 op = URX_BUILD(URX_LD_SP, stoLoc);
1346 fRXPat->fCompiledPat->addElement(op, *fStatus);
1347 }
1348 break;
1349
1350 case doPossessiveOpt:
1351 // Possessive ?+ quantifier.
1352 // Compiles to
1353 // 1. STO_SP loc
1354 // 2. SAVE_STATE 5
1355 // 3. body of optional block
1356 // 4. LD_SP loc
1357 // 5. ...
1358 //
1359 {
1360 // Reserve two slots at the top of the block.
1361 int32_t topLoc = blockTopLoc(TRUE);
1362 insertOp(topLoc);
1363
1364 // Emit the STO_SP
1365 int32_t stoLoc = fRXPat->fDataSize;
1366 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
1367 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
1368 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1369
1370 // Emit the SAVE_STATE
1371 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1372 op = URX_BUILD(URX_STATE_SAVE, continueLoc);
1373 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1374
1375 // Emit the LD_SP
1376 op = URX_BUILD(URX_LD_SP, stoLoc);
1377 fRXPat->fCompiledPat->addElement(op, *fStatus);
1378 }
1379 break;
1380
1381
1382 case doBeginMatchMode:
1383 fNewModeFlags = fModeFlags;
1384 fSetModeFlag = TRUE;
1385 break;
1386
1387 case doMatchMode: // (?i) and similar
1388 {
1389 int32_t bit = 0;
1390 switch (fC.fChar) {
1391 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
46f4442e 1392 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break;
b75a7d8f
A
1393 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
1394 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
46f4442e 1395 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break;
374ca955 1396 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
b75a7d8f
A
1397 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
1398 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
1399 default:
1400 U_ASSERT(FALSE); // Should never happen. Other chars are filtered out
1401 // by the scanner.
1402 }
1403 if (fSetModeFlag) {
1404 fNewModeFlags |= bit;
1405 } else {
1406 fNewModeFlags &= ~bit;
1407 }
1408 }
1409 break;
1410
1411 case doSetMatchMode:
1412 // We've got a (?i) or similar. The match mode is being changed, but
1413 // the change is not scoped to a parenthesized block.
73c04bcf 1414 U_ASSERT(fNewModeFlags < 0);
b75a7d8f
A
1415 fModeFlags = fNewModeFlags;
1416
1417 // Prevent any string from spanning across the change of match mode.
46f4442e
A
1418 // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef"
1419 fixLiterals();
b75a7d8f
A
1420 break;
1421
1422
1423 case doMatchModeParen:
1424 // We've got a (?i: or similar. Begin a parenthesized block, save old
1425 // mode flags so they can be restored at the close of the block.
1426 //
1427 // Compile to a
1428 // - NOP, which later may be replaced by a save-state if the
1429 // parenthesized group gets a * quantifier, followed by
1430 // - NOP, which may later be replaced by a save-state if there
1431 // is an '|' alternation within the parens.
1432 {
1433 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
1434 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
1435
1436 // On the Parentheses stack, start a new frame and add the postions
1437 // of the two NOPs (a normal non-capturing () frame, except for the
1438 // saving of the orignal mode flags.)
1439 fParenStack.push(fModeFlags, *fStatus);
1440 fParenStack.push(flags, *fStatus); // Frame Marker
1441 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1442 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1443
1444 // Set the current mode flags to the new values.
73c04bcf 1445 U_ASSERT(fNewModeFlags < 0);
b75a7d8f
A
1446 fModeFlags = fNewModeFlags;
1447 }
1448 break;
1449
374ca955
A
1450 case doBadModeFlag:
1451 error(U_REGEX_INVALID_FLAG);
1452 break;
1453
b75a7d8f
A
1454 case doSuppressComments:
1455 // We have just scanned a '(?'. We now need to prevent the character scanner from
1456 // treating a '#' as a to-the-end-of-line comment.
1457 // (This Perl compatibility just gets uglier and uglier to do...)
1458 fEOLComments = FALSE;
1459 break;
1460
1461
46f4442e
A
1462 case doSetAddAmp:
1463 {
1464 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1465 set->add(chAmp);
1466 }
1467 break;
1468
1469 case doSetAddDash:
1470 {
1471 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1472 set->add(chDash);
1473 }
1474 break;
1475
1476 case doSetBackslash_s:
1477 {
1478 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1479 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1480 break;
1481 }
1482
1483 case doSetBackslash_S:
1484 {
1485 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1486 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1487 SSet.complement();
1488 set->addAll(SSet);
1489 break;
1490 }
1491
1492 case doSetBackslash_d:
1493 {
1494 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1495 // TODO - make a static set, ticket 6058.
1496 addCategory(set, U_GC_ND_MASK, *fStatus);
1497 break;
1498 }
1499
1500 case doSetBackslash_D:
1501 {
1502 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1503 UnicodeSet digits;
1504 // TODO - make a static set, ticket 6058.
1505 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1506 digits.complement();
1507 set->addAll(digits);
1508 break;
1509 }
1510
1511 case doSetBackslash_w:
1512 {
1513 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1514 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1515 break;
1516 }
1517
1518 case doSetBackslash_W:
1519 {
1520 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1521 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1522 SSet.complement();
1523 set->addAll(SSet);
1524 break;
1525 }
1526
1527 case doSetBegin:
1528 fSetStack.push(new UnicodeSet(), *fStatus);
1529 fSetOpStack.push(setStart, *fStatus);
1530 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1531 fSetOpStack.push(setCaseClose, *fStatus);
1532 }
1533 break;
1534
1535 case doSetBeginDifference1:
1536 // We have scanned something like [[abc]-[
1537 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1538 // Push a Difference operator, which will cause the new set to be subtracted from what
1539 // went before once it is created.
1540 setPushOp(setDifference1);
1541 fSetOpStack.push(setStart, *fStatus);
1542 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1543 fSetOpStack.push(setCaseClose, *fStatus);
1544 }
1545 break;
1546
1547 case doSetBeginIntersection1:
1548 // We have scanned something like [[abc]&[
1549 // Need both the '&' operator and the open '[' operator.
1550 setPushOp(setIntersection1);
1551 fSetOpStack.push(setStart, *fStatus);
1552 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1553 fSetOpStack.push(setCaseClose, *fStatus);
1554 }
1555 break;
1556
1557 case doSetBeginUnion:
1558 // We have scanned something like [[abc][
1559 // Need to handle the union operation explicitly [[abc] | [
1560 setPushOp(setUnion);
1561 fSetOpStack.push(setStart, *fStatus);
1562 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1563 fSetOpStack.push(setCaseClose, *fStatus);
1564 }
1565 break;
1566
1567 case doSetDifference2:
1568 // We have scanned something like [abc--
1569 // Consider this to unambiguously be a set difference operator.
1570 setPushOp(setDifference2);
1571 break;
1572
1573 case doSetEnd:
1574 // Have encountered the ']' that closes a set.
1575 // Force the evaluation of any pending operations within this set,
1576 // leave the completed set on the top of the set stack.
1577 {
1578 setEval(setEnd);
1579 int32_t setOp = fSetOpStack.popi();
1580 U_ASSERT(setOp==setStart);
1581 break;
1582 }
1583
1584 case doSetFinish:
1585 {
1586 // Finished a complete set expression, including all nested sets.
1587 // The close bracket has already triggered clearing out pending set operators,
1588 // the operator stack should be empty and the operand stack should have just
1589 // one entry, the result set.
1590 U_ASSERT(fSetOpStack.empty());
1591 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1592 U_ASSERT(fSetStack.empty());
1593 compileSet(theSet);
1594 break;
1595 }
1596
1597 case doSetIntersection2:
1598 // Have scanned something like [abc&&
1599 setPushOp(setIntersection2);
1600 break;
1601
1602 case doSetLiteral:
1603 // Union the just-scanned literal character into the set being built.
1604 // This operation is the highest precedence set operation, so we can always do
1605 // it immediately, without waiting to see what follows. It is necessary to perform
1606 // any pending '-' or '&' operation first, because these have the same precedence
1607 // as union-ing in a literal'
1608 {
1609 setEval(setUnion);
1610 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1611 s->add(fC.fChar);
1612 fLastSetLiteral = fC.fChar;
1613 break;
1614 }
1615
1616 case doSetLiteralEscaped:
1617 // A back-slash escaped literal character was encountered.
1618 // Processing is the same as with setLiteral, above, with the addition of
1619 // the optional check for errors on escaped ASCII letters.
1620 {
1621 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1622 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1623 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1624 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1625 }
1626 setEval(setUnion);
1627 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1628 s->add(fC.fChar);
1629 fLastSetLiteral = fC.fChar;
1630 break;
1631 }
1632
1633 case doSetNamedChar:
1634 // Scanning a \N{UNICODE CHARACTER NAME}
1635 // Aside from the source of the character, the processing is identical to doSetLiteral,
1636 // above.
1637 {
1638 UChar32 c = scanNamedChar();
1639 setEval(setUnion);
1640 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1641 s->add(c);
1642 fLastSetLiteral = c;
1643 break;
1644 }
1645
1646 case doSetNamedRange:
1647 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1648 // The left character is already in the set, and is saved in fLastSetLiteral.
1649 // The right side needs to be picked up, the scan is at the 'N'.
1650 // Lower Limit > Upper limit being an error matches both Java
1651 // and ICU UnicodeSet behavior.
1652 {
1653 UChar32 c = scanNamedChar();
1654 if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) {
1655 error(U_REGEX_INVALID_RANGE);
1656 }
1657 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1658 s->add(fLastSetLiteral, c);
1659 fLastSetLiteral = c;
1660 break;
1661 }
1662
1663
1664 case doSetNegate:
1665 // Scanned a '^' at the start of a set.
1666 // Push the negation operator onto the set op stack.
1667 // A twist for case-insensitive matching:
1668 // the case closure operation must happen _before_ negation.
1669 // But the case closure operation will already be on the stack if it's required.
1670 // This requires checking for case closure, and swapping the stack order
1671 // if it is present.
1672 {
1673 int32_t tosOp = fSetOpStack.peeki();
1674 if (tosOp == setCaseClose) {
1675 fSetOpStack.popi();
1676 fSetOpStack.push(setNegation, *fStatus);
1677 fSetOpStack.push(setCaseClose, *fStatus);
1678 } else {
1679 fSetOpStack.push(setNegation, *fStatus);
1680 }
1681 }
1682 break;
1683
1684 case doSetNoCloseError:
1685 error(U_REGEX_MISSING_CLOSE_BRACKET);
1686 break;
1687
1688 case doSetOpError:
1689 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1690 break;
1691
1692 case doSetPosixProp:
1693 {
1694 UnicodeSet *s = scanPosixProp();
1695 if (s != NULL) {
1696 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1697 tos->addAll(*s);
1698 delete s;
1699 } // else error. scanProp() reported the error status already.
1700 }
1701 break;
1702
1703 case doSetProp:
1704 // Scanned a \p \P within [brackets].
1705 {
1706 UnicodeSet *s = scanProp();
1707 if (s != NULL) {
1708 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1709 tos->addAll(*s);
1710 delete s;
1711 } // else error. scanProp() reported the error status already.
1712 }
1713 break;
1714
1715
1716 case doSetRange:
1717 // We have scanned literal-literal. Add the range to the set.
1718 // The left character is already in the set, and is saved in fLastSetLiteral.
1719 // The right side is the current character.
1720 // Lower Limit > Upper limit being an error matches both Java
1721 // and ICU UnicodeSet behavior.
1722 {
1723 if (fLastSetLiteral > fC.fChar) {
1724 error(U_REGEX_INVALID_RANGE);
1725 }
1726 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1727 s->add(fLastSetLiteral, fC.fChar);
1728 break;
1729 }
1730
b75a7d8f
A
1731
1732 default:
1733 U_ASSERT(FALSE);
1734 error(U_REGEX_INTERNAL_ERROR);
1735 break;
1736 }
1737
1738 if (U_FAILURE(*fStatus)) {
1739 returnVal = FALSE;
1740 }
1741
1742 return returnVal;
73c04bcf 1743}
b75a7d8f
A
1744
1745
1746
1747//------------------------------------------------------------------------------
1748//
1749// literalChar We've encountered a literal character from the pattern,
1750// or an escape sequence that reduces to a character.
1751// Add it to the string containing all literal chars/strings from
1752// the pattern.
1753// If we are in a pattern string already, add the new char to it.
1754// If we aren't in a pattern string, begin one now.
1755//
1756//------------------------------------------------------------------------------
1757void RegexCompile::literalChar(UChar32 c) {
1758 int32_t op; // An operation in the compiled pattern.
1759 int32_t opType;
1760 int32_t patternLoc; // A position in the compiled pattern.
1761 int32_t stringLen;
1762
1763
1764 // If the last thing compiled into the pattern was not a literal char,
1765 // force this new literal char to begin a new string, and not append to the previous.
1766 op = fRXPat->fCompiledPat->lastElementi();
1767 opType = URX_TYPE(op);
1768 if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR || opType == URX_ONECHAR_I)) {
1769 fixLiterals();
1770 }
1771
1772 if (fStringOpStart == -1) {
1773 // First char of a string in the pattern.
1774 // Emit a OneChar op into the compiled pattern.
1775 emitONE_CHAR(c);
1776
1777 // Also add it to the string pool, in case we get a second adjacent literal
1778 // and want to change form ONE_CHAR to STRING
1779 fStringOpStart = fRXPat->fLiteralText.length();
1780 fRXPat->fLiteralText.append(c);
1781 return;
1782 }
46f4442e 1783
b75a7d8f
A
1784 // We are adding onto an existing string
1785 fRXPat->fLiteralText.append(c);
1786
1787 op = fRXPat->fCompiledPat->lastElementi();
1788 opType = URX_TYPE(op);
1789 U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
1790
46f4442e 1791 // If the most recently emitted op is a URX_ONECHAR,
b75a7d8f
A
1792 if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) {
1793 if (U16_IS_TRAIL(c) && U16_IS_LEAD(URX_VAL(op))) {
1794 // The most recently emitted op is a ONECHAR that was the first half
1795 // of a surrogate pair. Update the ONECHAR's operand to be the
1796 // supplementary code point resulting from both halves of the pair.
1797 c = U16_GET_SUPPLEMENTARY(URX_VAL(op), c);
1798 op = URX_BUILD(opType, c);
1799 patternLoc = fRXPat->fCompiledPat->size() - 1;
1800 fRXPat->fCompiledPat->setElementAt(op, patternLoc);
1801 return;
1802 }
46f4442e 1803
b75a7d8f
A
1804 // The most recently emitted op is a ONECHAR.
1805 // We've now received another adjacent char. Change the ONECHAR op
1806 // to a string op.
1807 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1808 op = URX_BUILD(URX_STRING_I, fStringOpStart);
1809 } else {
1810 op = URX_BUILD(URX_STRING, fStringOpStart);
1811 }
1812 patternLoc = fRXPat->fCompiledPat->size() - 1;
1813 fRXPat->fCompiledPat->setElementAt(op, patternLoc);
1814 op = URX_BUILD(URX_STRING_LEN, 0);
1815 fRXPat->fCompiledPat->addElement(op, *fStatus);
1816 }
46f4442e 1817
b75a7d8f
A
1818 // The pattern contains a URX_SRING / URX_STRING_LEN. Update the
1819 // string length to reflect the new char we just added to the string.
1820 stringLen = fRXPat->fLiteralText.length() - fStringOpStart;
1821 op = URX_BUILD(URX_STRING_LEN, stringLen);
1822 patternLoc = fRXPat->fCompiledPat->size() - 1;
1823 fRXPat->fCompiledPat->setElementAt(op, patternLoc);
1824}
1825
1826
1827
1828//------------------------------------------------------------------------------
1829//
1830// emitONE_CHAR emit a ONE_CHAR op into the generated code.
1831// Choose cased or uncased version, depending on the
1832// match mode and whether the character itself is cased.
1833//
1834//------------------------------------------------------------------------------
1835void RegexCompile::emitONE_CHAR(UChar32 c) {
1836 int32_t op;
1837 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1838 u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
1839 // We have a cased character, and are in case insensitive matching mode.
1840 c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
1841 op = URX_BUILD(URX_ONECHAR_I, c);
1842 } else {
1843 // Uncased char, or case sensitive match mode.
1844 // Either way, just generate a literal compare of the char.
1845 op = URX_BUILD(URX_ONECHAR, c);
1846 }
1847 fRXPat->fCompiledPat->addElement(op, *fStatus);
1848}
1849
1850
1851//------------------------------------------------------------------------------
1852//
1853// fixLiterals When compiling something that can follow a literal
1854// string in a pattern, we need to "fix" any preceding
1855// string, which will cause any subsequent literals to
1856// begin a new string, rather than appending to the
1857// old one.
1858//
1859// Optionally, split the last char of the string off into
1860// a single "ONE_CHAR" operation, so that quantifiers can
1861// apply to that char alone. Example: abc*
1862// The * must apply to the 'c' only.
1863//
1864//------------------------------------------------------------------------------
1865void RegexCompile::fixLiterals(UBool split) {
1866 int32_t stringStart = fStringOpStart; // start index of the current literal string
1867 int32_t op; // An op from/for the compiled pattern.
1868 int32_t opType; // An opcode type from the compiled pattern.
1869 int32_t stringLastCharIdx;
1870 UChar32 lastChar;
1871 int32_t stringNextToLastCharIdx;
1872 UChar32 nextToLastChar;
1873 int32_t stringLen;
1874
46f4442e 1875 fStringOpStart = -1;
b75a7d8f
A
1876 if (!split) {
1877 return;
1878 }
1879
1880 // Split: We need to ensure that the last item in the compiled pattern does
1881 // not refer to a literal string of more than one char. If it does,
1882 // separate the last char from the rest of the string.
1883
1884 // If the last operation from the compiled pattern is not a string,
46f4442e 1885 // nothing needs to be done
b75a7d8f
A
1886 op = fRXPat->fCompiledPat->lastElementi();
1887 opType = URX_TYPE(op);
1888 if (opType != URX_STRING_LEN) {
1889 return;
1890 }
1891 stringLen = URX_VAL(op);
1892
1893 //
1894 // Find the position of the last code point in the string (might be a surrogate pair)
1895 //
1896 stringLastCharIdx = fRXPat->fLiteralText.length();
1897 stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
1898 lastChar = fRXPat->fLiteralText.char32At(stringLastCharIdx);
1899
1900 // The string should always be at least two code points long, meaning that there
1901 // should be something before the last char position that we just found.
1902 U_ASSERT(stringLastCharIdx > stringStart);
1903 stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
1904 U_ASSERT(stringNextToLastCharIdx >= stringStart);
1905 nextToLastChar = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx);
1906
1907 if (stringNextToLastCharIdx > stringStart) {
1908 // The length of string remaining after removing one char is two or more.
1909 // Leave the string in the compiled pattern, shorten it by one char,
1910 // and append a URX_ONECHAR op for the last char.
1911 stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
1912 op = URX_BUILD(URX_STRING_LEN, stringLen);
1913 fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
1914 emitONE_CHAR(lastChar);
1915 } else {
1916 // The original string consisted of exactly two characters. Replace
1917 // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
1918 // of URX_ONECHARs.
1919 fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2);
1920 emitONE_CHAR(nextToLastChar);
1921 emitONE_CHAR(lastChar);
1922 }
1923}
1924
1925
1926
1927
1928
1929
1930//------------------------------------------------------------------------------
1931//
1932// insertOp() Insert a slot for a new opcode into the already
1933// compiled pattern code.
1934//
1935// Fill the slot with a NOP. Our caller will replace it
1936// with what they really wanted.
1937//
1938//------------------------------------------------------------------------------
1939void RegexCompile::insertOp(int32_t where) {
1940 UVector32 *code = fRXPat->fCompiledPat;
1941 U_ASSERT(where>0 && where < code->size());
1942
1943 int32_t nop = URX_BUILD(URX_NOP, 0);
1944 code->insertElementAt(nop, where, *fStatus);
1945
1946 // Walk through the pattern, looking for any ops with targets that
1947 // were moved down by the insert. Fix them.
1948 int32_t loc;
1949 for (loc=0; loc<code->size(); loc++) {
1950 int32_t op = code->elementAti(loc);
1951 int32_t opType = URX_TYPE(op);
1952 int32_t opValue = URX_VAL(op);
1953 if ((opType == URX_JMP ||
1954 opType == URX_JMPX ||
1955 opType == URX_STATE_SAVE ||
1956 opType == URX_CTR_LOOP ||
1957 opType == URX_CTR_LOOP_NG ||
1958 opType == URX_JMP_SAV ||
1959 opType == URX_RELOC_OPRND) && opValue > where) {
1960 // Target location for this opcode is after the insertion point and
1961 // needs to be incremented to adjust for the insertion.
1962 opValue++;
1963 op = URX_BUILD(opType, opValue);
1964 code->setElementAt(op, loc);
1965 }
1966 }
1967
1968 // Now fix up the parentheses stack. All positive values in it are locations in
1969 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
1970 for (loc=0; loc<fParenStack.size(); loc++) {
1971 int32_t x = fParenStack.elementAti(loc);
73c04bcf 1972 U_ASSERT(x < code->size());
b75a7d8f
A
1973 if (x>where) {
1974 x++;
1975 fParenStack.setElementAt(x, loc);
1976 }
1977 }
1978
1979 if (fMatchCloseParen > where) {
1980 fMatchCloseParen++;
1981 }
1982 if (fMatchOpenParen > where) {
1983 fMatchOpenParen++;
1984 }
1985}
1986
1987
1988
1989//------------------------------------------------------------------------------
1990//
1991// blockTopLoc() Find or create a location in the compiled pattern
1992// at the start of the operation or block that has
1993// just been compiled. Needed when a quantifier (* or
1994// whatever) appears, and we need to add an operation
1995// at the start of the thing being quantified.
1996//
1997// (Parenthesized Blocks) have a slot with a NOP that
1998// is reserved for this purpose. .* or similar don't
1999// and a slot needs to be added.
2000//
2001// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2002// at the returned location.
46f4442e 2003// FALSE - just return the address,
b75a7d8f
A
2004// do not reserve a location there.
2005//
2006//------------------------------------------------------------------------------
2007int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2008 int32_t theLoc;
2009 if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2010 {
2011 // The item just processed is a parenthesized block.
2012 theLoc = fMatchOpenParen; // A slot is already reserved for us.
2013 U_ASSERT(theLoc > 0);
374ca955 2014 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
b75a7d8f
A
2015 }
2016 else {
2017 // Item just compiled is a single thing, a ".", or a single char, or a set reference.
2018 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2019 // We need to make space now.
2020 fixLiterals(TRUE); // If last item was a string, separate the last char.
2021 theLoc = fRXPat->fCompiledPat->size()-1;
2022 if (reserveLoc) {
374ca955 2023 /*int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);*/
b75a7d8f
A
2024 int32_t nop = URX_BUILD(URX_NOP, 0);
2025 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2026 }
2027 }
2028 return theLoc;
2029}
2030
2031
2032
2033//------------------------------------------------------------------------------
2034//
2035// handleCloseParen When compiling a close paren, we need to go back
2036// and fix up any JMP or SAVE operations within the
2037// parenthesized block that need to target the end
2038// of the block. The locations of these are kept on
2039// the paretheses stack.
2040//
2041// This function is called both when encountering a
2042// real ) and at the end of the pattern.
2043//
73c04bcf 2044//------------------------------------------------------------------------------
b75a7d8f
A
2045void RegexCompile::handleCloseParen() {
2046 int32_t patIdx;
2047 int32_t patOp;
2048 if (fParenStack.size() <= 0) {
2049 error(U_REGEX_MISMATCHED_PAREN);
2050 return;
2051 }
2052
2053 // Force any literal chars that may follow the close paren to start a new string,
2054 // and not attach to any preceding it.
2055 fixLiterals(FALSE);
2056
2057 // Fixup any operations within the just-closed parenthesized group
2058 // that need to reference the end of the (block).
2059 // (The first one popped from the stack is an unused slot for
2060 // alternation (OR) state save, but applying the fixup to it does no harm.)
2061 for (;;) {
2062 patIdx = fParenStack.popi();
2063 if (patIdx < 0) {
2064 // value < 0 flags the start of the frame on the paren stack.
2065 break;
2066 }
2067 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2068 patOp = fRXPat->fCompiledPat->elementAti(patIdx);
2069 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set.
2070 patOp |= fRXPat->fCompiledPat->size(); // Set it now.
2071 fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2072 fMatchOpenParen = patIdx;
2073 }
2074
2075 // At the close of any parenthesized block, restore the match mode flags to
2076 // the value they had at the open paren. Saved value is
46f4442e 2077 // at the top of the paren stack.
b75a7d8f 2078 fModeFlags = fParenStack.popi();
73c04bcf 2079 U_ASSERT(fModeFlags < 0);
46f4442e 2080
b75a7d8f
A
2081 // DO any additional fixups, depending on the specific kind of
2082 // parentesized grouping this is
2083
2084 switch (patIdx) {
2085 case plain:
2086 case flags:
2087 // No additional fixups required.
2088 // (Grouping-only parentheses)
2089 break;
2090 case capturing:
2091 // Capturing Parentheses.
2092 // Insert a End Capture op into the pattern.
2093 // The frame offset of the variables for this cg is obtained from the
2094 // start capture op and put it into the end-capture op.
2095 {
2096 int32_t captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2097 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2098
2099 int32_t frameVarLocation = URX_VAL(captureOp);
2100 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
2101 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
2102 }
2103 break;
2104 case atomic:
2105 // Atomic Parenthesis.
2106 // Insert a LD_SP operation to restore the state stack to the position
2107 // it was when the atomic parens were entered.
2108 {
2109 int32_t stoOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2110 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2111 int32_t stoLoc = URX_VAL(stoOp);
2112 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc);
2113 fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
2114 }
2115 break;
2116
2117 case lookAhead:
2118 {
46f4442e 2119 int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
b75a7d8f
A
2120 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2121 int32_t dataLoc = URX_VAL(startOp);
2122 int32_t op = URX_BUILD(URX_LA_END, dataLoc);
2123 fRXPat->fCompiledPat->addElement(op, *fStatus);
2124 }
2125 break;
2126
2127 case negLookAhead:
2128 {
2129 // See comment at doOpenLookAheadNeg
2130 int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2131 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2132 int32_t dataLoc = URX_VAL(startOp);
2133 int32_t op = URX_BUILD(URX_LA_END, dataLoc);
2134 fRXPat->fCompiledPat->addElement(op, *fStatus);
46f4442e
A
2135 op = URX_BUILD(URX_BACKTRACK, 0);
2136 fRXPat->fCompiledPat->addElement(op, *fStatus);
2137 op = URX_BUILD(URX_LA_END, 0);
b75a7d8f
A
2138 fRXPat->fCompiledPat->addElement(op, *fStatus);
2139
2140 // Patch the URX_SAVE near the top of the block.
46f4442e 2141 // The destination of the SAVE is the final LA_END that was just added.
b75a7d8f
A
2142 int32_t saveOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2143 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
46f4442e 2144 int32_t dest = fRXPat->fCompiledPat->size()-1;
b75a7d8f
A
2145 saveOp = URX_BUILD(URX_STATE_SAVE, dest);
2146 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2147 }
2148 break;
2149
2150 case lookBehind:
2151 {
2152 // See comment at doOpenLookBehind.
46f4442e 2153
b75a7d8f
A
2154 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2155 int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2156 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2157 int32_t dataLoc = URX_VAL(startOp);
2158 int32_t op = URX_BUILD(URX_LB_END, dataLoc);
2159 fRXPat->fCompiledPat->addElement(op, *fStatus);
2160 op = URX_BUILD(URX_LA_END, dataLoc);
2161 fRXPat->fCompiledPat->addElement(op, *fStatus);
2162
2163 // Determine the min and max bounds for the length of the
2164 // string that the pattern can match.
2165 // An unbounded upper limit is an error.
2166 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2167 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2168 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2169 if (maxML == INT32_MAX) {
2170 error(U_REGEX_LOOK_BEHIND_LIMIT);
2171 break;
2172 }
2173 U_ASSERT(minML <= maxML);
2174
2175 // Insert the min and max match len bounds into the URX_LB_CONT op that
2176 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2177 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
2178 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
2179
2180 }
2181 break;
2182
2183
2184
2185 case lookBehindN:
2186 {
2187 // See comment at doOpenLookBehindNeg.
46f4442e 2188
b75a7d8f
A
2189 // Append the URX_LBN_END to the compiled pattern.
2190 int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2191 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2192 int32_t dataLoc = URX_VAL(startOp);
2193 int32_t op = URX_BUILD(URX_LBN_END, dataLoc);
2194 fRXPat->fCompiledPat->addElement(op, *fStatus);
2195
2196 // Determine the min and max bounds for the length of the
2197 // string that the pattern can match.
2198 // An unbounded upper limit is an error.
2199 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2200 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2201 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2202 if (maxML == INT32_MAX) {
2203 error(U_REGEX_LOOK_BEHIND_LIMIT);
2204 break;
2205 }
2206 U_ASSERT(minML <= maxML);
2207
2208 // Insert the min and max match len bounds into the URX_LB_CONT op that
2209 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2210 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
2211 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
2212
2213 // Insert the pattern location to continue at after a successful match
2214 // as the last operand of the URX_LBN_CONT
2215 op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2216 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
2217 }
2218 break;
2219
2220
2221
2222 default:
2223 U_ASSERT(FALSE);
2224 }
2225
2226 // remember the next location in the compiled pattern.
2227 // The compilation of Quantifiers will look at this to see whether its looping
2228 // over a parenthesized block or a single item
2229 fMatchCloseParen = fRXPat->fCompiledPat->size();
2230}
2231
2232
2233
73c04bcf 2234//------------------------------------------------------------------------------
b75a7d8f
A
2235//
2236// compileSet Compile the pattern operations for a reference to a
2237// UnicodeSet.
2238//
73c04bcf 2239//------------------------------------------------------------------------------
b75a7d8f
A
2240void RegexCompile::compileSet(UnicodeSet *theSet)
2241{
2242 if (theSet == NULL) {
2243 return;
2244 }
46f4442e
A
2245 // Remove any strings from the set.
2246 // There shoudn't be any, but just in case.
2247 // (Case Closure can add them; if we had a simple case closure avaialble that
2248 // ignored strings, that would be better.)
2249 theSet->removeAllStrings();
b75a7d8f
A
2250 int32_t setSize = theSet->size();
2251 UChar32 firstSetChar = theSet->charAt(0);
b75a7d8f
A
2252
2253 switch (setSize) {
46f4442e 2254 case 0:
b75a7d8f 2255 {
46f4442e 2256 // Set of no elements. Always fails to match.
b75a7d8f
A
2257 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
2258 delete theSet;
2259 }
2260 break;
46f4442e 2261
b75a7d8f
A
2262 case 1:
2263 {
2264 // The set contains only a single code point. Put it into
2265 // the compiled pattern as a single char operation rather
2266 // than a set, and discard the set itself.
2267 literalChar(firstSetChar);
2268 delete theSet;
2269 }
2270 break;
46f4442e
A
2271
2272 default:
b75a7d8f
A
2273 {
2274 // The set contains two or more chars. (the normal case)
2275 // Put it into the compiled pattern as a set.
2276 int32_t setNumber = fRXPat->fSets->size();
2277 fRXPat->fSets->addElement(theSet, *fStatus);
2278 int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
2279 fRXPat->fCompiledPat->addElement(setOp, *fStatus);
2280 }
2281 }
2282}
2283
2284
73c04bcf 2285//------------------------------------------------------------------------------
b75a7d8f
A
2286//
2287// compileInterval Generate the code for a {min, max} style interval quantifier.
2288// Except for the specific opcodes used, the code is the same
2289// for all three types (greedy, non-greedy, possessive) of
2290// intervals. The opcodes are supplied as parameters.
2291//
2292// The code for interval loops has this form:
2293// 0 CTR_INIT counter loc (in stack frame)
2294// 1 5 patt address of CTR_LOOP at bottom of block
2295// 2 min count
2296// 3 max count (-1 for unbounded)
2297// 4 ... block to be iterated over
46f4442e
A
2298// 5 CTR_LOOP
2299//
2300// In
73c04bcf 2301//------------------------------------------------------------------------------
b75a7d8f
A
2302void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2303{
2304 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2305 // four slots in the compiled code. Reserve them.
2306 int32_t topOfBlock = blockTopLoc(TRUE);
2307 insertOp(topOfBlock);
2308 insertOp(topOfBlock);
2309 insertOp(topOfBlock);
2310
2311 // The operands for the CTR_INIT opcode include the index in the matcher data
2312 // of the counter. Allocate it now.
2313 int32_t counterLoc = fRXPat->fFrameSize;
2314 fRXPat->fFrameSize++;
2315
2316 int32_t op = URX_BUILD(InitOp, counterLoc);
2317 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2318
2319 // The second operand of CTR_INIT is the location following the end of the loop.
2320 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2321 // compilation of something later on causes the code to grow and the target
2322 // position to move.
2323 int32_t loopEnd = fRXPat->fCompiledPat->size();
2324 op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
2325 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2326
2327 // Followed by the min and max counts.
2328 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2329 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2330
2331 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2332 // Goes at end of the block being looped over, so just append to the code so far.
2333 op = URX_BUILD(LoopOp, topOfBlock);
2334 fRXPat->fCompiledPat->addElement(op, *fStatus);
2335
374ca955
A
2336 if ((fIntervalLow & 0xff000000) != 0 ||
2337 fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0) {
2338 error(U_REGEX_NUMBER_TOO_BIG);
2339 }
2340
b75a7d8f
A
2341 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2342 error(U_REGEX_MAX_LT_MIN);
2343 }
b75a7d8f
A
2344}
2345
2346
2347
2348UBool RegexCompile::compileInlineInterval() {
2349 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2350 // Too big to inline. Fail, which will cause looping code to be generated.
2351 // (Upper < Lower picks up unbounded upper and errors, both.)
2352 return FALSE;
2353 }
2354
2355 int32_t topOfBlock = blockTopLoc(FALSE);
2356 if (fIntervalUpper == 0) {
2357 // Pathological case. Attempt no matches, as if the block doesn't exist.
2358 fRXPat->fCompiledPat->setSize(topOfBlock);
2359 return TRUE;
2360 }
2361
2362 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2363 // The thing being repeated is not a single op, but some
2364 // more complex block. Do it as a loop, not inlines.
2365 // Note that things "repeated" a max of once are handled as inline, because
2366 // the one copy of the code already generated is just fine.
2367 return FALSE;
2368 }
2369
2370 // Pick up the opcode that is to be repeated
2371 //
2372 int32_t op = fRXPat->fCompiledPat->elementAti(topOfBlock);
2373
46f4442e 2374 // Compute the pattern location where the inline sequence
b75a7d8f 2375 // will end, and set up the state save op that will be needed.
46f4442e 2376 //
b75a7d8f
A
2377 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2378 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2379 int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc);
2380 if (fIntervalLow == 0) {
2381 insertOp(topOfBlock);
2382 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2383 }
2384
2385
2386
2387 // Loop, emitting the op for the thing being repeated each time.
2388 // Loop starts at 1 because one instance of the op already exists in the pattern,
2389 // it was put there when it was originally encountered.
2390 int32_t i;
2391 for (i=1; i<fIntervalUpper; i++ ) {
2392 if (i == fIntervalLow) {
2393 fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
2394 }
2395 if (i > fIntervalLow) {
2396 fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
2397 }
2398 fRXPat->fCompiledPat->addElement(op, *fStatus);
2399 }
2400 return TRUE;
2401}
2402
2403
2404
73c04bcf 2405//------------------------------------------------------------------------------
b75a7d8f
A
2406//
2407// matchStartType Determine how a match can start.
2408// Used to optimize find() operations.
2409//
2410// Operation is very similar to minMatchLength(). Walk the compiled
2411// pattern, keeping an on-going minimum-match-length. For any
2412// op where the min match coming in is zero, add that ops possible
2413// starting matches to the possible starts for the overall pattern.
2414//
73c04bcf 2415//------------------------------------------------------------------------------
b75a7d8f
A
2416void RegexCompile::matchStartType() {
2417 if (U_FAILURE(*fStatus)) {
2418 return;
2419 }
2420
2421
2422 int32_t loc; // Location in the pattern of the current op being processed.
2423 int32_t op; // The op being processed
2424 int32_t opType; // The opcode type of the op
2425 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
2426 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
2427
2428 UBool atStart = TRUE; // True if no part of the pattern yet encountered
2429 // could have advanced the position in a match.
2430 // (Maximum match length so far == 0)
2431
2432 // forwardedLength is a vector holding minimum-match-length values that
2433 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2434 // It must be one longer than the pattern being checked because some ops
2435 // will jmp to a end-of-block+1 location from within a block, and we must
2436 // count those when checking the block.
2437 int32_t end = fRXPat->fCompiledPat->size();
2438 UVector32 forwardedLength(end+1, *fStatus);
2439 forwardedLength.setSize(end+1);
2440 for (loc=3; loc<end; loc++) {
2441 forwardedLength.setElementAt(INT32_MAX, loc);
2442 }
2443
2444 for (loc = 3; loc<end; loc++) {
2445 op = fRXPat->fCompiledPat->elementAti(loc);
2446 opType = URX_TYPE(op);
2447
2448 // The loop is advancing linearly through the pattern.
2449 // If the op we are now at was the destination of a branch in the pattern,
2450 // and that path has a shorter minimum length than the current accumulated value,
2451 // replace the current accumulated value.
b75a7d8f
A
2452 if (forwardedLength.elementAti(loc) < currentLen) {
2453 currentLen = forwardedLength.elementAti(loc);
2454 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2455 }
2456
2457 switch (opType) {
2458 // Ops that don't change the total length matched
2459 case URX_RESERVED_OP:
2460 case URX_END:
46f4442e 2461 case URX_FAIL:
b75a7d8f
A
2462 case URX_STRING_LEN:
2463 case URX_NOP:
2464 case URX_START_CAPTURE:
2465 case URX_END_CAPTURE:
2466 case URX_BACKSLASH_B:
374ca955 2467 case URX_BACKSLASH_BU:
b75a7d8f
A
2468 case URX_BACKSLASH_G:
2469 case URX_BACKSLASH_Z:
2470 case URX_DOLLAR:
46f4442e
A
2471 case URX_DOLLAR_M:
2472 case URX_DOLLAR_D:
2473 case URX_DOLLAR_MD:
b75a7d8f
A
2474 case URX_RELOC_OPRND:
2475 case URX_STO_INP_LOC:
b75a7d8f
A
2476 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2477 case URX_BACKREF_I:
2478
2479 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2480 case URX_LD_SP:
2481 break;
46f4442e 2482
b75a7d8f
A
2483 case URX_CARET:
2484 if (atStart) {
2485 fRXPat->fStartType = START_START;
2486 }
2487 break;
2488
2489 case URX_CARET_M:
46f4442e 2490 case URX_CARET_M_UNIX:
b75a7d8f
A
2491 if (atStart) {
2492 fRXPat->fStartType = START_LINE;
2493 }
2494 break;
46f4442e 2495
b75a7d8f
A
2496 case URX_ONECHAR:
2497 if (currentLen == 0) {
2498 // This character could appear at the start of a match.
2499 // Add it to the set of possible starting characters.
2500 fRXPat->fInitialChars->add(URX_VAL(op));
2501 numInitialStrings += 2;
2502 }
2503 currentLen++;
2504 atStart = FALSE;
2505 break;
b75a7d8f 2506
46f4442e
A
2507
2508 case URX_SETREF:
b75a7d8f
A
2509 if (currentLen == 0) {
2510 int32_t sn = URX_VAL(op);
2511 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2512 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2513 fRXPat->fInitialChars->addAll(*s);
2514 numInitialStrings += 2;
2515 }
2516 currentLen++;
2517 atStart = FALSE;
2518 break;
2519
2520 case URX_LOOP_SR_I:
2521 // [Set]*, like a SETREF, above, in what it can match,
2522 // but may not match at all, so currentLen is not incremented.
2523 if (currentLen == 0) {
2524 int32_t sn = URX_VAL(op);
2525 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2526 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2527 fRXPat->fInitialChars->addAll(*s);
2528 numInitialStrings += 2;
2529 }
2530 atStart = FALSE;
2531 break;
2532
2533 case URX_LOOP_DOT_I:
2534 if (currentLen == 0) {
2535 // .* at the start of a pattern.
2536 // Any character can begin the match.
2537 fRXPat->fInitialChars->clear();
2538 fRXPat->fInitialChars->complement();
2539 numInitialStrings += 2;
2540 }
2541 atStart = FALSE;
2542 break;
2543
2544
46f4442e 2545 case URX_STATIC_SETREF:
b75a7d8f
A
2546 if (currentLen == 0) {
2547 int32_t sn = URX_VAL(op);
2548 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2549 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2550 fRXPat->fInitialChars->addAll(*s);
2551 numInitialStrings += 2;
2552 }
2553 currentLen++;
2554 atStart = FALSE;
2555 break;
2556
2557
2558
46f4442e 2559 case URX_STAT_SETREF_N:
b75a7d8f
A
2560 if (currentLen == 0) {
2561 int32_t sn = URX_VAL(op);
2562 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2563 UnicodeSet sc(*s);
2564 sc.complement();
2565 fRXPat->fInitialChars->addAll(sc);
2566 numInitialStrings += 2;
2567 }
2568 currentLen++;
2569 atStart = FALSE;
2570 break;
2571
2572
2573
2574 case URX_BACKSLASH_D:
2575 // Digit Char
2576 if (currentLen == 0) {
46f4442e 2577 UnicodeSet s;
b75a7d8f
A
2578 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2579 if (URX_VAL(op) != 0) {
2580 s.complement();
2581 }
2582 fRXPat->fInitialChars->addAll(s);
2583 numInitialStrings += 2;
2584 }
2585 currentLen++;
2586 atStart = FALSE;
2587 break;
2588
2589
2590 case URX_ONECHAR_I:
2591 // Case Insensitive Single Character.
2592 if (currentLen == 0) {
2593 UChar32 c = URX_VAL(op);
2594 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2595 // character may have distinct cased forms. Add all of them
2596 // to the set of possible starting match chars.
2597 UnicodeSet s(c, c);
73c04bcf 2598 s.closeOver(USET_CASE_INSENSITIVE);
b75a7d8f
A
2599 fRXPat->fInitialChars->addAll(s);
2600 } else {
2601 // Char has no case variants. Just add it as-is to the
2602 // set of possible starting chars.
2603 fRXPat->fInitialChars->add(c);
2604 }
2605 numInitialStrings += 2;
2606 }
2607 currentLen++;
2608 atStart = FALSE;
2609 break;
2610
2611
2612 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2613 case URX_DOTANY_ALL: // . matches one or two.
2614 case URX_DOTANY:
46f4442e 2615 case URX_DOTANY_UNIX:
b75a7d8f
A
2616 if (currentLen == 0) {
2617 // These constructs are all bad news when they appear at the start
2618 // of a match. Any character can begin the match.
2619 fRXPat->fInitialChars->clear();
2620 fRXPat->fInitialChars->complement();
2621 numInitialStrings += 2;
2622 }
2623 currentLen++;
2624 atStart = FALSE;
2625 break;
2626
2627
2628 case URX_JMPX:
2629 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2630 case URX_JMP:
2631 {
2632 int32_t jmpDest = URX_VAL(op);
2633 if (jmpDest < loc) {
2634 // Loop of some kind. Can safely ignore, the worst that will happen
2635 // is that we understate the true minimum length
2636 currentLen = forwardedLength.elementAti(loc+1);
46f4442e 2637
b75a7d8f
A
2638 } else {
2639 // Forward jump. Propagate the current min length to the target loc of the jump.
2640 U_ASSERT(jmpDest <= end+1);
2641 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2642 forwardedLength.setElementAt(currentLen, jmpDest);
2643 }
2644 }
2645 }
2646 atStart = FALSE;
2647 break;
2648
2649 case URX_JMP_SAV:
2650 case URX_JMP_SAV_X:
2651 // Combo of state save to the next loc, + jmp backwards.
2652 // Net effect on min. length computation is nothing.
2653 atStart = FALSE;
2654 break;
2655
46f4442e 2656 case URX_BACKTRACK:
b75a7d8f
A
2657 // Fails are kind of like a branch, except that the min length was
2658 // propagated already, by the state save.
2659 currentLen = forwardedLength.elementAti(loc+1);
2660 atStart = FALSE;
2661 break;
2662
2663
2664 case URX_STATE_SAVE:
2665 {
2666 // State Save, for forward jumps, propagate the current minimum.
2667 // of the state save.
2668 int32_t jmpDest = URX_VAL(op);
2669 if (jmpDest > loc) {
2670 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2671 forwardedLength.setElementAt(currentLen, jmpDest);
2672 }
46f4442e 2673 }
b75a7d8f
A
2674 }
2675 atStart = FALSE;
2676 break;
46f4442e 2677
b75a7d8f
A
2678
2679
2680
2681 case URX_STRING:
2682 {
2683 loc++;
2684 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
2685 int32_t stringLen = URX_VAL(stringLenOp);
2686 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2687 U_ASSERT(stringLenOp >= 2);
2688 if (currentLen == 0) {
2689 // Add the starting character of this string to the set of possible starting
2690 // characters for this pattern.
2691 int32_t stringStartIdx = URX_VAL(op);
2692 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2693 fRXPat->fInitialChars->add(c);
2694
2695 // Remember this string. After the entire pattern has been checked,
2696 // if nothing else is identified that can start a match, we'll use it.
2697 numInitialStrings++;
2698 fRXPat->fInitialStringIdx = stringStartIdx;
2699 fRXPat->fInitialStringLen = stringLen;
2700 }
46f4442e 2701
b75a7d8f
A
2702 currentLen += stringLen;
2703 atStart = FALSE;
2704 }
2705 break;
2706
2707 case URX_STRING_I:
2708 {
2709 // Case-insensitive string. Unlike exact-match strings, we won't
2710 // attempt a string search for possible match positions. But we
2711 // do update the set of possible starting characters.
2712 loc++;
2713 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
2714 int32_t stringLen = URX_VAL(stringLenOp);
2715 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2716 U_ASSERT(stringLenOp >= 2);
2717 if (currentLen == 0) {
2718 // Add the starting character of this string to the set of possible starting
2719 // characters for this pattern.
2720 int32_t stringStartIdx = URX_VAL(op);
2721 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2722 UnicodeSet s(c, c);
73c04bcf 2723 s.closeOver(USET_CASE_INSENSITIVE);
b75a7d8f
A
2724 fRXPat->fInitialChars->addAll(s);
2725 numInitialStrings += 2; // Matching on an initial string not possible.
2726 }
2727 currentLen += stringLen;
2728 atStart = FALSE;
2729 }
2730 break;
2731
2732 case URX_CTR_INIT:
2733 case URX_CTR_INIT_NG:
2734 {
2735 // Loop Init Ops. These don't change the min length, but they are 4 word ops
2736 // so location must be updated accordingly.
46f4442e 2737 // Loop Init Ops.
b75a7d8f
A
2738 // If the min loop count == 0
2739 // move loc forwards to the end of the loop, skipping over the body.
46f4442e 2740 // If the min count is > 0,
b75a7d8f
A
2741 // continue normal processing of the body of the loop.
2742 int32_t loopEndLoc = fRXPat->fCompiledPat->elementAti(loc+1);
2743 loopEndLoc = URX_VAL(loopEndLoc);
2744 int32_t minLoopCount = fRXPat->fCompiledPat->elementAti(loc+2);
2745 if (minLoopCount == 0) {
374ca955
A
2746 // Min Loop Count of 0, treat like a forward branch and
2747 // move the current minimum length up to the target
2748 // (end of loop) location.
2749 U_ASSERT(loopEndLoc <= end+1);
2750 if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
2751 forwardedLength.setElementAt(currentLen, loopEndLoc);
2752 }
46f4442e 2753 }
374ca955 2754 loc+=3; // Skips over operands of CTR_INIT
b75a7d8f
A
2755 }
2756 atStart = FALSE;
2757 break;
2758
2759
2760 case URX_CTR_LOOP:
2761 case URX_CTR_LOOP_NG:
46f4442e 2762 // Loop ops.
b75a7d8f
A
2763 // The jump is conditional, backwards only.
2764 atStart = FALSE;
2765 break;
46f4442e 2766
b75a7d8f
A
2767 case URX_LOOP_C:
2768 // More loop ops. These state-save to themselves.
2769 // don't change the minimum match
2770 atStart = FALSE;
2771 break;
46f4442e 2772
b75a7d8f
A
2773
2774 case URX_LA_START:
2775 case URX_LB_START:
2776 {
2777 // Look-around. Scan forward until the matching look-ahead end,
2778 // without processing the look-around block. This is overly pessimistic.
46f4442e
A
2779
2780 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
2781 // lookahead contains two LA_END instructions, so count goes up by two
2782 // for each LA_START.
2783 int32_t depth = (opType == URX_LA_START? 2: 1);
b75a7d8f
A
2784 for (;;) {
2785 loc++;
2786 op = fRXPat->fCompiledPat->elementAti(loc);
46f4442e
A
2787 if (URX_TYPE(op) == URX_LA_START) {
2788 depth+=2;
2789 }
2790 if (URX_TYPE(op) == URX_LB_START) {
b75a7d8f
A
2791 depth++;
2792 }
2793 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
46f4442e 2794 depth--;
b75a7d8f
A
2795 if (depth == 0) {
2796 break;
2797 }
b75a7d8f
A
2798 }
2799 if (URX_TYPE(op) == URX_STATE_SAVE) {
2800 // Need this because neg lookahead blocks will FAIL to outside
2801 // of the block.
2802 int32_t jmpDest = URX_VAL(op);
2803 if (jmpDest > loc) {
2804 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2805 forwardedLength.setElementAt(currentLen, jmpDest);
2806 }
2807 }
2808 }
46f4442e 2809 U_ASSERT(loc <= end);
b75a7d8f
A
2810 }
2811 }
2812 break;
46f4442e 2813
b75a7d8f
A
2814 case URX_LA_END:
2815 case URX_LB_CONT:
2816 case URX_LB_END:
2817 case URX_LBN_CONT:
2818 case URX_LBN_END:
46f4442e 2819 U_ASSERT(FALSE); // Shouldn't get here. These ops should be
b75a7d8f
A
2820 // consumed by the scan in URX_LA_START and LB_START
2821
2822 break;
46f4442e 2823
b75a7d8f
A
2824 default:
2825 U_ASSERT(FALSE);
2826 }
46f4442e 2827
b75a7d8f
A
2828 }
2829
2830
2831 // We have finished walking through the ops. Check whether some forward jump
2832 // propagated a shorter length to location end+1.
2833 if (forwardedLength.elementAti(end+1) < currentLen) {
2834 currentLen = forwardedLength.elementAti(end+1);
2835 }
2836
2837
2838 fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
2839
2840
2841 // Sort out what we should check for when looking for candidate match start positions.
2842 // In order of preference,
2843 // 1. Start of input text buffer.
2844 // 2. A literal string.
2845 // 3. Start of line in multi-line mode.
2846 // 4. A single literal character.
2847 // 5. A character from a set of characters.
2848 //
2849 if (fRXPat->fStartType == START_START) {
2850 // Match only at the start of an input text string.
2851 // start type is already set. We're done.
2852 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
2853 // Match beginning only with a literal string.
2854 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
2855 U_ASSERT(fRXPat->fInitialChars->contains(c));
2856 fRXPat->fStartType = START_STRING;
2857 fRXPat->fInitialChar = c;
2858 } else if (fRXPat->fStartType == START_LINE) {
374ca955 2859 // Match at start of line in Multi-Line mode.
b75a7d8f
A
2860 // Nothing to do here; everything is already set.
2861 } else if (fRXPat->fMinMatchLen == 0) {
2862 // Zero length match possible. We could start anywhere.
2863 fRXPat->fStartType = START_NO_INFO;
2864 } else if (fRXPat->fInitialChars->size() == 1) {
2865 // All matches begin with the same char.
2866 fRXPat->fStartType = START_CHAR;
2867 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
2868 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
2869 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
2870 fRXPat->fMinMatchLen > 0) {
2871 // Matches start with a set of character smaller than the set of all chars.
2872 fRXPat->fStartType = START_SET;
2873 } else {
2874 // Matches can start with anything
2875 fRXPat->fStartType = START_NO_INFO;
2876 }
2877
2878 return;
2879}
2880
2881
2882
73c04bcf 2883//------------------------------------------------------------------------------
b75a7d8f
A
2884//
2885// minMatchLength Calculate the length of the shortest string that could
46f4442e 2886// match the specified pattern.
b75a7d8f
A
2887// Length is in 16 bit code units, not code points.
2888//
2889// The calculated length may not be exact. The returned
2890// value may be shorter than the actual minimum; it must
2891// never be longer.
2892//
2893// start and end are the range of p-code operations to be
2894// examined. The endpoints are included in the range.
2895//
73c04bcf 2896//------------------------------------------------------------------------------
b75a7d8f
A
2897int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
2898 if (U_FAILURE(*fStatus)) {
2899 return 0;
2900 }
2901
2902 U_ASSERT(start <= end);
2903 U_ASSERT(end < fRXPat->fCompiledPat->size());
2904
2905
2906 int32_t loc;
2907 int32_t op;
2908 int32_t opType;
2909 int32_t currentLen = 0;
2910
2911
2912 // forwardedLength is a vector holding minimum-match-length values that
2913 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2914 // It must be one longer than the pattern being checked because some ops
2915 // will jmp to a end-of-block+1 location from within a block, and we must
2916 // count those when checking the block.
2917 UVector32 forwardedLength(end+2, *fStatus);
2918 forwardedLength.setSize(end+2);
2919 for (loc=start; loc<=end+1; loc++) {
2920 forwardedLength.setElementAt(INT32_MAX, loc);
2921 }
2922
2923 for (loc = start; loc<=end; loc++) {
2924 op = fRXPat->fCompiledPat->elementAti(loc);
2925 opType = URX_TYPE(op);
2926
2927 // The loop is advancing linearly through the pattern.
2928 // If the op we are now at was the destination of a branch in the pattern,
2929 // and that path has a shorter minimum length than the current accumulated value,
2930 // replace the current accumulated value.
46f4442e
A
2931 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
2932 // no-match-possible cases.
b75a7d8f
A
2933 if (forwardedLength.elementAti(loc) < currentLen) {
2934 currentLen = forwardedLength.elementAti(loc);
2935 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2936 }
2937
2938 switch (opType) {
2939 // Ops that don't change the total length matched
2940 case URX_RESERVED_OP:
2941 case URX_END:
2942 case URX_STRING_LEN:
2943 case URX_NOP:
2944 case URX_START_CAPTURE:
2945 case URX_END_CAPTURE:
2946 case URX_BACKSLASH_B:
374ca955 2947 case URX_BACKSLASH_BU:
b75a7d8f
A
2948 case URX_BACKSLASH_G:
2949 case URX_BACKSLASH_Z:
2950 case URX_CARET:
2951 case URX_DOLLAR:
46f4442e
A
2952 case URX_DOLLAR_M:
2953 case URX_DOLLAR_D:
2954 case URX_DOLLAR_MD:
b75a7d8f
A
2955 case URX_RELOC_OPRND:
2956 case URX_STO_INP_LOC:
b75a7d8f 2957 case URX_CARET_M:
46f4442e 2958 case URX_CARET_M_UNIX:
b75a7d8f
A
2959 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2960 case URX_BACKREF_I:
2961
2962 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2963 case URX_LD_SP:
2964
2965 case URX_JMP_SAV:
2966 case URX_JMP_SAV_X:
2967 break;
46f4442e 2968
b75a7d8f
A
2969
2970 // Ops that match a minimum of one character (one or two 16 bit code units.)
46f4442e 2971 //
b75a7d8f
A
2972 case URX_ONECHAR:
2973 case URX_STATIC_SETREF:
2974 case URX_STAT_SETREF_N:
2975 case URX_SETREF:
2976 case URX_BACKSLASH_D:
2977 case URX_ONECHAR_I:
2978 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2979 case URX_DOTANY_ALL: // . matches one or two.
2980 case URX_DOTANY:
46f4442e 2981 case URX_DOTANY_UNIX:
b75a7d8f
A
2982 currentLen++;
2983 break;
2984
2985
2986 case URX_JMPX:
2987 loc++; // URX_JMPX has an extra operand, ignored here,
2988 // otherwise processed identically to URX_JMP.
2989 case URX_JMP:
2990 {
2991 int32_t jmpDest = URX_VAL(op);
2992 if (jmpDest < loc) {
2993 // Loop of some kind. Can safely ignore, the worst that will happen
2994 // is that we understate the true minimum length
2995 currentLen = forwardedLength.elementAti(loc+1);
2996 } else {
2997 // Forward jump. Propagate the current min length to the target loc of the jump.
2998 U_ASSERT(jmpDest <= end+1);
2999 if (forwardedLength.elementAti(jmpDest) > currentLen) {
3000 forwardedLength.setElementAt(currentLen, jmpDest);
3001 }
3002 }
3003 }
3004 break;
3005
46f4442e 3006 case URX_BACKTRACK:
b75a7d8f 3007 {
46f4442e 3008 // Back-tracks are kind of like a branch, except that the min length was
b75a7d8f
A
3009 // propagated already, by the state save.
3010 currentLen = forwardedLength.elementAti(loc+1);
b75a7d8f
A
3011 }
3012 break;
3013
3014
3015 case URX_STATE_SAVE:
3016 {
3017 // State Save, for forward jumps, propagate the current minimum.
3018 // of the state save.
3019 int32_t jmpDest = URX_VAL(op);
3020 if (jmpDest > loc) {
3021 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3022 forwardedLength.setElementAt(currentLen, jmpDest);
3023 }
46f4442e 3024 }
b75a7d8f
A
3025 }
3026 break;
46f4442e 3027
b75a7d8f
A
3028
3029 case URX_STRING:
3030 case URX_STRING_I:
3031 {
3032 loc++;
3033 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
3034 currentLen += URX_VAL(stringLenOp);
3035 }
3036 break;
3037
3038
3039 case URX_CTR_INIT:
3040 case URX_CTR_INIT_NG:
3041 {
46f4442e 3042 // Loop Init Ops.
b75a7d8f
A
3043 // If the min loop count == 0
3044 // move loc forwards to the end of the loop, skipping over the body.
46f4442e 3045 // If the min count is > 0,
b75a7d8f
A
3046 // continue normal processing of the body of the loop.
3047 int32_t loopEndLoc = fRXPat->fCompiledPat->elementAti(loc+1);
3048 loopEndLoc = URX_VAL(loopEndLoc);
3049 int32_t minLoopCount = fRXPat->fCompiledPat->elementAti(loc+2);
3050 if (minLoopCount == 0) {
3051 loc = loopEndLoc;
3052 } else {
3053 loc+=3; // Skips over operands of CTR_INIT
3054 }
3055 }
3056 break;
3057
3058
3059 case URX_CTR_LOOP:
3060 case URX_CTR_LOOP_NG:
46f4442e 3061 // Loop ops.
b75a7d8f
A
3062 // The jump is conditional, backwards only.
3063 break;
46f4442e 3064
b75a7d8f
A
3065 case URX_LOOP_SR_I:
3066 case URX_LOOP_DOT_I:
3067 case URX_LOOP_C:
3068 // More loop ops. These state-save to themselves.
3069 // don't change the minimum match - could match nothing at all.
3070 break;
46f4442e 3071
b75a7d8f
A
3072
3073 case URX_LA_START:
3074 case URX_LB_START:
3075 {
3076 // Look-around. Scan forward until the matching look-ahead end,
46f4442e
A
3077 // without processing the look-around block. This is overly pessimistic for look-ahead,
3078 // it assumes that the look-ahead match might be zero-length.
b75a7d8f 3079 // TODO: Positive lookahead could recursively do the block, then continue
46f4442e
A
3080 // with the longer of the block or the value coming in. Ticket 6060
3081 int32_t depth = (opType == URX_LA_START? 2: 1);;
b75a7d8f
A
3082 for (;;) {
3083 loc++;
3084 op = fRXPat->fCompiledPat->elementAti(loc);
46f4442e
A
3085 if (URX_TYPE(op) == URX_LA_START) {
3086 // The boilerplate for look-ahead includes two LA_END insturctions,
3087 // Depth will be decremented by each one when it is seen.
3088 depth += 2;
3089 }
3090 if (URX_TYPE(op) == URX_LB_START) {
b75a7d8f
A
3091 depth++;
3092 }
46f4442e
A
3093 if (URX_TYPE(op) == URX_LA_END) {
3094 depth--;
b75a7d8f
A
3095 if (depth == 0) {
3096 break;
3097 }
46f4442e
A
3098 }
3099 if (URX_TYPE(op)==URX_LBN_END) {
b75a7d8f 3100 depth--;
46f4442e
A
3101 if (depth == 0) {
3102 break;
3103 }
b75a7d8f
A
3104 }
3105 if (URX_TYPE(op) == URX_STATE_SAVE) {
3106 // Need this because neg lookahead blocks will FAIL to outside
3107 // of the block.
3108 int32_t jmpDest = URX_VAL(op);
3109 if (jmpDest > loc) {
3110 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3111 forwardedLength.setElementAt(currentLen, jmpDest);
3112 }
3113 }
3114 }
46f4442e 3115 U_ASSERT(loc <= end);
b75a7d8f
A
3116 }
3117 }
3118 break;
46f4442e 3119
b75a7d8f
A
3120 case URX_LA_END:
3121 case URX_LB_CONT:
3122 case URX_LB_END:
3123 case URX_LBN_CONT:
3124 case URX_LBN_END:
3125 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3126 // range being sized, which happens when measuring size of look-behind blocks.
3127 break;
46f4442e 3128
b75a7d8f
A
3129 default:
3130 U_ASSERT(FALSE);
3131 }
46f4442e 3132
b75a7d8f
A
3133 }
3134
3135 // We have finished walking through the ops. Check whether some forward jump
3136 // propagated a shorter length to location end+1.
3137 if (forwardedLength.elementAti(end+1) < currentLen) {
3138 currentLen = forwardedLength.elementAti(end+1);
3139 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3140 }
46f4442e 3141
b75a7d8f
A
3142 return currentLen;
3143}
3144
3145
3146
73c04bcf 3147//------------------------------------------------------------------------------
b75a7d8f
A
3148//
3149// maxMatchLength Calculate the length of the longest string that could
46f4442e 3150// match the specified pattern.
b75a7d8f
A
3151// Length is in 16 bit code units, not code points.
3152//
3153// The calculated length may not be exact. The returned
3154// value may be longer than the actual maximum; it must
3155// never be shorter.
3156//
73c04bcf 3157//------------------------------------------------------------------------------
b75a7d8f
A
3158int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3159 if (U_FAILURE(*fStatus)) {
3160 return 0;
3161 }
3162 U_ASSERT(start <= end);
3163 U_ASSERT(end < fRXPat->fCompiledPat->size());
3164
3165
3166 int32_t loc;
3167 int32_t op;
3168 int32_t opType;
3169 int32_t currentLen = 0;
3170 UVector32 forwardedLength(end+1, *fStatus);
3171 forwardedLength.setSize(end+1);
3172
3173 for (loc=start; loc<=end; loc++) {
3174 forwardedLength.setElementAt(0, loc);
3175 }
3176
3177 for (loc = start; loc<=end; loc++) {
3178 op = fRXPat->fCompiledPat->elementAti(loc);
3179 opType = URX_TYPE(op);
3180
3181 // The loop is advancing linearly through the pattern.
3182 // If the op we are now at was the destination of a branch in the pattern,
3183 // and that path has a longer maximum length than the current accumulated value,
3184 // replace the current accumulated value.
3185 if (forwardedLength.elementAti(loc) > currentLen) {
3186 currentLen = forwardedLength.elementAti(loc);
3187 }
3188
3189 switch (opType) {
3190 // Ops that don't change the total length matched
3191 case URX_RESERVED_OP:
3192 case URX_END:
3193 case URX_STRING_LEN:
3194 case URX_NOP:
3195 case URX_START_CAPTURE:
3196 case URX_END_CAPTURE:
3197 case URX_BACKSLASH_B:
374ca955 3198 case URX_BACKSLASH_BU:
b75a7d8f
A
3199 case URX_BACKSLASH_G:
3200 case URX_BACKSLASH_Z:
3201 case URX_CARET:
3202 case URX_DOLLAR:
46f4442e
A
3203 case URX_DOLLAR_M:
3204 case URX_DOLLAR_D:
3205 case URX_DOLLAR_MD:
b75a7d8f
A
3206 case URX_RELOC_OPRND:
3207 case URX_STO_INP_LOC:
b75a7d8f 3208 case URX_CARET_M:
46f4442e 3209 case URX_CARET_M_UNIX:
b75a7d8f
A
3210
3211 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3212 case URX_LD_SP:
3213
3214 case URX_LB_END:
3215 case URX_LB_CONT:
3216 case URX_LBN_CONT:
3217 case URX_LBN_END:
3218 break;
46f4442e 3219
b75a7d8f
A
3220
3221 // Ops that increase that cause an unbounded increase in the length
3222 // of a matched string, or that increase it a hard to characterize way.
3223 // Call the max length unbounded, and stop further checking.
3224 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3225 case URX_BACKREF_I:
3226 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
b75a7d8f
A
3227 currentLen = INT32_MAX;
3228 break;
3229
3230
3231 // Ops that match a max of one character (possibly two 16 bit code units.)
46f4442e 3232 //
b75a7d8f
A
3233 case URX_STATIC_SETREF:
3234 case URX_STAT_SETREF_N:
3235 case URX_SETREF:
3236 case URX_BACKSLASH_D:
3237 case URX_ONECHAR_I:
46f4442e 3238 case URX_DOTANY_ALL:
b75a7d8f 3239 case URX_DOTANY:
46f4442e 3240 case URX_DOTANY_UNIX:
b75a7d8f
A
3241 currentLen+=2;
3242 break;
3243
3244 // Single literal character. Increase current max length by one or two,
3245 // depending on whether the char is in the supplementary range.
3246 case URX_ONECHAR:
3247 currentLen++;
3248 if (URX_VAL(op) > 0x10000) {
3249 currentLen++;
3250 }
3251 break;
3252
46f4442e 3253 // Jumps.
b75a7d8f
A
3254 //
3255 case URX_JMP:
3256 case URX_JMPX:
3257 case URX_JMP_SAV:
3258 case URX_JMP_SAV_X:
3259 {
3260 int32_t jmpDest = URX_VAL(op);
3261 if (jmpDest < loc) {
3262 // Loop of some kind. Max match length is unbounded.
3263 currentLen = INT32_MAX;
3264 } else {
3265 // Forward jump. Propagate the current min length to the target loc of the jump.
3266 if (forwardedLength.elementAti(jmpDest) < currentLen) {
3267 forwardedLength.setElementAt(currentLen, jmpDest);
3268 }
3269 currentLen = 0;
3270 }
3271 }
3272 break;
3273
46f4442e
A
3274 case URX_BACKTRACK:
3275 // back-tracks are kind of like a branch, except that the max length was
b75a7d8f
A
3276 // propagated already, by the state save.
3277 currentLen = forwardedLength.elementAti(loc+1);
3278 break;
3279
3280
3281 case URX_STATE_SAVE:
3282 {
3283 // State Save, for forward jumps, propagate the current minimum.
3284 // of the state save.
3285 // For backwards jumps, they create a loop, maximum
3286 // match length is unbounded.
3287 int32_t jmpDest = URX_VAL(op);
3288 if (jmpDest > loc) {
3289 if (currentLen > forwardedLength.elementAti(jmpDest)) {
3290 forwardedLength.setElementAt(currentLen, jmpDest);
3291 }
3292 } else {
3293 currentLen = INT32_MAX;
3294 }
3295 }
3296 break;
46f4442e 3297
b75a7d8f
A
3298
3299
3300
3301 case URX_STRING:
3302 case URX_STRING_I:
3303 {
3304 loc++;
3305 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
3306 currentLen += URX_VAL(stringLenOp);
3307 }
3308 break;
3309
3310
3311 case URX_CTR_INIT:
3312 case URX_CTR_INIT_NG:
3313 case URX_CTR_LOOP:
3314 case URX_CTR_LOOP_NG:
3315 case URX_LOOP_SR_I:
3316 case URX_LOOP_DOT_I:
3317 case URX_LOOP_C:
3318 // For anything to do with loops, make the match length unbounded.
3319 // Note: INIT instructions are multi-word. Can ignore because
3320 // INT32_MAX length will stop the per-instruction loop.
3321 currentLen = INT32_MAX;
3322 break;
46f4442e
A
3323
3324
b75a7d8f
A
3325
3326 case URX_LA_START:
3327 case URX_LA_END:
3328 // Look-ahead. Just ignore, treat the look-ahead block as if
3329 // it were normal pattern. Gives a too-long match length,
3330 // but good enough for now.
3331 break;
46f4442e 3332
b75a7d8f
A
3333 // End of look-ahead ops should always be consumed by the processing at
3334 // the URX_LA_START op.
73c04bcf
A
3335 // U_ASSERT(FALSE);
3336 // break;
46f4442e 3337
b75a7d8f
A
3338 case URX_LB_START:
3339 {
3340 // Look-behind. Scan forward until the matching look-around end,
46f4442e 3341 // without processing the look-behind block.
b75a7d8f
A
3342 int32_t depth = 0;
3343 for (;;) {
3344 loc++;
3345 op = fRXPat->fCompiledPat->elementAti(loc);
3346 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3347 depth++;
3348 }
3349 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3350 if (depth == 0) {
3351 break;
3352 }
3353 depth--;
3354 }
46f4442e 3355 U_ASSERT(loc < end);
b75a7d8f
A
3356 }
3357 }
3358 break;
3359
3360 default:
3361 U_ASSERT(FALSE);
3362 }
3363
46f4442e 3364
b75a7d8f
A
3365 if (currentLen == INT32_MAX) {
3366 // The maximum length is unbounded.
3367 // Stop further processing of the pattern.
3368 break;
3369 }
46f4442e 3370
b75a7d8f
A
3371 }
3372 return currentLen;
46f4442e 3373
b75a7d8f
A
3374}
3375
3376
73c04bcf 3377//------------------------------------------------------------------------------
b75a7d8f
A
3378//
3379// stripNOPs Remove any NOP operations from the compiled pattern code.
3380// Extra NOPs are inserted for some constructs during the initial
3381// code generation to provide locations that may be patched later.
3382// Many end up unneeded, and are removed by this function.
3383//
73c04bcf 3384//------------------------------------------------------------------------------
b75a7d8f
A
3385void RegexCompile::stripNOPs() {
3386
3387 if (U_FAILURE(*fStatus)) {
3388 return;
3389 }
3390
3391 int32_t end = fRXPat->fCompiledPat->size();
3392 UVector32 deltas(end, *fStatus);
3393
3394 // Make a first pass over the code, computing the amount that things
3395 // will be offset at each location in the original code.
3396 int32_t loc;
3397 int32_t d = 0;
3398 for (loc=0; loc<end; loc++) {
3399 deltas.addElement(d, *fStatus);
3400 int32_t op = fRXPat->fCompiledPat->elementAti(loc);
3401 if (URX_TYPE(op) == URX_NOP) {
3402 d++;
3403 }
3404 }
3405
3406 // Make a second pass over the code, removing the NOPs by moving following
3407 // code up, and patching operands that refer to code locations that
3408 // are being moved. The array of offsets from the first step is used
3409 // to compute the new operand values.
3410 int32_t src;
3411 int32_t dst = 0;
3412 for (src=0; src<end; src++) {
3413 int32_t op = fRXPat->fCompiledPat->elementAti(src);
3414 int32_t opType = URX_TYPE(op);
3415 switch (opType) {
3416 case URX_NOP:
3417 break;
3418
3419 case URX_STATE_SAVE:
3420 case URX_JMP:
3421 case URX_CTR_LOOP:
3422 case URX_CTR_LOOP_NG:
3423 case URX_RELOC_OPRND:
3424 case URX_JMPX:
3425 case URX_JMP_SAV:
3426 case URX_JMP_SAV_X:
3427 // These are instructions with operands that refer to code locations.
3428 {
3429 int32_t operandAddress = URX_VAL(op);
3430 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3431 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3432 op = URX_BUILD(opType, fixedOperandAddress);
3433 fRXPat->fCompiledPat->setElementAt(op, dst);
3434 dst++;
3435 break;
3436 }
3437
3438 case URX_RESERVED_OP:
3439 case URX_RESERVED_OP_N:
3440 case URX_BACKTRACK:
3441 case URX_END:
3442 case URX_ONECHAR:
3443 case URX_STRING:
3444 case URX_STRING_LEN:
3445 case URX_START_CAPTURE:
3446 case URX_END_CAPTURE:
3447 case URX_STATIC_SETREF:
3448 case URX_STAT_SETREF_N:
3449 case URX_SETREF:
3450 case URX_DOTANY:
3451 case URX_FAIL:
3452 case URX_BACKSLASH_B:
374ca955 3453 case URX_BACKSLASH_BU:
b75a7d8f
A
3454 case URX_BACKSLASH_G:
3455 case URX_BACKSLASH_X:
3456 case URX_BACKSLASH_Z:
3457 case URX_DOTANY_ALL:
b75a7d8f
A
3458 case URX_BACKSLASH_D:
3459 case URX_CARET:
3460 case URX_DOLLAR:
3461 case URX_CTR_INIT:
3462 case URX_CTR_INIT_NG:
46f4442e 3463 case URX_DOTANY_UNIX:
b75a7d8f
A
3464 case URX_STO_SP:
3465 case URX_LD_SP:
3466 case URX_BACKREF:
3467 case URX_STO_INP_LOC:
3468 case URX_LA_START:
3469 case URX_LA_END:
3470 case URX_ONECHAR_I:
3471 case URX_STRING_I:
3472 case URX_BACKREF_I:
3473 case URX_DOLLAR_M:
3474 case URX_CARET_M:
46f4442e 3475 case URX_CARET_M_UNIX:
b75a7d8f
A
3476 case URX_LB_START:
3477 case URX_LB_CONT:
3478 case URX_LB_END:
3479 case URX_LBN_CONT:
3480 case URX_LBN_END:
3481 case URX_LOOP_SR_I:
3482 case URX_LOOP_DOT_I:
3483 case URX_LOOP_C:
46f4442e
A
3484 case URX_DOLLAR_D:
3485 case URX_DOLLAR_MD:
b75a7d8f
A
3486 // These instructions are unaltered by the relocation.
3487 fRXPat->fCompiledPat->setElementAt(op, dst);
3488 dst++;
3489 break;
3490
3491 default:
3492 // Some op is unaccounted for.
3493 U_ASSERT(FALSE);
3494 error(U_REGEX_INTERNAL_ERROR);
3495 }
3496 }
3497
3498 fRXPat->fCompiledPat->setSize(dst);
3499}
3500
3501
3502
3503
73c04bcf 3504//------------------------------------------------------------------------------
b75a7d8f
A
3505//
3506// Error Report a rule parse error.
3507// Only report it if no previous error has been recorded.
3508//
73c04bcf 3509//------------------------------------------------------------------------------
b75a7d8f
A
3510void RegexCompile::error(UErrorCode e) {
3511 if (U_SUCCESS(*fStatus)) {
3512 *fStatus = e;
3513 fParseErr->line = fLineNum;
3514 fParseErr->offset = fCharNum;
3515
3516 // Fill in the context.
3517 // Note: extractBetween() pins supplied indicies to the string bounds.
3518 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
3519 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3520 fRXPat->fPattern.extractBetween(fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex,
3521 fParseErr->preContext, 0);
3522 fRXPat->fPattern.extractBetween(fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1,
3523 fParseErr->postContext, 0);
3524 }
3525}
3526
3527
3528//
3529// Assorted Unicode character constants.
3530// Numeric because there is no portable way to enter them as literals.
3531// (Think EBCDIC).
3532//
3533static const UChar chCR = 0x0d; // New lines, for terminating comments.
46f4442e 3534static const UChar chLF = 0x0a; // Line Feed
b75a7d8f 3535static const UChar chPound = 0x23; // '#', introduces a comment.
46f4442e
A
3536static const UChar chDigit0 = 0x30; // '0'
3537static const UChar chDigit7 = 0x37; // '9'
3538static const UChar chColon = 0x3A; // ':'
b75a7d8f 3539static const UChar chE = 0x45; // 'E'
46f4442e
A
3540static const UChar chQ = 0x51; // 'Q'
3541static const UChar chN = 0x4E; // 'N'
3542static const UChar chP = 0x50; // 'P'
b75a7d8f 3543static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
46f4442e
A
3544static const UChar chLBracket = 0x5b; // '['
3545static const UChar chRBracket = 0x5d; // ']'
3546static const UChar chUp = 0x5e; // '^'
3547static const UChar chLowerP = 0x70;
3548static const UChar chLBrace = 0x7b; // '{'
3549static const UChar chRBrace = 0x7d; // '}'
3550static const UChar chNEL = 0x85; // NEL newline variant
3551static const UChar chLS = 0x2028; // Unicode Line Separator
b75a7d8f
A
3552
3553
73c04bcf 3554//------------------------------------------------------------------------------
b75a7d8f
A
3555//
3556// nextCharLL Low Level Next Char from the regex pattern.
3557// Get a char from the string, keep track of input position
3558// for error reporting.
3559//
73c04bcf 3560//------------------------------------------------------------------------------
b75a7d8f
A
3561UChar32 RegexCompile::nextCharLL() {
3562 UChar32 ch;
3563 UnicodeString &pattern = fRXPat->fPattern;
3564
3565 if (fPeekChar != -1) {
3566 ch = fPeekChar;
3567 fPeekChar = -1;
3568 return ch;
3569 }
3570 if (fPatternLength==0 || fNextIndex >= fPatternLength) {
3571 return (UChar32)-1;
3572 }
3573 ch = pattern.char32At(fNextIndex);
3574 fNextIndex = pattern.moveIndex32(fNextIndex, 1);
3575
3576 if (ch == chCR ||
3577 ch == chNEL ||
3578 ch == chLS ||
3579 ch == chLF && fLastChar != chCR) {
3580 // Character is starting a new line. Bump up the line number, and
3581 // reset the column to 0.
3582 fLineNum++;
3583 fCharNum=0;
b75a7d8f
A
3584 }
3585 else {
3586 // Character is not starting a new line. Except in the case of a
3587 // LF following a CR, increment the column position.
3588 if (ch != chLF) {
3589 fCharNum++;
3590 }
3591 }
3592 fLastChar = ch;
3593 return ch;
3594}
3595
73c04bcf 3596//------------------------------------------------------------------------------
b75a7d8f
A
3597//
3598// peekCharLL Low Level Character Scanning, sneak a peek at the next
3599// character without actually getting it.
3600//
73c04bcf 3601//------------------------------------------------------------------------------
b75a7d8f
A
3602UChar32 RegexCompile::peekCharLL() {
3603 if (fPeekChar == -1) {
3604 fPeekChar = nextCharLL();
3605 }
3606 return fPeekChar;
3607}
3608
3609
73c04bcf 3610//------------------------------------------------------------------------------
b75a7d8f
A
3611//
3612// nextChar for pattern scanning. At this level, we handle stripping
3613// out comments and processing some backslash character escapes.
3614// The rest of the pattern grammar is handled at the next level up.
3615//
73c04bcf 3616//------------------------------------------------------------------------------
b75a7d8f
A
3617void RegexCompile::nextChar(RegexPatternChar &c) {
3618
3619 fScanIndex = fNextIndex;
3620 c.fChar = nextCharLL();
3621 c.fQuoted = FALSE;
3622
3623 if (fQuoteMode) {
3624 c.fQuoted = TRUE;
3625 if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) {
3626 fQuoteMode = FALSE; // Exit quote mode,
3627 nextCharLL(); // discard the E
3628 nextChar(c); // recurse to get the real next char
3629 }
3630 }
3631 else if (fInBackslashQuote) {
3632 // The current character immediately follows a '\'
3633 // Don't check for any further escapes, just return it as-is.
3634 // Don't set c.fQuoted, because that would prevent the state machine from
3635 // dispatching on the character.
3636 fInBackslashQuote = FALSE;
3637 }
3638 else
3639 {
3640 // We are not in a \Q quoted region \E of the source.
3641 //
3642 if (fModeFlags & UREGEX_COMMENTS) {
3643 //
3644 // We are in free-spacing and comments mode.
46f4442e 3645 // Scan through any white space and comments, until we
b75a7d8f
A
3646 // reach a significant character or the end of inut.
3647 for (;;) {
3648 if (c.fChar == (UChar32)-1) {
3649 break; // End of Input
3650 }
3651 if (c.fChar == chPound && fEOLComments == TRUE) {
3652 // Start of a comment. Consume the rest of it, until EOF or a new line
3653 for (;;) {
3654 c.fChar = nextCharLL();
3655 if (c.fChar == (UChar32)-1 || // EOF
3656 c.fChar == chCR ||
3657 c.fChar == chLF ||
3658 c.fChar == chNEL ||
3659 c.fChar == chLS) {
3660 break;
3661 }
3662 }
3663 }
46f4442e 3664 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
b75a7d8f
A
3665 if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) {
3666 break;
3667 }
3668 c.fChar = nextCharLL();
3669 }
3670 }
3671
3672 //
3673 // check for backslash escaped characters.
3674 //
b75a7d8f 3675 if (c.fChar == chBackSlash) {
46f4442e
A
3676 int32_t startX = fNextIndex; // start and end positions of the
3677 int32_t endX = fNextIndex; // sequence following the '\'
3678 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
b75a7d8f
A
3679 //
3680 // A '\' sequence that is handled by ICU's standard unescapeAt function.
3681 // Includes \uxxxx, \n, \r, many others.
3682 // Return the single equivalent character.
3683 //
3684 nextCharLL(); // get & discard the peeked char.
3685 c.fQuoted = TRUE;
3686 c.fChar = fRXPat->fPattern.unescapeAt(endX);
3687 if (startX == endX) {
3688 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
3689 }
3690 fCharNum += endX - startX;
3691 fNextIndex = endX;
3692 }
46f4442e
A
3693 else if (peekCharLL() == chDigit0) {
3694 // Octal Escape, using Java Regexp Conventions
3695 // which are \0 followed by 1-3 octal digits.
3696 // Different from ICU Unescape handling of Octal, which does not
3697 // require the leading 0.
3698 // Java also has the convention of only consuning 2 octal digits if
3699 // the three digit number would be > 0xff
3700 //
3701 c.fChar = 0;
3702 nextCharLL(); // Consume the initial 0.
3703 int index;
3704 for (index=0; index<3; index++) {
3705 int32_t ch = peekCharLL();
3706 if (ch<chDigit0 || ch>chDigit7) {
3707 if (index==0) {
3708 // \0 is not followed by any octal digits.
3709 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
3710 }
3711 break;
3712 }
3713 c.fChar <<= 3;
3714 c.fChar += ch&7;
3715 if (c.fChar <= 255) {
3716 nextCharLL();
3717 } else {
3718 // The last digit made the number too big. Forget we saw it.
3719 c.fChar >>= 3;
3720 }
3721 }
3722 c.fQuoted = TRUE;
3723 }
3724 else if (peekCharLL() == chQ) {
3725 // "\Q" enter quote mode, which will continue until "\E"
3726 fQuoteMode = TRUE;
3727 nextCharLL(); // discard the 'Q'.
3728 nextChar(c); // recurse to get the real next char.
3729 }
b75a7d8f
A
3730 else
3731 {
3732 // We are in a '\' escape that will be handled by the state table scanner.
3733 // Just return the backslash, but remember that the following char is to
46f4442e 3734 // be taken literally.
b75a7d8f
A
3735 fInBackslashQuote = TRUE;
3736 }
3737 }
3738 }
3739
3740 // re-enable # to end-of-line comments, in case they were disabled.
3741 // They are disabled by the parser upon seeing '(?', but this lasts for
3742 // the fetching of the next character only.
3743 fEOLComments = TRUE;
3744
3745 // putc(c.fChar, stdout);
3746}
3747
3748
3749
73c04bcf 3750//------------------------------------------------------------------------------
b75a7d8f 3751//
46f4442e
A
3752// scanNamedChar
3753 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
3754//
3755// The scan position will be at the 'N'. On return
3756// the scan position should be just after the '}'
b75a7d8f 3757//
46f4442e 3758// Return the UChar32
b75a7d8f 3759//
73c04bcf 3760//------------------------------------------------------------------------------
46f4442e 3761UChar32 RegexCompile::scanNamedChar() {
b75a7d8f 3762 if (U_FAILURE(*fStatus)) {
46f4442e 3763 return 0;
b75a7d8f
A
3764 }
3765
46f4442e
A
3766 nextChar(fC);
3767 if (fC.fChar != chLBrace) {
3768 error(U_REGEX_PROPERTY_SYNTAX);
3769 return 0;
b75a7d8f 3770 }
46f4442e
A
3771
3772 UnicodeString charName;
b75a7d8f 3773 for (;;) {
46f4442e
A
3774 nextChar(fC);
3775 if (fC.fChar == chRBrace) {
b75a7d8f
A
3776 break;
3777 }
46f4442e
A
3778 if (fC.fChar == -1) {
3779 error(U_REGEX_PROPERTY_SYNTAX);
3780 return 0;
3781 }
3782 charName.append(fC.fChar);
b75a7d8f 3783 }
46f4442e
A
3784
3785 char name[100];
3786 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
3787 (uint32_t)charName.length()>=sizeof(name)) {
3788 // All Unicode character names have only invariant characters.
3789 // The API to get a character, given a name, accepts only char *, forcing us to convert,
3790 // which requires this error check
3791 error(U_REGEX_PROPERTY_SYNTAX);
3792 return 0;
3793 }
3794 charName.extract(0, charName.length(), name, sizeof(name), US_INV);
b75a7d8f 3795
46f4442e
A
3796 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
3797 if (U_FAILURE(*fStatus)) {
3798 error(U_REGEX_PROPERTY_SYNTAX);
3799 }
b75a7d8f 3800
46f4442e
A
3801 nextChar(fC); // Continue overall regex pattern processing with char after the '}'
3802 return theChar;
3803}
b75a7d8f 3804
73c04bcf 3805//------------------------------------------------------------------------------
b75a7d8f
A
3806//
3807// scanProp Construct a UnicodeSet from the text at the current scan
3808// position, which will be of the form \p{whaterver}
3809//
3810// The scan position will be at the 'p' or 'P'. On return
3811// the scan position should be just after the '}'
3812//
3813// Return a UnicodeSet, constructed from the \P pattern,
3814// or NULL if the pattern is invalid.
3815//
73c04bcf 3816//------------------------------------------------------------------------------
b75a7d8f
A
3817UnicodeSet *RegexCompile::scanProp() {
3818 UnicodeSet *uset = NULL;
3819
3820 if (U_FAILURE(*fStatus)) {
3821 return NULL;
3822 }
46f4442e
A
3823 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
3824 UBool negated = (fC.fChar == chP);
b75a7d8f 3825
46f4442e
A
3826 UnicodeString propertyName;
3827 nextChar(fC);
3828 if (fC.fChar != chLBrace) {
3829 error(U_REGEX_PROPERTY_SYNTAX);
3830 return NULL;
3831 }
b75a7d8f 3832 for (;;) {
46f4442e 3833 nextChar(fC);
b75a7d8f
A
3834 if (fC.fChar == chRBrace) {
3835 break;
3836 }
b75a7d8f
A
3837 if (fC.fChar == -1) {
3838 // Hit the end of the input string without finding the closing '}'
3839 error(U_REGEX_PROPERTY_SYNTAX);
3840 return NULL;
3841 }
46f4442e
A
3842 propertyName.append(fC.fChar);
3843 }
3844 uset = createSetForProperty(propertyName, negated);
3845 nextChar(fC); // Move input scan to position following the closing '}'
3846 return uset;
3847}
3848
3849//------------------------------------------------------------------------------
3850//
3851// scanPosixProp Construct a UnicodeSet from the text at the current scan
3852// position, which is expected be of the form [:property expression:]
3853//
3854// The scan position will be at the opening ':'. On return
3855// the scan position must be on the closing ']'
3856//
3857// Return a UnicodeSet constructed from the pattern,
3858// or NULL if this is not a valid POSIX-style set expression.
3859// If not a property expression, restore the initial scan position
3860// (to the opening ':')
3861//
3862// Note: the opening '[:' is not sufficient to guarantee that
3863// this is a [:property:] expression.
3864// [:'+=,] is a perfectly good ordinary set expression that
3865// happens to include ':' as one of its characters.
3866//
3867//------------------------------------------------------------------------------
3868UnicodeSet *RegexCompile::scanPosixProp() {
3869 UnicodeSet *uset = NULL;
3870
3871 if (U_FAILURE(*fStatus)) {
3872 return NULL;
b75a7d8f 3873 }
b75a7d8f 3874
46f4442e
A
3875 U_ASSERT(fC.fChar == chColon);
3876
3877 // Save the scanner state.
3878 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
3879 int32_t savedScanIndex = fScanIndex;
3880 int32_t savedNextIndex = fNextIndex;
3881 UBool savedQuoteMode = fQuoteMode;
3882 UBool savedInBackslashQuote = fInBackslashQuote;
3883 UBool savedEOLComments = fEOLComments;
3884 int32_t savedLineNum = fLineNum;
3885 int32_t savedCharNum = fCharNum;
3886 UChar32 savedLastChar = fLastChar;
3887 UChar32 savedPeekChar = fPeekChar;
3888 RegexPatternChar savedfC = fC;
3889
3890 // Scan for a closing ]. A little tricky because there are some perverse
3891 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
3892 // ending on the second closing ].
3893
3894 UnicodeString propName;
3895 UBool negated = FALSE;
3896
3897 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
3898 nextChar(fC);
3899 if (fC.fChar == chUp) {
3900 negated = TRUE;
3901 nextChar(fC);
3902 }
3903
3904 // Scan for the closing ":]", collecting the property name along the way.
3905 UBool sawPropSetTerminator = FALSE;
3906 for (;;) {
3907 propName.append(fC.fChar);
3908 nextChar(fC);
3909 if (fC.fQuoted || fC.fChar == -1) {
3910 // Escaped characters or end of input - either says this isn't a [:Property:]
3911 break;
3912 }
3913 if (fC.fChar == chColon) {
3914 nextChar(fC);
3915 if (fC.fChar == chRBracket) {
3916 sawPropSetTerminator = TRUE;
3917 }
3918 break;
3919 }
3920 }
3921
3922 if (sawPropSetTerminator) {
3923 uset = createSetForProperty(propName, negated);
3924 }
3925 else
3926 {
3927 // No closing ":]".
3928 // Restore the original scan position.
3929 // The main scanner will retry the input as a normal set expression,
3930 // not a [:Property:] expression.
3931 fScanIndex = savedScanIndex;
3932 fNextIndex = savedNextIndex;
3933 fQuoteMode = savedQuoteMode;
3934 fInBackslashQuote = savedInBackslashQuote;
3935 fEOLComments = savedEOLComments;
3936 fLineNum = savedLineNum;
3937 fCharNum = savedCharNum;
3938 fLastChar = savedLastChar;
3939 fPeekChar = savedPeekChar;
3940 fC = savedfC;
3941 }
3942 return uset;
3943}
3944
3945static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
3946 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
3947 addCategory(set, U_GC_CF_MASK, ec);
3948}
3949
3950//
3951// Create a Unicode Set from a Unicode Property expression.
3952// This is common code underlying both \p{...} ane [:...:] expressions.
3953// Includes trying the Java "properties" that aren't supported as
3954// normal ICU UnicodeSet properties
3955//
3956static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
3957static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
3958UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
3959 UnicodeString setExpr;
3960 UnicodeSet *set;
3961 uint32_t usetFlags = 0;
3962
3963 if (U_FAILURE(*fStatus)) {
3964 return NULL;
3965 }
3966
3967 //
3968 // First try the property as we received it
3969 //
3970 if (negated) {
3971 setExpr.append(negSetPrefix, -1);
3972 } else {
3973 setExpr.append(posSetPrefix, -1);
3974 }
3975 setExpr.append(propName);
3976 setExpr.append(chRBrace);
3977 setExpr.append(chRBracket);
b75a7d8f
A
3978 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
3979 usetFlags |= USET_CASE_INSENSITIVE;
3980 }
46f4442e
A
3981 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
3982 if (U_SUCCESS(*fStatus)) {
3983 return set;
3984 }
3985 delete set;
3986 set = NULL;
3987
3988 //
3989 // The property as it was didn't work.
3990 // Do emergency fixes -
3991 // InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
3992 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
3993 //
3994 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
3995 // is accepted by Java. The property part of the name is compared
3996 // case-insenstively. The spaces must be exactly as shown, either
3997 // all there, or all omitted, with exactly one at each position
3998 // if they are present. From checking against JDK 1.6
3999 //
4000 // This code should be removed when ICU properties support the Java compatibility names
4001 // (ICU 4.0?)
4002 //
4003 UnicodeString mPropName = propName;
4004 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4005 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
4006 }
4007 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4008 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4009 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4010 }
4011 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4012 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
4013 }
4014
4015 // See if the property looks like a Java "InBlockName", which
4016 // we will recast as "Block=BlockName"
4017 //
4018 static const UChar IN[] = {0x49, 0x6E, 0}; // "In"
4019 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block="
4020 if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
4021 setExpr.truncate(4); // Leaves "[\p{", or "[\P{"
4022 setExpr.append(BLOCK, -1);
4023 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed.
4024 setExpr.append(chRBrace);
4025 setExpr.append(chRBracket);
4026 *fStatus = U_ZERO_ERROR;
4027 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4028 if (U_SUCCESS(*fStatus)) {
4029 return set;
4030 }
4031 delete set;
4032 set = NULL;
b75a7d8f
A
4033 }
4034
46f4442e
A
4035 if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4036 propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4037 {
4038 UErrorCode localStatus = U_ZERO_ERROR;
4039 //setExpr.remove();
4040 set = new UnicodeSet();
4041 //
4042 // Try the various Java specific properties.
4043 // These all begin with "java"
4044 //
4045 if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4046 addCategory(set, U_GC_CN_MASK, localStatus);
4047 set->complement();
4048 }
4049 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4050 addCategory(set, U_GC_ND_MASK, localStatus);
4051 }
4052 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4053 addIdentifierIgnorable(set, localStatus);
4054 }
4055 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4056 set->add(0, 0x1F).add(0x7F, 0x9F);
4057 }
4058 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4059 addCategory(set, U_GC_L_MASK, localStatus);
4060 addCategory(set, U_GC_SC_MASK, localStatus);
4061 addCategory(set, U_GC_PC_MASK, localStatus);
4062 addCategory(set, U_GC_ND_MASK, localStatus);
4063 addCategory(set, U_GC_NL_MASK, localStatus);
4064 addCategory(set, U_GC_MC_MASK, localStatus);
4065 addCategory(set, U_GC_MN_MASK, localStatus);
4066 addIdentifierIgnorable(set, localStatus);
4067 }
4068 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4069 addCategory(set, U_GC_L_MASK, localStatus);
4070 addCategory(set, U_GC_NL_MASK, localStatus);
4071 addCategory(set, U_GC_SC_MASK, localStatus);
4072 addCategory(set, U_GC_PC_MASK, localStatus);
4073 }
4074 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4075 addCategory(set, U_GC_L_MASK, localStatus);
4076 }
4077 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4078 addCategory(set, U_GC_L_MASK, localStatus);
4079 addCategory(set, U_GC_ND_MASK, localStatus);
4080 }
4081 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4082 addCategory(set, U_GC_LL_MASK, localStatus);
4083 }
4084 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4085 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
4086 }
4087 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4088 addCategory(set, U_GC_Z_MASK, localStatus);
4089 }
4090 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4091 set->add(0x10000, UnicodeSet::MAX_VALUE);
4092 }
4093 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4094 addCategory(set, U_GC_LT_MASK, localStatus);
4095 }
4096 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4097 addCategory(set, U_GC_L_MASK, localStatus);
4098 addCategory(set, U_GC_NL_MASK, localStatus);
4099 }
4100 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4101 addCategory(set, U_GC_L_MASK, localStatus);
4102 addCategory(set, U_GC_PC_MASK, localStatus);
4103 addCategory(set, U_GC_ND_MASK, localStatus);
4104 addCategory(set, U_GC_NL_MASK, localStatus);
4105 addCategory(set, U_GC_MC_MASK, localStatus);
4106 addCategory(set, U_GC_MN_MASK, localStatus);
4107 addIdentifierIgnorable(set, localStatus);
4108 }
4109 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4110 addCategory(set, U_GC_LU_MASK, localStatus);
4111 }
4112 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4113 set->add(0, UnicodeSet::MAX_VALUE);
4114 }
4115 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4116 addCategory(set, U_GC_Z_MASK, localStatus);
4117 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4118 set->add(9, 0x0d).add(0x1c, 0x1f);
4119 }
4120 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4121 set->add(0, UnicodeSet::MAX_VALUE);
4122 }
4123
4124 if (U_SUCCESS(localStatus) && !set->isEmpty()) {
4125 *fStatus = U_ZERO_ERROR;
4126 if (usetFlags & USET_CASE_INSENSITIVE) {
4127 set->closeOver(USET_CASE_INSENSITIVE);
4128 }
4129 if (negated) {
4130 set->complement();
4131 }
4132 return set;
4133 }
4134 delete set;
4135 set = NULL;
b75a7d8f 4136 }
46f4442e
A
4137 error(*fStatus);
4138 return NULL;
4139}
b75a7d8f 4140
46f4442e
A
4141
4142
4143//
4144// SetEval Part of the evaluation of [set expressions].
4145// Perform any pending (stacked) operations with precedence
4146// equal or greater to that of the next operator encountered
4147// in the expression.
4148//
4149void RegexCompile::setEval(int32_t nextOp) {
4150 UnicodeSet *rightOperand = NULL;
4151 UnicodeSet *leftOperand = NULL;
4152 for (;;) {
4153 U_ASSERT(fSetOpStack.empty()==FALSE);
4154 int32_t pendingSetOperation = fSetOpStack.peeki();
4155 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4156 break;
4157 }
4158 fSetOpStack.popi();
4159 U_ASSERT(fSetStack.empty() == FALSE);
4160 rightOperand = (UnicodeSet *)fSetStack.peek();
4161 switch (pendingSetOperation) {
4162 case setNegation:
4163 rightOperand->complement();
4164 break;
4165 case setCaseClose:
4166 // TODO: need a simple close function. Ticket 6065
4167 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4168 rightOperand->removeAllStrings();
4169 break;
4170 case setDifference1:
4171 case setDifference2:
4172 fSetStack.pop();
4173 leftOperand = (UnicodeSet *)fSetStack.peek();
4174 leftOperand->removeAll(*rightOperand);
4175 delete rightOperand;
4176 break;
4177 case setIntersection1:
4178 case setIntersection2:
4179 fSetStack.pop();
4180 leftOperand = (UnicodeSet *)fSetStack.peek();
4181 leftOperand->retainAll(*rightOperand);
4182 delete rightOperand;
4183 break;
4184 case setUnion:
4185 fSetStack.pop();
4186 leftOperand = (UnicodeSet *)fSetStack.peek();
4187 leftOperand->addAll(*rightOperand);
4188 delete rightOperand;
4189 break;
4190 default:
4191 U_ASSERT(FALSE);
4192 break;
4193 }
4194 }
4195 }
4196
4197void RegexCompile::setPushOp(int32_t op) {
4198 setEval(op);
4199 fSetOpStack.push(op, *fStatus);
4200 fSetStack.push(new UnicodeSet(), *fStatus);
73c04bcf 4201}
b75a7d8f
A
4202
4203U_NAMESPACE_END
4204#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf 4205