]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/regexcmp.cpp
ICU-6.2.14.tar.gz
[apple/icu.git] / icuSources / i18n / regexcmp.cpp
CommitLineData
b75a7d8f
A
1
2//
3// file: regexcmp.cpp
4//
374ca955 5// Copyright (C) 2002-2004 International Business Machines Corporation and others.
b75a7d8f
A
6// All Rights Reserved.
7//
8// This file contains the ICU regular expression compiler, which is responsible
9// for processing a regular expression pattern into the compiled form that
10// is used by the match finding engine.
11//
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17#include "unicode/unistr.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/uchriter.h"
21#include "unicode/parsepos.h"
22#include "unicode/parseerr.h"
23#include "unicode/regex.h"
374ca955 24#include "util.h"
b75a7d8f
A
25#include "cmemory.h"
26#include "cstring.h"
27#include "uvectr32.h"
28#include "uassert.h"
29#include "ucln_in.h"
30#include "mutex.h"
31
32#include "regeximp.h"
33#include "regexcst.h" // Contains state table for the regex pattern parser.
34 // generated by a Perl script.
35#include "regexcmp.h"
36#include "regexst.h"
37
38
39
40U_NAMESPACE_BEGIN
41
42
43
44
45
46//----------------------------------------------------------------------------------------
47//
48// Constructor.
49//
50//----------------------------------------------------------------------------------------
51RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status)
52{
53 fStatus = &status;
54
55 fRXPat = rxp;
56 fScanIndex = 0;
57 fNextIndex = 0;
58 fPeekChar = -1;
59 fLineNum = 1;
60 fCharNum = 0;
61 fQuoteMode = FALSE;
62 fInBackslashQuote = FALSE;
63 fModeFlags = fRXPat->fFlags;
64 fEOLComments = TRUE;
65
66 fMatchOpenParen = -1;
67 fMatchCloseParen = -1;
68 fStringOpStart = -1;
69
70 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
71 status = rxp->fDeferredStatus;
72 }
73}
74
75
76
77//----------------------------------------------------------------------------------------
78//
79// Destructor
80//
81//----------------------------------------------------------------------------------------
82RegexCompile::~RegexCompile() {
83}
84
b75a7d8f
A
85//---------------------------------------------------------------------------------
86//
87// Compile regex pattern. The state machine for rexexp pattern parsing is here.
88// The state tables are hand-written in the file regexcst.txt,
89// and converted to the form used here by a perl
90// script regexcst.pl
91//
92//---------------------------------------------------------------------------------
93void RegexCompile::compile(
94 const UnicodeString &pat, // Source pat to be compiled.
95 UParseError &pp, // Error position info
96 UErrorCode &e) // Error Code
97{
98 fStatus = &e;
99 fParseErr = &pp;
100 fStackPtr = 0;
101 fStack[fStackPtr] = 0;
102
103 if (U_FAILURE(*fStatus)) {
104 return;
105 }
106
107 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
108 U_ASSERT(fRXPat->fPattern.length() == 0);
109
110 // Prepare the RegexPattern object to receive the compiled pattern.
111 // TODO: remove per-instance field, and just use globals directly. (But check perf)
112 fRXPat->fPattern = pat;
113 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
114 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
115
116
117 // Initialize the pattern scanning state machine
118 fPatternLength = pat.length();
119 uint16_t state = 1;
120 const RegexTableEl *tableEl;
121 nextChar(fC); // Fetch the first char from the pattern string.
122
123 //
124 // Main loop for the regex pattern parsing state machine.
125 // Runs once per state transition.
126 // Each time through optionally performs, depending on the state table,
127 // - an advance to the the next pattern char
128 // - an action to be performed.
129 // - pushing or popping a state to/from the local state return stack.
130 // file regexcst.txt is the source for the state table. The logic behind
131 // recongizing the pattern syntax is there, not here.
132 //
133 for (;;) {
134 // Bail out if anything has gone wrong.
135 // Regex pattern parsing stops on the first error encountered.
136 if (U_FAILURE(*fStatus)) {
137 break;
138 }
139
140 U_ASSERT(state != 0);
141
142 // Find the state table element that matches the input char from the pattern, or the
143 // class of the input character. Start with the first table row for this
144 // state, then linearly scan forward until we find a row that matches the
145 // character. The last row for each state always matches all characters, so
146 // the search will stop there, if not before.
147 //
148 tableEl = &gRuleParseStateTable[state];
374ca955
A
149 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
150 fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
b75a7d8f
A
151
152 for (;;) { // loop through table rows belonging to this state, looking for one
153 // that matches the current input char.
374ca955 154 REGEX_SCAN_DEBUG_PRINTF(("."));
b75a7d8f
A
155 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
156 // Table row specified an individual character, not a set, and
157 // the input character is not quoted, and
158 // the input character matched it.
159 break;
160 }
161 if (tableEl->fCharClass == 255) {
162 // Table row specified default, match anything character class.
163 break;
164 }
165 if (tableEl->fCharClass == 254 && fC.fQuoted) {
166 // Table row specified "quoted" and the char was quoted.
167 break;
168 }
169 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
170 // Table row specified eof and we hit eof on the input.
171 break;
172 }
173
174 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
175 fC.fQuoted == FALSE && // char is not escaped &&
176 fC.fChar != (UChar32)-1) { // char is not EOF
177 UnicodeSet *uniset = RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128];
178 if (uniset->contains(fC.fChar)) {
179 // Table row specified a character class, or set of characters,
180 // and the current char matches it.
181 break;
182 }
183 }
184
185 // No match on this row, advance to the next row for this state,
186 tableEl++;
187 }
374ca955 188 REGEX_SCAN_DEBUG_PRINTF(("\n"));
b75a7d8f
A
189
190 //
191 // We've found the row of the state table that matches the current input
192 // character from the rules string.
193 // Perform any action specified by this row in the state table.
194 if (doParseActions((EParseAction)tableEl->fAction) == FALSE) {
195 // Break out of the state machine loop if the
196 // the action signalled some kind of error, or
197 // the action was to exit, occurs on normal end-of-rules-input.
198 break;
199 }
200
201 if (tableEl->fPushState != 0) {
202 fStackPtr++;
203 if (fStackPtr >= kStackSize) {
204 error(U_REGEX_INTERNAL_ERROR);
374ca955 205 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
b75a7d8f
A
206 fStackPtr--;
207 }
208 fStack[fStackPtr] = tableEl->fPushState;
209 }
210
211 //
212 // NextChar. This is where characters are actually fetched from the pattern.
213 // Happens under control of the 'n' tag in the state table.
214 //
215 if (tableEl->fNextChar) {
216 nextChar(fC);
217 }
218
219 // Get the next state from the table entry, or from the
220 // state stack if the next state was specified as "pop".
221 if (tableEl->fNextState != 255) {
222 state = tableEl->fNextState;
223 } else {
224 state = fStack[fStackPtr];
225 fStackPtr--;
226 if (fStackPtr < 0) {
227 // state stack underflow
228 // This will occur if the user pattern has mis-matched parentheses,
229 // with extra close parens.
230 //
231 fStackPtr++;
232 error(U_REGEX_MISMATCHED_PAREN);
233 }
234 }
235
236 }
237
238 //
239 // The pattern has now been read and processed, and the compiled code generated.
240 //
241
242 // Back-reference fixup
243 //
244 int32_t loc;
245 for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) {
246 int32_t op = fRXPat->fCompiledPat->elementAti(loc);
247 int32_t opType = URX_TYPE(op);
248 if (opType == URX_BACKREF || opType == URX_BACKREF_I) {
249 int32_t where = URX_VAL(op);
250 if (where > fRXPat->fGroupMap->size()) {
251 error(U_REGEX_INVALID_BACK_REF);
252 break;
253 }
254 where = fRXPat->fGroupMap->elementAti(where-1);
255 op = URX_BUILD(opType, where);
256 fRXPat->fCompiledPat->setElementAt(op, loc);
257 }
258 }
259
260
261 //
262 // Compute the number of digits requried for the largest capture group number.
263 //
264 fRXPat->fMaxCaptureDigits = 1;
265 int32_t n = 10;
266 for (;;) {
267 if (n > fRXPat->fGroupMap->size()) {
268 break;
269 }
270 fRXPat->fMaxCaptureDigits++;
271 n *= 10;
272 }
273
274 //
275 // The pattern's fFrameSize so far has accumulated the requirements for
276 // storage for capture parentheses, counters, etc. that are encountered
277 // in the pattern. Add space for the two variables that are always
278 // present in the saved state: the input string position and the
279 // position in the compiled pattern.
280 //
281 fRXPat->fFrameSize+=2;
282
283 //
284 // Get bounds for the minimum and maximum length of a string that this
285 // pattern can match. Used to avoid looking for matches in strings that
286 // are too short.
287 //
288 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
289
290 //
291 // Optimization passes
292 //
293 matchStartType();
294 OptDotStar();
295 stripNOPs();
296
297 //
298 // Set up fast latin-1 range sets
299 //
300 int32_t numSets = fRXPat->fSets->size();
301 fRXPat->fSets8 = new Regex8BitSet[numSets];
302 int32_t i;
303 for (i=0; i<numSets; i++) {
304 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
305 fRXPat->fSets8[i].init(s);
306 }
307
b75a7d8f
A
308}
309
310
311
312
313
314//----------------------------------------------------------------------------------------
315//
316// doParseAction Do some action during regex pattern parsing.
317// Called by the parse state machine.
318//
319// Generation of the match engine PCode happens here, or
320// in functions called from the parse actions defined here.
321//
322//
323//----------------------------------------------------------------------------------------
324UBool RegexCompile::doParseActions(EParseAction action)
325{
326 UBool returnVal = TRUE;
327
328 switch ((Regex_PatternParseAction)action) {
329
330 case doPatStart:
331 // Start of pattern compiles to:
332 //0 SAVE 2 Fall back to position of FAIL
333 //1 jmp 3
334 //2 FAIL Stop if we ever reach here.
335 //3 NOP Dummy, so start of pattern looks the same as
336 // the start of an ( grouping.
337 //4 NOP Resreved, will be replaced by a save if there are
338 // OR | operators at the top level
339 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus);
340 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus);
341 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
342 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
343 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
344
345 fParenStack.push(-1, *fStatus); // Begin a Paren Stack Frame
346 fParenStack.push( 3, *fStatus); // Push location of first NOP
347 break;
348
349 case doPatFinish:
350 // We've scanned to the end of the pattern
351 // The end of pattern compiles to:
352 // URX_END
353 // which will stop the runtime match engine.
354 // Encountering end of pattern also behaves like a close paren,
355 // and forces fixups of the State Save at the beginning of the compiled pattern
356 // and of any OR operations at the top level.
357 //
358 handleCloseParen();
359 if (fParenStack.size() > 0) {
360 // Missing close paren in pattern.
361 error(U_REGEX_MISMATCHED_PAREN);
362 }
363
364 // add the END operation to the compiled pattern.
365 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
366
367 // Terminate the pattern compilation state machine.
368 returnVal = FALSE;
369 break;
370
371
372
373 case doOrOperator:
374 // Scanning a '|', as in (A|B)
375 {
376 // Insert a SAVE operation at the start of the pattern section preceding
377 // this OR at this level. This SAVE will branch the match forward
378 // to the right hand side of the OR in the event that the left hand
379 // side fails to match and backtracks. Locate the position for the
380 // save from the location on the top of the parentheses stack.
381 int32_t savePosition = fParenStack.popi();
382 int32_t op = fRXPat->fCompiledPat->elementAti(savePosition);
383 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
384 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
385 fRXPat->fCompiledPat->setElementAt(op, savePosition);
386
387 // Append an JMP operation into the compiled pattern. The operand for
388 // the JMP will eventually be the location following the ')' for the
389 // group. This will be patched in later, when the ')' is encountered.
390 op = URX_BUILD(URX_JMP, 0);
391 fRXPat->fCompiledPat->addElement(op, *fStatus);
392
393 // Push the position of the newly added JMP op onto the parentheses stack.
394 // This registers if for fixup when this block's close paren is encountered.
395 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
396
397 // Append a NOP to the compiled pattern. This is the slot reserved
398 // for a SAVE in the event that there is yet another '|' following
399 // this one.
400 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
401 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
402 }
403 break;
404
405
406 case doOpenCaptureParen:
407 // Open Paren.
408 // Compile to a
409 // - NOP, which later may be replaced by a save-state if the
410 // parenthesized group gets a * quantifier, followed by
411 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
412 // - NOP, which may later be replaced by a save-state if there
413 // is an '|' alternation within the parens.
414 //
415 // Each capture group gets three slots in the save stack frame:
416 // 0: Capture Group start position (in input string being matched.)
417 // 1: Capture Group end positino.
418 // 2: Start of Match-in-progress.
419 // The first two locations are for a completed capture group, and are
420 // referred to by back references and the like.
421 // The third location stores the capture start position when an START_CAPTURE is
422 // encountered. This will be promoted to a completed capture when (and if) the corresponding
423 // END_CAPure is encountered.
424 {
425 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
426 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame.
427 fRXPat->fFrameSize += 3;
428 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
429 fRXPat->fCompiledPat->addElement(cop, *fStatus);
430 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
431
432 // On the Parentheses stack, start a new frame and add the postions
433 // of the two NOPs. Depending on what follows in the pattern, the
434 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
435 // address of the end of the parenthesized group.
436 fParenStack.push(fModeFlags, *fStatus); // Match mode state
437 fParenStack.push(capturing, *fStatus); // Frame type.
438 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
439 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
440
441 // Save the mapping from group number to stack frame variable position.
442 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
443 }
444 break;
445
446 case doOpenNonCaptureParen:
447 // Open non-caputuring (grouping only) Paren.
448 // Compile to a
449 // - NOP, which later may be replaced by a save-state if the
450 // parenthesized group gets a * quantifier, followed by
451 // - NOP, which may later be replaced by a save-state if there
452 // is an '|' alternation within the parens.
453 {
454 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
455 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
456
457 // On the Parentheses stack, start a new frame and add the postions
458 // of the two NOPs.
459 fParenStack.push(fModeFlags, *fStatus); // Match mode state
460 fParenStack.push(plain, *fStatus); // Begin a new frame.
461 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
462 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
463 }
464 break;
465
466
467 case doOpenAtomicParen:
468 // Open Atomic Paren. (?>
469 // Compile to a
470 // - NOP, which later may be replaced if the parenthesized group
471 // has a quantifier, followed by
472 // - STO_SP save state stack position, so it can be restored at the ")"
473 // - NOP, which may later be replaced by a save-state if there
474 // is an '|' alternation within the parens.
475 {
476 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
477 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
478 fRXPat->fDataSize += 1; // state stack ptr.
479 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc);
480 fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
481 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
482
483 // On the Parentheses stack, start a new frame and add the postions
484 // of the two NOPs. Depending on what follows in the pattern, the
485 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
486 // address of the end of the parenthesized group.
487 fParenStack.push(fModeFlags, *fStatus); // Match mode state
488 fParenStack.push(atomic, *fStatus); // Frame type.
489 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
490 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
491 }
492 break;
493
494
495 case doOpenLookAhead:
496 // Positive Look-ahead (?= stuff )
497 // Compiles to
498 // 1 START_LA dataLoc
499 // 2. NOP reserved for use by quantifiers on the block.
500 // Look-ahead can't have quantifiers, but paren stack
501 // compile time conventions require the slot anyhow.
502 // 3. NOP may be replaced if there is are '|' ops in the block.
503 // 4. code for parenthesized stuff.
504 // 5. ENDLA
505 //
506 // Two data slots are reserved, for saving the stack ptr and the input position.
507 {
508 int32_t dataLoc = fRXPat->fDataSize;
509 fRXPat->fDataSize += 2;
510 int32_t op = URX_BUILD(URX_LA_START, dataLoc);
511 fRXPat->fCompiledPat->addElement(op, *fStatus);
512
513 op = URX_BUILD(URX_NOP, 0);
514 fRXPat->fCompiledPat->addElement(op, *fStatus);
515 fRXPat->fCompiledPat->addElement(op, *fStatus);
516
517 // On the Parentheses stack, start a new frame and add the postions
518 // of the NOPs.
519 fParenStack.push(fModeFlags, *fStatus); // Match mode state
520 fParenStack.push(lookAhead, *fStatus); // Frame type.
521 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
522 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
523 }
524 break;
525
526 case doOpenLookAheadNeg:
527 // Negated Lookahead. (?! stuff )
528 // Compiles to
529 // 1. START_LA dataloc
530 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
531 // // which continues with the match.
532 // 3. NOP // Std. Open Paren sequence, for possible '|'
533 // 4. code for parenthesized stuff.
534 // 5. END_LA // Cut back stack, remove saved state from step 2.
535 // 6. FAIL // code in block succeeded, so neg. lookahead fails.
536 // 7. ...
537 {
538 int32_t dataLoc = fRXPat->fDataSize;
539 fRXPat->fDataSize += 2;
540 int32_t op = URX_BUILD(URX_LA_START, dataLoc);
541 fRXPat->fCompiledPat->addElement(op, *fStatus);
542
543 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patched later.
544 fRXPat->fCompiledPat->addElement(op, *fStatus);
545
546 op = URX_BUILD(URX_NOP, 0);
547 fRXPat->fCompiledPat->addElement(op, *fStatus);
548
549 // On the Parentheses stack, start a new frame and add the postions
550 // of the StateSave and NOP.
551 fParenStack.push(fModeFlags, *fStatus); // Match mode state
552 fParenStack.push( negLookAhead, *fStatus); // Frame type
553 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
554 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
555
556 // Instructions #5 and #6 will be added when the ')' is encountered.
557 }
558 break;
559
560 case doOpenLookBehind:
561 {
562 // Compile a (?<= look-behind open paren.
563 //
564 // Compiles to
565 // 0 URX_LB_START dataLoc
566 // 1 URX_LB_CONT dataLoc
567 // 2 MinMatchLen
568 // 3 MaxMatchLen
569 // 4 URX_NOP Standard '(' boilerplate.
570 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
571 // 6 <code for LookBehind expression>
572 // 7 URX_LB_END dataLoc # Check match len, restore input len
573 // 8 URX_LA_END dataLoc # Restore stack, input pos
574 //
575 // Allocate a block of matcher data, to contain (when running a match)
576 // 0: Stack ptr on entry
577 // 1: Input Index on entry
578 // 2: Start index of match current match attempt.
579 // 3: Original Input String len.
580
581 // Allocate data space
582 int32_t dataLoc = fRXPat->fDataSize;
583 fRXPat->fDataSize += 4;
584
585 // Emit URX_LB_START
586 int32_t op = URX_BUILD(URX_LB_START, dataLoc);
587 fRXPat->fCompiledPat->addElement(op, *fStatus);
588
589 // Emit URX_LB_CONT
590 op = URX_BUILD(URX_LB_CONT, dataLoc);
591 fRXPat->fCompiledPat->addElement(op, *fStatus);
592 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later.
593 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later.
594
595 // Emit the NOP
596 op = URX_BUILD(URX_NOP, 0);
597 fRXPat->fCompiledPat->addElement(op, *fStatus);
598 fRXPat->fCompiledPat->addElement(op, *fStatus);
599
600 // On the Parentheses stack, start a new frame and add the postions
601 // of the URX_LB_CONT and the NOP.
602 fParenStack.push(fModeFlags, *fStatus); // Match mode state
603 fParenStack.push(lookBehind, *fStatus); // Frame type
604 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
605 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
606
607 // The final two instructions will be added when the ')' is encountered.
608 }
609
610 break;
611
612 case doOpenLookBehindNeg:
613 {
614 // Compile a (?<! negated look-behind open paren.
615 //
616 // Compiles to
617 // 0 URX_LB_START dataLoc # Save entry stack, input len
618 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
619 // 2 MinMatchLen
620 // 3 MaxMatchLen
621 // 4 continueLoc (9)
622 // 5 URX_NOP Standard '(' boilerplate.
623 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
624 // 7 <code for LookBehind expression>
625 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
626 // 9 ...
627 //
628 // Allocate a block of matcher data, to contain (when running a match)
629 // 0: Stack ptr on entry
630 // 1: Input Index on entry
631 // 2: Start index of match current match attempt.
632 // 3: Original Input String len.
633
634 // Allocate data space
635 int32_t dataLoc = fRXPat->fDataSize;
636 fRXPat->fDataSize += 4;
637
638 // Emit URX_LB_START
639 int32_t op = URX_BUILD(URX_LB_START, dataLoc);
640 fRXPat->fCompiledPat->addElement(op, *fStatus);
641
642 // Emit URX_LBN_CONT
643 op = URX_BUILD(URX_LBN_CONT, dataLoc);
644 fRXPat->fCompiledPat->addElement(op, *fStatus);
645 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later.
646 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later.
647 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later.
648
649 // Emit the NOP
650 op = URX_BUILD(URX_NOP, 0);
651 fRXPat->fCompiledPat->addElement(op, *fStatus);
652 fRXPat->fCompiledPat->addElement(op, *fStatus);
653
654 // On the Parentheses stack, start a new frame and add the postions
655 // of the URX_LB_CONT and the NOP.
656 fParenStack.push(fModeFlags, *fStatus); // Match mode state
657 fParenStack.push(lookBehindN, *fStatus); // Frame type
658 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
659 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
660
661 // The final two instructions will be added when the ')' is encountered.
662 }
663 break;
664
665 case doConditionalExpr:
666 // Conditionals such as (?(1)a:b)
667 case doPerlInline:
668 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
669 error(U_REGEX_UNIMPLEMENTED);
670 break;
671
672
673 case doCloseParen:
674 handleCloseParen();
675 if (fParenStack.size() <= 0) {
676 // Extra close paren, or missing open paren.
677 error(U_REGEX_MISMATCHED_PAREN);
678 }
679 break;
680
681 case doNOP:
682 break;
683
684
685 case doBadOpenParenType:
686 case doRuleError:
687 error(U_REGEX_RULE_SYNTAX);
688 break;
689
690
691 case doMismatchedParenErr:
692 error(U_REGEX_MISMATCHED_PAREN);
693 break;
694
695 case doPlus:
696 // Normal '+' compiles to
697 // 1. stuff to be repeated (already built)
698 // 2. jmp-sav 1
699 // 3. ...
700 //
701 // Or, if the item to be repeated can match a zero length string,
702 // 1. STO_INP_LOC data-loc
703 // 2. body of stuff to be repeated
704 // 3. JMP_SAV_X 2
705 // 4. ...
706
707 //
708 // Or, if the item to be repeated is simple
709 // 1. Item to be repeated.
710 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
711 // 3. LOOP_C stack location
712 {
713 int32_t topLoc = blockTopLoc(FALSE); // location of item #1
714 int32_t frameLoc;
715
716 // Check for simple constructs, which may get special optimized code.
717 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
718 int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc);
719
720 if (URX_TYPE(repeatedOp) == URX_SETREF) {
721 // Emit optimized code for [char set]+
722 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
723 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
724 frameLoc = fRXPat->fFrameSize;
725 fRXPat->fFrameSize++;
726 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
727 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
728 break;
729 }
730
731 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
732 URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
733 // Emit Optimized code for .+ operations.
734 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
735 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
736 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
737 loopOpI |= 1;
738 }
739 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
740 frameLoc = fRXPat->fFrameSize;
741 fRXPat->fFrameSize++;
742 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
743 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
744 break;
745 }
746
747 }
748
749 // General case.
750
751 // Check for minimum match length of zero, which requires
752 // extra loop-breaking code.
753 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
754 // Zero length match is possible.
755 // Emit the code sequence that can handle it.
756 insertOp(topLoc);
757 frameLoc = fRXPat->fFrameSize;
758 fRXPat->fFrameSize++;
759
760 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
761 fRXPat->fCompiledPat->setElementAt(op, topLoc);
762
763 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1);
764 fRXPat->fCompiledPat->addElement(op, *fStatus);
765 } else {
766 // Simpler code when the repeated body must match something non-empty
767 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc);
768 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
769 }
770 }
771 break;
772
773 case doNGPlus:
774 // Non-greedy '+?' compiles to
775 // 1. stuff to be repeated (already built)
776 // 2. state-save 1
777 // 3. ...
778 {
779 int32_t topLoc = blockTopLoc(FALSE);
780 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
781 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
782 }
783 break;
784
785
786 case doOpt:
787 // Normal (greedy) ? quantifier.
788 // Compiles to
789 // 1. state save 3
790 // 2. body of optional block
791 // 3. ...
792 // Insert the state save into the compiled pattern, and we're done.
793 {
794 int32_t saveStateLoc = blockTopLoc(TRUE);
795 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
796 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
797 }
798 break;
799
800 case doNGOpt:
801 // Non-greedy ?? quantifier
802 // compiles to
803 // 1. jmp 4
804 // 2. body of optional block
805 // 3 jmp 5
806 // 4. state save 2
807 // 5 ...
808 // This code is less than ideal, with two jmps instead of one, because we can only
809 // insert one instruction at the top of the block being iterated.
810 {
811 int32_t jmp1_loc = blockTopLoc(TRUE);
812 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
813
814 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1);
815 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
816
817 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2);
818 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);
819
820 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
821 fRXPat->fCompiledPat->addElement(save_op, *fStatus);
822 }
823 break;
824
825
826 case doStar:
827 // Normal (greedy) * quantifier.
828 // Compiles to
829 // 1. STATE_SAVE 4
830 // 2. body of stuff being iterated over
831 // 3. JMP_SAV 2
832 // 4. ...
833 //
834 // Or, if the body is a simple [Set],
835 // 1. LOOP_SR_I set number
836 // 2. LOOP_C stack location
837 // ...
838 //
839 // Or if this is a .*
840 // 1. LOOP_DOT_I (. matches all mode flag)
841 // 2. LOOP_C stack location
842 //
843 // Or, if the body can match a zero-length string, to inhibit infinite loops,
844 // 1. STATE_SAVE 5
845 // 2. STO_INP_LOC data-loc
846 // 3. body of stuff
847 // 4. JMP_SAV_X 2
848 // 5. ...
849 {
850 // location of item #1, the STATE_SAVE
851 int32_t topLoc = blockTopLoc(FALSE);
852 int32_t dataLoc = -1;
853
854 // Check for simple *, where the construct being repeated
855 // compiled to single opcode, and might be optimizable.
856 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
857 int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc);
858
859 if (URX_TYPE(repeatedOp) == URX_SETREF) {
860 // Emit optimized code for a [char set]*
861 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
862 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
863 dataLoc = fRXPat->fFrameSize;
864 fRXPat->fFrameSize++;
865 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
866 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
867 break;
868 }
869
870 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
871 URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
872 // Emit Optimized code for .* operations.
873 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
874 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
875 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
876 loopOpI |= 1;
877 }
878 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
879 dataLoc = fRXPat->fFrameSize;
880 fRXPat->fFrameSize++;
881 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
882 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
883 break;
884 }
885 }
886
887 // Emit general case code for this *
888 // The optimizations did not apply.
889
890 int32_t saveStateLoc = blockTopLoc(TRUE);
891 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1);
892
893 // Check for minimum match length of zero, which requires
894 // extra loop-breaking code.
895 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
896 insertOp(saveStateLoc);
897 dataLoc = fRXPat->fFrameSize;
898 fRXPat->fFrameSize++;
899
900 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
901 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
902 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2);
903 }
904
905 // Locate the position in the compiled pattern where the match will continue
906 // after completing the *. (4 or 5 in the comment above)
907 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
908
909 // Put together the save state op store it into the compiled code.
910 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
911 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
912
913 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
914 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
915 }
916 break;
917
918 case doNGStar:
919 // Non-greedy *? quantifier
920 // compiles to
921 // 1. JMP 3
922 // 2. body of stuff being iterated over
923 // 3. STATE_SAVE 2
924 // 4 ...
925 {
926 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
927 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
928 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc);
929 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
930 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
931 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
932 }
933 break;
934
935
936 case doIntervalInit:
937 // The '{' opening an interval quantifier was just scanned.
938 // Init the counter varaiables that will accumulate the values as the digits
939 // are scanned.
940 fIntervalLow = 0;
941 fIntervalUpper = -1;
942 break;
943
944 case doIntevalLowerDigit:
945 // Scanned a digit from the lower value of an {lower,upper} interval
946 {
947 int32_t digitValue = u_charDigitValue(fC.fChar);
948 U_ASSERT(digitValue >= 0);
949 fIntervalLow = fIntervalLow*10 + digitValue;
950 if (fIntervalLow < 0) {
951 error(U_REGEX_NUMBER_TOO_BIG);
952 }
953 }
954 break;
955
956 case doIntervalUpperDigit:
957 // Scanned a digit from the upper value of an {lower,upper} interval
958 {
959 if (fIntervalUpper < 0) {
960 fIntervalUpper = 0;
961 }
962 int32_t digitValue = u_charDigitValue(fC.fChar);
963 U_ASSERT(digitValue >= 0);
964 fIntervalUpper = fIntervalUpper*10 + digitValue;
374ca955 965 if (fIntervalUpper < 0) {
b75a7d8f
A
966 error(U_REGEX_NUMBER_TOO_BIG);
967 }
968 }
969 break;
970
971 case doIntervalSame:
972 // Scanned a single value interval like {27}. Upper = Lower.
973 fIntervalUpper = fIntervalLow;
974 break;
975
976 case doInterval:
977 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
978 if (compileInlineInterval() == FALSE) {
979 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
980 }
981 break;
982
983 case doPossessiveInterval:
984 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
985 {
986 // Remember the loc for the top of the block being looped over.
987 // (Can not reserve a slot in the compiled pattern at this time, becuase
988 // compileInterval needs to reserve also, and blockTopLoc can only reserve
989 // once per block.)
990 int32_t topLoc = blockTopLoc(FALSE);
991
992 // Produce normal looping code.
993 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
994
995 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
996 // just as if the loop was inclosed in atomic parentheses.
997
998 // First the STO_SP before the start of the loop
999 insertOp(topLoc);
1000 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
1001 fRXPat->fDataSize += 1; // state stack ptr.
1002 int32_t op = URX_BUILD(URX_STO_SP, varLoc);
1003 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1004
1005 int32_t loopOp = fRXPat->fCompiledPat->popi();
1006 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1007 loopOp++; // point LoopOp after the just-inserted STO_SP
1008 fRXPat->fCompiledPat->push(loopOp, *fStatus);
1009
1010 // Then the LD_SP after the end of the loop
1011 op = URX_BUILD(URX_LD_SP, varLoc);
1012 fRXPat->fCompiledPat->addElement(op, *fStatus);
1013 }
1014
1015 break;
1016
1017 case doNGInterval:
1018 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1019 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1020 break;
1021
1022 case doIntervalError:
1023 error(U_REGEX_BAD_INTERVAL);
1024 break;
1025
1026 case doLiteralChar:
1027 // We've just scanned a "normal" character from the pattern,
1028 literalChar(fC.fChar);
1029 break;
1030
1031
1032
1033 case doDotAny:
1034 // scanned a ".", match any single character.
1035 {
1036 int32_t op;
1037 if (fModeFlags & UREGEX_DOTALL) {
1038 op = URX_BUILD(URX_DOTANY_ALL, 0);
1039 } else {
1040 op = URX_BUILD(URX_DOTANY, 0);
1041 }
1042 fRXPat->fCompiledPat->addElement(op, *fStatus);
1043 }
1044 break;
1045
1046 case doCaret:
1047 {
1048 int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_CARET_M : URX_CARET;
1049 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1050 }
1051 break;
1052
1053
1054 case doDollar:
1055 {
1056 int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_DOLLAR_M : URX_DOLLAR;
1057 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1058 }
1059 break;
1060
1061 case doBackslashA:
1062 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
1063 break;
1064
1065 case doBackslashB:
374ca955
A
1066 {
1067 #if UCONFIG_NO_BREAK_ITERATION==1
1068 if (fModeFlags & UREGEX_UWORD) {
1069 error(U_UNSUPPORTED_ERROR);
1070 }
1071 #endif
1072 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1073 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus);
1074 }
b75a7d8f
A
1075 break;
1076
1077 case doBackslashb:
374ca955
A
1078 {
1079 #if UCONFIG_NO_BREAK_ITERATION==1
1080 if (fModeFlags & UREGEX_UWORD) {
1081 error(U_UNSUPPORTED_ERROR);
1082 }
1083 #endif
1084 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1085 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1086 }
b75a7d8f
A
1087 break;
1088
1089 case doBackslashD:
1090 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
1091 break;
1092
1093 case doBackslashd:
1094 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
1095 break;
1096
1097 case doBackslashG:
1098 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
1099 break;
1100
1101 case doBackslashS:
1102 fRXPat->fCompiledPat->addElement(
1103 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
1104 break;
1105
1106 case doBackslashs:
1107 fRXPat->fCompiledPat->addElement(
1108 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
1109 break;
1110
1111 case doBackslashW:
1112 fRXPat->fCompiledPat->addElement(
1113 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
1114 break;
1115
1116 case doBackslashw:
1117 fRXPat->fCompiledPat->addElement(
1118 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
1119 break;
1120
1121 case doBackslashX:
1122 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
1123 break;
1124
1125
1126 case doBackslashZ:
1127 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
1128 break;
1129
1130 case doBackslashz:
1131 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
1132 break;
1133
1134 case doEscapeError:
1135 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1136 break;
1137
1138 case doExit:
1139 returnVal = FALSE;
1140 break;
1141
1142 case doProperty:
1143 {
1144 UnicodeSet *theSet = scanProp();
1145 compileSet(theSet);
1146 }
1147 break;
1148
1149
1150 case doScanUnicodeSet:
1151 {
1152 UnicodeSet *theSet = scanSet();
1153 compileSet(theSet);
1154 }
1155 break;
1156
1157 case doEnterQuoteMode:
1158 // Just scanned a \Q. Put character scanner into quote mode.
1159 fQuoteMode = TRUE;
1160 break;
1161
1162 case doBackRef:
1163 // BackReference. Somewhat unusual in that the front-end can not completely parse
1164 // the regular expression, because the number of digits to be consumed
1165 // depends on the number of capture groups that have been defined. So
1166 // we have to do it here instead.
1167 {
1168 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1169 int32_t groupNum = 0;
1170 UChar32 c = fC.fChar;
1171
1172 for (;;) {
1173 // Loop once per digit, for max allowed number of digits in a back reference.
1174 int32_t digit = u_charDigitValue(c);
1175 groupNum = groupNum * 10 + digit;
1176 if (groupNum >= numCaptureGroups) {
1177 break;
1178 }
1179 c = peekCharLL();
1180 if (RegexStaticSets::gStaticSets->fRuleDigits->contains(c) == FALSE) {
1181 break;
1182 }
1183 nextCharLL();
1184 }
1185
1186 // Scan of the back reference in the source regexp is complete. Now generate
1187 // the compiled code for it.
1188 // Because capture groups can be forward-referenced by back-references,
1189 // we fill the operand with the capture group number. At the end
374ca955 1190 // of compilation, it will be changed to the variable's location.
b75a7d8f
A
1191 U_ASSERT(groupNum > 0);
1192 int32_t op;
1193 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1194 op = URX_BUILD(URX_BACKREF_I, groupNum);
1195 } else {
1196 op = URX_BUILD(URX_BACKREF, groupNum);
1197 }
1198 fRXPat->fCompiledPat->addElement(op, *fStatus);
1199 }
1200 break;
1201
1202
b75a7d8f
A
1203 case doPossessivePlus:
1204 // Possessive ++ quantifier.
1205 // Compiles to
1206 // 1. STO_SP
1207 // 2. body of stuff being iterated over
1208 // 3. STATE_SAVE 5
1209 // 4. JMP 2
1210 // 5. LD_SP
1211 // 6. ...
1212 //
1213 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1214 // then unconditionally discarded. Perhaps introduce a new opcode
1215 //
1216 {
1217 // Emit the STO_SP
1218 int32_t topLoc = blockTopLoc(TRUE);
1219 int32_t stoLoc = fRXPat->fDataSize;
1220 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
1221 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
1222 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1223
1224 // Emit the STATE_SAVE
1225 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1226 fRXPat->fCompiledPat->addElement(op, *fStatus);
1227
1228 // Emit the JMP
1229 op = URX_BUILD(URX_JMP, topLoc+1);
1230 fRXPat->fCompiledPat->addElement(op, *fStatus);
1231
1232 // Emit the LD_SP
1233 op = URX_BUILD(URX_LD_SP, stoLoc);
1234 fRXPat->fCompiledPat->addElement(op, *fStatus);
1235 }
1236 break;
1237
1238 case doPossessiveStar:
1239 // Possessive *+ quantifier.
1240 // Compiles to
1241 // 1. STO_SP loc
1242 // 2. STATE_SAVE 5
1243 // 3. body of stuff being iterated over
1244 // 4. JMP 2
1245 // 5. LD_SP loc
1246 // 6 ...
1247 // TODO: do something to cut back the state stack each time through the loop.
1248 {
1249 // Reserve two slots at the top of the block.
1250 int32_t topLoc = blockTopLoc(TRUE);
1251 insertOp(topLoc);
1252
1253 // emit STO_SP loc
1254 int32_t stoLoc = fRXPat->fDataSize;
1255 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
1256 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
1257 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1258
1259 // Emit the SAVE_STATE 5
1260 int32_t L7 = fRXPat->fCompiledPat->size()+1;
1261 op = URX_BUILD(URX_STATE_SAVE, L7);
1262 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1263
1264 // Append the JMP operation.
1265 op = URX_BUILD(URX_JMP, topLoc+1);
1266 fRXPat->fCompiledPat->addElement(op, *fStatus);
1267
1268 // Emit the LD_SP loc
1269 op = URX_BUILD(URX_LD_SP, stoLoc);
1270 fRXPat->fCompiledPat->addElement(op, *fStatus);
1271 }
1272 break;
1273
1274 case doPossessiveOpt:
1275 // Possessive ?+ quantifier.
1276 // Compiles to
1277 // 1. STO_SP loc
1278 // 2. SAVE_STATE 5
1279 // 3. body of optional block
1280 // 4. LD_SP loc
1281 // 5. ...
1282 //
1283 {
1284 // Reserve two slots at the top of the block.
1285 int32_t topLoc = blockTopLoc(TRUE);
1286 insertOp(topLoc);
1287
1288 // Emit the STO_SP
1289 int32_t stoLoc = fRXPat->fDataSize;
1290 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
1291 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
1292 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1293
1294 // Emit the SAVE_STATE
1295 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1296 op = URX_BUILD(URX_STATE_SAVE, continueLoc);
1297 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1298
1299 // Emit the LD_SP
1300 op = URX_BUILD(URX_LD_SP, stoLoc);
1301 fRXPat->fCompiledPat->addElement(op, *fStatus);
1302 }
1303 break;
1304
1305
1306 case doBeginMatchMode:
1307 fNewModeFlags = fModeFlags;
1308 fSetModeFlag = TRUE;
1309 break;
1310
1311 case doMatchMode: // (?i) and similar
1312 {
1313 int32_t bit = 0;
1314 switch (fC.fChar) {
1315 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
1316 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
1317 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
374ca955 1318 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
b75a7d8f
A
1319 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
1320 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
1321 default:
1322 U_ASSERT(FALSE); // Should never happen. Other chars are filtered out
1323 // by the scanner.
1324 }
1325 if (fSetModeFlag) {
1326 fNewModeFlags |= bit;
1327 } else {
1328 fNewModeFlags &= ~bit;
1329 }
1330 }
1331 break;
1332
1333 case doSetMatchMode:
1334 // We've got a (?i) or similar. The match mode is being changed, but
1335 // the change is not scoped to a parenthesized block.
1336 fModeFlags = fNewModeFlags;
1337
1338 // Prevent any string from spanning across the change of match mode.
1339 // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef"
1340 fixLiterals();
1341 break;
1342
1343
1344 case doMatchModeParen:
1345 // We've got a (?i: or similar. Begin a parenthesized block, save old
1346 // mode flags so they can be restored at the close of the block.
1347 //
1348 // Compile to a
1349 // - NOP, which later may be replaced by a save-state if the
1350 // parenthesized group gets a * quantifier, followed by
1351 // - NOP, which may later be replaced by a save-state if there
1352 // is an '|' alternation within the parens.
1353 {
1354 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
1355 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
1356
1357 // On the Parentheses stack, start a new frame and add the postions
1358 // of the two NOPs (a normal non-capturing () frame, except for the
1359 // saving of the orignal mode flags.)
1360 fParenStack.push(fModeFlags, *fStatus);
1361 fParenStack.push(flags, *fStatus); // Frame Marker
1362 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1363 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1364
1365 // Set the current mode flags to the new values.
1366 fModeFlags = fNewModeFlags;
1367 }
1368 break;
1369
374ca955
A
1370 case doBadModeFlag:
1371 error(U_REGEX_INVALID_FLAG);
1372 break;
1373
b75a7d8f
A
1374 case doSuppressComments:
1375 // We have just scanned a '(?'. We now need to prevent the character scanner from
1376 // treating a '#' as a to-the-end-of-line comment.
1377 // (This Perl compatibility just gets uglier and uglier to do...)
1378 fEOLComments = FALSE;
1379 break;
1380
1381
1382
1383 default:
1384 U_ASSERT(FALSE);
1385 error(U_REGEX_INTERNAL_ERROR);
1386 break;
1387 }
1388
1389 if (U_FAILURE(*fStatus)) {
1390 returnVal = FALSE;
1391 }
1392
1393 return returnVal;
1394};
1395
1396
1397
1398//------------------------------------------------------------------------------
1399//
1400// literalChar We've encountered a literal character from the pattern,
1401// or an escape sequence that reduces to a character.
1402// Add it to the string containing all literal chars/strings from
1403// the pattern.
1404// If we are in a pattern string already, add the new char to it.
1405// If we aren't in a pattern string, begin one now.
1406//
1407//------------------------------------------------------------------------------
1408void RegexCompile::literalChar(UChar32 c) {
1409 int32_t op; // An operation in the compiled pattern.
1410 int32_t opType;
1411 int32_t patternLoc; // A position in the compiled pattern.
1412 int32_t stringLen;
1413
1414
1415 // If the last thing compiled into the pattern was not a literal char,
1416 // force this new literal char to begin a new string, and not append to the previous.
1417 op = fRXPat->fCompiledPat->lastElementi();
1418 opType = URX_TYPE(op);
1419 if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR || opType == URX_ONECHAR_I)) {
1420 fixLiterals();
1421 }
1422
1423 if (fStringOpStart == -1) {
1424 // First char of a string in the pattern.
1425 // Emit a OneChar op into the compiled pattern.
1426 emitONE_CHAR(c);
1427
1428 // Also add it to the string pool, in case we get a second adjacent literal
1429 // and want to change form ONE_CHAR to STRING
1430 fStringOpStart = fRXPat->fLiteralText.length();
1431 fRXPat->fLiteralText.append(c);
1432 return;
1433 }
1434
1435 // We are adding onto an existing string
1436 fRXPat->fLiteralText.append(c);
1437
1438 op = fRXPat->fCompiledPat->lastElementi();
1439 opType = URX_TYPE(op);
1440 U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
1441
1442 // If the most recently emitted op is a URX_ONECHAR,
1443 if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) {
1444 if (U16_IS_TRAIL(c) && U16_IS_LEAD(URX_VAL(op))) {
1445 // The most recently emitted op is a ONECHAR that was the first half
1446 // of a surrogate pair. Update the ONECHAR's operand to be the
1447 // supplementary code point resulting from both halves of the pair.
1448 c = U16_GET_SUPPLEMENTARY(URX_VAL(op), c);
1449 op = URX_BUILD(opType, c);
1450 patternLoc = fRXPat->fCompiledPat->size() - 1;
1451 fRXPat->fCompiledPat->setElementAt(op, patternLoc);
1452 return;
1453 }
1454
1455 // The most recently emitted op is a ONECHAR.
1456 // We've now received another adjacent char. Change the ONECHAR op
1457 // to a string op.
1458 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1459 op = URX_BUILD(URX_STRING_I, fStringOpStart);
1460 } else {
1461 op = URX_BUILD(URX_STRING, fStringOpStart);
1462 }
1463 patternLoc = fRXPat->fCompiledPat->size() - 1;
1464 fRXPat->fCompiledPat->setElementAt(op, patternLoc);
1465 op = URX_BUILD(URX_STRING_LEN, 0);
1466 fRXPat->fCompiledPat->addElement(op, *fStatus);
1467 }
1468
1469 // The pattern contains a URX_SRING / URX_STRING_LEN. Update the
1470 // string length to reflect the new char we just added to the string.
1471 stringLen = fRXPat->fLiteralText.length() - fStringOpStart;
1472 op = URX_BUILD(URX_STRING_LEN, stringLen);
1473 patternLoc = fRXPat->fCompiledPat->size() - 1;
1474 fRXPat->fCompiledPat->setElementAt(op, patternLoc);
1475}
1476
1477
1478
1479//------------------------------------------------------------------------------
1480//
1481// emitONE_CHAR emit a ONE_CHAR op into the generated code.
1482// Choose cased or uncased version, depending on the
1483// match mode and whether the character itself is cased.
1484//
1485//------------------------------------------------------------------------------
1486void RegexCompile::emitONE_CHAR(UChar32 c) {
1487 int32_t op;
1488 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1489 u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
1490 // We have a cased character, and are in case insensitive matching mode.
1491 c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
1492 op = URX_BUILD(URX_ONECHAR_I, c);
1493 } else {
1494 // Uncased char, or case sensitive match mode.
1495 // Either way, just generate a literal compare of the char.
1496 op = URX_BUILD(URX_ONECHAR, c);
1497 }
1498 fRXPat->fCompiledPat->addElement(op, *fStatus);
1499}
1500
1501
1502//------------------------------------------------------------------------------
1503//
1504// fixLiterals When compiling something that can follow a literal
1505// string in a pattern, we need to "fix" any preceding
1506// string, which will cause any subsequent literals to
1507// begin a new string, rather than appending to the
1508// old one.
1509//
1510// Optionally, split the last char of the string off into
1511// a single "ONE_CHAR" operation, so that quantifiers can
1512// apply to that char alone. Example: abc*
1513// The * must apply to the 'c' only.
1514//
1515//------------------------------------------------------------------------------
1516void RegexCompile::fixLiterals(UBool split) {
1517 int32_t stringStart = fStringOpStart; // start index of the current literal string
1518 int32_t op; // An op from/for the compiled pattern.
1519 int32_t opType; // An opcode type from the compiled pattern.
1520 int32_t stringLastCharIdx;
1521 UChar32 lastChar;
1522 int32_t stringNextToLastCharIdx;
1523 UChar32 nextToLastChar;
1524 int32_t stringLen;
1525
1526 fStringOpStart = -1;
1527 if (!split) {
1528 return;
1529 }
1530
1531 // Split: We need to ensure that the last item in the compiled pattern does
1532 // not refer to a literal string of more than one char. If it does,
1533 // separate the last char from the rest of the string.
1534
1535 // If the last operation from the compiled pattern is not a string,
1536 // nothing needs to be done
1537 op = fRXPat->fCompiledPat->lastElementi();
1538 opType = URX_TYPE(op);
1539 if (opType != URX_STRING_LEN) {
1540 return;
1541 }
1542 stringLen = URX_VAL(op);
1543
1544 //
1545 // Find the position of the last code point in the string (might be a surrogate pair)
1546 //
1547 stringLastCharIdx = fRXPat->fLiteralText.length();
1548 stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
1549 lastChar = fRXPat->fLiteralText.char32At(stringLastCharIdx);
1550
1551 // The string should always be at least two code points long, meaning that there
1552 // should be something before the last char position that we just found.
1553 U_ASSERT(stringLastCharIdx > stringStart);
1554 stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
1555 U_ASSERT(stringNextToLastCharIdx >= stringStart);
1556 nextToLastChar = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx);
1557
1558 if (stringNextToLastCharIdx > stringStart) {
1559 // The length of string remaining after removing one char is two or more.
1560 // Leave the string in the compiled pattern, shorten it by one char,
1561 // and append a URX_ONECHAR op for the last char.
1562 stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
1563 op = URX_BUILD(URX_STRING_LEN, stringLen);
1564 fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
1565 emitONE_CHAR(lastChar);
1566 } else {
1567 // The original string consisted of exactly two characters. Replace
1568 // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
1569 // of URX_ONECHARs.
1570 fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2);
1571 emitONE_CHAR(nextToLastChar);
1572 emitONE_CHAR(lastChar);
1573 }
1574}
1575
1576
1577
1578
1579
1580
1581//------------------------------------------------------------------------------
1582//
1583// insertOp() Insert a slot for a new opcode into the already
1584// compiled pattern code.
1585//
1586// Fill the slot with a NOP. Our caller will replace it
1587// with what they really wanted.
1588//
1589//------------------------------------------------------------------------------
1590void RegexCompile::insertOp(int32_t where) {
1591 UVector32 *code = fRXPat->fCompiledPat;
1592 U_ASSERT(where>0 && where < code->size());
1593
1594 int32_t nop = URX_BUILD(URX_NOP, 0);
1595 code->insertElementAt(nop, where, *fStatus);
1596
1597 // Walk through the pattern, looking for any ops with targets that
1598 // were moved down by the insert. Fix them.
1599 int32_t loc;
1600 for (loc=0; loc<code->size(); loc++) {
1601 int32_t op = code->elementAti(loc);
1602 int32_t opType = URX_TYPE(op);
1603 int32_t opValue = URX_VAL(op);
1604 if ((opType == URX_JMP ||
1605 opType == URX_JMPX ||
1606 opType == URX_STATE_SAVE ||
1607 opType == URX_CTR_LOOP ||
1608 opType == URX_CTR_LOOP_NG ||
1609 opType == URX_JMP_SAV ||
1610 opType == URX_RELOC_OPRND) && opValue > where) {
1611 // Target location for this opcode is after the insertion point and
1612 // needs to be incremented to adjust for the insertion.
1613 opValue++;
1614 op = URX_BUILD(opType, opValue);
1615 code->setElementAt(op, loc);
1616 }
1617 }
1618
1619 // Now fix up the parentheses stack. All positive values in it are locations in
1620 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
1621 for (loc=0; loc<fParenStack.size(); loc++) {
1622 int32_t x = fParenStack.elementAti(loc);
1623 if (x>where) {
1624 x++;
1625 fParenStack.setElementAt(x, loc);
1626 }
1627 }
1628
1629 if (fMatchCloseParen > where) {
1630 fMatchCloseParen++;
1631 }
1632 if (fMatchOpenParen > where) {
1633 fMatchOpenParen++;
1634 }
1635}
1636
1637
1638
1639//------------------------------------------------------------------------------
1640//
1641// blockTopLoc() Find or create a location in the compiled pattern
1642// at the start of the operation or block that has
1643// just been compiled. Needed when a quantifier (* or
1644// whatever) appears, and we need to add an operation
1645// at the start of the thing being quantified.
1646//
1647// (Parenthesized Blocks) have a slot with a NOP that
1648// is reserved for this purpose. .* or similar don't
1649// and a slot needs to be added.
1650//
1651// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
1652// at the returned location.
1653// FALSE - just return the address,
1654// do not reserve a location there.
1655//
1656//------------------------------------------------------------------------------
1657int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
1658 int32_t theLoc;
1659 if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
1660 {
1661 // The item just processed is a parenthesized block.
1662 theLoc = fMatchOpenParen; // A slot is already reserved for us.
1663 U_ASSERT(theLoc > 0);
374ca955 1664 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
b75a7d8f
A
1665 }
1666 else {
1667 // Item just compiled is a single thing, a ".", or a single char, or a set reference.
1668 // No slot for STATE_SAVE was pre-reserved in the compiled code.
1669 // We need to make space now.
1670 fixLiterals(TRUE); // If last item was a string, separate the last char.
1671 theLoc = fRXPat->fCompiledPat->size()-1;
1672 if (reserveLoc) {
374ca955 1673 /*int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);*/
b75a7d8f
A
1674 int32_t nop = URX_BUILD(URX_NOP, 0);
1675 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
1676 }
1677 }
1678 return theLoc;
1679}
1680
1681
1682
1683//------------------------------------------------------------------------------
1684//
1685// handleCloseParen When compiling a close paren, we need to go back
1686// and fix up any JMP or SAVE operations within the
1687// parenthesized block that need to target the end
1688// of the block. The locations of these are kept on
1689// the paretheses stack.
1690//
1691// This function is called both when encountering a
1692// real ) and at the end of the pattern.
1693//
1694//-------------------------------------------------------------------------------
1695void RegexCompile::handleCloseParen() {
1696 int32_t patIdx;
1697 int32_t patOp;
1698 if (fParenStack.size() <= 0) {
1699 error(U_REGEX_MISMATCHED_PAREN);
1700 return;
1701 }
1702
1703 // Force any literal chars that may follow the close paren to start a new string,
1704 // and not attach to any preceding it.
1705 fixLiterals(FALSE);
1706
1707 // Fixup any operations within the just-closed parenthesized group
1708 // that need to reference the end of the (block).
1709 // (The first one popped from the stack is an unused slot for
1710 // alternation (OR) state save, but applying the fixup to it does no harm.)
1711 for (;;) {
1712 patIdx = fParenStack.popi();
1713 if (patIdx < 0) {
1714 // value < 0 flags the start of the frame on the paren stack.
1715 break;
1716 }
1717 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
1718 patOp = fRXPat->fCompiledPat->elementAti(patIdx);
1719 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set.
1720 patOp |= fRXPat->fCompiledPat->size(); // Set it now.
1721 fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
1722 fMatchOpenParen = patIdx;
1723 }
1724
1725 // At the close of any parenthesized block, restore the match mode flags to
1726 // the value they had at the open paren. Saved value is
1727 // at the top of the paren stack.
1728 fModeFlags = fParenStack.popi();
1729
1730 // DO any additional fixups, depending on the specific kind of
1731 // parentesized grouping this is
1732
1733 switch (patIdx) {
1734 case plain:
1735 case flags:
1736 // No additional fixups required.
1737 // (Grouping-only parentheses)
1738 break;
1739 case capturing:
1740 // Capturing Parentheses.
1741 // Insert a End Capture op into the pattern.
1742 // The frame offset of the variables for this cg is obtained from the
1743 // start capture op and put it into the end-capture op.
1744 {
1745 int32_t captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
1746 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
1747
1748 int32_t frameVarLocation = URX_VAL(captureOp);
1749 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
1750 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
1751 }
1752 break;
1753 case atomic:
1754 // Atomic Parenthesis.
1755 // Insert a LD_SP operation to restore the state stack to the position
1756 // it was when the atomic parens were entered.
1757 {
1758 int32_t stoOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
1759 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
1760 int32_t stoLoc = URX_VAL(stoOp);
1761 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc);
1762 fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
1763 }
1764 break;
1765
1766 case lookAhead:
1767 {
1768 int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
1769 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
1770 int32_t dataLoc = URX_VAL(startOp);
1771 int32_t op = URX_BUILD(URX_LA_END, dataLoc);
1772 fRXPat->fCompiledPat->addElement(op, *fStatus);
1773 }
1774 break;
1775
1776 case negLookAhead:
1777 {
1778 // See comment at doOpenLookAheadNeg
1779 int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
1780 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
1781 int32_t dataLoc = URX_VAL(startOp);
1782 int32_t op = URX_BUILD(URX_LA_END, dataLoc);
1783 fRXPat->fCompiledPat->addElement(op, *fStatus);
1784 op = URX_BUILD(URX_FAIL, 0);
1785 fRXPat->fCompiledPat->addElement(op, *fStatus);
1786
1787 // Patch the URX_SAVE near the top of the block.
1788 int32_t saveOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
1789 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
1790 int32_t dest = fRXPat->fCompiledPat->size();
1791 saveOp = URX_BUILD(URX_STATE_SAVE, dest);
1792 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
1793 }
1794 break;
1795
1796 case lookBehind:
1797 {
1798 // See comment at doOpenLookBehind.
1799
1800 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
1801 int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
1802 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
1803 int32_t dataLoc = URX_VAL(startOp);
1804 int32_t op = URX_BUILD(URX_LB_END, dataLoc);
1805 fRXPat->fCompiledPat->addElement(op, *fStatus);
1806 op = URX_BUILD(URX_LA_END, dataLoc);
1807 fRXPat->fCompiledPat->addElement(op, *fStatus);
1808
1809 // Determine the min and max bounds for the length of the
1810 // string that the pattern can match.
1811 // An unbounded upper limit is an error.
1812 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
1813 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
1814 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
1815 if (maxML == INT32_MAX) {
1816 error(U_REGEX_LOOK_BEHIND_LIMIT);
1817 break;
1818 }
1819 U_ASSERT(minML <= maxML);
1820
1821 // Insert the min and max match len bounds into the URX_LB_CONT op that
1822 // appears at the top of the look-behind block, at location fMatchOpenParen+1
1823 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
1824 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
1825
1826 }
1827 break;
1828
1829
1830
1831 case lookBehindN:
1832 {
1833 // See comment at doOpenLookBehindNeg.
1834
1835 // Append the URX_LBN_END to the compiled pattern.
1836 int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
1837 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
1838 int32_t dataLoc = URX_VAL(startOp);
1839 int32_t op = URX_BUILD(URX_LBN_END, dataLoc);
1840 fRXPat->fCompiledPat->addElement(op, *fStatus);
1841
1842 // Determine the min and max bounds for the length of the
1843 // string that the pattern can match.
1844 // An unbounded upper limit is an error.
1845 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
1846 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
1847 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
1848 if (maxML == INT32_MAX) {
1849 error(U_REGEX_LOOK_BEHIND_LIMIT);
1850 break;
1851 }
1852 U_ASSERT(minML <= maxML);
1853
1854 // Insert the min and max match len bounds into the URX_LB_CONT op that
1855 // appears at the top of the look-behind block, at location fMatchOpenParen+1
1856 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
1857 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
1858
1859 // Insert the pattern location to continue at after a successful match
1860 // as the last operand of the URX_LBN_CONT
1861 op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
1862 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
1863 }
1864 break;
1865
1866
1867
1868 default:
1869 U_ASSERT(FALSE);
1870 }
1871
1872 // remember the next location in the compiled pattern.
1873 // The compilation of Quantifiers will look at this to see whether its looping
1874 // over a parenthesized block or a single item
1875 fMatchCloseParen = fRXPat->fCompiledPat->size();
1876}
1877
1878
1879
1880//----------------------------------------------------------------------------------------
1881//
1882// compileSet Compile the pattern operations for a reference to a
1883// UnicodeSet.
1884//
1885//----------------------------------------------------------------------------------------
1886void RegexCompile::compileSet(UnicodeSet *theSet)
1887{
1888 if (theSet == NULL) {
1889 return;
1890 }
1891 int32_t setSize = theSet->size();
1892 UChar32 firstSetChar = theSet->charAt(0);
1893 if (firstSetChar == -1) {
1894 // Sets that contain only strings, but no individual chars,
1895 // will end up here.
1896 error(U_REGEX_SET_CONTAINS_STRING);
1897 setSize = 0;
1898 }
1899
1900 switch (setSize) {
1901 case 0:
1902 {
1903 // Set of no elements. Always fails to match.
1904 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
1905 delete theSet;
1906 }
1907 break;
1908
1909 case 1:
1910 {
1911 // The set contains only a single code point. Put it into
1912 // the compiled pattern as a single char operation rather
1913 // than a set, and discard the set itself.
1914 literalChar(firstSetChar);
1915 delete theSet;
1916 }
1917 break;
1918
1919 default:
1920 {
1921 // The set contains two or more chars. (the normal case)
1922 // Put it into the compiled pattern as a set.
1923 int32_t setNumber = fRXPat->fSets->size();
1924 fRXPat->fSets->addElement(theSet, *fStatus);
1925 int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
1926 fRXPat->fCompiledPat->addElement(setOp, *fStatus);
1927 }
1928 }
1929}
1930
1931
1932//----------------------------------------------------------------------------------------
1933//
1934// compileInterval Generate the code for a {min, max} style interval quantifier.
1935// Except for the specific opcodes used, the code is the same
1936// for all three types (greedy, non-greedy, possessive) of
1937// intervals. The opcodes are supplied as parameters.
1938//
1939// The code for interval loops has this form:
1940// 0 CTR_INIT counter loc (in stack frame)
1941// 1 5 patt address of CTR_LOOP at bottom of block
1942// 2 min count
1943// 3 max count (-1 for unbounded)
1944// 4 ... block to be iterated over
1945// 5 CTR_LOOP
1946//
1947// In
1948//----------------------------------------------------------------------------------------
1949void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
1950{
1951 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
1952 // four slots in the compiled code. Reserve them.
1953 int32_t topOfBlock = blockTopLoc(TRUE);
1954 insertOp(topOfBlock);
1955 insertOp(topOfBlock);
1956 insertOp(topOfBlock);
1957
1958 // The operands for the CTR_INIT opcode include the index in the matcher data
1959 // of the counter. Allocate it now.
1960 int32_t counterLoc = fRXPat->fFrameSize;
1961 fRXPat->fFrameSize++;
1962
1963 int32_t op = URX_BUILD(InitOp, counterLoc);
1964 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
1965
1966 // The second operand of CTR_INIT is the location following the end of the loop.
1967 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
1968 // compilation of something later on causes the code to grow and the target
1969 // position to move.
1970 int32_t loopEnd = fRXPat->fCompiledPat->size();
1971 op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
1972 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
1973
1974 // Followed by the min and max counts.
1975 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
1976 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
1977
1978 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
1979 // Goes at end of the block being looped over, so just append to the code so far.
1980 op = URX_BUILD(LoopOp, topOfBlock);
1981 fRXPat->fCompiledPat->addElement(op, *fStatus);
1982
374ca955
A
1983 if ((fIntervalLow & 0xff000000) != 0 ||
1984 fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0) {
1985 error(U_REGEX_NUMBER_TOO_BIG);
1986 }
1987
b75a7d8f
A
1988 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
1989 error(U_REGEX_MAX_LT_MIN);
1990 }
b75a7d8f
A
1991}
1992
1993
1994
1995UBool RegexCompile::compileInlineInterval() {
1996 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
1997 // Too big to inline. Fail, which will cause looping code to be generated.
1998 // (Upper < Lower picks up unbounded upper and errors, both.)
1999 return FALSE;
2000 }
2001
2002 int32_t topOfBlock = blockTopLoc(FALSE);
2003 if (fIntervalUpper == 0) {
2004 // Pathological case. Attempt no matches, as if the block doesn't exist.
2005 fRXPat->fCompiledPat->setSize(topOfBlock);
2006 return TRUE;
2007 }
2008
2009 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2010 // The thing being repeated is not a single op, but some
2011 // more complex block. Do it as a loop, not inlines.
2012 // Note that things "repeated" a max of once are handled as inline, because
2013 // the one copy of the code already generated is just fine.
2014 return FALSE;
2015 }
2016
2017 // Pick up the opcode that is to be repeated
2018 //
2019 int32_t op = fRXPat->fCompiledPat->elementAti(topOfBlock);
2020
2021 // Compute the pattern location where the inline sequence
2022 // will end, and set up the state save op that will be needed.
2023 //
2024 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2025 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2026 int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc);
2027 if (fIntervalLow == 0) {
2028 insertOp(topOfBlock);
2029 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2030 }
2031
2032
2033
2034 // Loop, emitting the op for the thing being repeated each time.
2035 // Loop starts at 1 because one instance of the op already exists in the pattern,
2036 // it was put there when it was originally encountered.
2037 int32_t i;
2038 for (i=1; i<fIntervalUpper; i++ ) {
2039 if (i == fIntervalLow) {
2040 fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
2041 }
2042 if (i > fIntervalLow) {
2043 fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
2044 }
2045 fRXPat->fCompiledPat->addElement(op, *fStatus);
2046 }
2047 return TRUE;
2048}
2049
2050
2051
2052//----------------------------------------------------------------------------------------
2053//
2054// matchStartType Determine how a match can start.
2055// Used to optimize find() operations.
2056//
2057// Operation is very similar to minMatchLength(). Walk the compiled
2058// pattern, keeping an on-going minimum-match-length. For any
2059// op where the min match coming in is zero, add that ops possible
2060// starting matches to the possible starts for the overall pattern.
2061//
2062//----------------------------------------------------------------------------------------
2063void RegexCompile::matchStartType() {
2064 if (U_FAILURE(*fStatus)) {
2065 return;
2066 }
2067
2068
2069 int32_t loc; // Location in the pattern of the current op being processed.
2070 int32_t op; // The op being processed
2071 int32_t opType; // The opcode type of the op
2072 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
2073 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
2074
2075 UBool atStart = TRUE; // True if no part of the pattern yet encountered
2076 // could have advanced the position in a match.
2077 // (Maximum match length so far == 0)
2078
2079 // forwardedLength is a vector holding minimum-match-length values that
2080 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2081 // It must be one longer than the pattern being checked because some ops
2082 // will jmp to a end-of-block+1 location from within a block, and we must
2083 // count those when checking the block.
2084 int32_t end = fRXPat->fCompiledPat->size();
2085 UVector32 forwardedLength(end+1, *fStatus);
2086 forwardedLength.setSize(end+1);
2087 for (loc=3; loc<end; loc++) {
2088 forwardedLength.setElementAt(INT32_MAX, loc);
2089 }
2090
2091 for (loc = 3; loc<end; loc++) {
2092 op = fRXPat->fCompiledPat->elementAti(loc);
2093 opType = URX_TYPE(op);
2094
2095 // The loop is advancing linearly through the pattern.
2096 // If the op we are now at was the destination of a branch in the pattern,
2097 // and that path has a shorter minimum length than the current accumulated value,
2098 // replace the current accumulated value.
2099 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2100 if (forwardedLength.elementAti(loc) < currentLen) {
2101 currentLen = forwardedLength.elementAti(loc);
2102 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2103 }
2104
2105 switch (opType) {
2106 // Ops that don't change the total length matched
2107 case URX_RESERVED_OP:
2108 case URX_END:
2109 case URX_STRING_LEN:
2110 case URX_NOP:
2111 case URX_START_CAPTURE:
2112 case URX_END_CAPTURE:
2113 case URX_BACKSLASH_B:
374ca955 2114 case URX_BACKSLASH_BU:
b75a7d8f
A
2115 case URX_BACKSLASH_G:
2116 case URX_BACKSLASH_Z:
2117 case URX_DOLLAR:
2118 case URX_RELOC_OPRND:
2119 case URX_STO_INP_LOC:
2120 case URX_DOLLAR_M:
2121 case URX_BACKTRACK:
2122 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2123 case URX_BACKREF_I:
2124
2125 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2126 case URX_LD_SP:
2127 break;
2128
2129 case URX_CARET:
2130 if (atStart) {
2131 fRXPat->fStartType = START_START;
2132 }
2133 break;
2134
2135 case URX_CARET_M:
2136 if (atStart) {
2137 fRXPat->fStartType = START_LINE;
2138 }
2139 break;
2140
2141 case URX_ONECHAR:
2142 if (currentLen == 0) {
2143 // This character could appear at the start of a match.
2144 // Add it to the set of possible starting characters.
2145 fRXPat->fInitialChars->add(URX_VAL(op));
2146 numInitialStrings += 2;
2147 }
2148 currentLen++;
2149 atStart = FALSE;
2150 break;
2151
2152
2153 case URX_SETREF:
2154 if (currentLen == 0) {
2155 int32_t sn = URX_VAL(op);
2156 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2157 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2158 fRXPat->fInitialChars->addAll(*s);
2159 numInitialStrings += 2;
2160 }
2161 currentLen++;
2162 atStart = FALSE;
2163 break;
2164
2165 case URX_LOOP_SR_I:
2166 // [Set]*, like a SETREF, above, in what it can match,
2167 // but may not match at all, so currentLen is not incremented.
2168 if (currentLen == 0) {
2169 int32_t sn = URX_VAL(op);
2170 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2171 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2172 fRXPat->fInitialChars->addAll(*s);
2173 numInitialStrings += 2;
2174 }
2175 atStart = FALSE;
2176 break;
2177
2178 case URX_LOOP_DOT_I:
2179 if (currentLen == 0) {
2180 // .* at the start of a pattern.
2181 // Any character can begin the match.
2182 fRXPat->fInitialChars->clear();
2183 fRXPat->fInitialChars->complement();
2184 numInitialStrings += 2;
2185 }
2186 atStart = FALSE;
2187 break;
2188
2189
2190 case URX_STATIC_SETREF:
2191 if (currentLen == 0) {
2192 int32_t sn = URX_VAL(op);
2193 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2194 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2195 fRXPat->fInitialChars->addAll(*s);
2196 numInitialStrings += 2;
2197 }
2198 currentLen++;
2199 atStart = FALSE;
2200 break;
2201
2202
2203
2204 case URX_STAT_SETREF_N:
2205 if (currentLen == 0) {
2206 int32_t sn = URX_VAL(op);
2207 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2208 UnicodeSet sc(*s);
2209 sc.complement();
2210 fRXPat->fInitialChars->addAll(sc);
2211 numInitialStrings += 2;
2212 }
2213 currentLen++;
2214 atStart = FALSE;
2215 break;
2216
2217
2218
2219 case URX_BACKSLASH_D:
2220 // Digit Char
2221 if (currentLen == 0) {
2222 UnicodeSet s;
2223 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2224 if (URX_VAL(op) != 0) {
2225 s.complement();
2226 }
2227 fRXPat->fInitialChars->addAll(s);
2228 numInitialStrings += 2;
2229 }
2230 currentLen++;
2231 atStart = FALSE;
2232 break;
2233
2234
2235 case URX_ONECHAR_I:
2236 // Case Insensitive Single Character.
2237 if (currentLen == 0) {
2238 UChar32 c = URX_VAL(op);
2239 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2240 // character may have distinct cased forms. Add all of them
2241 // to the set of possible starting match chars.
2242 UnicodeSet s(c, c);
2243 s.closeOver(USET_CASE);
2244 fRXPat->fInitialChars->addAll(s);
2245 } else {
2246 // Char has no case variants. Just add it as-is to the
2247 // set of possible starting chars.
2248 fRXPat->fInitialChars->add(c);
2249 }
2250 numInitialStrings += 2;
2251 }
2252 currentLen++;
2253 atStart = FALSE;
2254 break;
2255
2256
2257 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2258 case URX_DOTANY_ALL: // . matches one or two.
2259 case URX_DOTANY:
2260 case URX_DOTANY_ALL_PL:
2261 case URX_DOTANY_PL:
2262 if (currentLen == 0) {
2263 // These constructs are all bad news when they appear at the start
2264 // of a match. Any character can begin the match.
2265 fRXPat->fInitialChars->clear();
2266 fRXPat->fInitialChars->complement();
2267 numInitialStrings += 2;
2268 }
2269 currentLen++;
2270 atStart = FALSE;
2271 break;
2272
2273
2274 case URX_JMPX:
2275 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2276 case URX_JMP:
2277 {
2278 int32_t jmpDest = URX_VAL(op);
2279 if (jmpDest < loc) {
2280 // Loop of some kind. Can safely ignore, the worst that will happen
2281 // is that we understate the true minimum length
2282 currentLen = forwardedLength.elementAti(loc+1);
2283
2284 } else {
2285 // Forward jump. Propagate the current min length to the target loc of the jump.
2286 U_ASSERT(jmpDest <= end+1);
2287 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2288 forwardedLength.setElementAt(currentLen, jmpDest);
2289 }
2290 }
2291 }
2292 atStart = FALSE;
2293 break;
2294
2295 case URX_JMP_SAV:
2296 case URX_JMP_SAV_X:
2297 // Combo of state save to the next loc, + jmp backwards.
2298 // Net effect on min. length computation is nothing.
2299 atStart = FALSE;
2300 break;
2301
2302 case URX_FAIL:
2303 // Fails are kind of like a branch, except that the min length was
2304 // propagated already, by the state save.
2305 currentLen = forwardedLength.elementAti(loc+1);
2306 atStart = FALSE;
2307 break;
2308
2309
2310 case URX_STATE_SAVE:
2311 {
2312 // State Save, for forward jumps, propagate the current minimum.
2313 // of the state save.
2314 int32_t jmpDest = URX_VAL(op);
2315 if (jmpDest > loc) {
2316 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2317 forwardedLength.setElementAt(currentLen, jmpDest);
2318 }
2319 }
2320 }
2321 atStart = FALSE;
2322 break;
2323
2324
2325
2326
2327 case URX_STRING:
2328 {
2329 loc++;
2330 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
2331 int32_t stringLen = URX_VAL(stringLenOp);
2332 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2333 U_ASSERT(stringLenOp >= 2);
2334 if (currentLen == 0) {
2335 // Add the starting character of this string to the set of possible starting
2336 // characters for this pattern.
2337 int32_t stringStartIdx = URX_VAL(op);
2338 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2339 fRXPat->fInitialChars->add(c);
2340
2341 // Remember this string. After the entire pattern has been checked,
2342 // if nothing else is identified that can start a match, we'll use it.
2343 numInitialStrings++;
2344 fRXPat->fInitialStringIdx = stringStartIdx;
2345 fRXPat->fInitialStringLen = stringLen;
2346 }
2347
2348 currentLen += stringLen;
2349 atStart = FALSE;
2350 }
2351 break;
2352
2353 case URX_STRING_I:
2354 {
2355 // Case-insensitive string. Unlike exact-match strings, we won't
2356 // attempt a string search for possible match positions. But we
2357 // do update the set of possible starting characters.
2358 loc++;
2359 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
2360 int32_t stringLen = URX_VAL(stringLenOp);
2361 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2362 U_ASSERT(stringLenOp >= 2);
2363 if (currentLen == 0) {
2364 // Add the starting character of this string to the set of possible starting
2365 // characters for this pattern.
2366 int32_t stringStartIdx = URX_VAL(op);
2367 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2368 UnicodeSet s(c, c);
2369 s.closeOver(USET_CASE);
2370 fRXPat->fInitialChars->addAll(s);
2371 numInitialStrings += 2; // Matching on an initial string not possible.
2372 }
2373 currentLen += stringLen;
2374 atStart = FALSE;
2375 }
2376 break;
2377
2378 case URX_CTR_INIT:
2379 case URX_CTR_INIT_NG:
2380 {
2381 // Loop Init Ops. These don't change the min length, but they are 4 word ops
2382 // so location must be updated accordingly.
2383 // Loop Init Ops.
2384 // If the min loop count == 0
2385 // move loc forwards to the end of the loop, skipping over the body.
2386 // If the min count is > 0,
2387 // continue normal processing of the body of the loop.
2388 int32_t loopEndLoc = fRXPat->fCompiledPat->elementAti(loc+1);
2389 loopEndLoc = URX_VAL(loopEndLoc);
2390 int32_t minLoopCount = fRXPat->fCompiledPat->elementAti(loc+2);
2391 if (minLoopCount == 0) {
374ca955
A
2392 // Min Loop Count of 0, treat like a forward branch and
2393 // move the current minimum length up to the target
2394 // (end of loop) location.
2395 U_ASSERT(loopEndLoc <= end+1);
2396 if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
2397 forwardedLength.setElementAt(currentLen, loopEndLoc);
2398 }
2399 }
2400 loc+=3; // Skips over operands of CTR_INIT
b75a7d8f
A
2401 }
2402 atStart = FALSE;
2403 break;
2404
2405
2406 case URX_CTR_LOOP:
2407 case URX_CTR_LOOP_NG:
2408 // Loop ops.
2409 // The jump is conditional, backwards only.
2410 atStart = FALSE;
2411 break;
2412
2413 case URX_LOOP_C:
2414 // More loop ops. These state-save to themselves.
2415 // don't change the minimum match
2416 atStart = FALSE;
2417 break;
2418
2419
2420 case URX_LA_START:
2421 case URX_LB_START:
2422 {
2423 // Look-around. Scan forward until the matching look-ahead end,
2424 // without processing the look-around block. This is overly pessimistic.
2425 int32_t depth = 0;
2426 for (;;) {
2427 loc++;
2428 op = fRXPat->fCompiledPat->elementAti(loc);
2429 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
2430 depth++;
2431 }
2432 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
2433 if (depth == 0) {
2434 break;
2435 }
2436 depth--;
2437 }
2438 if (URX_TYPE(op) == URX_STATE_SAVE) {
2439 // Need this because neg lookahead blocks will FAIL to outside
2440 // of the block.
2441 int32_t jmpDest = URX_VAL(op);
2442 if (jmpDest > loc) {
2443 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2444 forwardedLength.setElementAt(currentLen, jmpDest);
2445 }
2446 }
2447 }
2448 U_ASSERT(loc <= end);
2449 }
2450 }
2451 break;
2452
2453 case URX_LA_END:
2454 case URX_LB_CONT:
2455 case URX_LB_END:
2456 case URX_LBN_CONT:
2457 case URX_LBN_END:
2458 U_ASSERT(FALSE); // Shouldn't get here. These ops should be
2459 // consumed by the scan in URX_LA_START and LB_START
2460
2461 break;
2462
2463 default:
2464 U_ASSERT(FALSE);
2465 }
2466
2467 }
2468
2469
2470 // We have finished walking through the ops. Check whether some forward jump
2471 // propagated a shorter length to location end+1.
2472 if (forwardedLength.elementAti(end+1) < currentLen) {
2473 currentLen = forwardedLength.elementAti(end+1);
2474 }
2475
2476
2477 fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
2478
2479
2480 // Sort out what we should check for when looking for candidate match start positions.
2481 // In order of preference,
2482 // 1. Start of input text buffer.
2483 // 2. A literal string.
2484 // 3. Start of line in multi-line mode.
2485 // 4. A single literal character.
2486 // 5. A character from a set of characters.
2487 //
2488 if (fRXPat->fStartType == START_START) {
2489 // Match only at the start of an input text string.
2490 // start type is already set. We're done.
2491 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
2492 // Match beginning only with a literal string.
2493 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
2494 U_ASSERT(fRXPat->fInitialChars->contains(c));
2495 fRXPat->fStartType = START_STRING;
2496 fRXPat->fInitialChar = c;
2497 } else if (fRXPat->fStartType == START_LINE) {
374ca955 2498 // Match at start of line in Multi-Line mode.
b75a7d8f
A
2499 // Nothing to do here; everything is already set.
2500 } else if (fRXPat->fMinMatchLen == 0) {
2501 // Zero length match possible. We could start anywhere.
2502 fRXPat->fStartType = START_NO_INFO;
2503 } else if (fRXPat->fInitialChars->size() == 1) {
2504 // All matches begin with the same char.
2505 fRXPat->fStartType = START_CHAR;
2506 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
2507 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
2508 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
2509 fRXPat->fMinMatchLen > 0) {
2510 // Matches start with a set of character smaller than the set of all chars.
2511 fRXPat->fStartType = START_SET;
2512 } else {
2513 // Matches can start with anything
2514 fRXPat->fStartType = START_NO_INFO;
2515 }
2516
2517 return;
2518}
2519
2520
2521
2522//----------------------------------------------------------------------------------------
2523//
2524// minMatchLength Calculate the length of the shortest string that could
2525// match the specified pattern.
2526// Length is in 16 bit code units, not code points.
2527//
2528// The calculated length may not be exact. The returned
2529// value may be shorter than the actual minimum; it must
2530// never be longer.
2531//
2532// start and end are the range of p-code operations to be
2533// examined. The endpoints are included in the range.
2534//
2535//----------------------------------------------------------------------------------------
2536int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
2537 if (U_FAILURE(*fStatus)) {
2538 return 0;
2539 }
2540
2541 U_ASSERT(start <= end);
2542 U_ASSERT(end < fRXPat->fCompiledPat->size());
2543
2544
2545 int32_t loc;
2546 int32_t op;
2547 int32_t opType;
2548 int32_t currentLen = 0;
2549
2550
2551 // forwardedLength is a vector holding minimum-match-length values that
2552 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2553 // It must be one longer than the pattern being checked because some ops
2554 // will jmp to a end-of-block+1 location from within a block, and we must
2555 // count those when checking the block.
2556 UVector32 forwardedLength(end+2, *fStatus);
2557 forwardedLength.setSize(end+2);
2558 for (loc=start; loc<=end+1; loc++) {
2559 forwardedLength.setElementAt(INT32_MAX, loc);
2560 }
2561
2562 for (loc = start; loc<=end; loc++) {
2563 op = fRXPat->fCompiledPat->elementAti(loc);
2564 opType = URX_TYPE(op);
2565
2566 // The loop is advancing linearly through the pattern.
2567 // If the op we are now at was the destination of a branch in the pattern,
2568 // and that path has a shorter minimum length than the current accumulated value,
2569 // replace the current accumulated value.
2570 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2571 if (forwardedLength.elementAti(loc) < currentLen) {
2572 currentLen = forwardedLength.elementAti(loc);
2573 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2574 }
2575
2576 switch (opType) {
2577 // Ops that don't change the total length matched
2578 case URX_RESERVED_OP:
2579 case URX_END:
2580 case URX_STRING_LEN:
2581 case URX_NOP:
2582 case URX_START_CAPTURE:
2583 case URX_END_CAPTURE:
2584 case URX_BACKSLASH_B:
374ca955 2585 case URX_BACKSLASH_BU:
b75a7d8f
A
2586 case URX_BACKSLASH_G:
2587 case URX_BACKSLASH_Z:
2588 case URX_CARET:
2589 case URX_DOLLAR:
2590 case URX_RELOC_OPRND:
2591 case URX_STO_INP_LOC:
2592 case URX_DOLLAR_M:
2593 case URX_CARET_M:
2594 case URX_BACKTRACK:
2595 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2596 case URX_BACKREF_I:
2597
2598 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2599 case URX_LD_SP:
2600
2601 case URX_JMP_SAV:
2602 case URX_JMP_SAV_X:
2603 break;
2604
2605
2606 // Ops that match a minimum of one character (one or two 16 bit code units.)
2607 //
2608 case URX_ONECHAR:
2609 case URX_STATIC_SETREF:
2610 case URX_STAT_SETREF_N:
2611 case URX_SETREF:
2612 case URX_BACKSLASH_D:
2613 case URX_ONECHAR_I:
2614 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2615 case URX_DOTANY_ALL: // . matches one or two.
2616 case URX_DOTANY:
2617 case URX_DOTANY_PL:
2618 case URX_DOTANY_ALL_PL:
2619 currentLen++;
2620 break;
2621
2622
2623 case URX_JMPX:
2624 loc++; // URX_JMPX has an extra operand, ignored here,
2625 // otherwise processed identically to URX_JMP.
2626 case URX_JMP:
2627 {
2628 int32_t jmpDest = URX_VAL(op);
2629 if (jmpDest < loc) {
2630 // Loop of some kind. Can safely ignore, the worst that will happen
2631 // is that we understate the true minimum length
2632 currentLen = forwardedLength.elementAti(loc+1);
2633 } else {
2634 // Forward jump. Propagate the current min length to the target loc of the jump.
2635 U_ASSERT(jmpDest <= end+1);
2636 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2637 forwardedLength.setElementAt(currentLen, jmpDest);
2638 }
2639 }
2640 }
2641 break;
2642
2643 case URX_FAIL:
2644 {
2645 // Fails are kind of like a branch, except that the min length was
2646 // propagated already, by the state save.
2647 currentLen = forwardedLength.elementAti(loc+1);
2648 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2649 }
2650 break;
2651
2652
2653 case URX_STATE_SAVE:
2654 {
2655 // State Save, for forward jumps, propagate the current minimum.
2656 // of the state save.
2657 int32_t jmpDest = URX_VAL(op);
2658 if (jmpDest > loc) {
2659 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2660 forwardedLength.setElementAt(currentLen, jmpDest);
2661 }
2662 }
2663 }
2664 break;
2665
2666
2667 case URX_STRING:
2668 case URX_STRING_I:
2669 {
2670 loc++;
2671 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
2672 currentLen += URX_VAL(stringLenOp);
2673 }
2674 break;
2675
2676
2677 case URX_CTR_INIT:
2678 case URX_CTR_INIT_NG:
2679 {
2680 // Loop Init Ops.
2681 // If the min loop count == 0
2682 // move loc forwards to the end of the loop, skipping over the body.
2683 // If the min count is > 0,
2684 // continue normal processing of the body of the loop.
2685 int32_t loopEndLoc = fRXPat->fCompiledPat->elementAti(loc+1);
2686 loopEndLoc = URX_VAL(loopEndLoc);
2687 int32_t minLoopCount = fRXPat->fCompiledPat->elementAti(loc+2);
2688 if (minLoopCount == 0) {
2689 loc = loopEndLoc;
2690 } else {
2691 loc+=3; // Skips over operands of CTR_INIT
2692 }
2693 }
2694 break;
2695
2696
2697 case URX_CTR_LOOP:
2698 case URX_CTR_LOOP_NG:
2699 // Loop ops.
2700 // The jump is conditional, backwards only.
2701 break;
2702
2703 case URX_LOOP_SR_I:
2704 case URX_LOOP_DOT_I:
2705 case URX_LOOP_C:
2706 // More loop ops. These state-save to themselves.
2707 // don't change the minimum match - could match nothing at all.
2708 break;
2709
2710
2711 case URX_LA_START:
2712 case URX_LB_START:
2713 {
2714 // Look-around. Scan forward until the matching look-ahead end,
2715 // without processing the look-around block. This is overly pessimistic.
2716 // TODO: Positive lookahead could recursively do the block, then continue
2717 // with the longer of the block or the value coming in.
2718 int32_t depth = 0;
2719 for (;;) {
2720 loc++;
2721 op = fRXPat->fCompiledPat->elementAti(loc);
2722 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
2723 depth++;
2724 }
2725 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
2726 if (depth == 0) {
2727 break;
2728 }
2729 depth--;
2730 }
2731 if (URX_TYPE(op) == URX_STATE_SAVE) {
2732 // Need this because neg lookahead blocks will FAIL to outside
2733 // of the block.
2734 int32_t jmpDest = URX_VAL(op);
2735 if (jmpDest > loc) {
2736 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2737 forwardedLength.setElementAt(currentLen, jmpDest);
2738 }
2739 }
2740 }
2741
2742 U_ASSERT(loc <= end);
2743 }
2744 }
2745 break;
2746
2747 case URX_LA_END:
2748 case URX_LB_CONT:
2749 case URX_LB_END:
2750 case URX_LBN_CONT:
2751 case URX_LBN_END:
2752 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
2753 // range being sized, which happens when measuring size of look-behind blocks.
2754 break;
2755
2756 default:
2757 U_ASSERT(FALSE);
2758 }
2759
2760 }
2761
2762 // We have finished walking through the ops. Check whether some forward jump
2763 // propagated a shorter length to location end+1.
2764 if (forwardedLength.elementAti(end+1) < currentLen) {
2765 currentLen = forwardedLength.elementAti(end+1);
2766 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2767 }
2768
2769 return currentLen;
2770}
2771
2772
2773
2774//----------------------------------------------------------------------------------------
2775//
2776// maxMatchLength Calculate the length of the longest string that could
2777// match the specified pattern.
2778// Length is in 16 bit code units, not code points.
2779//
2780// The calculated length may not be exact. The returned
2781// value may be longer than the actual maximum; it must
2782// never be shorter.
2783//
2784//----------------------------------------------------------------------------------------
2785int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
2786 if (U_FAILURE(*fStatus)) {
2787 return 0;
2788 }
2789 U_ASSERT(start <= end);
2790 U_ASSERT(end < fRXPat->fCompiledPat->size());
2791
2792
2793 int32_t loc;
2794 int32_t op;
2795 int32_t opType;
2796 int32_t currentLen = 0;
2797 UVector32 forwardedLength(end+1, *fStatus);
2798 forwardedLength.setSize(end+1);
2799
2800 for (loc=start; loc<=end; loc++) {
2801 forwardedLength.setElementAt(0, loc);
2802 }
2803
2804 for (loc = start; loc<=end; loc++) {
2805 op = fRXPat->fCompiledPat->elementAti(loc);
2806 opType = URX_TYPE(op);
2807
2808 // The loop is advancing linearly through the pattern.
2809 // If the op we are now at was the destination of a branch in the pattern,
2810 // and that path has a longer maximum length than the current accumulated value,
2811 // replace the current accumulated value.
2812 if (forwardedLength.elementAti(loc) > currentLen) {
2813 currentLen = forwardedLength.elementAti(loc);
2814 }
2815
2816 switch (opType) {
2817 // Ops that don't change the total length matched
2818 case URX_RESERVED_OP:
2819 case URX_END:
2820 case URX_STRING_LEN:
2821 case URX_NOP:
2822 case URX_START_CAPTURE:
2823 case URX_END_CAPTURE:
2824 case URX_BACKSLASH_B:
374ca955 2825 case URX_BACKSLASH_BU:
b75a7d8f
A
2826 case URX_BACKSLASH_G:
2827 case URX_BACKSLASH_Z:
2828 case URX_CARET:
2829 case URX_DOLLAR:
2830 case URX_RELOC_OPRND:
2831 case URX_STO_INP_LOC:
2832 case URX_DOLLAR_M:
2833 case URX_CARET_M:
2834 case URX_BACKTRACK:
2835
2836 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2837 case URX_LD_SP:
2838
2839 case URX_LB_END:
2840 case URX_LB_CONT:
2841 case URX_LBN_CONT:
2842 case URX_LBN_END:
2843 break;
2844
2845
2846 // Ops that increase that cause an unbounded increase in the length
2847 // of a matched string, or that increase it a hard to characterize way.
2848 // Call the max length unbounded, and stop further checking.
2849 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2850 case URX_BACKREF_I:
2851 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2852 case URX_DOTANY_PL:
2853 case URX_DOTANY_ALL_PL:
2854 currentLen = INT32_MAX;
2855 break;
2856
2857
2858 // Ops that match a max of one character (possibly two 16 bit code units.)
2859 //
2860 case URX_STATIC_SETREF:
2861 case URX_STAT_SETREF_N:
2862 case URX_SETREF:
2863 case URX_BACKSLASH_D:
2864 case URX_ONECHAR_I:
2865 case URX_DOTANY_ALL:
2866 case URX_DOTANY:
2867 currentLen+=2;
2868 break;
2869
2870 // Single literal character. Increase current max length by one or two,
2871 // depending on whether the char is in the supplementary range.
2872 case URX_ONECHAR:
2873 currentLen++;
2874 if (URX_VAL(op) > 0x10000) {
2875 currentLen++;
2876 }
2877 break;
2878
2879 // Jumps.
2880 //
2881 case URX_JMP:
2882 case URX_JMPX:
2883 case URX_JMP_SAV:
2884 case URX_JMP_SAV_X:
2885 {
2886 int32_t jmpDest = URX_VAL(op);
2887 if (jmpDest < loc) {
2888 // Loop of some kind. Max match length is unbounded.
2889 currentLen = INT32_MAX;
2890 } else {
2891 // Forward jump. Propagate the current min length to the target loc of the jump.
2892 if (forwardedLength.elementAti(jmpDest) < currentLen) {
2893 forwardedLength.setElementAt(currentLen, jmpDest);
2894 }
2895 currentLen = 0;
2896 }
2897 }
2898 break;
2899
2900 case URX_FAIL:
2901 // Fails are kind of like a branch, except that the max length was
2902 // propagated already, by the state save.
2903 currentLen = forwardedLength.elementAti(loc+1);
2904 break;
2905
2906
2907 case URX_STATE_SAVE:
2908 {
2909 // State Save, for forward jumps, propagate the current minimum.
2910 // of the state save.
2911 // For backwards jumps, they create a loop, maximum
2912 // match length is unbounded.
2913 int32_t jmpDest = URX_VAL(op);
2914 if (jmpDest > loc) {
2915 if (currentLen > forwardedLength.elementAti(jmpDest)) {
2916 forwardedLength.setElementAt(currentLen, jmpDest);
2917 }
2918 } else {
2919 currentLen = INT32_MAX;
2920 }
2921 }
2922 break;
2923
2924
2925
2926
2927 case URX_STRING:
2928 case URX_STRING_I:
2929 {
2930 loc++;
2931 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
2932 currentLen += URX_VAL(stringLenOp);
2933 }
2934 break;
2935
2936
2937 case URX_CTR_INIT:
2938 case URX_CTR_INIT_NG:
2939 case URX_CTR_LOOP:
2940 case URX_CTR_LOOP_NG:
2941 case URX_LOOP_SR_I:
2942 case URX_LOOP_DOT_I:
2943 case URX_LOOP_C:
2944 // For anything to do with loops, make the match length unbounded.
2945 // Note: INIT instructions are multi-word. Can ignore because
2946 // INT32_MAX length will stop the per-instruction loop.
2947 currentLen = INT32_MAX;
2948 break;
2949
2950
2951
2952 case URX_LA_START:
2953 case URX_LA_END:
2954 // Look-ahead. Just ignore, treat the look-ahead block as if
2955 // it were normal pattern. Gives a too-long match length,
2956 // but good enough for now.
2957 break;
2958
2959 // End of look-ahead ops should always be consumed by the processing at
2960 // the URX_LA_START op.
2961 U_ASSERT(FALSE);
2962 break;
2963
2964 case URX_LB_START:
2965 {
2966 // Look-behind. Scan forward until the matching look-around end,
2967 // without processing the look-behind block.
2968 int32_t depth = 0;
2969 for (;;) {
2970 loc++;
2971 op = fRXPat->fCompiledPat->elementAti(loc);
2972 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
2973 depth++;
2974 }
2975 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
2976 if (depth == 0) {
2977 break;
2978 }
2979 depth--;
2980 }
2981 U_ASSERT(loc < end);
2982 }
2983 }
2984 break;
2985
2986 default:
2987 U_ASSERT(FALSE);
2988 }
2989
2990
2991 if (currentLen == INT32_MAX) {
2992 // The maximum length is unbounded.
2993 // Stop further processing of the pattern.
2994 break;
2995 }
2996
2997 }
2998 return currentLen;
2999
3000}
3001
3002
3003//----------------------------------------------------------------------------------------
3004//
3005// stripNOPs Remove any NOP operations from the compiled pattern code.
3006// Extra NOPs are inserted for some constructs during the initial
3007// code generation to provide locations that may be patched later.
3008// Many end up unneeded, and are removed by this function.
3009//
3010//----------------------------------------------------------------------------------------
3011void RegexCompile::stripNOPs() {
3012
3013 if (U_FAILURE(*fStatus)) {
3014 return;
3015 }
3016
3017 int32_t end = fRXPat->fCompiledPat->size();
3018 UVector32 deltas(end, *fStatus);
3019
3020 // Make a first pass over the code, computing the amount that things
3021 // will be offset at each location in the original code.
3022 int32_t loc;
3023 int32_t d = 0;
3024 for (loc=0; loc<end; loc++) {
3025 deltas.addElement(d, *fStatus);
3026 int32_t op = fRXPat->fCompiledPat->elementAti(loc);
3027 if (URX_TYPE(op) == URX_NOP) {
3028 d++;
3029 }
3030 }
3031
3032 // Make a second pass over the code, removing the NOPs by moving following
3033 // code up, and patching operands that refer to code locations that
3034 // are being moved. The array of offsets from the first step is used
3035 // to compute the new operand values.
3036 int32_t src;
3037 int32_t dst = 0;
3038 for (src=0; src<end; src++) {
3039 int32_t op = fRXPat->fCompiledPat->elementAti(src);
3040 int32_t opType = URX_TYPE(op);
3041 switch (opType) {
3042 case URX_NOP:
3043 break;
3044
3045 case URX_STATE_SAVE:
3046 case URX_JMP:
3047 case URX_CTR_LOOP:
3048 case URX_CTR_LOOP_NG:
3049 case URX_RELOC_OPRND:
3050 case URX_JMPX:
3051 case URX_JMP_SAV:
3052 case URX_JMP_SAV_X:
3053 // These are instructions with operands that refer to code locations.
3054 {
3055 int32_t operandAddress = URX_VAL(op);
3056 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3057 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3058 op = URX_BUILD(opType, fixedOperandAddress);
3059 fRXPat->fCompiledPat->setElementAt(op, dst);
3060 dst++;
3061 break;
3062 }
3063
3064 case URX_RESERVED_OP:
3065 case URX_RESERVED_OP_N:
3066 case URX_BACKTRACK:
3067 case URX_END:
3068 case URX_ONECHAR:
3069 case URX_STRING:
3070 case URX_STRING_LEN:
3071 case URX_START_CAPTURE:
3072 case URX_END_CAPTURE:
3073 case URX_STATIC_SETREF:
3074 case URX_STAT_SETREF_N:
3075 case URX_SETREF:
3076 case URX_DOTANY:
3077 case URX_FAIL:
3078 case URX_BACKSLASH_B:
374ca955 3079 case URX_BACKSLASH_BU:
b75a7d8f
A
3080 case URX_BACKSLASH_G:
3081 case URX_BACKSLASH_X:
3082 case URX_BACKSLASH_Z:
3083 case URX_DOTANY_ALL:
3084 case URX_DOTANY_ALL_PL:
3085 case URX_DOTANY_PL:
3086 case URX_BACKSLASH_D:
3087 case URX_CARET:
3088 case URX_DOLLAR:
3089 case URX_CTR_INIT:
3090 case URX_CTR_INIT_NG:
3091 case URX_STO_SP:
3092 case URX_LD_SP:
3093 case URX_BACKREF:
3094 case URX_STO_INP_LOC:
3095 case URX_LA_START:
3096 case URX_LA_END:
3097 case URX_ONECHAR_I:
3098 case URX_STRING_I:
3099 case URX_BACKREF_I:
3100 case URX_DOLLAR_M:
3101 case URX_CARET_M:
3102 case URX_LB_START:
3103 case URX_LB_CONT:
3104 case URX_LB_END:
3105 case URX_LBN_CONT:
3106 case URX_LBN_END:
3107 case URX_LOOP_SR_I:
3108 case URX_LOOP_DOT_I:
3109 case URX_LOOP_C:
3110 // These instructions are unaltered by the relocation.
3111 fRXPat->fCompiledPat->setElementAt(op, dst);
3112 dst++;
3113 break;
3114
3115 default:
3116 // Some op is unaccounted for.
3117 U_ASSERT(FALSE);
3118 error(U_REGEX_INTERNAL_ERROR);
3119 }
3120 }
3121
3122 fRXPat->fCompiledPat->setSize(dst);
3123}
3124
3125
3126
3127
3128//----------------------------------------------------------------------------------------
3129//
3130// OptDotStar Optimize patterns that end with a '.*' or '.+' to
3131// just advance the input to the end.
3132//
3133// Transform this compiled sequence
3134// [DOT_ANY | DOT_ANY_ALL]
3135// JMP_SAV to previous instruction
3136// [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
3137// END
3138//
3139// To
3140// NOP
3141// [DOT_ANY_PL | DOT_ANY_ALL_PL]
3142// [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
3143// END
3144//
3145//----------------------------------------------------------------------------------------
3146void RegexCompile::OptDotStar() {
3147 // Scan backwards in the pattern, looking for a JMP_SAV near the end.
3148 int32_t jmpLoc;
3149 int32_t op = 0;
3150 int32_t opType;
3151 for (jmpLoc=fRXPat->fCompiledPat->size(); jmpLoc--;) {
3152 U_ASSERT(jmpLoc>0);
3153 op = fRXPat->fCompiledPat->elementAti(jmpLoc);
3154 opType = URX_TYPE(op);
3155 switch(opType) {
3156
3157
3158 case URX_END:
3159 case URX_NOP:
3160 case URX_END_CAPTURE:
3161 case URX_DOLLAR_M:
3162 case URX_DOLLAR:
3163 case URX_BACKSLASH_Z:
3164 // These ops may follow the JMP_SAV without preventing us from
3165 // doing this optimization.
3166 continue;
3167
3168 case URX_JMP_SAV:
3169 // Got a trailing JMP_SAV that's a candidate for optimization.
3170 break;
3171
3172 default:
3173 // This optimization not possible.
3174 return;
3175 }
3176 break; // from the for loop.
3177 }
3178
3179 // We found in URX_JMP_SAV near the end that is a candidate for optimizing.
3180 // Is the target address the previous instruction?
3181 // Is the previous instruction a flavor of URX_DOTANY
3182 int32_t loopTopLoc = URX_VAL(op);
3183 if (loopTopLoc != jmpLoc-1) {
3184 return;
3185 }
3186 int32_t newOp;
3187 int32_t oldOp = fRXPat->fCompiledPat->elementAti(loopTopLoc);
3188 int32_t oldOpType = opType = URX_TYPE(oldOp);
3189 if (oldOpType == URX_DOTANY) {
3190 newOp = URX_BUILD(URX_DOTANY_PL, 0);
3191 }
3192 else if (oldOpType == URX_DOTANY_ALL) {
3193 newOp = URX_BUILD(URX_DOTANY_ALL_PL, 0);
3194 } else {
3195 return; // Sequence we were looking for isn't there.
3196 }
3197
3198 // Substitute the new instructions into the pattern.
3199 // The NOP will be removed in a later optimization step.
3200 fRXPat->fCompiledPat->setElementAt(URX_BUILD(URX_NOP, 0), loopTopLoc);
3201 fRXPat->fCompiledPat->setElementAt(newOp, jmpLoc);
3202}
3203
3204
3205//----------------------------------------------------------------------------------------
3206//
3207// Error Report a rule parse error.
3208// Only report it if no previous error has been recorded.
3209//
3210//----------------------------------------------------------------------------------------
3211void RegexCompile::error(UErrorCode e) {
3212 if (U_SUCCESS(*fStatus)) {
3213 *fStatus = e;
3214 fParseErr->line = fLineNum;
3215 fParseErr->offset = fCharNum;
3216
3217 // Fill in the context.
3218 // Note: extractBetween() pins supplied indicies to the string bounds.
3219 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
3220 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3221 fRXPat->fPattern.extractBetween(fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex,
3222 fParseErr->preContext, 0);
3223 fRXPat->fPattern.extractBetween(fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1,
3224 fParseErr->postContext, 0);
3225 }
3226}
3227
3228
3229//
3230// Assorted Unicode character constants.
3231// Numeric because there is no portable way to enter them as literals.
3232// (Think EBCDIC).
3233//
3234static const UChar chCR = 0x0d; // New lines, for terminating comments.
3235static const UChar chLF = 0x0a;
3236static const UChar chNEL = 0x85; // NEL newline variant
3237static const UChar chLS = 0x2028; // Unicode Line Separator
3238static const UChar chApos = 0x27; // single quote, for quoted chars.
3239static const UChar chPound = 0x23; // '#', introduces a comment.
3240static const UChar chE = 0x45; // 'E'
3241static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
3242static const UChar chLParen = 0x28;
3243static const UChar chRParen = 0x29;
3244static const UChar chLBracket = 0x5b;
3245static const UChar chRBracket = 0x5d;
3246static const UChar chRBrace = 0x7d;
3247static const UChar chUpperN = 0x4E;
3248static const UChar chLowerP = 0x70;
3249static const UChar chUpperP = 0x50;
3250
3251
3252//----------------------------------------------------------------------------------------
3253//
3254// nextCharLL Low Level Next Char from the regex pattern.
3255// Get a char from the string, keep track of input position
3256// for error reporting.
3257//
3258//----------------------------------------------------------------------------------------
3259UChar32 RegexCompile::nextCharLL() {
3260 UChar32 ch;
3261 UnicodeString &pattern = fRXPat->fPattern;
3262
3263 if (fPeekChar != -1) {
3264 ch = fPeekChar;
3265 fPeekChar = -1;
3266 return ch;
3267 }
3268 if (fPatternLength==0 || fNextIndex >= fPatternLength) {
3269 return (UChar32)-1;
3270 }
3271 ch = pattern.char32At(fNextIndex);
3272 fNextIndex = pattern.moveIndex32(fNextIndex, 1);
3273
3274 if (ch == chCR ||
3275 ch == chNEL ||
3276 ch == chLS ||
3277 ch == chLF && fLastChar != chCR) {
3278 // Character is starting a new line. Bump up the line number, and
3279 // reset the column to 0.
3280 fLineNum++;
3281 fCharNum=0;
3282 if (fQuoteMode) {
3283 error(U_REGEX_RULE_SYNTAX);
3284 fQuoteMode = FALSE;
3285 }
3286 }
3287 else {
3288 // Character is not starting a new line. Except in the case of a
3289 // LF following a CR, increment the column position.
3290 if (ch != chLF) {
3291 fCharNum++;
3292 }
3293 }
3294 fLastChar = ch;
3295 return ch;
3296}
3297
3298//---------------------------------------------------------------------------------
3299//
3300// peekCharLL Low Level Character Scanning, sneak a peek at the next
3301// character without actually getting it.
3302//
3303//---------------------------------------------------------------------------------
3304UChar32 RegexCompile::peekCharLL() {
3305 if (fPeekChar == -1) {
3306 fPeekChar = nextCharLL();
3307 }
3308 return fPeekChar;
3309}
3310
3311
3312//---------------------------------------------------------------------------------
3313//
3314// nextChar for pattern scanning. At this level, we handle stripping
3315// out comments and processing some backslash character escapes.
3316// The rest of the pattern grammar is handled at the next level up.
3317//
3318//---------------------------------------------------------------------------------
3319void RegexCompile::nextChar(RegexPatternChar &c) {
3320
3321 fScanIndex = fNextIndex;
3322 c.fChar = nextCharLL();
3323 c.fQuoted = FALSE;
3324
3325 if (fQuoteMode) {
3326 c.fQuoted = TRUE;
3327 if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) {
3328 fQuoteMode = FALSE; // Exit quote mode,
3329 nextCharLL(); // discard the E
3330 nextChar(c); // recurse to get the real next char
3331 }
3332 }
3333 else if (fInBackslashQuote) {
3334 // The current character immediately follows a '\'
3335 // Don't check for any further escapes, just return it as-is.
3336 // Don't set c.fQuoted, because that would prevent the state machine from
3337 // dispatching on the character.
3338 fInBackslashQuote = FALSE;
3339 }
3340 else
3341 {
3342 // We are not in a \Q quoted region \E of the source.
3343 //
3344 if (fModeFlags & UREGEX_COMMENTS) {
3345 //
3346 // We are in free-spacing and comments mode.
3347 // Scan through any white space and comments, until we
3348 // reach a significant character or the end of inut.
3349 for (;;) {
3350 if (c.fChar == (UChar32)-1) {
3351 break; // End of Input
3352 }
3353 if (c.fChar == chPound && fEOLComments == TRUE) {
3354 // Start of a comment. Consume the rest of it, until EOF or a new line
3355 for (;;) {
3356 c.fChar = nextCharLL();
3357 if (c.fChar == (UChar32)-1 || // EOF
3358 c.fChar == chCR ||
3359 c.fChar == chLF ||
3360 c.fChar == chNEL ||
3361 c.fChar == chLS) {
3362 break;
3363 }
3364 }
3365 }
3366 if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) {
3367 break;
3368 }
3369 c.fChar = nextCharLL();
3370 }
3371 }
3372
3373 //
3374 // check for backslash escaped characters.
3375 //
3376 int32_t startX = fNextIndex; // start and end positions of the
3377 int32_t endX = fNextIndex; // sequence following the '\'
3378 if (c.fChar == chBackSlash) {
3379 if (RegexStaticSets::gStaticSets->fUnescapeCharSet->contains(peekCharLL())) {
3380 //
3381 // A '\' sequence that is handled by ICU's standard unescapeAt function.
3382 // Includes \uxxxx, \n, \r, many others.
3383 // Return the single equivalent character.
3384 //
3385 nextCharLL(); // get & discard the peeked char.
3386 c.fQuoted = TRUE;
3387 c.fChar = fRXPat->fPattern.unescapeAt(endX);
3388 if (startX == endX) {
3389 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
3390 }
3391 fCharNum += endX - startX;
3392 fNextIndex = endX;
3393 }
3394 else
3395 {
3396 // We are in a '\' escape that will be handled by the state table scanner.
3397 // Just return the backslash, but remember that the following char is to
3398 // be taken literally. TODO: this is awkward, think about alternatives.
3399 fInBackslashQuote = TRUE;
3400 }
3401 }
3402 }
3403
3404 // re-enable # to end-of-line comments, in case they were disabled.
3405 // They are disabled by the parser upon seeing '(?', but this lasts for
3406 // the fetching of the next character only.
3407 fEOLComments = TRUE;
3408
3409 // putc(c.fChar, stdout);
3410}
3411
3412
3413
3414//---------------------------------------------------------------------------------
3415//
3416// scanSet Construct a UnicodeSet from the text at the current scan
3417// position. Advance the scan position to the first character
3418// after the set.
3419//
3420// The scan position is normally under the control of the state machine
3421// that controls pattern parsing. UnicodeSets, however, are parsed by
3422// the UnicodeSet constructor, not by the Regex pattern parser.
3423//
3424//---------------------------------------------------------------------------------
3425UnicodeSet *RegexCompile::scanSet() {
3426 UnicodeSet *uset = NULL;
3427 ParsePosition pos;
3428 int startPos;
3429 int i;
3430
3431 if (U_FAILURE(*fStatus)) {
3432 return NULL;
3433 }
3434
3435 pos.setIndex(fScanIndex);
3436 startPos = fScanIndex;
3437 UErrorCode localStatus = U_ZERO_ERROR;
3438 uint32_t usetFlags = 0;
3439 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
3440 usetFlags |= USET_CASE_INSENSITIVE;
3441 }
3442 if (fModeFlags & UREGEX_COMMENTS) {
3443 usetFlags |= USET_IGNORE_SPACE;
3444 }
3445
3446 uset = new UnicodeSet(fRXPat->fPattern, pos,
374ca955 3447 usetFlags, NULL, localStatus);
b75a7d8f
A
3448 if (U_FAILURE(localStatus)) {
3449 // TODO: Get more accurate position of the error from UnicodeSet's return info.
3450 // UnicodeSet appears to not be reporting correctly at this time.
374ca955 3451 REGEX_SCAN_DEBUG_PRINTF(("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()));
b75a7d8f
A
3452 error(localStatus);
3453 delete uset;
3454 return NULL;
3455 }
3456
3457 // Advance the current scan postion over the UnicodeSet.
3458 // Don't just set fScanIndex because the line/char positions maintained
3459 // for error reporting would be thrown off.
3460 i = pos.getIndex();
3461 for (;;) {
3462 if (fNextIndex >= i) {
3463 break;
3464 }
3465 nextCharLL();
3466 }
3467
3468 return uset;
3469};
3470
3471
3472//---------------------------------------------------------------------------------
3473//
3474// scanProp Construct a UnicodeSet from the text at the current scan
3475// position, which will be of the form \p{whaterver}
3476//
3477// The scan position will be at the 'p' or 'P'. On return
3478// the scan position should be just after the '}'
3479//
3480// Return a UnicodeSet, constructed from the \P pattern,
3481// or NULL if the pattern is invalid.
3482//
3483//---------------------------------------------------------------------------------
3484UnicodeSet *RegexCompile::scanProp() {
3485 UnicodeSet *uset = NULL;
3486
3487 if (U_FAILURE(*fStatus)) {
3488 return NULL;
3489 }
3490
3491 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chUpperP || fC.fChar == chUpperN);
3492
3493 // enclose the \p{property} from the regex pattern source in [brackets]
3494 UnicodeString setPattern;
3495 setPattern.append(chLBracket);
3496 setPattern.append(chBackSlash);
3497 for (;;) {
3498 setPattern.append(fC.fChar);
3499 if (fC.fChar == chRBrace) {
3500 break;
3501 }
3502 nextChar(fC);
3503 if (fC.fChar == -1) {
3504 // Hit the end of the input string without finding the closing '}'
3505 error(U_REGEX_PROPERTY_SYNTAX);
3506 return NULL;
3507 }
3508 }
3509 setPattern.append(chRBracket);
3510
3511 uint32_t usetFlags = 0;
3512 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
3513 usetFlags |= USET_CASE_INSENSITIVE;
3514 }
3515 if (fModeFlags & UREGEX_COMMENTS) {
3516 usetFlags |= USET_IGNORE_SPACE;
3517 }
3518
3519 // Build the UnicodeSet from the set pattern we just built up in a string.
374ca955 3520 uset = new UnicodeSet(setPattern, usetFlags, NULL, *fStatus);
b75a7d8f
A
3521 if (U_FAILURE(*fStatus)) {
3522 delete uset;
3523 uset = NULL;
3524 }
3525
3526 nextChar(fC); // Continue overall regex pattern processing with char after the '}'
3527 return uset;
3528};
3529
3530U_NAMESPACE_END
3531#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS