]> git.saurik.com Git - apple/javascriptcore.git/blame - pcre/pcre_compile.cpp
JavaScriptCore-584.tar.gz
[apple/javascriptcore.git] / pcre / pcre_compile.cpp
CommitLineData
b37bf2e1
A
1/* This is JavaScriptCore's variant of the PCRE library. While this library
2started out as a copy of PCRE, many of the features of PCRE have been
3removed. This library now supports only the regular expression features
4required by the JavaScript language specification, and has only the functions
5needed by JavaScriptCore and the rest of WebKit.
6
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9dae56ea 9 Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
b37bf2e1
A
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41/* This module contains the external function jsRegExpExecute(), along with
42supporting internal functions that are not used by other modules. */
43
44#include "config.h"
45
46#include "pcre_internal.h"
47
48#include <string.h>
49#include <wtf/ASCIICType.h>
50#include <wtf/FastMalloc.h>
51
52using namespace WTF;
53
54/* Negative values for the firstchar and reqchar variables */
55
56#define REQ_UNSET (-2)
57#define REQ_NONE (-1)
58
59/*************************************************
60* Code parameters and static tables *
61*************************************************/
62
63/* Maximum number of items on the nested bracket stacks at compile time. This
64applies to the nesting of all kinds of parentheses. It does not limit
65un-nested, non-capturing parentheses. This number can be made bigger if
66necessary - it is used to dimension one int and one unsigned char vector at
67compile time. */
68
69#define BRASTACK_SIZE 200
70
71/* Table for handling escaped characters in the range '0'-'z'. Positive returns
72are simple data values; negative values are for special things like \d and so
73on. Zero means further processing is needed (for things like \x), or the escape
74is invalid. */
75
76static const short escapes[] = {
77 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
78 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
79 '@', 0, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
80 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
81 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
82 0, 0, 0, '[', '\\', ']', '^', '_', /* X - _ */
83 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
84 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
85 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
86 0, 0, 0 /* x - z */
87};
88
89/* Error code numbers. They are given names so that they can more easily be
90tracked. */
91
92enum ErrorCode {
93 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
94 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
95};
96
97/* The texts of compile-time error messages. These are "char *" because they
98are passed to the outside world. */
99
100static const char* errorText(ErrorCode code)
101{
102 static const char errorTexts[] =
103 /* 1 */
104 "\\ at end of pattern\0"
105 "\\c at end of pattern\0"
106 "character value in \\x{...} sequence is too large\0"
107 "numbers out of order in {} quantifier\0"
108 /* 5 */
109 "number too big in {} quantifier\0"
110 "missing terminating ] for character class\0"
111 "internal error: code overflow\0"
112 "range out of order in character class\0"
113 "nothing to repeat\0"
114 /* 10 */
115 "unmatched parentheses\0"
116 "internal error: unexpected repeat\0"
117 "unrecognized character after (?\0"
118 "failed to get memory\0"
119 "missing )\0"
120 /* 15 */
121 "reference to non-existent subpattern\0"
122 "regular expression too large\0"
123 "parentheses nested too deeply"
124 ;
125
126 int i = code;
127 const char* text = errorTexts;
128 while (i > 1)
129 i -= !*text++;
130 return text;
131}
132
133/* Structure for passing "static" information around between the functions
134doing the compiling. */
135
136struct CompileData {
137 CompileData() {
9dae56ea 138 topBackref = 0;
b37bf2e1 139 backrefMap = 0;
9dae56ea 140 reqVaryOpt = 0;
b37bf2e1
A
141 needOuterBracket = false;
142 numCapturingBrackets = 0;
143 }
9dae56ea 144 int topBackref; /* Maximum back reference */
b37bf2e1 145 unsigned backrefMap; /* Bitmap of low back refs */
9dae56ea 146 int reqVaryOpt; /* "After variable item" flag for reqByte */
b37bf2e1
A
147 bool needOuterBracket;
148 int numCapturingBrackets;
149};
150
151/* Definitions to allow mutual recursion */
152
153static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);
154static bool bracketIsAnchored(const unsigned char* code);
155static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap);
156static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert);
157
158/*************************************************
159* Handle escapes *
160*************************************************/
161
162/* This function is called when a \ has been encountered. It either returns a
163positive value for a simple escape such as \n, or a negative value which
164encodes one of the more complicated things such as \d. When UTF-8 is enabled,
165a positive value greater than 255 may be returned. On entry, ptr is pointing at
166the \. On exit, it is on the final character of the escape sequence.
167
168Arguments:
9dae56ea
A
169 ptrPtr points to the pattern position pointer
170 errorCodePtr points to the errorcode variable
b37bf2e1
A
171 bracount number of previous extracting brackets
172 options the options bits
9dae56ea 173 isClass true if inside a character class
b37bf2e1
A
174
175Returns: zero or positive => a data character
176 negative => a special escape sequence
9dae56ea 177 on error, errorPtr is set
b37bf2e1
A
178*/
179
9dae56ea 180static int checkEscape(const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int bracount, bool isClass)
b37bf2e1 181{
9dae56ea 182 const UChar* ptr = *ptrPtr + 1;
b37bf2e1
A
183
184 /* If backslash is at the end of the pattern, it's an error. */
185 if (ptr == patternEnd) {
9dae56ea
A
186 *errorCodePtr = ERR1;
187 *ptrPtr = ptr;
b37bf2e1
A
188 return 0;
189 }
190
191 int c = *ptr;
192
193 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
194 a table. A non-zero result is something that can be returned immediately.
195 Otherwise further processing may be required. */
196
197 if (c < '0' || c > 'z') { /* Not alphameric */
198 } else if (int escapeValue = escapes[c - '0']) {
199 c = escapeValue;
9dae56ea 200 if (isClass) {
b37bf2e1
A
201 if (-c == ESC_b)
202 c = '\b'; /* \b is backslash in a class */
203 else if (-c == ESC_B)
204 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */
205 }
206 /* Escapes that need further processing, or are illegal. */
207
208 } else {
209 switch (c) {
210 case '1':
211 case '2':
212 case '3':
213 case '4':
214 case '5':
215 case '6':
216 case '7':
217 case '8':
218 case '9':
219 /* Escape sequences starting with a non-zero digit are backreferences,
220 unless there are insufficient brackets, in which case they are octal
221 escape sequences. Those sequences end on the first non-octal character
222 or when we overflow 0-255, whichever comes first. */
223
9dae56ea 224 if (!isClass) {
b37bf2e1
A
225 const UChar* oldptr = ptr;
226 c -= '0';
227 while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount)
228 c = c * 10 + *(++ptr) - '0';
229 if (c <= bracount) {
230 c = -(ESC_REF + c);
231 break;
232 }
233 ptr = oldptr; /* Put the pointer back and fall through */
234 }
235
236 /* Handle an octal number following \. If the first digit is 8 or 9,
237 this is not octal. */
238
9dae56ea
A
239 if ((c = *ptr) >= '8') {
240 c = '\\';
241 ptr -= 1;
b37bf2e1 242 break;
9dae56ea 243 }
b37bf2e1
A
244
245 /* \0 always starts an octal number, but we may drop through to here with a
246 larger first octal digit. */
247
248 case '0': {
249 c -= '0';
250 int i;
251 for (i = 1; i <= 2; ++i) {
252 if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
253 break;
254 int cc = c * 8 + ptr[i] - '0';
255 if (cc > 255)
256 break;
257 c = cc;
258 }
259 ptr += i - 1;
260 break;
261 }
262
263 case 'x': {
264 c = 0;
265 int i;
266 for (i = 1; i <= 2; ++i) {
267 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
268 c = 'x';
269 i = 1;
270 break;
271 }
272 int cc = ptr[i];
273 if (cc >= 'a')
274 cc -= 32; /* Convert to upper case */
275 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
276 }
277 ptr += i - 1;
278 break;
279 }
280
281 case 'u': {
282 c = 0;
283 int i;
284 for (i = 1; i <= 4; ++i) {
285 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
286 c = 'u';
287 i = 1;
288 break;
289 }
290 int cc = ptr[i];
291 if (cc >= 'a')
292 cc -= 32; /* Convert to upper case */
293 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
294 }
295 ptr += i - 1;
296 break;
297 }
298
299 case 'c':
300 if (++ptr == patternEnd) {
9dae56ea 301 *errorCodePtr = ERR2;
b37bf2e1
A
302 return 0;
303 }
b37bf2e1 304
9dae56ea
A
305 c = *ptr;
306
307 /* To match Firefox, inside a character class, we also accept
308 numbers and '_' as control characters */
309 if ((!isClass && !isASCIIAlpha(c)) || (!isASCIIAlphanumeric(c) && c != '_')) {
310 c = '\\';
311 ptr -= 2;
312 break;
313 }
314
b37bf2e1
A
315 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
316 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
317 c = toASCIIUpper(c) ^ 0x40;
318 break;
319 }
320 }
321
9dae56ea 322 *ptrPtr = ptr;
b37bf2e1
A
323 return c;
324}
325
326/*************************************************
327* Check for counted repeat *
328*************************************************/
329
330/* This function is called when a '{' is encountered in a place where it might
331start a quantifier. It looks ahead to see if it really is a quantifier or not.
332It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
333where the ddds are digits.
334
335Arguments:
336 p pointer to the first char after '{'
337
338Returns: true or false
339*/
340
341static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
342{
343 if (p >= patternEnd || !isASCIIDigit(*p))
344 return false;
345 p++;
346 while (p < patternEnd && isASCIIDigit(*p))
347 p++;
348 if (p < patternEnd && *p == '}')
349 return true;
350
351 if (p >= patternEnd || *p++ != ',')
352 return false;
353 if (p < patternEnd && *p == '}')
354 return true;
355
356 if (p >= patternEnd || !isASCIIDigit(*p))
357 return false;
358 p++;
359 while (p < patternEnd && isASCIIDigit(*p))
360 p++;
361
362 return (p < patternEnd && *p == '}');
363}
364
365/*************************************************
366* Read repeat counts *
367*************************************************/
368
369/* Read an item of the form {n,m} and return the values. This is called only
370after isCountedRepeat() has confirmed that a repeat-count quantifier exists,
371so the syntax is guaranteed to be correct, but we need to check the values.
372
373Arguments:
374 p pointer to first char after '{'
375 minp pointer to int for min
376 maxp pointer to int for max
377 returned as -1 if no max
9dae56ea 378 errorCodePtr points to error code variable
b37bf2e1
A
379
380Returns: pointer to '}' on success;
9dae56ea 381 current ptr on error, with errorCodePtr set non-zero
b37bf2e1
A
382*/
383
9dae56ea 384static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorCodePtr)
b37bf2e1
A
385{
386 int min = 0;
387 int max = -1;
388
389 /* Read the minimum value and do a paranoid check: a negative value indicates
390 an integer overflow. */
391
392 while (isASCIIDigit(*p))
393 min = min * 10 + *p++ - '0';
394 if (min < 0 || min > 65535) {
9dae56ea 395 *errorCodePtr = ERR5;
b37bf2e1
A
396 return p;
397 }
398
399 /* Read the maximum value if there is one, and again do a paranoid on its size.
400 Also, max must not be less than min. */
401
402 if (*p == '}')
403 max = min;
404 else {
405 if (*(++p) != '}') {
406 max = 0;
407 while (isASCIIDigit(*p))
408 max = max * 10 + *p++ - '0';
409 if (max < 0 || max > 65535) {
9dae56ea 410 *errorCodePtr = ERR5;
b37bf2e1
A
411 return p;
412 }
413 if (max < min) {
9dae56ea 414 *errorCodePtr = ERR4;
b37bf2e1
A
415 return p;
416 }
417 }
418 }
419
420 /* Fill in the required variables, and pass back the pointer to the terminating
421 '}'. */
422
423 *minp = min;
424 *maxp = max;
425 return p;
426}
427
428/*************************************************
429* Find first significant op code *
430*************************************************/
431
432/* This is called by several functions that scan a compiled expression looking
433for a fixed first character, or an anchoring op code etc. It skips over things
434that do not influence this.
435
436Arguments:
437 code pointer to the start of the group
438Returns: pointer to the first significant opcode
439*/
440
441static const unsigned char* firstSignificantOpcode(const unsigned char* code)
442{
443 while (*code == OP_BRANUMBER)
444 code += 3;
445 return code;
446}
447
448static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code)
449{
450 while (true) {
451 switch (*code) {
452 case OP_ASSERT_NOT:
453 advanceToEndOfBracket(code);
454 code += 1 + LINK_SIZE;
455 break;
456 case OP_WORD_BOUNDARY:
457 case OP_NOT_WORD_BOUNDARY:
458 ++code;
459 break;
460 case OP_BRANUMBER:
461 code += 3;
462 break;
463 default:
464 return code;
465 }
466 }
467}
468
469/*************************************************
470* Get othercase range *
471*************************************************/
472
473/* This function is passed the start and end of a class range, in UTF-8 mode
474with UCP support. It searches up the characters, looking for internal ranges of
475characters in the "other" case. Each call returns the next one, updating the
476start address.
477
478Arguments:
479 cptr points to starting character value; updated
480 d end value
481 ocptr where to put start of othercase range
482 odptr where to put end of othercase range
483
484Yield: true when range returned; false when no more
485*/
486
487static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
488{
489 int c, othercase = 0;
490
491 for (c = *cptr; c <= d; c++) {
9dae56ea 492 if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0)
b37bf2e1
A
493 break;
494 }
495
496 if (c > d)
497 return false;
498
499 *ocptr = othercase;
500 int next = othercase + 1;
501
502 for (++c; c <= d; c++) {
9dae56ea 503 if (jsc_pcre_ucp_othercase(c) != next)
b37bf2e1
A
504 break;
505 next++;
506 }
507
508 *odptr = next - 1;
509 *cptr = c;
510
511 return true;
512}
513
514/*************************************************
515 * Convert character value to UTF-8 *
516 *************************************************/
517
518/* This function takes an integer value in the range 0 - 0x7fffffff
519 and encodes it as a UTF-8 character in 0 to 6 bytes.
520
521 Arguments:
522 cvalue the character value
523 buffer pointer to buffer for result - at least 6 bytes long
524
525 Returns: number of characters placed in the buffer
526 */
527
528static int encodeUTF8(int cvalue, unsigned char *buffer)
529{
530 int i;
9dae56ea
A
531 for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
532 if (cvalue <= jsc_pcre_utf8_table1[i])
b37bf2e1
A
533 break;
534 buffer += i;
535 for (int j = i; j > 0; j--) {
536 *buffer-- = 0x80 | (cvalue & 0x3f);
537 cvalue >>= 6;
538 }
9dae56ea 539 *buffer = jsc_pcre_utf8_table2[i] | cvalue;
b37bf2e1
A
540 return i + 1;
541}
542
543/*************************************************
544* Compile one branch *
545*************************************************/
546
547/* Scan the pattern, compiling it into the code vector.
548
549Arguments:
550 options the option bits
551 brackets points to number of extracting brackets used
9dae56ea
A
552 codePtr points to the pointer to the current code point
553 ptrPtr points to the current pattern pointer
554 errorCodePtr points to error code variable
b37bf2e1
A
555 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
556 reqbyteptr set to the last literal character required, else < 0
557 cd contains pointers to tables etc.
558
559Returns: true on success
9dae56ea 560 false, with *errorCodePtr set non-zero on error
b37bf2e1
A
561*/
562
563static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected)
564{
565 return ((ptr + 1 < patternEnd) && ptr[1] == expected);
566}
567
568static bool
9dae56ea
A
569compileBranch(int options, int* brackets, unsigned char** codePtr,
570 const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int *firstbyteptr,
b37bf2e1
A
571 int* reqbyteptr, CompileData& cd)
572{
9dae56ea
A
573 int repeatType, opType;
574 int repeatMin = 0, repeat_max = 0; /* To please picky compilers */
b37bf2e1
A
575 int bravalue = 0;
576 int reqvary, tempreqvary;
577 int c;
9dae56ea 578 unsigned char* code = *codePtr;
b37bf2e1 579 unsigned char* tempcode;
9dae56ea
A
580 bool didGroupSetFirstByte = false;
581 const UChar* ptr = *ptrPtr;
b37bf2e1
A
582 const UChar* tempptr;
583 unsigned char* previous = NULL;
584 unsigned char classbits[32];
585
586 bool class_utf8;
587 unsigned char* class_utf8data;
588 unsigned char utf8_char[6];
589
590 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
591 matching encountered yet". It gets changed to REQ_NONE if we hit something that
9dae56ea 592 matches a non-fixed char first char; reqByte just remains unset if we never
b37bf2e1
A
593 find one.
594
595 When we hit a repeat whose minimum is zero, we may have to adjust these values
596 to take the zero repeat into account. This is implemented by setting them to
9dae56ea 597 zeroFirstByte and zeroReqByte when such a repeat is encountered. The individual
b37bf2e1
A
598 item types that can be repeated set these backoff variables appropriately. */
599
9dae56ea
A
600 int firstByte = REQ_UNSET;
601 int reqByte = REQ_UNSET;
602 int zeroReqByte = REQ_UNSET;
603 int zeroFirstByte = REQ_UNSET;
b37bf2e1 604
9dae56ea 605 /* The variable reqCaseOpt contains either the REQ_IGNORE_CASE value or zero,
b37bf2e1 606 according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
9dae56ea 607 value > 255. It is added into the firstByte or reqByte variables to record the
b37bf2e1
A
608 case status of the value. This is used only for ASCII characters. */
609
9dae56ea 610 int reqCaseOpt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
b37bf2e1
A
611
612 /* Switch on next character until the end of the branch */
613
614 for (;; ptr++) {
9dae56ea
A
615 bool negateClass;
616 bool shouldFlipNegation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
617 int classCharCount;
618 int classLastChar;
619 int skipBytes;
620 int subReqByte;
621 int subFirstByte;
622 int mcLength;
b37bf2e1
A
623 unsigned char mcbuffer[8];
624
625 /* Next byte in the pattern */
626
627 c = ptr < patternEnd ? *ptr : 0;
628
629 /* Fill in length of a previous callout, except when the next thing is
630 a quantifier. */
631
9dae56ea 632 bool isQuantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd));
b37bf2e1
A
633
634 switch (c) {
635 /* The branch terminates at end of string, |, or ). */
636
637 case 0:
638 if (ptr < patternEnd)
639 goto NORMAL_CHAR;
640 // End of string; fall through
641 case '|':
642 case ')':
9dae56ea
A
643 *firstbyteptr = firstByte;
644 *reqbyteptr = reqByte;
645 *codePtr = code;
646 *ptrPtr = ptr;
b37bf2e1
A
647 return true;
648
649 /* Handle single-character metacharacters. In multiline mode, ^ disables
650 the setting of any following char as a first character. */
9dae56ea 651
b37bf2e1
A
652 case '^':
653 if (options & MatchAcrossMultipleLinesOption) {
9dae56ea
A
654 if (firstByte == REQ_UNSET)
655 firstByte = REQ_NONE;
656 *code++ = OP_BOL;
657 } else
658 *code++ = OP_CIRC;
b37bf2e1 659 previous = NULL;
b37bf2e1 660 break;
9dae56ea 661
b37bf2e1
A
662 case '$':
663 previous = NULL;
9dae56ea
A
664 if (options & MatchAcrossMultipleLinesOption)
665 *code++ = OP_EOL;
666 else
667 *code++ = OP_DOLL;
b37bf2e1 668 break;
9dae56ea 669
b37bf2e1 670 /* There can never be a first char if '.' is first, whatever happens about
9dae56ea
A
671 repeats. The value of reqByte doesn't change either. */
672
b37bf2e1 673 case '.':
9dae56ea
A
674 if (firstByte == REQ_UNSET)
675 firstByte = REQ_NONE;
676 zeroFirstByte = firstByte;
677 zeroReqByte = reqByte;
b37bf2e1
A
678 previous = code;
679 *code++ = OP_NOT_NEWLINE;
680 break;
681
682 /* Character classes. If the included characters are all < 256, we build a
683 32-byte bitmap of the permitted characters, except in the special case
684 where there is only one such character. For negated classes, we build the
685 map as usual, then invert it at the end. However, we use a different opcode
686 so that data characters > 255 can be handled correctly.
687
688 If the class contains characters outside the 0-255 range, a different
689 opcode is compiled. It may optionally have a bit map for characters < 256,
690 but those above are are explicitly listed afterwards. A flag byte tells
691 whether the bitmap is present, and whether this is a negated class or not.
692 */
693
694 case '[': {
695 previous = code;
9dae56ea 696 shouldFlipNegation = false;
b37bf2e1
A
697
698 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
699 they are encountered at the top level, so we'll do that too. */
700
701 /* If the first character is '^', set the negation flag and skip it. */
702
703 if (ptr + 1 >= patternEnd) {
9dae56ea 704 *errorCodePtr = ERR6;
b37bf2e1
A
705 return false;
706 }
707
708 if (ptr[1] == '^') {
9dae56ea 709 negateClass = true;
b37bf2e1
A
710 ++ptr;
711 } else
9dae56ea 712 negateClass = false;
b37bf2e1
A
713
714 /* Keep a count of chars with values < 256 so that we can optimize the case
715 of just a single character (as long as it's < 256). For higher valued UTF-8
716 characters, we don't yet do any optimization. */
717
9dae56ea
A
718 classCharCount = 0;
719 classLastChar = -1;
b37bf2e1
A
720
721 class_utf8 = false; /* No chars >= 256 */
722 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
723
724 /* Initialize the 32-char bit map to all zeros. We have to build the
725 map in a temporary bit of store, in case the class contains only 1
726 character (< 256), because in that case the compiled code doesn't use the
727 bit map. */
728
729 memset(classbits, 0, 32 * sizeof(unsigned char));
730
731 /* Process characters until ] is reached. The first pass
732 through the regex checked the overall syntax, so we don't need to be very
733 strict here. At the start of the loop, c contains the first byte of the
734 character. */
735
736 while ((++ptr < patternEnd) && (c = *ptr) != ']') {
737 /* Backslash may introduce a single character, or it may introduce one
738 of the specials, which just set a flag. Escaped items are checked for
739 validity in the pre-compiling pass. The sequence \b is a special case.
740 Inside a class (and only there) it is treated as backspace. Elsewhere
741 it marks a word boundary. Other escapes have preset maps ready to
742 or into the one we are building. We assume they have more than one
9dae56ea 743 character in them, so set classCharCount bigger than one. */
b37bf2e1
A
744
745 if (c == '\\') {
9dae56ea 746 c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
b37bf2e1 747 if (c < 0) {
9dae56ea 748 classCharCount += 2; /* Greater than 1 is what matters */
b37bf2e1
A
749 switch (-c) {
750 case ESC_d:
751 for (c = 0; c < 32; c++)
752 classbits[c] |= classBitmapForChar(c + cbit_digit);
753 continue;
754
755 case ESC_D:
9dae56ea 756 shouldFlipNegation = true;
b37bf2e1
A
757 for (c = 0; c < 32; c++)
758 classbits[c] |= ~classBitmapForChar(c + cbit_digit);
759 continue;
760
761 case ESC_w:
762 for (c = 0; c < 32; c++)
763 classbits[c] |= classBitmapForChar(c + cbit_word);
764 continue;
765
766 case ESC_W:
9dae56ea 767 shouldFlipNegation = true;
b37bf2e1
A
768 for (c = 0; c < 32; c++)
769 classbits[c] |= ~classBitmapForChar(c + cbit_word);
770 continue;
771
772 case ESC_s:
773 for (c = 0; c < 32; c++)
774 classbits[c] |= classBitmapForChar(c + cbit_space);
775 continue;
776
777 case ESC_S:
9dae56ea 778 shouldFlipNegation = true;
b37bf2e1
A
779 for (c = 0; c < 32; c++)
780 classbits[c] |= ~classBitmapForChar(c + cbit_space);
781 continue;
782
783 /* Unrecognized escapes are faulted if PCRE is running in its
784 strict mode. By default, for compatibility with Perl, they are
785 treated as literals. */
786
787 default:
788 c = *ptr; /* The final character */
9dae56ea 789 classCharCount -= 2; /* Undo the default count from above */
b37bf2e1
A
790 }
791 }
792
793 /* Fall through if we have a single character (c >= 0). This may be
794 > 256 in UTF-8 mode. */
795
796 } /* End of backslash handling */
797
798 /* A single character may be followed by '-' to form a range. However,
799 Perl does not permit ']' to be the end of the range. A '-' character
800 here is treated as a literal. */
801
802 if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') {
803 ptr += 2;
804
805 int d = *ptr;
806
807 /* The second part of a range can be a single-character escape, but
808 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
809 in such circumstances. */
810
811 if (d == '\\') {
812 const UChar* oldptr = ptr;
9dae56ea 813 d = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
b37bf2e1
A
814
815 /* \X is literal X; any other special means the '-' was literal */
816 if (d < 0) {
817 ptr = oldptr - 2;
818 goto LONE_SINGLE_CHARACTER; /* A few lines below */
819 }
820 }
821
822 /* The check that the two values are in the correct order happens in
823 the pre-pass. Optimize one-character ranges */
824
825 if (d == c)
826 goto LONE_SINGLE_CHARACTER; /* A few lines below */
827
828 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
829 matching, we have to use an XCLASS with extra data items. Caseless
830 matching for characters > 127 is available only if UCP support is
831 available. */
832
833 if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {
834 class_utf8 = true;
835
836 /* With UCP support, we can find the other case equivalents of
837 the relevant characters. There may be several ranges. Optimize how
838 they fit with the basic range. */
839
840 if (options & IgnoreCaseOption) {
841 int occ, ocd;
842 int cc = c;
843 int origd = d;
844 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
845 if (occ >= c && ocd <= d)
846 continue; /* Skip embedded ranges */
847
848 if (occ < c && ocd >= c - 1) /* Extend the basic range */
849 { /* if there is overlap, */
850 c = occ; /* noting that if occ < c */
851 continue; /* we can't have ocd > d */
852 } /* because a subrange is */
853 if (ocd > d && occ <= d + 1) /* always shorter than */
854 { /* the basic range. */
855 d = ocd;
856 continue;
857 }
858
859 if (occ == ocd)
860 *class_utf8data++ = XCL_SINGLE;
861 else {
862 *class_utf8data++ = XCL_RANGE;
863 class_utf8data += encodeUTF8(occ, class_utf8data);
864 }
865 class_utf8data += encodeUTF8(ocd, class_utf8data);
866 }
867 }
868
869 /* Now record the original range, possibly modified for UCP caseless
870 overlapping ranges. */
871
872 *class_utf8data++ = XCL_RANGE;
873 class_utf8data += encodeUTF8(c, class_utf8data);
874 class_utf8data += encodeUTF8(d, class_utf8data);
875
876 /* With UCP support, we are done. Without UCP support, there is no
877 caseless matching for UTF-8 characters > 127; we can use the bit map
878 for the smaller ones. */
879
880 continue; /* With next character in the class */
881 }
882
883 /* We use the bit map for all cases when not in UTF-8 mode; else
884 ranges that lie entirely within 0-127 when there is UCP support; else
885 for partial ranges without UCP support. */
886
887 for (; c <= d; c++) {
888 classbits[c/8] |= (1 << (c&7));
889 if (options & IgnoreCaseOption) {
890 int uc = flipCase(c);
891 classbits[uc/8] |= (1 << (uc&7));
892 }
9dae56ea
A
893 classCharCount++; /* in case a one-char range */
894 classLastChar = c;
b37bf2e1
A
895 }
896
897 continue; /* Go get the next char in the class */
898 }
899
900 /* Handle a lone single character - we can get here for a normal
901 non-escape char, or after \ that introduces a single character or for an
902 apparent range that isn't. */
903
904 LONE_SINGLE_CHARACTER:
905
906 /* Handle a character that cannot go in the bit map */
907
908 if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
909 class_utf8 = true;
910 *class_utf8data++ = XCL_SINGLE;
911 class_utf8data += encodeUTF8(c, class_utf8data);
912
913 if (options & IgnoreCaseOption) {
914 int othercase;
9dae56ea 915 if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0) {
b37bf2e1
A
916 *class_utf8data++ = XCL_SINGLE;
917 class_utf8data += encodeUTF8(othercase, class_utf8data);
918 }
919 }
920 } else {
921 /* Handle a single-byte character */
922 classbits[c/8] |= (1 << (c&7));
923 if (options & IgnoreCaseOption) {
924 c = flipCase(c);
925 classbits[c/8] |= (1 << (c&7));
926 }
9dae56ea
A
927 classCharCount++;
928 classLastChar = c;
b37bf2e1
A
929 }
930 }
931
9dae56ea 932 /* If classCharCount is 1, we saw precisely one character whose value is
b37bf2e1
A
933 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
934 can optimize the negative case only if there were no characters >= 128
935 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
936 single-bytes only. This is an historical hangover. Maybe one day we can
937 tidy these opcodes to handle multi-byte characters.
938
939 The optimization throws away the bit map. We turn the item into a
940 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
941 that OP_NOT does not support multibyte characters. In the positive case, it
9dae56ea 942 can cause firstByte to be set. Otherwise, there can be no first char if
b37bf2e1 943 this item is first, whatever repeat count may follow. In the case of
9dae56ea 944 reqByte, save the previous value for reinstating. */
b37bf2e1 945
9dae56ea
A
946 if (classCharCount == 1 && (!class_utf8 && (!negateClass || classLastChar < 128))) {
947 zeroReqByte = reqByte;
b37bf2e1
A
948
949 /* The OP_NOT opcode works on one-byte characters only. */
950
9dae56ea
A
951 if (negateClass) {
952 if (firstByte == REQ_UNSET)
953 firstByte = REQ_NONE;
954 zeroFirstByte = firstByte;
b37bf2e1 955 *code++ = OP_NOT;
9dae56ea 956 *code++ = classLastChar;
b37bf2e1
A
957 break;
958 }
959
960 /* For a single, positive character, get the value into c, and
961 then we can handle this with the normal one-character code. */
962
9dae56ea 963 c = classLastChar;
b37bf2e1
A
964 goto NORMAL_CHAR;
965 } /* End of 1-char optimization */
966
967 /* The general case - not the one-char optimization. If this is the first
968 thing in the branch, there can be no first char setting, whatever the
9dae56ea 969 repeat count. Any reqByte setting must remain unchanged after any kind of
b37bf2e1
A
970 repeat. */
971
9dae56ea
A
972 if (firstByte == REQ_UNSET) firstByte = REQ_NONE;
973 zeroFirstByte = firstByte;
974 zeroReqByte = reqByte;
b37bf2e1
A
975
976 /* If there are characters with values > 255, we have to compile an
977 extended class, with its own opcode. If there are no characters < 256,
978 we can omit the bitmap. */
979
9dae56ea 980 if (class_utf8 && !shouldFlipNegation) {
b37bf2e1
A
981 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
982 *code++ = OP_XCLASS;
983 code += LINK_SIZE;
9dae56ea 984 *code = negateClass? XCL_NOT : 0;
b37bf2e1
A
985
986 /* If the map is required, install it, and move on to the end of
987 the extra data */
988
9dae56ea 989 if (classCharCount > 0) {
b37bf2e1
A
990 *code++ |= XCL_MAP;
991 memcpy(code, classbits, 32);
992 code = class_utf8data;
993 }
994
995 /* If the map is not required, slide down the extra data. */
996
997 else {
998 int len = class_utf8data - (code + 33);
999 memmove(code + 1, code + 33, len);
1000 code += len + 1;
1001 }
1002
1003 /* Now fill in the complete length of the item */
1004
1005 putLinkValue(previous + 1, code - previous);
1006 break; /* End of class handling */
1007 }
1008
1009 /* If there are no characters > 255, negate the 32-byte map if necessary,
1010 and copy it into the code vector. If this is the first thing in the branch,
9dae56ea 1011 there can be no first char setting, whatever the repeat count. Any reqByte
b37bf2e1
A
1012 setting must remain unchanged after any kind of repeat. */
1013
9dae56ea
A
1014 *code++ = (negateClass == shouldFlipNegation) ? OP_CLASS : OP_NCLASS;
1015 if (negateClass)
b37bf2e1
A
1016 for (c = 0; c < 32; c++)
1017 code[c] = ~classbits[c];
1018 else
1019 memcpy(code, classbits, 32);
1020 code += 32;
1021 break;
1022 }
1023
1024 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
1025 has been tested above. */
1026
1027 case '{':
9dae56ea 1028 if (!isQuantifier)
b37bf2e1 1029 goto NORMAL_CHAR;
9dae56ea
A
1030 ptr = readRepeatCounts(ptr + 1, &repeatMin, &repeat_max, errorCodePtr);
1031 if (*errorCodePtr)
b37bf2e1
A
1032 goto FAILED;
1033 goto REPEAT;
1034
1035 case '*':
9dae56ea 1036 repeatMin = 0;
b37bf2e1
A
1037 repeat_max = -1;
1038 goto REPEAT;
1039
1040 case '+':
9dae56ea 1041 repeatMin = 1;
b37bf2e1
A
1042 repeat_max = -1;
1043 goto REPEAT;
1044
1045 case '?':
9dae56ea 1046 repeatMin = 0;
b37bf2e1
A
1047 repeat_max = 1;
1048
1049 REPEAT:
1050 if (!previous) {
9dae56ea 1051 *errorCodePtr = ERR9;
b37bf2e1
A
1052 goto FAILED;
1053 }
1054
9dae56ea
A
1055 if (repeatMin == 0) {
1056 firstByte = zeroFirstByte; /* Adjust for zero repeat */
1057 reqByte = zeroReqByte; /* Ditto */
b37bf2e1
A
1058 }
1059
1060 /* Remember whether this is a variable length repeat */
1061
9dae56ea 1062 reqvary = (repeatMin == repeat_max) ? 0 : REQ_VARY;
b37bf2e1 1063
9dae56ea 1064 opType = 0; /* Default single-char op codes */
b37bf2e1
A
1065
1066 /* Save start of previous item, in case we have to move it up to make space
1067 for an inserted OP_ONCE for the additional '+' extension. */
1068 /* FIXME: Probably don't need this because we don't use OP_ONCE. */
1069
1070 tempcode = previous;
1071
1072 /* If the next character is '+', we have a possessive quantifier. This
1073 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
1074 If the next character is '?' this is a minimizing repeat, by default,
1075 but if PCRE_UNGREEDY is set, it works the other way round. We change the
1076 repeat type to the non-default. */
1077
1078 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
9dae56ea 1079 repeatType = 1;
b37bf2e1
A
1080 ptr++;
1081 } else
9dae56ea 1082 repeatType = 0;
b37bf2e1
A
1083
1084 /* If previous was a character match, abolish the item and generate a
1085 repeat item instead. If a char item has a minumum of more than one, ensure
9dae56ea
A
1086 that it is set in reqByte - it might not be if a sequence such as x{3} is
1087 the first thing in a branch because the x will have gone into firstByte
b37bf2e1
A
1088 instead. */
1089
1090 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
1091 /* Deal with UTF-8 characters that take up more than one byte. It's
1092 easier to write this out separately than try to macrify it. Use c to
1093 hold the length of the character in bytes, plus 0x80 to flag that it's a
1094 length rather than a small character. */
1095
1096 if (code[-1] & 0x80) {
1097 unsigned char *lastchar = code - 1;
1098 while((*lastchar & 0xc0) == 0x80)
1099 lastchar--;
1100 c = code - lastchar; /* Length of UTF-8 character */
1101 memcpy(utf8_char, lastchar, c); /* Save the char */
1102 c |= 0x80; /* Flag c as a length */
1103 }
1104 else {
1105 c = code[-1];
9dae56ea
A
1106 if (repeatMin > 1)
1107 reqByte = c | reqCaseOpt | cd.reqVaryOpt;
b37bf2e1
A
1108 }
1109
1110 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1111 }
1112
1113 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {
1114 c = previous[1];
9dae56ea
A
1115 if (repeatMin > 1)
1116 reqByte = c | reqCaseOpt | cd.reqVaryOpt;
b37bf2e1
A
1117 goto OUTPUT_SINGLE_REPEAT;
1118 }
1119
1120 /* If previous was a single negated character ([^a] or similar), we use
1121 one of the special opcodes, replacing it. The code is shared with single-
1122 character repeats by setting opt_type to add a suitable offset into
9dae56ea 1123 repeatType. OP_NOT is currently used only for single-byte chars. */
b37bf2e1
A
1124
1125 else if (*previous == OP_NOT) {
9dae56ea 1126 opType = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
b37bf2e1
A
1127 c = previous[1];
1128 goto OUTPUT_SINGLE_REPEAT;
1129 }
1130
1131 /* If previous was a character type match (\d or similar), abolish it and
1132 create a suitable repeat item. The code is shared with single-character
9dae56ea 1133 repeats by setting opType to add a suitable offset into repeatType. */
b37bf2e1
A
1134
1135 else if (*previous <= OP_NOT_NEWLINE) {
9dae56ea 1136 opType = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
b37bf2e1
A
1137 c = *previous;
1138
1139 OUTPUT_SINGLE_REPEAT:
1140 int prop_type = -1;
1141 int prop_value = -1;
1142
1143 unsigned char* oldcode = code;
1144 code = previous; /* Usually overwrite previous item */
1145
1146 /* If the maximum is zero then the minimum must also be zero; Perl allows
1147 this case, so we do too - by simply omitting the item altogether. */
1148
1149 if (repeat_max == 0)
1150 goto END_REPEAT;
1151
9dae56ea 1152 /* Combine the opType with the repeatType */
b37bf2e1 1153
9dae56ea 1154 repeatType += opType;
b37bf2e1
A
1155
1156 /* A minimum of zero is handled either as the special case * or ?, or as
1157 an UPTO, with the maximum given. */
1158
9dae56ea 1159 if (repeatMin == 0) {
b37bf2e1 1160 if (repeat_max == -1)
9dae56ea 1161 *code++ = OP_STAR + repeatType;
b37bf2e1 1162 else if (repeat_max == 1)
9dae56ea 1163 *code++ = OP_QUERY + repeatType;
b37bf2e1 1164 else {
9dae56ea 1165 *code++ = OP_UPTO + repeatType;
b37bf2e1
A
1166 put2ByteValueAndAdvance(code, repeat_max);
1167 }
1168 }
1169
1170 /* A repeat minimum of 1 is optimized into some special cases. If the
1171 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
1172 left in place and, if the maximum is greater than 1, we use OP_UPTO with
1173 one less than the maximum. */
1174
9dae56ea 1175 else if (repeatMin == 1) {
b37bf2e1 1176 if (repeat_max == -1)
9dae56ea 1177 *code++ = OP_PLUS + repeatType;
b37bf2e1
A
1178 else {
1179 code = oldcode; /* leave previous item in place */
1180 if (repeat_max == 1)
1181 goto END_REPEAT;
9dae56ea 1182 *code++ = OP_UPTO + repeatType;
b37bf2e1
A
1183 put2ByteValueAndAdvance(code, repeat_max - 1);
1184 }
1185 }
1186
1187 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1188 handled as an EXACT followed by an UPTO. */
1189
1190 else {
9dae56ea
A
1191 *code++ = OP_EXACT + opType; /* NB EXACT doesn't have repeatType */
1192 put2ByteValueAndAdvance(code, repeatMin);
b37bf2e1
A
1193
1194 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
1195 we have to insert the character for the previous code. For a repeated
1196 Unicode property match, there are two extra bytes that define the
1197 required property. In UTF-8 mode, long characters have their length in
1198 c, with the 0x80 bit as a flag. */
1199
1200 if (repeat_max < 0) {
1201 if (c >= 128) {
1202 memcpy(code, utf8_char, c & 7);
1203 code += c & 7;
1204 } else {
1205 *code++ = c;
1206 if (prop_type >= 0) {
1207 *code++ = prop_type;
1208 *code++ = prop_value;
1209 }
1210 }
9dae56ea 1211 *code++ = OP_STAR + repeatType;
b37bf2e1
A
1212 }
1213
1214 /* Else insert an UPTO if the max is greater than the min, again
1215 preceded by the character, for the previously inserted code. */
1216
9dae56ea 1217 else if (repeat_max != repeatMin) {
b37bf2e1
A
1218 if (c >= 128) {
1219 memcpy(code, utf8_char, c & 7);
1220 code += c & 7;
1221 } else
1222 *code++ = c;
1223 if (prop_type >= 0) {
1224 *code++ = prop_type;
1225 *code++ = prop_value;
1226 }
9dae56ea
A
1227 repeat_max -= repeatMin;
1228 *code++ = OP_UPTO + repeatType;
b37bf2e1
A
1229 put2ByteValueAndAdvance(code, repeat_max);
1230 }
1231 }
1232
1233 /* The character or character type itself comes last in all cases. */
1234
1235 if (c >= 128) {
1236 memcpy(code, utf8_char, c & 7);
1237 code += c & 7;
1238 } else
1239 *code++ = c;
1240
1241 /* For a repeated Unicode property match, there are two extra bytes that
1242 define the required property. */
1243
1244 if (prop_type >= 0) {
1245 *code++ = prop_type;
1246 *code++ = prop_value;
1247 }
1248 }
1249
1250 /* If previous was a character class or a back reference, we put the repeat
1251 stuff after it, but just skip the item if the repeat was {0,0}. */
1252
1253 else if (*previous == OP_CLASS ||
1254 *previous == OP_NCLASS ||
1255 *previous == OP_XCLASS ||
1256 *previous == OP_REF)
1257 {
1258 if (repeat_max == 0) {
1259 code = previous;
1260 goto END_REPEAT;
1261 }
1262
9dae56ea
A
1263 if (repeatMin == 0 && repeat_max == -1)
1264 *code++ = OP_CRSTAR + repeatType;
1265 else if (repeatMin == 1 && repeat_max == -1)
1266 *code++ = OP_CRPLUS + repeatType;
1267 else if (repeatMin == 0 && repeat_max == 1)
1268 *code++ = OP_CRQUERY + repeatType;
b37bf2e1 1269 else {
9dae56ea
A
1270 *code++ = OP_CRRANGE + repeatType;
1271 put2ByteValueAndAdvance(code, repeatMin);
b37bf2e1
A
1272 if (repeat_max == -1)
1273 repeat_max = 0; /* 2-byte encoding for max */
1274 put2ByteValueAndAdvance(code, repeat_max);
1275 }
1276 }
1277
1278 /* If previous was a bracket group, we may have to replicate it in certain
1279 cases. */
1280
1281 else if (*previous >= OP_BRA) {
1282 int ketoffset = 0;
1283 int len = code - previous;
1284 unsigned char* bralink = NULL;
1285
1286 /* If the maximum repeat count is unlimited, find the end of the bracket
1287 by scanning through from the start, and compute the offset back to it
1288 from the current code pointer. There may be an OP_OPT setting following
1289 the final KET, so we can't find the end just by going back from the code
1290 pointer. */
1291
1292 if (repeat_max == -1) {
1293 const unsigned char* ket = previous;
1294 advanceToEndOfBracket(ket);
1295 ketoffset = code - ket;
1296 }
1297
1298 /* The case of a zero minimum is special because of the need to stick
1299 OP_BRAZERO in front of it, and because the group appears once in the
1300 data, whereas in other cases it appears the minimum number of times. For
1301 this reason, it is simplest to treat this case separately, as otherwise
1302 the code gets far too messy. There are several special subcases when the
1303 minimum is zero. */
1304
9dae56ea 1305 if (repeatMin == 0) {
b37bf2e1
A
1306 /* If the maximum is also zero, we just omit the group from the output
1307 altogether. */
1308
1309 if (repeat_max == 0) {
1310 code = previous;
1311 goto END_REPEAT;
1312 }
1313
1314 /* If the maximum is 1 or unlimited, we just have to stick in the
1315 BRAZERO and do no more at this point. However, we do need to adjust
1316 any OP_RECURSE calls inside the group that refer to the group itself or
1317 any internal group, because the offset is from the start of the whole
1318 regex. Temporarily terminate the pattern while doing this. */
1319
1320 if (repeat_max <= 1) {
1321 *code = OP_END;
1322 memmove(previous+1, previous, len);
1323 code++;
9dae56ea 1324 *previous++ = OP_BRAZERO + repeatType;
b37bf2e1
A
1325 }
1326
1327 /* If the maximum is greater than 1 and limited, we have to replicate
1328 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1329 The first one has to be handled carefully because it's the original
1330 copy, which has to be moved up. The remainder can be handled by code
1331 that is common with the non-zero minimum case below. We have to
1332 adjust the value of repeat_max, since one less copy is required. */
1333
1334 else {
1335 *code = OP_END;
1336 memmove(previous + 2 + LINK_SIZE, previous, len);
1337 code += 2 + LINK_SIZE;
9dae56ea 1338 *previous++ = OP_BRAZERO + repeatType;
b37bf2e1
A
1339 *previous++ = OP_BRA;
1340
1341 /* We chain together the bracket offset fields that have to be
1342 filled in later when the ends of the brackets are reached. */
1343
1344 int offset = (!bralink) ? 0 : previous - bralink;
1345 bralink = previous;
1346 putLinkValueAllowZeroAndAdvance(previous, offset);
1347 }
1348
1349 repeat_max--;
1350 }
1351
1352 /* If the minimum is greater than zero, replicate the group as many
1353 times as necessary, and adjust the maximum to the number of subsequent
1354 copies that we need. If we set a first char from the group, and didn't
1355 set a required char, copy the latter from the former. */
1356
1357 else {
9dae56ea
A
1358 if (repeatMin > 1) {
1359 if (didGroupSetFirstByte && reqByte < 0)
1360 reqByte = firstByte;
1361 for (int i = 1; i < repeatMin; i++) {
b37bf2e1
A
1362 memcpy(code, previous, len);
1363 code += len;
1364 }
1365 }
1366 if (repeat_max > 0)
9dae56ea 1367 repeat_max -= repeatMin;
b37bf2e1
A
1368 }
1369
1370 /* This code is common to both the zero and non-zero minimum cases. If
1371 the maximum is limited, it replicates the group in a nested fashion,
1372 remembering the bracket starts on a stack. In the case of a zero minimum,
1373 the first one was set up above. In all cases the repeat_max now specifies
1374 the number of additional copies needed. */
1375
1376 if (repeat_max >= 0) {
1377 for (int i = repeat_max - 1; i >= 0; i--) {
9dae56ea 1378 *code++ = OP_BRAZERO + repeatType;
b37bf2e1
A
1379
1380 /* All but the final copy start a new nesting, maintaining the
1381 chain of brackets outstanding. */
1382
1383 if (i != 0) {
1384 *code++ = OP_BRA;
1385 int offset = (!bralink) ? 0 : code - bralink;
1386 bralink = code;
1387 putLinkValueAllowZeroAndAdvance(code, offset);
1388 }
1389
1390 memcpy(code, previous, len);
1391 code += len;
1392 }
1393
1394 /* Now chain through the pending brackets, and fill in their length
1395 fields (which are holding the chain links pro tem). */
1396
1397 while (bralink) {
1398 int offset = code - bralink + 1;
1399 unsigned char* bra = code - offset;
1400 int oldlinkoffset = getLinkValueAllowZero(bra + 1);
1401 bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset;
1402 *code++ = OP_KET;
1403 putLinkValueAndAdvance(code, offset);
1404 putLinkValue(bra + 1, offset);
1405 }
1406 }
1407
1408 /* If the maximum is unlimited, set a repeater in the final copy. We
1409 can't just offset backwards from the current code point, because we
1410 don't know if there's been an options resetting after the ket. The
1411 correct offset was computed above. */
1412
1413 else
9dae56ea
A
1414 code[-ketoffset] = OP_KETRMAX + repeatType;
1415 }
1416
1417 // A quantifier after an assertion is mostly meaningless, but it
1418 // can nullify the assertion if it has a 0 minimum.
1419 else if (*previous == OP_ASSERT || *previous == OP_ASSERT_NOT) {
1420 if (repeatMin == 0) {
1421 code = previous;
1422 goto END_REPEAT;
1423 }
b37bf2e1
A
1424 }
1425
1426 /* Else there's some kind of shambles */
1427
1428 else {
9dae56ea 1429 *errorCodePtr = ERR11;
b37bf2e1
A
1430 goto FAILED;
1431 }
1432
1433 /* In all case we no longer have a previous item. We also set the
1434 "follows varying string" flag for subsequently encountered reqbytes if
1435 it isn't already set and we have just passed a varying length item. */
1436
1437 END_REPEAT:
1438 previous = NULL;
9dae56ea 1439 cd.reqVaryOpt |= reqvary;
b37bf2e1
A
1440 break;
1441
1442 /* Start of nested bracket sub-expression, or comment or lookahead or
1443 lookbehind or option setting or condition. First deal with special things
1444 that can come after a bracket; all are introduced by ?, and the appearance
1445 of any of them means that this is not a referencing group. They were
1446 checked for validity in the first pass over the string, so we don't have to
1447 check for syntax errors here. */
1448
1449 case '(':
9dae56ea 1450 skipBytes = 0;
b37bf2e1
A
1451
1452 if (*(++ptr) == '?') {
1453 switch (*(++ptr)) {
1454 case ':': /* Non-extracting bracket */
1455 bravalue = OP_BRA;
1456 ptr++;
1457 break;
1458
1459 case '=': /* Positive lookahead */
1460 bravalue = OP_ASSERT;
1461 ptr++;
1462 break;
1463
1464 case '!': /* Negative lookahead */
1465 bravalue = OP_ASSERT_NOT;
1466 ptr++;
1467 break;
1468
1469 /* Character after (? not specially recognized */
1470
1471 default:
9dae56ea 1472 *errorCodePtr = ERR12;
b37bf2e1
A
1473 goto FAILED;
1474 }
1475 }
1476
1477 /* Else we have a referencing group; adjust the opcode. If the bracket
1478 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1479 arrange for the true number to follow later, in an OP_BRANUMBER item. */
1480
1481 else {
1482 if (++(*brackets) > EXTRACT_BASIC_MAX) {
1483 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1484 code[1 + LINK_SIZE] = OP_BRANUMBER;
1485 put2ByteValue(code + 2 + LINK_SIZE, *brackets);
9dae56ea 1486 skipBytes = 3;
b37bf2e1
A
1487 }
1488 else
1489 bravalue = OP_BRA + *brackets;
1490 }
1491
9dae56ea
A
1492 /* Process nested bracketed re. We copy code into a non-variable
1493 in order to be able to pass its address because some compilers
1494 complain otherwise. Pass in a new setting for the ims options
1495 if they have changed. */
b37bf2e1 1496
9dae56ea 1497 previous = code;
b37bf2e1
A
1498 *code = bravalue;
1499 tempcode = code;
9dae56ea 1500 tempreqvary = cd.reqVaryOpt; /* Save value before bracket */
b37bf2e1
A
1501
1502 if (!compileBracket(
1503 options,
1504 brackets, /* Extracting bracket count */
1505 &tempcode, /* Where to put code (updated) */
1506 &ptr, /* Input pointer (updated) */
1507 patternEnd,
9dae56ea
A
1508 errorCodePtr, /* Where to put an error message */
1509 skipBytes, /* Skip over OP_BRANUMBER */
1510 &subFirstByte, /* For possible first char */
1511 &subReqByte, /* For possible last char */
b37bf2e1
A
1512 cd)) /* Tables block */
1513 goto FAILED;
1514
1515 /* At the end of compiling, code is still pointing to the start of the
1516 group, while tempcode has been updated to point past the end of the group
1517 and any option resetting that may follow it. The pattern pointer (ptr)
1518 is on the bracket. */
1519
1520 /* Handle updating of the required and first characters. Update for normal
1521 brackets of all kinds, and conditions with two branches (see code above).
1522 If the bracket is followed by a quantifier with zero repeat, we have to
9dae56ea 1523 back off. Hence the definition of zeroReqByte and zeroFirstByte outside the
b37bf2e1
A
1524 main loop so that they can be accessed for the back off. */
1525
9dae56ea
A
1526 zeroReqByte = reqByte;
1527 zeroFirstByte = firstByte;
1528 didGroupSetFirstByte = false;
b37bf2e1
A
1529
1530 if (bravalue >= OP_BRA) {
9dae56ea 1531 /* If we have not yet set a firstByte in this branch, take it from the
b37bf2e1 1532 subpattern, remembering that it was set here so that a repeat of more
9dae56ea
A
1533 than one can replicate it as reqByte if necessary. If the subpattern has
1534 no firstByte, set "none" for the whole branch. In both cases, a zero
1535 repeat forces firstByte to "none". */
1536
1537 if (firstByte == REQ_UNSET) {
1538 if (subFirstByte >= 0) {
1539 firstByte = subFirstByte;
1540 didGroupSetFirstByte = true;
b37bf2e1
A
1541 }
1542 else
9dae56ea
A
1543 firstByte = REQ_NONE;
1544 zeroFirstByte = REQ_NONE;
b37bf2e1
A
1545 }
1546
9dae56ea
A
1547 /* If firstByte was previously set, convert the subpattern's firstByte
1548 into reqByte if there wasn't one, using the vary flag that was in
b37bf2e1
A
1549 existence beforehand. */
1550
9dae56ea
A
1551 else if (subFirstByte >= 0 && subReqByte < 0)
1552 subReqByte = subFirstByte | tempreqvary;
b37bf2e1
A
1553
1554 /* If the subpattern set a required byte (or set a first byte that isn't
1555 really the first byte - see above), set it. */
1556
9dae56ea
A
1557 if (subReqByte >= 0)
1558 reqByte = subReqByte;
b37bf2e1
A
1559 }
1560
9dae56ea 1561 /* For a forward assertion, we take the reqByte, if set. This can be
b37bf2e1 1562 helpful if the pattern that follows the assertion doesn't set a different
9dae56ea 1563 char. For example, it's useful for /(?=abcde).+/. We can't set firstByte
b37bf2e1 1564 for an assertion, however because it leads to incorrect effect for patterns
9dae56ea
A
1565 such as /(?=a)a.+/ when the "real" "a" would then become a reqByte instead
1566 of a firstByte. This is overcome by a scan at the end if there's no
1567 firstByte, looking for an asserted first char. */
b37bf2e1 1568
9dae56ea
A
1569 else if (bravalue == OP_ASSERT && subReqByte >= 0)
1570 reqByte = subReqByte;
b37bf2e1
A
1571
1572 /* Now update the main code pointer to the end of the group. */
1573
1574 code = tempcode;
1575
1576 /* Error if hit end of pattern */
1577
1578 if (ptr >= patternEnd || *ptr != ')') {
9dae56ea 1579 *errorCodePtr = ERR14;
b37bf2e1
A
1580 goto FAILED;
1581 }
1582 break;
1583
1584 /* Check \ for being a real metacharacter; if not, fall through and handle
1585 it as a data character at the start of a string. Escape items are checked
1586 for validity in the pre-compiling pass. */
1587
1588 case '\\':
1589 tempptr = ptr;
9dae56ea 1590 c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, false);
b37bf2e1
A
1591
1592 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1593 are arranged to be the negation of the corresponding OP_values. For the
1594 back references, the values are ESC_REF plus the reference number. Only
1595 back references and those types that consume a character may be repeated.
1596 We can test for values between ESC_b and ESC_w for the latter; this may
1597 have to change if any new ones are ever created. */
1598
1599 if (c < 0) {
1600 /* For metasequences that actually match a character, we disable the
1601 setting of a first character if it hasn't already been set. */
1602
9dae56ea
A
1603 if (firstByte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
1604 firstByte = REQ_NONE;
b37bf2e1
A
1605
1606 /* Set values to reset to if this is followed by a zero repeat. */
1607
9dae56ea
A
1608 zeroFirstByte = firstByte;
1609 zeroReqByte = reqByte;
b37bf2e1
A
1610
1611 /* Back references are handled specially */
1612
1613 if (-c >= ESC_REF) {
1614 int number = -c - ESC_REF;
1615 previous = code;
1616 *code++ = OP_REF;
1617 put2ByteValueAndAdvance(code, number);
1618 }
1619
1620 /* For the rest, we can obtain the OP value by negating the escape
1621 value */
1622
1623 else {
1624 previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
1625 *code++ = -c;
1626 }
1627 continue;
1628 }
1629
1630 /* Fall through. */
1631
1632 /* Handle a literal character. It is guaranteed not to be whitespace or #
1633 when the extended flag is set. If we are in UTF-8 mode, it may be a
1634 multi-byte literal character. */
1635
1636 default:
1637 NORMAL_CHAR:
1638
1639 previous = code;
1640
1641 if (c < 128) {
9dae56ea 1642 mcLength = 1;
b37bf2e1
A
1643 mcbuffer[0] = c;
1644
1645 if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
1646 *code++ = OP_ASCII_LETTER_IGNORING_CASE;
1647 *code++ = c | 0x20;
1648 } else {
1649 *code++ = OP_ASCII_CHAR;
1650 *code++ = c;
1651 }
1652 } else {
9dae56ea 1653 mcLength = encodeUTF8(c, mcbuffer);
b37bf2e1
A
1654
1655 *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
9dae56ea 1656 for (c = 0; c < mcLength; c++)
b37bf2e1
A
1657 *code++ = mcbuffer[c];
1658 }
1659
1660 /* Set the first and required bytes appropriately. If no previous first
1661 byte, set it from this character, but revert to none on a zero repeat.
9dae56ea 1662 Otherwise, leave the firstByte value alone, and don't change it on a zero
b37bf2e1
A
1663 repeat. */
1664
9dae56ea
A
1665 if (firstByte == REQ_UNSET) {
1666 zeroFirstByte = REQ_NONE;
1667 zeroReqByte = reqByte;
b37bf2e1 1668
9dae56ea 1669 /* If the character is more than one byte long, we can set firstByte
b37bf2e1
A
1670 only if it is not to be matched caselessly. */
1671
9dae56ea
A
1672 if (mcLength == 1 || reqCaseOpt == 0) {
1673 firstByte = mcbuffer[0] | reqCaseOpt;
1674 if (mcLength != 1)
1675 reqByte = code[-1] | cd.reqVaryOpt;
b37bf2e1
A
1676 }
1677 else
9dae56ea 1678 firstByte = reqByte = REQ_NONE;
b37bf2e1
A
1679 }
1680
9dae56ea 1681 /* firstByte was previously set; we can set reqByte only the length is
b37bf2e1
A
1682 1 or the matching is caseful. */
1683
1684 else {
9dae56ea
A
1685 zeroFirstByte = firstByte;
1686 zeroReqByte = reqByte;
1687 if (mcLength == 1 || reqCaseOpt == 0)
1688 reqByte = code[-1] | reqCaseOpt | cd.reqVaryOpt;
b37bf2e1
A
1689 }
1690
1691 break; /* End of literal character handling */
1692 }
1693 } /* end of big loop */
1694
1695 /* Control never reaches here by falling through, only by a goto for all the
1696 error states. Pass back the position in the pattern so that it can be displayed
1697 to the user for diagnosing the error. */
1698
1699FAILED:
9dae56ea 1700 *ptrPtr = ptr;
b37bf2e1
A
1701 return false;
1702}
1703
1704/*************************************************
1705* Compile sequence of alternatives *
1706*************************************************/
1707
1708/* On entry, ptr is pointing past the bracket character, but on return
1709it points to the closing bracket, or vertical bar, or end of string.
1710The code variable is pointing at the byte into which the BRA operator has been
1711stored. If the ims options are changed at the start (for a (?ims: group) or
1712during any branch, we need to insert an OP_OPT item at the start of every
1713following branch to ensure they get set correctly at run time, and also pass
1714the new options into every subsequent branch compile.
1715
1716Argument:
1717 options option bits, including any changes for this subpattern
1718 brackets -> int containing the number of extracting brackets used
9dae56ea
A
1719 codePtr -> the address of the current code pointer
1720 ptrPtr -> the address of the current pattern pointer
1721 errorCodePtr -> pointer to error code variable
1722 skipBytes skip this many bytes at start (for OP_BRANUMBER)
b37bf2e1
A
1723 firstbyteptr place to put the first required character, or a negative number
1724 reqbyteptr place to put the last required character, or a negative number
1725 cd points to the data block with tables pointers etc.
1726
1727Returns: true on success
1728*/
1729
1730static bool
9dae56ea
A
1731compileBracket(int options, int* brackets, unsigned char** codePtr,
1732 const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int skipBytes,
b37bf2e1
A
1733 int* firstbyteptr, int* reqbyteptr, CompileData& cd)
1734{
9dae56ea
A
1735 const UChar* ptr = *ptrPtr;
1736 unsigned char* code = *codePtr;
1737 unsigned char* lastBranch = code;
b37bf2e1 1738 unsigned char* start_bracket = code;
9dae56ea
A
1739 int firstByte = REQ_UNSET;
1740 int reqByte = REQ_UNSET;
b37bf2e1
A
1741
1742 /* Offset is set zero to mark that this bracket is still open */
1743
1744 putLinkValueAllowZero(code + 1, 0);
9dae56ea 1745 code += 1 + LINK_SIZE + skipBytes;
b37bf2e1
A
1746
1747 /* Loop for each alternative branch */
1748
1749 while (true) {
1750 /* Now compile the branch */
1751
9dae56ea
A
1752 int branchFirstByte;
1753 int branchReqByte;
1754 if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorCodePtr,
1755 &branchFirstByte, &branchReqByte, cd)) {
1756 *ptrPtr = ptr;
b37bf2e1
A
1757 return false;
1758 }
1759
9dae56ea 1760 /* If this is the first branch, the firstByte and reqByte values for the
b37bf2e1
A
1761 branch become the values for the regex. */
1762
9dae56ea
A
1763 if (*lastBranch != OP_ALT) {
1764 firstByte = branchFirstByte;
1765 reqByte = branchReqByte;
b37bf2e1
A
1766 }
1767
9dae56ea 1768 /* If this is not the first branch, the first char and reqByte have to
b37bf2e1 1769 match the values from all the previous branches, except that if the previous
9dae56ea 1770 value for reqByte didn't have REQ_VARY set, it can still match, and we set
b37bf2e1
A
1771 REQ_VARY for the regex. */
1772
1773 else {
9dae56ea
A
1774 /* If we previously had a firstByte, but it doesn't match the new branch,
1775 we have to abandon the firstByte for the regex, but if there was previously
1776 no reqByte, it takes on the value of the old firstByte. */
b37bf2e1 1777
9dae56ea
A
1778 if (firstByte >= 0 && firstByte != branchFirstByte) {
1779 if (reqByte < 0)
1780 reqByte = firstByte;
1781 firstByte = REQ_NONE;
b37bf2e1
A
1782 }
1783
9dae56ea
A
1784 /* If we (now or from before) have no firstByte, a firstByte from the
1785 branch becomes a reqByte if there isn't a branch reqByte. */
b37bf2e1 1786
9dae56ea
A
1787 if (firstByte < 0 && branchFirstByte >= 0 && branchReqByte < 0)
1788 branchReqByte = branchFirstByte;
b37bf2e1
A
1789
1790 /* Now ensure that the reqbytes match */
1791
9dae56ea
A
1792 if ((reqByte & ~REQ_VARY) != (branchReqByte & ~REQ_VARY))
1793 reqByte = REQ_NONE;
b37bf2e1 1794 else
9dae56ea 1795 reqByte |= branchReqByte; /* To "or" REQ_VARY */
b37bf2e1
A
1796 }
1797
1798 /* Reached end of expression, either ')' or end of pattern. Go back through
1799 the alternative branches and reverse the chain of offsets, with the field in
1800 the BRA item now becoming an offset to the first alternative. If there are
1801 no alternatives, it points to the end of the group. The length in the
1802 terminating ket is always the length of the whole bracketed item. If any of
1803 the ims options were changed inside the group, compile a resetting op-code
1804 following, except at the very end of the pattern. Return leaving the pointer
1805 at the terminating char. */
1806
1807 if (ptr >= patternEnd || *ptr != '|') {
9dae56ea 1808 int length = code - lastBranch;
b37bf2e1 1809 do {
9dae56ea
A
1810 int prevLength = getLinkValueAllowZero(lastBranch + 1);
1811 putLinkValue(lastBranch + 1, length);
1812 length = prevLength;
1813 lastBranch -= length;
b37bf2e1
A
1814 } while (length > 0);
1815
1816 /* Fill in the ket */
1817
1818 *code = OP_KET;
1819 putLinkValue(code + 1, code - start_bracket);
1820 code += 1 + LINK_SIZE;
1821
1822 /* Set values to pass back */
1823
9dae56ea
A
1824 *codePtr = code;
1825 *ptrPtr = ptr;
1826 *firstbyteptr = firstByte;
1827 *reqbyteptr = reqByte;
b37bf2e1
A
1828 return true;
1829 }
1830
1831 /* Another branch follows; insert an "or" node. Its length field points back
1832 to the previous branch while the bracket remains open. At the end the chain
1833 is reversed. It's done like this so that the start of the bracket has a
1834 zero offset until it is closed, making it possible to detect recursion. */
1835
1836 *code = OP_ALT;
9dae56ea
A
1837 putLinkValue(code + 1, code - lastBranch);
1838 lastBranch = code;
b37bf2e1
A
1839 code += 1 + LINK_SIZE;
1840 ptr++;
1841 }
1842 ASSERT_NOT_REACHED();
1843}
1844
1845/*************************************************
1846* Check for anchored expression *
1847*************************************************/
1848
1849/* Try to find out if this is an anchored regular expression. Consider each
1850alternative branch. If they all start OP_CIRC, or with a bracket
1851all of whose alternatives start OP_CIRC (recurse ad lib), then
1852it's anchored.
1853
1854Arguments:
1855 code points to start of expression (the bracket)
1856 captureMap a bitmap of which brackets we are inside while testing; this
1857 handles up to substring 31; all brackets after that share
1858 the zero bit
1859 backrefMap the back reference bitmap
1860*/
1861
1862static bool branchIsAnchored(const unsigned char* code)
1863{
1864 const unsigned char* scode = firstSignificantOpcode(code);
1865 int op = *scode;
1866
1867 /* Brackets */
1868 if (op >= OP_BRA || op == OP_ASSERT)
1869 return bracketIsAnchored(scode);
1870
1871 /* Check for explicit anchoring */
1872 return op == OP_CIRC;
1873}
1874
1875static bool bracketIsAnchored(const unsigned char* code)
1876{
1877 do {
1878 if (!branchIsAnchored(code + 1 + LINK_SIZE))
1879 return false;
1880 code += getLinkValue(code + 1);
1881 } while (*code == OP_ALT); /* Loop for each alternative */
1882 return true;
1883}
1884
1885/*************************************************
1886* Check for starting with ^ or .* *
1887*************************************************/
1888
1889/* This is called to find out if every branch starts with ^ or .* so that
1890"first char" processing can be done to speed things up in multiline
1891matching and for non-DOTALL patterns that start with .* (which must start at
1892the beginning or after \n)
1893
1894Except when the .* appears inside capturing parentheses, and there is a
1895subsequent back reference to those parentheses. By keeping a bitmap of the
1896first 31 back references, we can catch some of the more common cases more
1897precisely; all the greater back references share a single bit.
1898
1899Arguments:
1900 code points to start of expression (the bracket)
1901 captureMap a bitmap of which brackets we are inside while testing; this
1902 handles up to substring 31; all brackets after that share
1903 the zero bit
1904 backrefMap the back reference bitmap
1905*/
1906
1907static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1908{
1909 const unsigned char* scode = firstSignificantOpcode(code);
1910 int op = *scode;
1911
1912 /* Capturing brackets */
1913 if (op > OP_BRA) {
1914 int captureNum = op - OP_BRA;
1915 if (captureNum > EXTRACT_BASIC_MAX)
1916 captureNum = get2ByteValue(scode + 2 + LINK_SIZE);
1917 int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;
1918 return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap);
1919 }
1920
1921 /* Other brackets */
1922 if (op == OP_BRA || op == OP_ASSERT)
1923 return bracketNeedsLineStart(scode, captureMap, backrefMap);
1924
1925 /* .* means "start at start or after \n" if it isn't in brackets that
1926 may be referenced. */
1927
1928 if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1929 return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
1930
1931 /* Explicit ^ */
9dae56ea 1932 return op == OP_CIRC || op == OP_BOL;
b37bf2e1
A
1933}
1934
1935static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1936{
1937 do {
1938 if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap))
1939 return false;
1940 code += getLinkValue(code + 1);
1941 } while (*code == OP_ALT); /* Loop for each alternative */
1942 return true;
1943}
1944
1945/*************************************************
1946* Check for asserted fixed first char *
1947*************************************************/
1948
1949/* During compilation, the "first char" settings from forward assertions are
1950discarded, because they can cause conflicts with actual literals that follow.
1951However, if we end up without a first char setting for an unanchored pattern,
1952it is worth scanning the regex to see if there is an initial asserted first
1953char. If all branches start with the same asserted char, or with a bracket all
1954of whose alternatives start with the same asserted char (recurse ad lib), then
1955we return that char, otherwise -1.
1956
1957Arguments:
1958 code points to start of expression (the bracket)
1959 options pointer to the options (used to check casing changes)
1960 inassert true if in an assertion
1961
1962Returns: -1 or the fixed first char
1963*/
1964
1965static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
1966{
1967 const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
1968 int op = *scode;
1969
1970 if (op >= OP_BRA)
1971 op = OP_BRA;
1972
1973 switch (op) {
1974 default:
1975 return -1;
1976
1977 case OP_BRA:
1978 case OP_ASSERT:
1979 return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);
1980
1981 case OP_EXACT:
1982 scode += 2;
1983 /* Fall through */
1984
1985 case OP_CHAR:
1986 case OP_CHAR_IGNORING_CASE:
1987 case OP_ASCII_CHAR:
1988 case OP_ASCII_LETTER_IGNORING_CASE:
1989 case OP_PLUS:
1990 case OP_MINPLUS:
1991 if (!inassert)
1992 return -1;
1993 return scode[1];
1994 }
1995}
1996
1997static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
1998{
1999 int c = -1;
2000 do {
2001 int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert);
2002 if (d < 0)
2003 return -1;
2004 if (c < 0)
2005 c = d;
2006 else if (c != d)
2007 return -1;
2008 code += getLinkValue(code + 1);
2009 } while (*code == OP_ALT);
2010 return c;
2011}
2012
2013static inline int multiplyWithOverflowCheck(int a, int b)
2014{
2015 if (!a || !b)
2016 return 0;
2017 if (a > MAX_PATTERN_SIZE / b)
2018 return -1;
2019 return a * b;
2020}
2021
2022static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
2023 CompileData& cd, ErrorCode& errorcode)
2024{
2025 /* Make a pass over the pattern to compute the
2026 amount of store required to hold the compiled code. This does not have to be
2027 perfect as long as errors are overestimates. */
2028
2029 if (patternLength > MAX_PATTERN_SIZE) {
2030 errorcode = ERR16;
2031 return -1;
2032 }
2033
2034 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
2035 int branch_extra = 0;
2036 int lastitemlength = 0;
2037 unsigned brastackptr = 0;
2038 int brastack[BRASTACK_SIZE];
2039 unsigned char bralenstack[BRASTACK_SIZE];
2040 int bracount = 0;
2041
2042 const UChar* ptr = (const UChar*)(pattern - 1);
2043 const UChar* patternEnd = (const UChar*)(pattern + patternLength);
2044
2045 while (++ptr < patternEnd) {
2046 int minRepeats = 0, maxRepeats = 0;
2047 int c = *ptr;
2048
2049 switch (c) {
2050 /* A backslashed item may be an escaped data character or it may be a
2051 character type. */
2052
2053 case '\\':
2054 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, false);
2055 if (errorcode != 0)
2056 return -1;
2057
2058 lastitemlength = 1; /* Default length of last item for repeats */
2059
2060 if (c >= 0) { /* Data character */
2061 length += 2; /* For a one-byte character */
2062
2063 if (c > 127) {
2064 int i;
9dae56ea
A
2065 for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
2066 if (c <= jsc_pcre_utf8_table1[i]) break;
b37bf2e1
A
2067 length += i;
2068 lastitemlength += i;
2069 }
2070
2071 continue;
2072 }
2073
2074 /* Other escapes need one byte */
2075
2076 length++;
2077
2078 /* A back reference needs an additional 2 bytes, plus either one or 5
2079 bytes for a repeat. We also need to keep the value of the highest
2080 back reference. */
2081
2082 if (c <= -ESC_REF) {
2083 int refnum = -c - ESC_REF;
2084 cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1;
9dae56ea
A
2085 if (refnum > cd.topBackref)
2086 cd.topBackref = refnum;
b37bf2e1
A
2087 length += 2; /* For single back reference */
2088 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
2089 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2090 if (errorcode)
2091 return -1;
2092 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2093 (minRepeats == 1 && maxRepeats == -1))
2094 length++;
2095 else
2096 length += 5;
2097 if (safelyCheckNextChar(ptr, patternEnd, '?'))
2098 ptr++;
2099 }
2100 }
2101 continue;
2102
2103 case '^': /* Single-byte metacharacters */
2104 case '.':
2105 case '$':
2106 length++;
2107 lastitemlength = 1;
2108 continue;
2109
2110 case '*': /* These repeats won't be after brackets; */
2111 case '+': /* those are handled separately */
2112 case '?':
2113 length++;
2114 goto POSSESSIVE;
2115
2116 /* This covers the cases of braced repeats after a single char, metachar,
2117 class, or back reference. */
2118
2119 case '{':
2120 if (!isCountedRepeat(ptr + 1, patternEnd))
2121 goto NORMAL_CHAR;
2122 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &errorcode);
2123 if (errorcode != 0)
2124 return -1;
2125
2126 /* These special cases just insert one extra opcode */
2127
2128 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2129 (minRepeats == 1 && maxRepeats == -1))
2130 length++;
2131
2132 /* These cases might insert additional copies of a preceding character. */
2133
2134 else {
2135 if (minRepeats != 1) {
2136 length -= lastitemlength; /* Uncount the original char or metachar */
2137 if (minRepeats > 0)
2138 length += 3 + lastitemlength;
2139 }
2140 length += lastitemlength + ((maxRepeats > 0) ? 3 : 1);
2141 }
2142
2143 if (safelyCheckNextChar(ptr, patternEnd, '?'))
2144 ptr++; /* Needs no extra length */
2145
2146 POSSESSIVE: /* Test for possessive quantifier */
2147 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2148 ptr++;
2149 length += 2 + 2 * LINK_SIZE; /* Allow for atomic brackets */
2150 }
2151 continue;
2152
2153 /* An alternation contains an offset to the next branch or ket. If any ims
2154 options changed in the previous branch(es), and/or if we are in a
2155 lookbehind assertion, extra space will be needed at the start of the
2156 branch. This is handled by branch_extra. */
2157
2158 case '|':
2159 if (brastackptr == 0)
2160 cd.needOuterBracket = true;
2161 length += 1 + LINK_SIZE + branch_extra;
2162 continue;
2163
2164 /* A character class uses 33 characters provided that all the character
2165 values are less than 256. Otherwise, it uses a bit map for low valued
2166 characters, and individual items for others. Don't worry about character
2167 types that aren't allowed in classes - they'll get picked up during the
2168 compile. A character class that contains only one single-byte character
2169 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
2170 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
2171
2172 case '[': {
2173 int class_optcount;
2174 if (*(++ptr) == '^') {
2175 class_optcount = 10; /* Greater than one */
2176 ptr++;
2177 }
2178 else
2179 class_optcount = 0;
2180
2181 bool class_utf8 = false;
2182
2183 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
2184 /* Check for escapes */
2185
2186 if (*ptr == '\\') {
2187 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
2188 if (errorcode != 0)
2189 return -1;
2190
2191 /* Handle escapes that turn into characters */
2192
2193 if (c >= 0)
2194 goto NON_SPECIAL_CHARACTER;
2195
2196 /* Escapes that are meta-things. The normal ones just affect the
2197 bit map, but Unicode properties require an XCLASS extended item. */
2198
2199 else
2200 class_optcount = 10; /* \d, \s etc; make sure > 1 */
2201 }
2202
2203 /* Anything else increments the possible optimization count. We have to
2204 detect ranges here so that we can compute the number of extra ranges for
2205 caseless wide characters when UCP support is available. If there are wide
2206 characters, we are going to have to use an XCLASS, even for single
2207 characters. */
2208
2209 else {
2210 c = *ptr;
2211
2212 /* Come here from handling \ above when it escapes to a char value */
2213
2214 NON_SPECIAL_CHARACTER:
2215 class_optcount++;
2216
2217 int d = -1;
2218 if (safelyCheckNextChar(ptr, patternEnd, '-')) {
ba379fdc 2219 const UChar* hyptr = ptr++;
b37bf2e1
A
2220 if (safelyCheckNextChar(ptr, patternEnd, '\\')) {
2221 ptr++;
2222 d = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
2223 if (errorcode != 0)
2224 return -1;
2225 }
2226 else if ((ptr + 1 < patternEnd) && ptr[1] != ']')
2227 d = *++ptr;
2228 if (d < 0)
2229 ptr = hyptr; /* go back to hyphen as data */
2230 }
2231
2232 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
2233 127 for caseless matching, we will need to use an XCLASS. */
2234
2235 if (d >= 0) {
2236 class_optcount = 10; /* Ensure > 1 */
2237 if (d < c) {
2238 errorcode = ERR8;
2239 return -1;
2240 }
2241
2242 if ((d > 255 || (ignoreCase && d > 127))) {
2243 unsigned char buffer[6];
2244 if (!class_utf8) /* Allow for XCLASS overhead */
2245 {
2246 class_utf8 = true;
2247 length += LINK_SIZE + 2;
2248 }
2249
2250 /* If we have UCP support, find out how many extra ranges are
2251 needed to map the other case of characters within this range. We
2252 have to mimic the range optimization here, because extending the
2253 range upwards might push d over a boundary that makes it use
2254 another byte in the UTF-8 representation. */
2255
2256 if (ignoreCase) {
2257 int occ, ocd;
2258 int cc = c;
2259 int origd = d;
2260 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
2261 if (occ >= c && ocd <= d)
2262 continue; /* Skip embedded */
2263
2264 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2265 { /* if there is overlap, */
2266 c = occ; /* noting that if occ < c */
2267 continue; /* we can't have ocd > d */
2268 } /* because a subrange is */
2269 if (ocd > d && occ <= d + 1) /* always shorter than */
2270 { /* the basic range. */
2271 d = ocd;
2272 continue;
2273 }
2274
2275 /* An extra item is needed */
2276
2277 length += 1 + encodeUTF8(occ, buffer) +
2278 ((occ == ocd) ? 0 : encodeUTF8(ocd, buffer));
2279 }
2280 }
2281
2282 /* The length of the (possibly extended) range */
2283
2284 length += 1 + encodeUTF8(c, buffer) + encodeUTF8(d, buffer);
2285 }
2286
2287 }
2288
2289 /* We have a single character. There is nothing to be done unless we
2290 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
2291 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
2292 support. */
2293
2294 else {
2295 if ((c > 255 || (ignoreCase && c > 127))) {
2296 unsigned char buffer[6];
2297 class_optcount = 10; /* Ensure > 1 */
2298 if (!class_utf8) /* Allow for XCLASS overhead */
2299 {
2300 class_utf8 = true;
2301 length += LINK_SIZE + 2;
2302 }
2303 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8(c, buffer));
2304 }
2305 }
2306 }
2307 }
2308
2309 if (ptr >= patternEnd) { /* Missing terminating ']' */
2310 errorcode = ERR6;
2311 return -1;
2312 }
2313
2314 /* We can optimize when there was only one optimizable character.
2315 Note that this does not detect the case of a negated single character.
2316 In that case we do an incorrect length computation, but it's not a serious
2317 problem because the computed length is too large rather than too small. */
2318
2319 if (class_optcount == 1)
2320 goto NORMAL_CHAR;
2321
2322 /* Here, we handle repeats for the class opcodes. */
2323 {
2324 length += 33;
2325
2326 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
2327 we also need extra for wrapping the whole thing in a sub-pattern. */
2328
2329 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
2330 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2331 if (errorcode != 0)
2332 return -1;
2333 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2334 (minRepeats == 1 && maxRepeats == -1))
2335 length++;
2336 else
2337 length += 5;
2338 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2339 ptr++;
2340 length += 2 + 2 * LINK_SIZE;
2341 } else if (safelyCheckNextChar(ptr, patternEnd, '?'))
2342 ptr++;
2343 }
2344 }
2345 continue;
2346 }
2347
2348 /* Brackets may be genuine groups or special things */
2349
2350 case '(': {
2351 int branch_newextra = 0;
2352 int bracket_length = 1 + LINK_SIZE;
2353 bool capturing = false;
2354
2355 /* Handle special forms of bracket, which all start (? */
2356
2357 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
2358 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
2359 /* Non-referencing groups and lookaheads just move the pointer on, and
2360 then behave like a non-special bracket, except that they don't increment
2361 the count of extracting brackets. Ditto for the "once only" bracket,
2362 which is in Perl from version 5.005. */
2363
2364 case ':':
2365 case '=':
2366 case '!':
2367 ptr += 2;
2368 break;
2369
2370 /* Else loop checking valid options until ) is met. Anything else is an
2371 error. If we are without any brackets, i.e. at top level, the settings
2372 act as if specified in the options, so massage the options immediately.
2373 This is for backward compatibility with Perl 5.004. */
2374
2375 default:
2376 errorcode = ERR12;
2377 return -1;
2378 }
2379 } else
2380 capturing = 1;
2381
2382 /* Capturing brackets must be counted so we can process escapes in a
2383 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
2384 an additional 3 bytes of memory per capturing bracket. */
2385
2386 if (capturing) {
2387 bracount++;
2388 if (bracount > EXTRACT_BASIC_MAX)
2389 bracket_length += 3;
2390 }
2391
2392 /* Save length for computing whole length at end if there's a repeat that
2393 requires duplication of the group. Also save the current value of
2394 branch_extra, and start the new group with the new value. If non-zero, this
2395 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2396
2397 if (brastackptr >= sizeof(brastack)/sizeof(int)) {
2398 errorcode = ERR17;
2399 return -1;
2400 }
2401
2402 bralenstack[brastackptr] = branch_extra;
2403 branch_extra = branch_newextra;
2404
2405 brastack[brastackptr++] = length;
2406 length += bracket_length;
2407 continue;
2408 }
2409
2410 /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we
2411 have to replicate this bracket up to that many times. If brastackptr is
2412 0 this is an unmatched bracket which will generate an error, but take care
2413 not to try to access brastack[-1] when computing the length and restoring
2414 the branch_extra value. */
2415
2416 case ')': {
2417 int duplength;
2418 length += 1 + LINK_SIZE;
2419 if (brastackptr > 0) {
2420 duplength = length - brastack[--brastackptr];
2421 branch_extra = bralenstack[brastackptr];
2422 }
2423 else
2424 duplength = 0;
2425
2426 /* Leave ptr at the final char; for readRepeatCounts this happens
2427 automatically; for the others we need an increment. */
2428
2429 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(ptr + 2, patternEnd)) {
2430 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2431 if (errorcode)
2432 return -1;
2433 } else if (c == '*') {
2434 minRepeats = 0;
2435 maxRepeats = -1;
2436 ptr++;
2437 } else if (c == '+') {
2438 minRepeats = 1;
2439 maxRepeats = -1;
2440 ptr++;
2441 } else if (c == '?') {
2442 minRepeats = 0;
2443 maxRepeats = 1;
2444 ptr++;
2445 } else {
2446 minRepeats = 1;
2447 maxRepeats = 1;
2448 }
2449
2450 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2451 group, and if the maximum is greater than zero, we have to replicate
2452 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2453 bracket set. */
2454
2455 int repeatsLength;
2456 if (minRepeats == 0) {
2457 length++;
2458 if (maxRepeats > 0) {
2459 repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1, duplength + 3 + 2 * LINK_SIZE);
2460 if (repeatsLength < 0) {
2461 errorcode = ERR16;
2462 return -1;
2463 }
2464 length += repeatsLength;
2465 if (length > MAX_PATTERN_SIZE) {
2466 errorcode = ERR16;
2467 return -1;
2468 }
2469 }
2470 }
2471
2472 /* When the minimum is greater than zero, we have to replicate up to
2473 minval-1 times, with no additions required in the copies. Then, if there
2474 is a limited maximum we have to replicate up to maxval-1 times allowing
2475 for a BRAZERO item before each optional copy and nesting brackets for all
2476 but one of the optional copies. */
2477
2478 else {
2479 repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, duplength);
2480 if (repeatsLength < 0) {
2481 errorcode = ERR16;
2482 return -1;
2483 }
2484 length += repeatsLength;
2485 if (maxRepeats > minRepeats) { /* Need this test as maxRepeats=-1 means no limit */
2486 repeatsLength = multiplyWithOverflowCheck(maxRepeats - minRepeats, duplength + 3 + 2 * LINK_SIZE);
2487 if (repeatsLength < 0) {
2488 errorcode = ERR16;
2489 return -1;
2490 }
2491 length += repeatsLength - (2 + 2 * LINK_SIZE);
2492 }
2493 if (length > MAX_PATTERN_SIZE) {
2494 errorcode = ERR16;
2495 return -1;
2496 }
2497 }
2498
2499 /* Allow space for once brackets for "possessive quantifier" */
2500
2501 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2502 ptr++;
2503 length += 2 + 2 * LINK_SIZE;
2504 }
2505 continue;
2506 }
2507
2508 /* Non-special character. It won't be space or # in extended mode, so it is
2509 always a genuine character. If we are in a \Q...\E sequence, check for the
2510 end; if not, we have a literal. */
2511
2512 default:
2513 NORMAL_CHAR:
2514 length += 2; /* For a one-byte character */
2515 lastitemlength = 1; /* Default length of last item for repeats */
2516
2517 if (c > 127) {
2518 int i;
9dae56ea
A
2519 for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
2520 if (c <= jsc_pcre_utf8_table1[i])
b37bf2e1
A
2521 break;
2522 length += i;
2523 lastitemlength += i;
2524 }
2525
2526 continue;
2527 }
2528 }
2529
2530 length += 2 + LINK_SIZE; /* For final KET and END */
2531
2532 cd.numCapturingBrackets = bracount;
2533 return length;
2534}
2535
2536/*************************************************
2537* Compile a Regular Expression *
2538*************************************************/
2539
2540/* This function takes a string and returns a pointer to a block of store
2541holding a compiled version of the expression. The original API for this
2542function had no error code return variable; it is retained for backwards
2543compatibility. The new function is given a new name.
2544
2545Arguments:
2546 pattern the regular expression
2547 options various option bits
9dae56ea 2548 errorCodePtr pointer to error code variable (pcre_compile2() only)
b37bf2e1 2549 can be NULL if you don't want a code value
9dae56ea 2550 errorPtr pointer to pointer to error text
b37bf2e1
A
2551 erroroffset ptr offset in pattern where error was detected
2552 tables pointer to character tables or NULL
2553
2554Returns: pointer to compiled data block, or NULL on error,
9dae56ea 2555 with errorPtr and erroroffset set
b37bf2e1
A
2556*/
2557
9dae56ea 2558static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorPtr)
b37bf2e1 2559{
9dae56ea 2560 *errorPtr = errorText(errorcode);
b37bf2e1
A
2561 return 0;
2562}
2563
2564JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
2565 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
9dae56ea 2566 unsigned* numSubpatterns, const char** errorPtr)
b37bf2e1 2567{
9dae56ea 2568 /* We can't pass back an error message if errorPtr is NULL; I guess the best we
b37bf2e1 2569 can do is just return NULL, but we can set a code value if there is a code pointer. */
9dae56ea 2570 if (!errorPtr)
b37bf2e1 2571 return 0;
9dae56ea 2572 *errorPtr = NULL;
b37bf2e1
A
2573
2574 CompileData cd;
2575
2576 ErrorCode errorcode = ERR0;
2577 /* Call this once just to count the brackets. */
2578 calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2579 /* Call it again to compute the length. */
2580 int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2581 if (errorcode)
9dae56ea 2582 return returnError(errorcode, errorPtr);
b37bf2e1
A
2583
2584 if (length > MAX_PATTERN_SIZE)
9dae56ea 2585 return returnError(ERR16, errorPtr);
b37bf2e1
A
2586
2587 size_t size = length + sizeof(JSRegExp);
9dae56ea
A
2588#if REGEXP_HISTOGRAM
2589 size_t stringOffset = (size + sizeof(UChar) - 1) / sizeof(UChar) * sizeof(UChar);
2590 size = stringOffset + patternLength * sizeof(UChar);
2591#endif
b37bf2e1
A
2592 JSRegExp* re = reinterpret_cast<JSRegExp*>(new char[size]);
2593
2594 if (!re)
9dae56ea 2595 return returnError(ERR13, errorPtr);
b37bf2e1
A
2596
2597 re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0);
2598
2599 /* The starting points of the name/number translation table and of the code are
2600 passed around in the compile data block. */
2601
2602 const unsigned char* codeStart = (const unsigned char*)(re + 1);
2603
2604 /* Set up a starting, non-extracting bracket, then compile the expression. On
2605 error, errorcode will be set non-zero, so we don't need to look at the result
2606 of the function here. */
2607
2608 const UChar* ptr = (const UChar*)pattern;
2609 const UChar* patternEnd = pattern + patternLength;
9dae56ea
A
2610 unsigned char* code = const_cast<unsigned char*>(codeStart);
2611 int firstByte, reqByte;
b37bf2e1
A
2612 int bracketCount = 0;
2613 if (!cd.needOuterBracket)
9dae56ea 2614 compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, &firstByte, &reqByte, cd);
b37bf2e1
A
2615 else {
2616 *code = OP_BRA;
9dae56ea 2617 compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, 0, &firstByte, &reqByte, cd);
b37bf2e1 2618 }
9dae56ea
A
2619 re->topBracket = bracketCount;
2620 re->topBackref = cd.topBackref;
b37bf2e1
A
2621
2622 /* If not reached end of pattern on success, there's an excess bracket. */
2623
2624 if (errorcode == 0 && ptr < patternEnd)
2625 errorcode = ERR10;
2626
2627 /* Fill in the terminating state and check for disastrous overflow, but
2628 if debugging, leave the test till after things are printed out. */
2629
2630 *code++ = OP_END;
2631
2632 ASSERT(code - codeStart <= length);
2633 if (code - codeStart > length)
2634 errorcode = ERR7;
2635
2636 /* Give an error if there's back reference to a non-existent capturing
2637 subpattern. */
2638
9dae56ea 2639 if (re->topBackref > re->topBracket)
b37bf2e1
A
2640 errorcode = ERR15;
2641
2642 /* Failed to compile, or error while post-processing */
2643
2644 if (errorcode != ERR0) {
2645 delete [] reinterpret_cast<char*>(re);
9dae56ea 2646 return returnError(errorcode, errorPtr);
b37bf2e1
A
2647 }
2648
2649 /* If the anchored option was not passed, set the flag if we can determine that
2650 the pattern is anchored by virtue of ^ characters or \A or anything else (such
2651 as starting with .* when DOTALL is set).
2652
2653 Otherwise, if we know what the first character has to be, save it, because that
2654 speeds up unanchored matches no end. If not, see if we can set the
2655 UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
2656 start with ^. and also when all branches start with .* for non-DOTALL matches.
2657 */
2658
2659 if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(codeStart))
2660 re->options |= IsAnchoredOption;
2661 else {
9dae56ea
A
2662 if (firstByte < 0) {
2663 firstByte = (cd.needOuterBracket
b37bf2e1
A
2664 ? bracketFindFirstAssertedCharacter(codeStart, false)
2665 : branchFindFirstAssertedCharacter(codeStart, false))
2666 | ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);
2667 }
9dae56ea
A
2668 if (firstByte >= 0) {
2669 int ch = firstByte & 255;
b37bf2e1 2670 if (ch < 127) {
9dae56ea 2671 re->firstByte = ((firstByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstByte;
b37bf2e1
A
2672 re->options |= UseFirstByteOptimizationOption;
2673 }
2674 } else {
2675 if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.backrefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap))
2676 re->options |= UseMultiLineFirstByteOptimizationOption;
2677 }
2678 }
2679
2680 /* For an anchored pattern, we use the "required byte" only if it follows a
2681 variable length item in the regex. Remove the caseless flag for non-caseable
2682 bytes. */
2683
9dae56ea
A
2684 if (reqByte >= 0 && (!(re->options & IsAnchoredOption) || (reqByte & REQ_VARY))) {
2685 int ch = reqByte & 255;
b37bf2e1 2686 if (ch < 127) {
9dae56ea 2687 re->reqByte = ((reqByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqByte & ~REQ_IGNORE_CASE) : reqByte;
b37bf2e1
A
2688 re->options |= UseRequiredByteOptimizationOption;
2689 }
2690 }
2691
9dae56ea
A
2692#if REGEXP_HISTOGRAM
2693 re->stringOffset = stringOffset;
2694 re->stringLength = patternLength;
2695 memcpy(reinterpret_cast<char*>(re) + stringOffset, pattern, patternLength * 2);
2696#endif
2697
b37bf2e1 2698 if (numSubpatterns)
9dae56ea 2699 *numSubpatterns = re->topBracket;
b37bf2e1
A
2700 return re;
2701}
2702
2703void jsRegExpFree(JSRegExp* re)
2704{
2705 delete [] reinterpret_cast<char*>(re);
2706}