]> git.saurik.com Git - apple/libc.git/blob - regex/FreeBSD/regex.3
Libc-1439.100.3.tar.gz
[apple/libc.git] / regex / FreeBSD / regex.3
1 .\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
2 .\" Copyright (c) 1992, 1993, 1994
3 .\" The Regents of the University of California. All rights reserved.
4 .\"
5 .\" This code is derived from software contributed to Berkeley by
6 .\" Henry Spencer.
7 .\"
8 .\" Redistribution and use in source and binary forms, with or without
9 .\" modification, are permitted provided that the following conditions
10 .\" are met:
11 .\" 1. Redistributions of source code must retain the above copyright
12 .\" notice, this list of conditions and the following disclaimer.
13 .\" 2. Redistributions in binary form must reproduce the above copyright
14 .\" notice, this list of conditions and the following disclaimer in the
15 .\" documentation and/or other materials provided with the distribution.
16 .\" 4. Neither the name of the University nor the names of its contributors
17 .\" may be used to endorse or promote products derived from this software
18 .\" without specific prior written permission.
19 .\"
20 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 .\" SUCH DAMAGE.
31 .\"
32 .\" @(#)regex.3 8.4 (Berkeley) 3/20/94
33 .\" $FreeBSD: src/lib/libc/regex/regex.3,v 1.21 2007/01/09 00:28:04 imp Exp $
34 .\"
35 .Dd Sept 29, 2011
36 .Dt REGEX 3
37 .Os
38 .Sh NAME
39 .Nm regcomp ,
40 .Nm regcomp_l ,
41 .Nm regerror ,
42 .Nm regexec ,
43 .Nm regfree ,
44 .Nm regncomp ,
45 .Nm regncomp_l ,
46 .Nm regnexec ,
47 .Nm regnwcomp ,
48 .Nm regnwcomp_l ,
49 .Nm regnwexec ,
50 .Nm regwcomp ,
51 .Nm regwcomp_l ,
52 .Nm regwexec
53 .Nd regular-expression library
54 .Sh SYNOPSIS
55 .Sy (Standards-compliant APIs)
56 .Pp
57 .In regex.h
58 .Ft int
59 .Fo regcomp
60 .Fa "regex_t *restrict preg"
61 .Fa "const char *restrict pattern"
62 .Fa "int cflags"
63 .Fc
64 .Ft size_t
65 .Fo regerror
66 .Fa "int errcode"
67 .Fa "const regex_t *restrict preg"
68 .Fa "char *restrict errbuf"
69 .Fa "size_t errbuf_size"
70 .Fc
71 .Ft int
72 .Fo regexec
73 .Fa "const regex_t *restrict preg"
74 .Fa "const char *restrict string"
75 .Fa "size_t nmatch"
76 .Fa "regmatch_t pmatch[restrict]"
77 .Fa "int eflags"
78 .Fc
79 .Ft void
80 .Fo regfree
81 .Fa "regex_t *preg"
82 .Fc
83 .Pp
84 .Sy (Non-portable extensions)
85 .Ft int
86 .Fo regncomp
87 .Fa "regex_t *restrict preg"
88 .Fa "const char *restrict pattern"
89 .Fa "size_t len"
90 .Fa "int cflags"
91 .Fc
92 .Ft int
93 .Fo regnexec
94 .Fa "const regex_t *restrict preg"
95 .Fa "const char *restrict string"
96 .Fa "size_t len"
97 .Fa "size_t nmatch"
98 .Fa "regmatch_t pmatch[restrict]"
99 .Fa "int eflags"
100 .Fc
101 .Ft int
102 .Fo regwcomp
103 .Fa "regex_t *restrict preg"
104 .Fa "const wchar_t *restrict widepat"
105 .Fa "int cflags"
106 .Fc
107 .Ft int
108 .Fo regwexec
109 .Fa "const regex_t *restrict preg"
110 .Fa "const wchar_t *restrict widestr"
111 .Fa "size_t nmatch"
112 .Fa "regmatch_t pmatch[restrict]"
113 .Fa "int eflags"
114 .Fc
115 .Ft int
116 .Fo regwncomp
117 .Fa "regex_t *restrict preg"
118 .Fa "const wchar_t *restrict widepat"
119 .Fa "size_t len"
120 .Fa "int cflags"
121 .Fc
122 .Ft int
123 .Fo regwnexec
124 .Fa "const regex_t *restrict preg"
125 .Fa "const wchar_t *restrict widestr"
126 .Fa "size_t len"
127 .Fa "size_t nmatch"
128 .Fa "regmatch_t pmatch[restrict]"
129 .Fa "int eflags"
130 .Fc
131 .In regex.h
132 .In xlocale.h
133 .Ft int
134 .Fo regcomp_l
135 .Fa "regex_t *restrict preg"
136 .Fa "const char *restrict pattern"
137 .Fa "int cflags"
138 .Fa "locale_t restrict"
139 .Fc
140 .Ft int
141 .Fo regncomp_l
142 .Fa "regex_t *restrict preg"
143 .Fa "const char *restrict pattern"
144 .Fa "size_t len"
145 .Fa "int cflags"
146 .Fa "locale_t restrict"
147 .Fc
148 .Ft int
149 .Fo regwcomp_l
150 .Fa "regex_t *restrict preg"
151 .Fa "const wchar_t *restrict widepat"
152 .Fa "int cflags"
153 .Fa "locale_t restrict"
154 .Fc
155 .Ft int
156 .Fo regwncomp_l
157 .Fa "regex_t *restrict preg"
158 .Fa "const wchar_t *restrict widepat"
159 .Fa "size_t len"
160 .Fa "int cflags"
161 .Fa "locale_t restrict"
162 .Fc
163 .Sh DESCRIPTION
164 These routines implement
165 .St -p1003.2
166 regular expressions
167 .Pq Do RE Dc Ns s ;
168 see
169 .Xr re_format 7 .
170 The
171 .Fn regcomp
172 function
173 compiles an RE, written as a string, into an internal form.
174 .Fn regexec
175 matches that internal form against a string and reports results.
176 .Fn regerror
177 transforms error codes from either into human-readable messages.
178 .Fn regfree
179 frees any dynamically-allocated storage used by the internal form
180 of an RE.
181 .Pp
182 The header
183 .In regex.h
184 declares two structure types,
185 .Ft regex_t
186 and
187 .Ft regmatch_t ,
188 the former for compiled internal forms and the latter for match reporting.
189 It also declares the four functions,
190 a type
191 .Ft regoff_t ,
192 and a number of constants with names starting with
193 .Dq Dv REG_ .
194 .Pp
195 The
196 .Fn regcomp
197 function
198 compiles the regular expression contained in the
199 .Fa pattern
200 string,
201 subject to the flags in
202 .Fa cflags ,
203 and places the results in the
204 .Ft regex_t
205 structure pointed to by
206 .Fa preg .
207 The
208 .Fa cflags
209 argument
210 is the bitwise OR of zero or more of the following flags:
211 .Bl -tag -width REG_EXTENDED
212 .It Dv REG_EXTENDED
213 Compile modern
214 .Pq Dq extended
215 REs,
216 rather than the obsolete
217 .Pq Dq basic
218 REs that
219 are the default.
220 .It Dv REG_BASIC
221 This is a synonym for 0,
222 provided as a counterpart to
223 .Dv REG_EXTENDED
224 to improve readability.
225 .It Dv REG_NOSPEC
226 Compile with recognition of all special characters turned off.
227 All characters are thus considered ordinary,
228 so the
229 .Dq RE
230 is a literal string.
231 This is an extension,
232 compatible with but not specified by
233 .St -p1003.2 ,
234 and should be used with
235 caution in software intended to be portable to other systems.
236 .Dv REG_EXTENDED
237 and
238 .Dv REG_NOSPEC
239 may not be used
240 in the same call to
241 .Fn regcomp .
242 .It Dv REG_LITERAL
243 An alias of
244 .Dv REG_NOSPEC .
245 .It Dv REG_ICASE
246 Compile for matching that ignores upper/lower case distinctions.
247 See
248 .Xr re_format 7 .
249 .It Dv REG_NOSUB
250 Compile for matching that need only report success or failure,
251 not what was matched.
252 .It Dv REG_NEWLINE
253 Compile for newline-sensitive matching.
254 By default, newline is a completely ordinary character with no special
255 meaning in either REs or strings.
256 With this flag,
257 .Ql [^
258 bracket expressions and
259 .Ql .\&
260 never match newline,
261 a
262 .Ql ^\&
263 anchor matches the null string after any newline in the string
264 in addition to its normal function,
265 and the
266 .Ql $\&
267 anchor matches the null string before any newline in the
268 string in addition to its normal function.
269 .It Dv REG_PEND
270 (Note that
271 .Dv REG_PEND
272 is not recognized by any of the wide character or
273 .Dq Nm n
274 variants.
275 Besides, the
276 .Dq Nm n
277 variants can be used instead of
278 .Dv REG_PEND ;
279 see EXTENDED APIS below.)
280 The regular expression ends,
281 not at the first NUL,
282 but just before the character pointed to by the
283 .Va re_endp
284 member of the structure pointed to by
285 .Fa preg .
286 The
287 .Va re_endp
288 member is of type
289 .Ft "const char *" .
290 This flag permits inclusion of NULs in the RE;
291 they are considered ordinary characters.
292 This is an extension,
293 compatible with but not specified by
294 .St -p1003.2 ,
295 and should be used with
296 caution in software intended to be portable to other systems.
297 .It Dv REG_ENHANCED
298 Recognized enhanced regular expression features; see
299 .Xr re_format 7
300 for details.
301 This is an extension not specified by
302 .St -p1003.2 ,
303 and should be used with
304 caution in software intended to be portable to other systems.
305 .It Dv REG_MINIMAL
306 Use minimal (non-greedy) repetitions instead of the normal greedy ones; see
307 .Xr re_format 7
308 for details.
309 (This only applies when both
310 .Dv REG_ENHANCED
311 and
312 .Dv REG_EXTENDED
313 are also set.)
314 This is an extension not specified by
315 .St -p1003.2 ,
316 and should be used with
317 caution in software intended to be portable to other systems.
318 .It Dv REG_UNGREEDY
319 Alias of
320 .Dv REG_MINIMAL .
321 .El
322 .Pp
323 When successful,
324 .Fn regcomp
325 returns 0 and fills in the structure pointed to by
326 .Fa preg .
327 One member of that structure
328 (other than
329 .Va re_endp )
330 is publicized:
331 .Va re_nsub ,
332 of type
333 .Ft size_t ,
334 contains the number of parenthesized subexpressions within the RE
335 (except that the value of this member is undefined if the
336 .Dv REG_NOSUB
337 flag was used).
338 If
339 .Fn regcomp
340 fails, it returns a non-zero error code;
341 see
342 .Sx DIAGNOSTICS .
343 .Pp
344 The
345 .Fn regexec
346 function
347 matches the compiled RE pointed to by
348 .Fa preg
349 against the
350 .Fa string ,
351 subject to the flags in
352 .Fa eflags ,
353 and reports results using
354 .Fa nmatch ,
355 .Fa pmatch ,
356 and the returned value.
357 The RE must have been compiled by a previous invocation of
358 .Fn regcomp .
359 The compiled form is not altered during execution of
360 .Fn regexec ,
361 so a single compiled RE can be used simultaneously by multiple threads.
362 .Pp
363 By default,
364 the NUL-terminated string pointed to by
365 .Fa string
366 is considered to be the text of an entire line, minus any terminating
367 newline.
368 The
369 .Fa eflags
370 argument is the bitwise OR of zero or more of the following flags:
371 .Bl -tag -width REG_STARTEND
372 .It Dv REG_NOTBOL
373 The first character of
374 the string
375 is not the beginning of a line, so the
376 .Ql ^\&
377 anchor should not match before it.
378 This does not affect the behavior of newlines under
379 .Dv REG_NEWLINE .
380 .It Dv REG_NOTEOL
381 The NUL terminating
382 the string
383 does not end a line, so the
384 .Ql $\&
385 anchor should not match before it.
386 This does not affect the behavior of newlines under
387 .Dv REG_NEWLINE .
388 .It Dv REG_STARTEND
389 The string is considered to start at
390 .Fa string
391 +
392 .Fa pmatch Ns [0]. Ns Va rm_so
393 and to have a terminating NUL located at
394 .Fa string
395 +
396 .Fa pmatch Ns [0]. Ns Va rm_eo
397 (there need not actually be a NUL at that location),
398 regardless of the value of
399 .Fa nmatch .
400 See below for the definition of
401 .Fa pmatch
402 and
403 .Fa nmatch .
404 This is an extension,
405 compatible with but not specified by
406 .St -p1003.2 ,
407 and should be used with
408 caution in software intended to be portable to other systems.
409 Note that a non-zero
410 .Va rm_so
411 does not imply
412 .Dv REG_NOTBOL ;
413 .Dv REG_STARTEND
414 affects only the location of the string,
415 not how it is matched.
416 .El
417 .Pp
418 See
419 .Xr re_format 7
420 for a discussion of what is matched in situations where an RE or a
421 portion thereof could match any of several substrings of
422 .Fa string .
423 .Pp
424 Normally,
425 .Fn regexec
426 returns 0 for success and the non-zero code
427 .Dv REG_NOMATCH
428 for failure.
429 Other non-zero error codes may be returned in exceptional situations;
430 see
431 .Sx DIAGNOSTICS .
432 .Pp
433 If
434 .Dv REG_NOSUB
435 was specified in the compilation of the RE,
436 or if
437 .Fa nmatch
438 is 0,
439 .Fn regexec
440 ignores the
441 .Fa pmatch
442 argument (but see below for the case where
443 .Dv REG_STARTEND
444 is specified).
445 Otherwise,
446 .Fa pmatch
447 points to an array of
448 .Fa nmatch
449 structures of type
450 .Ft regmatch_t .
451 Such a structure has at least the members
452 .Va rm_so
453 and
454 .Va rm_eo ,
455 both of type
456 .Ft regoff_t
457 (a signed arithmetic type at least as large as an
458 .Ft off_t
459 and a
460 .Ft ssize_t ) ,
461 containing respectively the offset of the first character of a substring
462 and the offset of the first character after the end of the substring.
463 Offsets are measured from the beginning of the
464 .Fa string
465 argument given to
466 .Fn regexec .
467 An empty substring is denoted by equal offsets,
468 both indicating the character following the empty substring.
469 .Pp
470 The 0th member of the
471 .Fa pmatch
472 array is filled in to indicate what substring of
473 .Fa string
474 was matched by the entire RE.
475 Remaining members report what substring was matched by parenthesized
476 subexpressions within the RE;
477 member
478 .Va i
479 reports subexpression
480 .Va i ,
481 with subexpressions counted (starting at 1) by the order of their opening
482 parentheses in the RE, left to right.
483 Unused entries in the array (corresponding either to subexpressions that
484 did not participate in the match at all, or to subexpressions that do not
485 exist in the RE (that is,
486 .Va i
487 >
488 .Fa preg Ns -> Ns Va re_nsub ) )
489 have both
490 .Va rm_so
491 and
492 .Va rm_eo
493 set to -1.
494 If a subexpression participated in the match several times,
495 the reported substring is the last one it matched.
496 (Note, as an example in particular, that when the RE
497 .Ql "(b*)+"
498 matches
499 .Ql bbb ,
500 the parenthesized subexpression matches each of the three
501 .So Li b Sc Ns s
502 and then
503 an infinite number of empty strings following the last
504 .Ql b ,
505 so the reported substring is one of the empties.)
506 .Pp
507 If
508 .Dv REG_STARTEND
509 is specified,
510 .Fa pmatch
511 must point to at least one
512 .Ft regmatch_t
513 (even if
514 .Fa nmatch
515 is 0 or
516 .Dv REG_NOSUB
517 was specified),
518 to hold the input offsets for
519 .Dv REG_STARTEND .
520 Use for output is still entirely controlled by
521 .Fa nmatch ;
522 if
523 .Fa nmatch
524 is 0 or
525 .Dv REG_NOSUB
526 was specified,
527 the value of
528 .Fa pmatch Ns [0]
529 will not be changed by a successful
530 .Fn regexec .
531 .Pp
532 The
533 .Fn regerror
534 function
535 maps a non-zero
536 .Fa errcode
537 from either
538 .Fn regcomp
539 or
540 .Fn regexec
541 to a human-readable, printable message.
542 If
543 .Fa preg
544 is
545 .No non\- Ns Dv NULL ,
546 the error code should have arisen from use of
547 the
548 .Ft regex_t
549 pointed to by
550 .Fa preg ,
551 and if the error code came from
552 .Fn regcomp ,
553 it should have been the result from the most recent
554 .Fn regcomp
555 using that
556 .Ft regex_t .
557 The
558 .Fn ( regerror
559 may be able to supply a more detailed message using information
560 from the
561 .Ft regex_t . )
562 The
563 .Fn regerror
564 function
565 places the NUL-terminated message into the buffer pointed to by
566 .Fa errbuf ,
567 limiting the length (including the NUL) to at most
568 .Fa errbuf_size
569 bytes.
570 If the whole message will not fit,
571 as much of it as will fit before the terminating NUL is supplied.
572 In any case,
573 the returned value is the size of buffer needed to hold the whole
574 message (including terminating NUL).
575 If
576 .Fa errbuf_size
577 is 0,
578 .Fa errbuf
579 is ignored but the return value is still correct.
580 .Pp
581 If the
582 .Fa errcode
583 given to
584 .Fn regerror
585 is first ORed with
586 .Dv REG_ITOA ,
587 the
588 .Dq message
589 that results is the printable name of the error code,
590 e.g.\&
591 .Dq Dv REG_NOMATCH ,
592 rather than an explanation thereof.
593 If
594 .Fa errcode
595 is
596 .Dv REG_ATOI ,
597 then
598 .Fa preg
599 shall be
600 .No non\- Ns Dv NULL
601 and the
602 .Va re_endp
603 member of the structure it points to
604 must point to the printable name of an error code;
605 in this case, the result in
606 .Fa errbuf
607 is the decimal digits of
608 the numeric value of the error code
609 (0 if the name is not recognized).
610 .Dv REG_ITOA
611 and
612 .Dv REG_ATOI
613 are intended primarily as debugging facilities;
614 they are extensions,
615 compatible with but not specified by
616 .St -p1003.2 ,
617 and should be used with
618 caution in software intended to be portable to other systems.
619 Be warned also that they are considered experimental and changes are possible.
620 .Pp
621 The
622 .Fn regfree
623 function
624 frees any dynamically-allocated storage associated with the compiled RE
625 pointed to by
626 .Fa preg .
627 The remaining
628 .Ft regex_t
629 is no longer a valid compiled RE
630 and the effect of supplying it to
631 .Fn regexec
632 or
633 .Fn regerror
634 is undefined.
635 .Pp
636 None of these functions references global variables except for tables
637 of constants;
638 all are safe for use from multiple threads if the arguments are safe.
639 .Sh EXTENDED APIS
640 These extended APIs are available in Mac OS X 10.8 and beyond, when the
641 deployment target is 10.8 or later.
642 It should also be noted that any of the
643 .Fn regcomp
644 variants may be used to initialize a
645 .Ft regex_t
646 structure, that can then be passed to any of the
647 .Fn regexec
648 variants.
649 So it is quite legal to compile a wide character RE and use it to match a
650 multibyte character string, or vice versa.
651 .Pp
652 The
653 .Fn regncomp
654 routine compiles regular expressions like
655 .Fn regcomp ,
656 but the length of the regular expression string is specified, allowing a string
657 that is not NUL terminated and/or contains NUL characters.
658 This is a modern replacement for using
659 .Fn regcomp
660 with the
661 .Dv REG_PEND
662 option.
663 .Pp
664 Similarly, the
665 .Fn regnexec
666 routine is like
667 .Fn regexec ,
668 but the length of the string to match is specified, allowing a string
669 that is not NUL terminated and/or contains NUL characters.
670 .Pp
671 The
672 .Fn regwcomp
673 and
674 .Fn regwexec
675 variants take a wide-character
676 .Vt ( wchar_t )
677 string for the regular expression and string to match.
678 And
679 .Fn regwncomp
680 and
681 .Fn regwnexec
682 are variants that allow specifying the wide character string length, and
683 so allows wide character strings that are not NUL terminated and/or
684 contains NUL characters.
685 .Sh INTERACTION WITH THE LOCALE
686 When
687 .Fn regcomp
688 or one of its variants is run, the regular expression is compiled into an
689 internal form, which may include specific information about the locale currently
690 in effect, such as equivalence classes or multi-character collation symbols.
691 So a reference to the current locale is also stored with the internal form,
692 so that when
693 .Fn regexec
694 is run, it can use the same locale (even if the locale is changed in-between
695 the calls to
696 .Fn regcomp
697 and
698 .Fn regexec ) .
699 .Pp
700 To provide more direct control over which locale is used,
701 routines with
702 .Dq Nm _l
703 appended to their names are provided that work just like the variants
704 without the
705 .Dq Nm _l ,
706 except that a locale (via a
707 .Vt locale_t
708 variable type) is specified directly.
709 Note that only variants of
710 .Fn regcomp
711 have
712 .Dq Nm _l
713 variants, since the
714 .Fn regexec
715 variants just use the reference to the locale stored in the internal form.
716 .Sh IMPLEMENTATION CHOICES
717 The
718 .Nm regex
719 implementation in Mac OS X 10.8 and later is based on a heavily modified subset
720 of TRE (http://laurikari.net/tre/).
721 This provides improved performance, better conformance and additional features.
722 However, both API and binary compatibility have been maintained with previous
723 releases, so binaries
724 built on previous releases should work on 10.8 and later, and binaries built on
725 10.8 and later should be able to run on previous releases (as long as none of
726 the new variants or new features are used.
727 .Pp
728 There are a number of decisions that
729 .St -p1003.2
730 leaves up to the implementor,
731 either by explicitly saying
732 .Dq undefined
733 or by virtue of them being
734 forbidden by the RE grammar.
735 This implementation treats them as follows.
736 .Pp
737 See
738 .Xr re_format 7
739 for a discussion of the definition of case-independent matching.
740 .Pp
741 There is no particular limit on the length of REs,
742 except insofar as memory is limited.
743 Memory usage is approximately linear in RE size, and largely insensitive
744 to RE complexity, except for bounded repetitions.
745 See
746 .Sx BUGS
747 for one short RE using them
748 that will run almost any system out of memory.
749 .Pp
750 A backslashed character other than one specifically given a magic meaning
751 by
752 .St -p1003.2
753 (such magic meanings occur only in obsolete
754 .Bq Dq basic
755 REs)
756 is taken as an ordinary character.
757 .Pp
758 Any unmatched
759 .Ql [\&
760 is a
761 .Dv REG_EBRACK
762 error.
763 .Pp
764 Equivalence classes cannot begin or end bracket-expression ranges.
765 The endpoint of one range cannot begin another.
766 .Pp
767 .Dv RE_DUP_MAX ,
768 the limit on repetition counts in bounded repetitions, is 255.
769 .Pp
770 A repetition operator
771 .Ql ( ?\& ,
772 .Ql *\& ,
773 .Ql +\& ,
774 or bounds)
775 cannot follow another
776 repetition operator, except for the use of
777 .Ql ?\&
778 for minimal repetition (for enhanced extended REs; see
779 .Xr re_format 7
780 for details).
781 A repetition operator cannot begin an expression or subexpression
782 or follow
783 .Ql ^\&
784 or
785 .Ql |\& .
786 .Pp
787 .Ql |\&
788 cannot appear first or last in a (sub)expression or after another
789 .Ql |\& ,
790 i.e., an operand of
791 .Ql |\&
792 cannot be an empty subexpression.
793 An empty parenthesized subexpression,
794 .Ql "()" ,
795 is legal and matches an
796 empty (sub)string.
797 An empty string is not a legal RE.
798 .Pp
799 A
800 .Ql {\&
801 followed by a digit is considered the beginning of bounds for a
802 bounded repetition, which must then follow the syntax for bounds.
803 A
804 .Ql {\&
805 .Em not
806 followed by a digit is considered an ordinary character.
807 .Pp
808 .Ql ^\&
809 and
810 .Ql $\&
811 beginning and ending subexpressions in obsolete
812 .Pq Dq basic
813 REs are anchors, not ordinary characters.
814 .Sh DIAGNOSTICS
815 Non-zero error codes from
816 .Fn regcomp
817 and
818 .Fn regexec
819 include the following:
820 .Pp
821 .Bl -tag -width REG_ECOLLATE -compact
822 .It Dv REG_NOMATCH
823 The
824 .Fn regexec
825 function
826 failed to match
827 .It Dv REG_BADPAT
828 invalid regular expression
829 .It Dv REG_ECOLLATE
830 invalid collating element
831 .It Dv REG_ECTYPE
832 invalid character class
833 .It Dv REG_EESCAPE
834 .Ql \e
835 applied to unescapable character
836 .It Dv REG_ESUBREG
837 invalid backreference number
838 .It Dv REG_EBRACK
839 brackets
840 .Ql "[ ]"
841 not balanced
842 .It Dv REG_EPAREN
843 parentheses
844 .Ql "( )"
845 not balanced
846 .It Dv REG_EBRACE
847 braces
848 .Ql "{ }"
849 not balanced
850 .It Dv REG_BADBR
851 invalid repetition count(s) in
852 .Ql "{ }"
853 .It Dv REG_ERANGE
854 invalid character range in
855 .Ql "[ ]"
856 .It Dv REG_ESPACE
857 ran out of memory
858 .It Dv REG_BADRPT
859 .Ql ?\& ,
860 .Ql *\& ,
861 or
862 .Ql +\&
863 operand invalid
864 .It Dv REG_EMPTY
865 empty (sub)expression
866 .It Dv REG_ASSERT
867 cannot happen - you found a bug
868 .It Dv REG_INVARG
869 invalid argument, e.g.\& negative-length string
870 .It Dv REG_ILLSEQ
871 illegal byte sequence (bad multibyte character)
872 .El
873 .Sh SEE ALSO
874 .Xr grep 1 ,
875 .Xr re_format 7
876 .Pp
877 .St -p1003.2 ,
878 sections 2.8 (Regular Expression Notation)
879 and
880 B.5 (C Binding for Regular Expression Matching).
881 .Sh HISTORY
882 The
883 .Nm regex
884 implementation is based on a heavily modified subset of TRE
885 (http://laurikari.net/tre/), originally written by Ville Laurikari.
886 Previous releases used an implementation originally written by
887 .An Henry Spencer ,
888 and altered for inclusion in the
889 .Bx 4.4
890 distribution.
891 .Sh BUGS
892 The beginning-of-line and end-of-line anchors (
893 .Dq ^\&
894 and
895 .Dq $\& )
896 are currently implemented so that repetitions can not be applied to them.
897 The standards are unclear about whether this is legal, but other
898 .Nm regex
899 packages do support this case.
900 It is best to avoid this non-portable (and not really very useful) case.
901 .Pp
902 The back-reference code is subtle and doubts linger about its correctness
903 in complex cases.
904 .Pp
905 The
906 .Fn regexec
907 variants use one of two internal matching engines.
908 The normal one is linear worst-case time in the length of the text being
909 searched, and quadratic worst-case time in the length of the used regular
910 expression.
911 When back-references are used, a slower, backtracking engine is used.
912 While all backtracking matching engines suffer from extreme slowness for certain
913 pathological cases, the normal engines doesn't suffer from these cases.
914 It is advised to avoid back-references whenever possible.
915 .Pp
916 The
917 .Fn regcomp
918 variants
919 implements bounded repetitions by macro expansion,
920 which is costly in time and space if counts are large
921 or bounded repetitions are nested.
922 An RE like, say,
923 .Ql "((((a{1,100}){1,100}){1,100}){1,100}){1,100}"
924 will (eventually) run almost any existing machine out of swap space.
925 .Pp
926 Due to a mistake in
927 .St -p1003.2 ,
928 things like
929 .Ql "a)b"
930 are legal REs because
931 .Ql )\&
932 is
933 a special character only in the presence of a previous unmatched
934 .Ql (\& .
935 This cannot be fixed until the spec is fixed.
936 .Pp
937 The standard's definition of back references is vague.
938 For example, does
939 .Ql "a\e(\e(b\e)*\e2\e)*d"
940 match
941 .Ql "abbbd" ?
942 Until the standard is clarified,
943 behavior in such cases should not be relied on.