]>
Commit | Line | Data |
---|---|---|
5b2abdfb A |
1 | .\" Copyright (c) 1992, 1993, 1994 Henry Spencer. |
2 | .\" Copyright (c) 1992, 1993, 1994 | |
3 | .\" The Regents of the University of California. All rights reserved. | |
4 | .\" | |
5 | .\" This code is derived from software contributed to Berkeley by | |
6 | .\" Henry Spencer. | |
7 | .\" | |
8 | .\" Redistribution and use in source and binary forms, with or without | |
9 | .\" modification, are permitted provided that the following conditions | |
10 | .\" are met: | |
11 | .\" 1. Redistributions of source code must retain the above copyright | |
12 | .\" notice, this list of conditions and the following disclaimer. | |
13 | .\" 2. Redistributions in binary form must reproduce the above copyright | |
14 | .\" notice, this list of conditions and the following disclaimer in the | |
15 | .\" documentation and/or other materials provided with the distribution. | |
5b2abdfb A |
16 | .\" 4. Neither the name of the University nor the names of its contributors |
17 | .\" may be used to endorse or promote products derived from this software | |
18 | .\" without specific prior written permission. | |
19 | .\" | |
20 | .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
21 | .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
22 | .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
23 | .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
24 | .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
25 | .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
26 | .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
27 | .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
28 | .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
29 | .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
30 | .\" SUCH DAMAGE. | |
31 | .\" | |
32 | .\" @(#)regex.3 8.4 (Berkeley) 3/20/94 | |
1f2f436a | 33 | .\" $FreeBSD: src/lib/libc/regex/regex.3,v 1.21 2007/01/09 00:28:04 imp Exp $ |
5b2abdfb | 34 | .\" |
ad3c9f2a | 35 | .Dd Sept 29, 2011 |
5b2abdfb A |
36 | .Dt REGEX 3 |
37 | .Os | |
38 | .Sh NAME | |
39 | .Nm regcomp , | |
ad3c9f2a | 40 | .Nm regcomp_l , |
5b2abdfb | 41 | .Nm regerror , |
ad3c9f2a A |
42 | .Nm regexec , |
43 | .Nm regfree , | |
44 | .Nm regncomp , | |
45 | .Nm regncomp_l , | |
46 | .Nm regnexec , | |
47 | .Nm regnwcomp , | |
48 | .Nm regnwcomp_l , | |
49 | .Nm regnwexec , | |
50 | .Nm regwcomp , | |
51 | .Nm regwcomp_l , | |
52 | .Nm regwexec | |
5b2abdfb | 53 | .Nd regular-expression library |
5b2abdfb | 54 | .Sh SYNOPSIS |
ad3c9f2a A |
55 | .Sy (Standards-compliant APIs) |
56 | .Pp | |
5b2abdfb A |
57 | .In regex.h |
58 | .Ft int | |
9385eb3d | 59 | .Fo regcomp |
ad3c9f2a A |
60 | .Fa "regex_t *restrict preg" |
61 | .Fa "const char *restrict pattern" | |
62 | .Fa "int cflags" | |
5b2abdfb A |
63 | .Fc |
64 | .Ft size_t | |
65 | .Fo regerror | |
ad3c9f2a A |
66 | .Fa "int errcode" |
67 | .Fa "const regex_t *restrict preg" | |
68 | .Fa "char *restrict errbuf" | |
69 | .Fa "size_t errbuf_size" | |
70 | .Fc | |
71 | .Ft int | |
72 | .Fo regexec | |
73 | .Fa "const regex_t *restrict preg" | |
74 | .Fa "const char *restrict string" | |
75 | .Fa "size_t nmatch" | |
76 | .Fa "regmatch_t pmatch[restrict]" | |
77 | .Fa "int eflags" | |
5b2abdfb A |
78 | .Fc |
79 | .Ft void | |
ad3c9f2a A |
80 | .Fo regfree |
81 | .Fa "regex_t *preg" | |
82 | .Fc | |
83 | .Pp | |
84 | .Sy (Non-portable extensions) | |
85 | .Ft int | |
86 | .Fo regncomp | |
87 | .Fa "regex_t *restrict preg" | |
88 | .Fa "const char *restrict pattern" | |
89 | .Fa "size_t len" | |
90 | .Fa "int cflags" | |
91 | .Fc | |
92 | .Ft int | |
93 | .Fo regnexec | |
94 | .Fa "const regex_t *restrict preg" | |
95 | .Fa "const char *restrict string" | |
96 | .Fa "size_t len" | |
97 | .Fa "size_t nmatch" | |
98 | .Fa "regmatch_t pmatch[restrict]" | |
99 | .Fa "int eflags" | |
100 | .Fc | |
101 | .Ft int | |
102 | .Fo regwcomp | |
103 | .Fa "regex_t *restrict preg" | |
104 | .Fa "const wchar_t *restrict widepat" | |
105 | .Fa "int cflags" | |
106 | .Fc | |
107 | .Ft int | |
108 | .Fo regwexec | |
109 | .Fa "const regex_t *restrict preg" | |
110 | .Fa "const wchar_t *restrict widestr" | |
111 | .Fa "size_t nmatch" | |
112 | .Fa "regmatch_t pmatch[restrict]" | |
113 | .Fa "int eflags" | |
114 | .Fc | |
115 | .Ft int | |
116 | .Fo regwncomp | |
117 | .Fa "regex_t *restrict preg" | |
118 | .Fa "const wchar_t *restrict widepat" | |
119 | .Fa "size_t len" | |
120 | .Fa "int cflags" | |
121 | .Fc | |
122 | .Ft int | |
123 | .Fo regwnexec | |
124 | .Fa "const regex_t *restrict preg" | |
125 | .Fa "const wchar_t *restrict widestr" | |
126 | .Fa "size_t len" | |
127 | .Fa "size_t nmatch" | |
128 | .Fa "regmatch_t pmatch[restrict]" | |
129 | .Fa "int eflags" | |
130 | .Fc | |
131 | .In regex.h | |
132 | .In xlocale.h | |
133 | .Ft int | |
134 | .Fo regcomp_l | |
135 | .Fa "regex_t *restrict preg" | |
136 | .Fa "const char *restrict pattern" | |
137 | .Fa "int cflags" | |
138 | .Fa "locale_t restrict" | |
139 | .Fc | |
140 | .Ft int | |
141 | .Fo regncomp_l | |
142 | .Fa "regex_t *restrict preg" | |
143 | .Fa "const char *restrict pattern" | |
144 | .Fa "size_t len" | |
145 | .Fa "int cflags" | |
146 | .Fa "locale_t restrict" | |
147 | .Fc | |
148 | .Ft int | |
149 | .Fo regwcomp_l | |
150 | .Fa "regex_t *restrict preg" | |
151 | .Fa "const wchar_t *restrict widepat" | |
152 | .Fa "int cflags" | |
153 | .Fa "locale_t restrict" | |
154 | .Fc | |
155 | .Ft int | |
156 | .Fo regwncomp_l | |
157 | .Fa "regex_t *restrict preg" | |
158 | .Fa "const wchar_t *restrict widepat" | |
159 | .Fa "size_t len" | |
160 | .Fa "int cflags" | |
161 | .Fa "locale_t restrict" | |
162 | .Fc | |
5b2abdfb A |
163 | .Sh DESCRIPTION |
164 | These routines implement | |
165 | .St -p1003.2 | |
166 | regular expressions | |
167 | .Pq Do RE Dc Ns s ; | |
168 | see | |
169 | .Xr re_format 7 . | |
9385eb3d A |
170 | The |
171 | .Fn regcomp | |
172 | function | |
ad3c9f2a | 173 | compiles an RE, written as a string, into an internal form. |
5b2abdfb | 174 | .Fn regexec |
ad3c9f2a | 175 | matches that internal form against a string and reports results. |
5b2abdfb | 176 | .Fn regerror |
ad3c9f2a | 177 | transforms error codes from either into human-readable messages. |
5b2abdfb A |
178 | .Fn regfree |
179 | frees any dynamically-allocated storage used by the internal form | |
180 | of an RE. | |
181 | .Pp | |
182 | The header | |
3d9156a7 | 183 | .In regex.h |
5b2abdfb A |
184 | declares two structure types, |
185 | .Ft regex_t | |
186 | and | |
187 | .Ft regmatch_t , | |
188 | the former for compiled internal forms and the latter for match reporting. | |
189 | It also declares the four functions, | |
190 | a type | |
191 | .Ft regoff_t , | |
192 | and a number of constants with names starting with | |
193 | .Dq Dv REG_ . | |
194 | .Pp | |
9385eb3d A |
195 | The |
196 | .Fn regcomp | |
197 | function | |
5b2abdfb A |
198 | compiles the regular expression contained in the |
199 | .Fa pattern | |
200 | string, | |
201 | subject to the flags in | |
202 | .Fa cflags , | |
203 | and places the results in the | |
204 | .Ft regex_t | |
205 | structure pointed to by | |
206 | .Fa preg . | |
9385eb3d A |
207 | The |
208 | .Fa cflags | |
209 | argument | |
5b2abdfb A |
210 | is the bitwise OR of zero or more of the following flags: |
211 | .Bl -tag -width REG_EXTENDED | |
212 | .It Dv REG_EXTENDED | |
213 | Compile modern | |
214 | .Pq Dq extended | |
215 | REs, | |
216 | rather than the obsolete | |
217 | .Pq Dq basic | |
218 | REs that | |
219 | are the default. | |
220 | .It Dv REG_BASIC | |
221 | This is a synonym for 0, | |
222 | provided as a counterpart to | |
223 | .Dv REG_EXTENDED | |
224 | to improve readability. | |
225 | .It Dv REG_NOSPEC | |
226 | Compile with recognition of all special characters turned off. | |
227 | All characters are thus considered ordinary, | |
228 | so the | |
229 | .Dq RE | |
230 | is a literal string. | |
231 | This is an extension, | |
232 | compatible with but not specified by | |
233 | .St -p1003.2 , | |
234 | and should be used with | |
235 | caution in software intended to be portable to other systems. | |
236 | .Dv REG_EXTENDED | |
237 | and | |
238 | .Dv REG_NOSPEC | |
239 | may not be used | |
240 | in the same call to | |
241 | .Fn regcomp . | |
ad3c9f2a A |
242 | .It Dv REG_LITERAL |
243 | An alias of | |
244 | .Dv REG_NOSPEC . | |
5b2abdfb A |
245 | .It Dv REG_ICASE |
246 | Compile for matching that ignores upper/lower case distinctions. | |
247 | See | |
248 | .Xr re_format 7 . | |
249 | .It Dv REG_NOSUB | |
250 | Compile for matching that need only report success or failure, | |
251 | not what was matched. | |
252 | .It Dv REG_NEWLINE | |
253 | Compile for newline-sensitive matching. | |
254 | By default, newline is a completely ordinary character with no special | |
255 | meaning in either REs or strings. | |
256 | With this flag, | |
257 | .Ql [^ | |
258 | bracket expressions and | |
259 | .Ql .\& | |
260 | never match newline, | |
261 | a | |
262 | .Ql ^\& | |
263 | anchor matches the null string after any newline in the string | |
264 | in addition to its normal function, | |
265 | and the | |
266 | .Ql $\& | |
267 | anchor matches the null string before any newline in the | |
268 | string in addition to its normal function. | |
269 | .It Dv REG_PEND | |
ad3c9f2a A |
270 | (Note that |
271 | .Dv REG_PEND | |
272 | is not recognized by any of the wide character or | |
273 | .Dq Nm n | |
274 | variants. | |
275 | Besides, the | |
276 | .Dq Nm n | |
277 | variants can be used instead of | |
278 | .Dv REG_PEND ; | |
279 | see EXTENDED APIS below.) | |
5b2abdfb A |
280 | The regular expression ends, |
281 | not at the first NUL, | |
282 | but just before the character pointed to by the | |
283 | .Va re_endp | |
284 | member of the structure pointed to by | |
285 | .Fa preg . | |
286 | The | |
287 | .Va re_endp | |
288 | member is of type | |
289 | .Ft "const char *" . | |
290 | This flag permits inclusion of NULs in the RE; | |
291 | they are considered ordinary characters. | |
292 | This is an extension, | |
293 | compatible with but not specified by | |
294 | .St -p1003.2 , | |
295 | and should be used with | |
296 | caution in software intended to be portable to other systems. | |
ad3c9f2a A |
297 | .It Dv REG_ENHANCED |
298 | Recognized enhanced regular expression features; see | |
299 | .Xr re_format 7 | |
300 | for details. | |
301 | This is an extension not specified by | |
302 | .St -p1003.2 , | |
303 | and should be used with | |
304 | caution in software intended to be portable to other systems. | |
305 | .It Dv REG_MINIMAL | |
306 | Use minimal (non-greedy) repetitions instead of the normal greedy ones; see | |
307 | .Xr re_format 7 | |
308 | for details. | |
309 | (This only applies when both | |
310 | .Dv REG_ENHANCED | |
311 | and | |
312 | .Dv REG_EXTENDED | |
313 | are also set.) | |
314 | This is an extension not specified by | |
315 | .St -p1003.2 , | |
316 | and should be used with | |
317 | caution in software intended to be portable to other systems. | |
318 | .It Dv REG_UNGREEDY | |
319 | Alias of | |
320 | .Dv REG_MINIMAL . | |
5b2abdfb A |
321 | .El |
322 | .Pp | |
323 | When successful, | |
324 | .Fn regcomp | |
325 | returns 0 and fills in the structure pointed to by | |
326 | .Fa preg . | |
327 | One member of that structure | |
328 | (other than | |
329 | .Va re_endp ) | |
330 | is publicized: | |
331 | .Va re_nsub , | |
332 | of type | |
333 | .Ft size_t , | |
334 | contains the number of parenthesized subexpressions within the RE | |
335 | (except that the value of this member is undefined if the | |
336 | .Dv REG_NOSUB | |
337 | flag was used). | |
338 | If | |
339 | .Fn regcomp | |
340 | fails, it returns a non-zero error code; | |
341 | see | |
342 | .Sx DIAGNOSTICS . | |
343 | .Pp | |
9385eb3d A |
344 | The |
345 | .Fn regexec | |
346 | function | |
5b2abdfb A |
347 | matches the compiled RE pointed to by |
348 | .Fa preg | |
349 | against the | |
350 | .Fa string , | |
351 | subject to the flags in | |
352 | .Fa eflags , | |
353 | and reports results using | |
354 | .Fa nmatch , | |
355 | .Fa pmatch , | |
356 | and the returned value. | |
357 | The RE must have been compiled by a previous invocation of | |
358 | .Fn regcomp . | |
359 | The compiled form is not altered during execution of | |
360 | .Fn regexec , | |
361 | so a single compiled RE can be used simultaneously by multiple threads. | |
362 | .Pp | |
363 | By default, | |
364 | the NUL-terminated string pointed to by | |
365 | .Fa string | |
366 | is considered to be the text of an entire line, minus any terminating | |
367 | newline. | |
368 | The | |
369 | .Fa eflags | |
370 | argument is the bitwise OR of zero or more of the following flags: | |
371 | .Bl -tag -width REG_STARTEND | |
372 | .It Dv REG_NOTBOL | |
373 | The first character of | |
374 | the string | |
375 | is not the beginning of a line, so the | |
376 | .Ql ^\& | |
377 | anchor should not match before it. | |
378 | This does not affect the behavior of newlines under | |
379 | .Dv REG_NEWLINE . | |
380 | .It Dv REG_NOTEOL | |
381 | The NUL terminating | |
382 | the string | |
383 | does not end a line, so the | |
384 | .Ql $\& | |
385 | anchor should not match before it. | |
386 | This does not affect the behavior of newlines under | |
387 | .Dv REG_NEWLINE . | |
388 | .It Dv REG_STARTEND | |
389 | The string is considered to start at | |
390 | .Fa string | |
391 | + | |
392 | .Fa pmatch Ns [0]. Ns Va rm_so | |
393 | and to have a terminating NUL located at | |
394 | .Fa string | |
395 | + | |
396 | .Fa pmatch Ns [0]. Ns Va rm_eo | |
397 | (there need not actually be a NUL at that location), | |
398 | regardless of the value of | |
399 | .Fa nmatch . | |
400 | See below for the definition of | |
401 | .Fa pmatch | |
402 | and | |
403 | .Fa nmatch . | |
404 | This is an extension, | |
405 | compatible with but not specified by | |
406 | .St -p1003.2 , | |
407 | and should be used with | |
408 | caution in software intended to be portable to other systems. | |
409 | Note that a non-zero | |
410 | .Va rm_so | |
411 | does not imply | |
412 | .Dv REG_NOTBOL ; | |
413 | .Dv REG_STARTEND | |
414 | affects only the location of the string, | |
415 | not how it is matched. | |
416 | .El | |
417 | .Pp | |
418 | See | |
419 | .Xr re_format 7 | |
420 | for a discussion of what is matched in situations where an RE or a | |
421 | portion thereof could match any of several substrings of | |
422 | .Fa string . | |
423 | .Pp | |
424 | Normally, | |
425 | .Fn regexec | |
426 | returns 0 for success and the non-zero code | |
427 | .Dv REG_NOMATCH | |
428 | for failure. | |
429 | Other non-zero error codes may be returned in exceptional situations; | |
430 | see | |
431 | .Sx DIAGNOSTICS . | |
432 | .Pp | |
433 | If | |
434 | .Dv REG_NOSUB | |
435 | was specified in the compilation of the RE, | |
436 | or if | |
437 | .Fa nmatch | |
438 | is 0, | |
439 | .Fn regexec | |
440 | ignores the | |
441 | .Fa pmatch | |
442 | argument (but see below for the case where | |
443 | .Dv REG_STARTEND | |
444 | is specified). | |
445 | Otherwise, | |
446 | .Fa pmatch | |
447 | points to an array of | |
448 | .Fa nmatch | |
449 | structures of type | |
450 | .Ft regmatch_t . | |
451 | Such a structure has at least the members | |
452 | .Va rm_so | |
453 | and | |
454 | .Va rm_eo , | |
455 | both of type | |
456 | .Ft regoff_t | |
457 | (a signed arithmetic type at least as large as an | |
458 | .Ft off_t | |
459 | and a | |
460 | .Ft ssize_t ) , | |
461 | containing respectively the offset of the first character of a substring | |
462 | and the offset of the first character after the end of the substring. | |
463 | Offsets are measured from the beginning of the | |
464 | .Fa string | |
465 | argument given to | |
466 | .Fn regexec . | |
467 | An empty substring is denoted by equal offsets, | |
468 | both indicating the character following the empty substring. | |
469 | .Pp | |
470 | The 0th member of the | |
471 | .Fa pmatch | |
472 | array is filled in to indicate what substring of | |
473 | .Fa string | |
474 | was matched by the entire RE. | |
475 | Remaining members report what substring was matched by parenthesized | |
476 | subexpressions within the RE; | |
477 | member | |
478 | .Va i | |
479 | reports subexpression | |
480 | .Va i , | |
481 | with subexpressions counted (starting at 1) by the order of their opening | |
482 | parentheses in the RE, left to right. | |
483 | Unused entries in the array (corresponding either to subexpressions that | |
484 | did not participate in the match at all, or to subexpressions that do not | |
485 | exist in the RE (that is, | |
486 | .Va i | |
487 | > | |
488 | .Fa preg Ns -> Ns Va re_nsub ) ) | |
489 | have both | |
490 | .Va rm_so | |
491 | and | |
492 | .Va rm_eo | |
493 | set to -1. | |
494 | If a subexpression participated in the match several times, | |
495 | the reported substring is the last one it matched. | |
496 | (Note, as an example in particular, that when the RE | |
497 | .Ql "(b*)+" | |
498 | matches | |
499 | .Ql bbb , | |
500 | the parenthesized subexpression matches each of the three | |
501 | .So Li b Sc Ns s | |
502 | and then | |
503 | an infinite number of empty strings following the last | |
504 | .Ql b , | |
505 | so the reported substring is one of the empties.) | |
506 | .Pp | |
507 | If | |
508 | .Dv REG_STARTEND | |
509 | is specified, | |
510 | .Fa pmatch | |
511 | must point to at least one | |
512 | .Ft regmatch_t | |
513 | (even if | |
514 | .Fa nmatch | |
515 | is 0 or | |
516 | .Dv REG_NOSUB | |
517 | was specified), | |
518 | to hold the input offsets for | |
519 | .Dv REG_STARTEND . | |
520 | Use for output is still entirely controlled by | |
521 | .Fa nmatch ; | |
522 | if | |
523 | .Fa nmatch | |
524 | is 0 or | |
525 | .Dv REG_NOSUB | |
526 | was specified, | |
527 | the value of | |
528 | .Fa pmatch Ns [0] | |
529 | will not be changed by a successful | |
530 | .Fn regexec . | |
531 | .Pp | |
9385eb3d A |
532 | The |
533 | .Fn regerror | |
534 | function | |
5b2abdfb A |
535 | maps a non-zero |
536 | .Fa errcode | |
537 | from either | |
538 | .Fn regcomp | |
539 | or | |
540 | .Fn regexec | |
541 | to a human-readable, printable message. | |
542 | If | |
543 | .Fa preg | |
544 | is | |
545 | .No non\- Ns Dv NULL , | |
546 | the error code should have arisen from use of | |
547 | the | |
548 | .Ft regex_t | |
549 | pointed to by | |
550 | .Fa preg , | |
551 | and if the error code came from | |
552 | .Fn regcomp , | |
553 | it should have been the result from the most recent | |
554 | .Fn regcomp | |
555 | using that | |
556 | .Ft regex_t . | |
9385eb3d A |
557 | The |
558 | .Fn ( regerror | |
5b2abdfb A |
559 | may be able to supply a more detailed message using information |
560 | from the | |
561 | .Ft regex_t . ) | |
9385eb3d A |
562 | The |
563 | .Fn regerror | |
564 | function | |
5b2abdfb A |
565 | places the NUL-terminated message into the buffer pointed to by |
566 | .Fa errbuf , | |
567 | limiting the length (including the NUL) to at most | |
568 | .Fa errbuf_size | |
569 | bytes. | |
1f2f436a | 570 | If the whole message will not fit, |
5b2abdfb A |
571 | as much of it as will fit before the terminating NUL is supplied. |
572 | In any case, | |
573 | the returned value is the size of buffer needed to hold the whole | |
574 | message (including terminating NUL). | |
575 | If | |
576 | .Fa errbuf_size | |
577 | is 0, | |
578 | .Fa errbuf | |
579 | is ignored but the return value is still correct. | |
580 | .Pp | |
581 | If the | |
582 | .Fa errcode | |
583 | given to | |
584 | .Fn regerror | |
585 | is first ORed with | |
586 | .Dv REG_ITOA , | |
587 | the | |
588 | .Dq message | |
589 | that results is the printable name of the error code, | |
590 | e.g.\& | |
591 | .Dq Dv REG_NOMATCH , | |
592 | rather than an explanation thereof. | |
593 | If | |
594 | .Fa errcode | |
595 | is | |
596 | .Dv REG_ATOI , | |
597 | then | |
598 | .Fa preg | |
599 | shall be | |
600 | .No non\- Ns Dv NULL | |
601 | and the | |
602 | .Va re_endp | |
603 | member of the structure it points to | |
604 | must point to the printable name of an error code; | |
605 | in this case, the result in | |
606 | .Fa errbuf | |
607 | is the decimal digits of | |
608 | the numeric value of the error code | |
609 | (0 if the name is not recognized). | |
610 | .Dv REG_ITOA | |
611 | and | |
612 | .Dv REG_ATOI | |
613 | are intended primarily as debugging facilities; | |
614 | they are extensions, | |
615 | compatible with but not specified by | |
616 | .St -p1003.2 , | |
617 | and should be used with | |
618 | caution in software intended to be portable to other systems. | |
619 | Be warned also that they are considered experimental and changes are possible. | |
620 | .Pp | |
9385eb3d A |
621 | The |
622 | .Fn regfree | |
623 | function | |
5b2abdfb A |
624 | frees any dynamically-allocated storage associated with the compiled RE |
625 | pointed to by | |
626 | .Fa preg . | |
627 | The remaining | |
628 | .Ft regex_t | |
629 | is no longer a valid compiled RE | |
630 | and the effect of supplying it to | |
631 | .Fn regexec | |
632 | or | |
633 | .Fn regerror | |
634 | is undefined. | |
635 | .Pp | |
636 | None of these functions references global variables except for tables | |
637 | of constants; | |
638 | all are safe for use from multiple threads if the arguments are safe. | |
ad3c9f2a A |
639 | .Sh EXTENDED APIS |
640 | These extended APIs are available in Mac OS X 10.8 and beyond, when the | |
641 | deployment target is 10.8 or later. | |
642 | It should also be noted that any of the | |
643 | .Fn regcomp | |
644 | variants may be used to initialize a | |
645 | .Ft regex_t | |
646 | structure, that can then be passed to any of the | |
647 | .Fn regexec | |
648 | variants. | |
649 | So it is quite legal to compile a wide character RE and use it to match a | |
650 | multibyte character string, or vice versa. | |
651 | .Pp | |
652 | The | |
653 | .Fn regncomp | |
654 | routine compiles regular expressions like | |
655 | .Fn regcomp , | |
656 | but the length of the regular expression string is specified, allowing a string | |
657 | that is not NUL terminated and/or contains NUL characters. | |
658 | This is a modern replacement for using | |
659 | .Fn regcomp | |
660 | with the | |
661 | .Dv REG_PEND | |
662 | option. | |
663 | .Pp | |
664 | Similarly, the | |
665 | .Fn regnexec | |
666 | routine is like | |
667 | .Fn regexec , | |
668 | but the length of the string to match is specified, allowing a string | |
669 | that is not NUL terminated and/or contains NUL characters. | |
670 | .Pp | |
671 | The | |
672 | .Fn regwcomp | |
673 | and | |
674 | .Fn regwexec | |
675 | variants take a wide-character | |
676 | .Vt ( wchar_t ) | |
677 | string for the regular expression and string to match. | |
678 | And | |
679 | .Fn regwncomp | |
680 | and | |
681 | .Fn regwnexec | |
682 | are variants that allow specifying the wide character string length, and | |
683 | so allows wide character strings that are not NUL terminated and/or | |
684 | contains NUL characters. | |
685 | .Sh INTERACTION WITH THE LOCALE | |
686 | When | |
687 | .Fn regcomp | |
688 | or one of its variants is run, the regular expression is compiled into an | |
689 | internal form, which may include specific information about the locale currently | |
690 | in effect, such as equivalence classes or multi-character collation symbols. | |
691 | So a reference to the current locale is also stored with the internal form, | |
692 | so that when | |
693 | .Fn regexec | |
694 | is run, it can use the same locale (even if the locale is changed in-between | |
695 | the calls to | |
696 | .Fn regcomp | |
697 | and | |
698 | .Fn regexec ) . | |
699 | .Pp | |
700 | To provide more direct control over which locale is used, | |
701 | routines with | |
702 | .Dq Nm _l | |
703 | appended to their names are provided that work just like the variants | |
704 | without the | |
705 | .Dq Nm _l , | |
706 | except that a locale (via a | |
707 | .Vt locale_t | |
708 | variable type) is specified directly. | |
709 | Note that only variants of | |
710 | .Fn regcomp | |
711 | have | |
712 | .Dq Nm _l | |
713 | variants, since the | |
714 | .Fn regexec | |
715 | variants just use the reference to the locale stored in the internal form. | |
5b2abdfb | 716 | .Sh IMPLEMENTATION CHOICES |
ad3c9f2a A |
717 | The |
718 | .Nm regex | |
719 | implementation in Mac OS X 10.8 and later is based on a heavily modified subset | |
720 | of TRE (http://laurikari.net/tre/). | |
721 | This provides improved performance, better conformance and additional features. | |
722 | However, both API and binary compatibility have been maintained with previous | |
723 | releases, so binaries | |
724 | built on previous releases should work on 10.8 and later, and binaries built on | |
725 | 10.8 and later should be able to run on previous releases (as long as none of | |
726 | the new variants or new features are used. | |
727 | .Pp | |
5b2abdfb A |
728 | There are a number of decisions that |
729 | .St -p1003.2 | |
730 | leaves up to the implementor, | |
731 | either by explicitly saying | |
732 | .Dq undefined | |
733 | or by virtue of them being | |
734 | forbidden by the RE grammar. | |
735 | This implementation treats them as follows. | |
736 | .Pp | |
737 | See | |
738 | .Xr re_format 7 | |
739 | for a discussion of the definition of case-independent matching. | |
740 | .Pp | |
741 | There is no particular limit on the length of REs, | |
742 | except insofar as memory is limited. | |
743 | Memory usage is approximately linear in RE size, and largely insensitive | |
744 | to RE complexity, except for bounded repetitions. | |
745 | See | |
746 | .Sx BUGS | |
747 | for one short RE using them | |
748 | that will run almost any system out of memory. | |
749 | .Pp | |
750 | A backslashed character other than one specifically given a magic meaning | |
751 | by | |
752 | .St -p1003.2 | |
753 | (such magic meanings occur only in obsolete | |
754 | .Bq Dq basic | |
755 | REs) | |
756 | is taken as an ordinary character. | |
757 | .Pp | |
758 | Any unmatched | |
759 | .Ql [\& | |
760 | is a | |
761 | .Dv REG_EBRACK | |
762 | error. | |
763 | .Pp | |
764 | Equivalence classes cannot begin or end bracket-expression ranges. | |
765 | The endpoint of one range cannot begin another. | |
766 | .Pp | |
767 | .Dv RE_DUP_MAX , | |
768 | the limit on repetition counts in bounded repetitions, is 255. | |
769 | .Pp | |
770 | A repetition operator | |
771 | .Ql ( ?\& , | |
772 | .Ql *\& , | |
773 | .Ql +\& , | |
774 | or bounds) | |
775 | cannot follow another | |
ad3c9f2a A |
776 | repetition operator, except for the use of |
777 | .Ql ?\& | |
778 | for minimal repetition (for enhanced extended REs; see | |
779 | .Xr re_format 7 | |
780 | for details). | |
5b2abdfb A |
781 | A repetition operator cannot begin an expression or subexpression |
782 | or follow | |
783 | .Ql ^\& | |
784 | or | |
785 | .Ql |\& . | |
786 | .Pp | |
787 | .Ql |\& | |
788 | cannot appear first or last in a (sub)expression or after another | |
789 | .Ql |\& , | |
3d9156a7 | 790 | i.e., an operand of |
5b2abdfb A |
791 | .Ql |\& |
792 | cannot be an empty subexpression. | |
793 | An empty parenthesized subexpression, | |
794 | .Ql "()" , | |
795 | is legal and matches an | |
796 | empty (sub)string. | |
797 | An empty string is not a legal RE. | |
798 | .Pp | |
799 | A | |
800 | .Ql {\& | |
801 | followed by a digit is considered the beginning of bounds for a | |
802 | bounded repetition, which must then follow the syntax for bounds. | |
803 | A | |
804 | .Ql {\& | |
805 | .Em not | |
806 | followed by a digit is considered an ordinary character. | |
807 | .Pp | |
808 | .Ql ^\& | |
809 | and | |
810 | .Ql $\& | |
811 | beginning and ending subexpressions in obsolete | |
812 | .Pq Dq basic | |
813 | REs are anchors, not ordinary characters. | |
5b2abdfb A |
814 | .Sh DIAGNOSTICS |
815 | Non-zero error codes from | |
816 | .Fn regcomp | |
817 | and | |
818 | .Fn regexec | |
819 | include the following: | |
820 | .Pp | |
821 | .Bl -tag -width REG_ECOLLATE -compact | |
822 | .It Dv REG_NOMATCH | |
9385eb3d | 823 | The |
5b2abdfb | 824 | .Fn regexec |
9385eb3d | 825 | function |
5b2abdfb A |
826 | failed to match |
827 | .It Dv REG_BADPAT | |
828 | invalid regular expression | |
829 | .It Dv REG_ECOLLATE | |
830 | invalid collating element | |
831 | .It Dv REG_ECTYPE | |
832 | invalid character class | |
833 | .It Dv REG_EESCAPE | |
834 | .Ql \e | |
835 | applied to unescapable character | |
836 | .It Dv REG_ESUBREG | |
837 | invalid backreference number | |
838 | .It Dv REG_EBRACK | |
839 | brackets | |
840 | .Ql "[ ]" | |
841 | not balanced | |
842 | .It Dv REG_EPAREN | |
843 | parentheses | |
844 | .Ql "( )" | |
845 | not balanced | |
846 | .It Dv REG_EBRACE | |
847 | braces | |
848 | .Ql "{ }" | |
849 | not balanced | |
850 | .It Dv REG_BADBR | |
851 | invalid repetition count(s) in | |
852 | .Ql "{ }" | |
853 | .It Dv REG_ERANGE | |
854 | invalid character range in | |
855 | .Ql "[ ]" | |
856 | .It Dv REG_ESPACE | |
857 | ran out of memory | |
858 | .It Dv REG_BADRPT | |
859 | .Ql ?\& , | |
860 | .Ql *\& , | |
861 | or | |
862 | .Ql +\& | |
863 | operand invalid | |
864 | .It Dv REG_EMPTY | |
865 | empty (sub)expression | |
866 | .It Dv REG_ASSERT | |
1f2f436a | 867 | cannot happen - you found a bug |
5b2abdfb | 868 | .It Dv REG_INVARG |
3d9156a7 A |
869 | invalid argument, e.g.\& negative-length string |
870 | .It Dv REG_ILLSEQ | |
871 | illegal byte sequence (bad multibyte character) | |
5b2abdfb | 872 | .El |
1f2f436a A |
873 | .Sh SEE ALSO |
874 | .Xr grep 1 , | |
875 | .Xr re_format 7 | |
876 | .Pp | |
877 | .St -p1003.2 , | |
878 | sections 2.8 (Regular Expression Notation) | |
879 | and | |
880 | B.5 (C Binding for Regular Expression Matching). | |
5b2abdfb | 881 | .Sh HISTORY |
ad3c9f2a A |
882 | The |
883 | .Nm regex | |
884 | implementation is based on a heavily modified subset of TRE | |
885 | (http://laurikari.net/tre/), originally written by Ville Laurikari. | |
886 | Previous releases used an implementation originally written by | |
887 | .An Henry Spencer , | |
888 | and altered for inclusion in the | |
5b2abdfb A |
889 | .Bx 4.4 |
890 | distribution. | |
891 | .Sh BUGS | |
ad3c9f2a A |
892 | The beginning-of-line and end-of-line anchors ( |
893 | .Dq ^\& | |
894 | and | |
895 | .Dq $\& ) | |
896 | are currently implemented so that repetitions can not be applied to them. | |
897 | The standards are unclear about whether this is legal, but other | |
898 | .Nm regex | |
899 | packages do support this case. | |
900 | It is best to avoid this non-portable (and not really very useful) case. | |
5b2abdfb A |
901 | .Pp |
902 | The back-reference code is subtle and doubts linger about its correctness | |
903 | in complex cases. | |
904 | .Pp | |
9385eb3d A |
905 | The |
906 | .Fn regexec | |
ad3c9f2a A |
907 | variants use one of two internal matching engines. |
908 | The normal one is linear worst-case time in the length of the text being | |
909 | searched, and quadratic worst-case time in the length of the used regular | |
910 | expression. | |
911 | When back-references are used, a slower, backtracking engine is used. | |
912 | While all backtracking matching engines suffer from extreme slowness for certain | |
913 | pathological cases, the normal engines doesn't suffer from these cases. | |
914 | It is advised to avoid back-references whenever possible. | |
5b2abdfb | 915 | .Pp |
9385eb3d A |
916 | The |
917 | .Fn regcomp | |
ad3c9f2a | 918 | variants |
5b2abdfb A |
919 | implements bounded repetitions by macro expansion, |
920 | which is costly in time and space if counts are large | |
921 | or bounded repetitions are nested. | |
922 | An RE like, say, | |
923 | .Ql "((((a{1,100}){1,100}){1,100}){1,100}){1,100}" | |
924 | will (eventually) run almost any existing machine out of swap space. | |
925 | .Pp | |
5b2abdfb A |
926 | Due to a mistake in |
927 | .St -p1003.2 , | |
928 | things like | |
929 | .Ql "a)b" | |
930 | are legal REs because | |
931 | .Ql )\& | |
932 | is | |
933 | a special character only in the presence of a previous unmatched | |
934 | .Ql (\& . | |
1f2f436a | 935 | This cannot be fixed until the spec is fixed. |
5b2abdfb A |
936 | .Pp |
937 | The standard's definition of back references is vague. | |
938 | For example, does | |
939 | .Ql "a\e(\e(b\e)*\e2\e)*d" | |
940 | match | |
941 | .Ql "abbbd" ? | |
942 | Until the standard is clarified, | |
943 | behavior in such cases should not be relied on. |