git.saurik.com Git - wxWidgets.git/blame_incremental - docs/doxygen/overviews/resyntax.h

... / ...

Commit	Line	Data
	1	/////////////////////////////////////////////////////////////////////////////
	2	// Name: resyntax.h
	3	// Purpose: topic overview
	4	// Author: wxWidgets team
	5	// RCS-ID: $Id$
	6	// Licence: wxWindows licence
	7	/////////////////////////////////////////////////////////////////////////////
	8
	9	/**
	10
	11	@page overview_resyntax Regular Expressions
	12
	13	A <em>regular expression</em> describes strings of characters. It's a pattern
	14	that matches certain strings and doesn't match others.
	15
	16	@li @ref overview_resyntax_differentflavors
	17	@li @ref overview_resyntax_syntax
	18	@li @ref overview_resyntax_bracket
	19	@li @ref overview_resyntax_escapes
	20	@li @ref overview_resyntax_metasyntax
	21	@li @ref overview_resyntax_matching
	22	@li @ref overview_resyntax_limits
	23	@li @ref overview_resyntax_bre
	24	@li @ref overview_resyntax_characters
	25
	26	@see
	27
	28	@li wxRegEx
	29
	30
	31	<hr>
	32
	33
	34	@section overview_resyntax_differentflavors Different Flavors of Regular Expressions
	35
	36	Regular expressions (RE), as defined by POSIX, come in two flavors:
	37	<em>extended regular expressions</em> (ERE) and <em>basic regular
	38	expressions</em> (BRE). EREs are roughly those of the traditional @e egrep,
	39	while BREs are roughly those of the traditional @e ed. This implementation
	40	adds a third flavor: <em>advanced regular expressions</em> (ARE), basically
	41	EREs with some significant extensions.
	42
	43	This manual page primarily describes AREs. BREs mostly exist for backward
	44	compatibility in some old programs. POSIX EREs are almost an exact subset of
	45	AREs. Features of AREs that are not present in EREs will be indicated.
	46
	47
	48	@section overview_resyntax_syntax Regular Expression Syntax
	49
	50	These regular expressions are implemented using the package written by Henry
	51	Spencer, based on the 1003.2 spec and some (not quite all) of the Perl5
	52	extensions (thanks, Henry!). Much of the description of regular expressions
	53	below is copied verbatim from his manual entry.
	54
	55	An ARE is one or more @e branches, separated by "\|", matching anything that
	56	matches any of the branches.
	57
	58	A branch is zero or more @e constraints or @e quantified atoms, concatenated.
	59	It matches a match for the first, followed by a match for the second, etc; an
	60	empty branch matches the empty string.
	61
	62	A quantified atom is an @e atom possibly followed by a single @e quantifier.
	63	Without a quantifier, it matches a match for the atom. The quantifiers, and
	64	what a so-quantified atom matches, are:
	65
	66	@beginTable
	67	@row2col{ <tt>*</tt> ,
	68	A sequence of 0 or more matches of the atom. }
	69	@row2col{ <tt>+</tt> ,
	70	A sequence of 1 or more matches of the atom. }
	71	@row2col{ <tt>?</tt> ,
	72	A sequence of 0 or 1 matches of the atom. }
	73	@row2col{ <tt>{m}</tt> ,
	74	A sequence of exactly @e m matches of the atom. }
	75	@row2col{ <tt>{m\,}</tt> ,
	76	A sequence of @e m or more matches of the atom. }
	77	@row2col{ <tt>{m\,n}</tt> ,
	78	A sequence of @e m through @e n (inclusive) matches of the atom; @e m may
	79	not exceed @e n. }
	80	@row2col{ <tt>*? +? ?? {m}? {m\,}? {m\,n}?</tt> ,
	81	@e Non-greedy quantifiers, which match the same possibilities, but prefer
	82	the smallest number rather than the largest number of matches (see
	83	@ref overview_resyntax_matching). }
	84	@endTable
	85
	86	The forms using @b { and @b } are known as @e bounds. The numbers @e m and
	87	@e n are unsigned decimal integers with permissible values from 0 to 255
	88	inclusive. An atom is one of:
	89
	90	@beginTable
	91	@row2col{ <tt>(re)</tt> ,
	92	Where @e re is any regular expression, matches for @e re, with the match
	93	captured for possible reporting. }
	94	@row2col{ <tt>(?:re)</tt> ,
	95	As previous, but does no reporting (a "non-capturing" set of
	96	parentheses). }
	97	@row2col{ <tt>()</tt> ,
	98	Matches an empty string, captured for possible reporting. }
	99	@row2col{ <tt>(?:)</tt> ,
	100	Matches an empty string, without reporting. }
	101	@row2col{ <tt>[chars]</tt> ,
	102	A <em>bracket expression</em>, matching any one of the @e chars (see
	103	@ref overview_resyntax_bracket for more details). }
	104	@row2col{ <tt>.</tt> ,
	105	Matches any single character. }
	106	@row2col{ <tt>@\k</tt> ,
	107	Where @e k is a non-alphanumeric character, matches that character taken
	108	as an ordinary character, e.g. @\@\ matches a backslash character. }
	109	@row2col{ <tt>@\c</tt> ,
	110	Where @e c is alphanumeric (possibly followed by other characters), an
	111	@e escape (AREs only), see @ref overview_resyntax_escapes below. }
	112	@row2col{ <tt>@leftCurly</tt> ,
	113	When followed by a character other than a digit, matches the left-brace
	114	character "@leftCurly"; when followed by a digit, it is the beginning of a
	115	@e bound (see above). }
	116	@row2col{ <tt>x</tt> ,
	117	Where @e x is a single character with no other significance, matches that
	118	character. }
	119	@endTable
	120
	121	A @e constraint matches an empty string when specific conditions are met. A
	122	constraint may not be followed by a quantifier. The simple constraints are as
	123	follows; some more constraints are described later, under
	124	@ref overview_resyntax_escapes.
	125
	126	@beginTable
	127	@row2col{ <tt>^</tt> ,
	128	Matches at the beginning of a line. }
	129	@row2col{ <tt>@$</tt> ,
	130	Matches at the end of a line. }
	131	@row2col{ <tt>(?=re)</tt> ,
	132	@e Positive lookahead (AREs only), matches at any point where a substring
	133	matching @e re begins. }
	134	@row2col{ <tt>(?!re)</tt> ,
	135	@e Negative lookahead (AREs only), matches at any point where no substring
	136	matching @e re begins. }
	137	@endTable
	138
	139	The lookahead constraints may not contain back references (see later), and all
	140	parentheses within them are considered non-capturing. A RE may not end with
	141	"\".
	142
	143
	144	@section overview_resyntax_bracket Bracket Expressions
	145
	146	A <em>bracket expression</em> is a list of characters enclosed in <tt>[]</tt>.
	147	It normally matches any single character from the list (but see below). If the
	148	list begins with @c ^, it matches any single character (but see below) @e not
	149	from the rest of the list.
	150
	151	If two characters in the list are separated by <tt>-</tt>, this is shorthand
	152	for the full @e range of characters between those two (inclusive) in the
	153	collating sequence, e.g. <tt>[0-9]</tt> in ASCII matches any decimal digit.
	154	Two ranges may not share an endpoint, so e.g. <tt>a-c-e</tt> is illegal.
	155	Ranges are very collating-sequence-dependent, and portable programs should
	156	avoid relying on them.
	157
	158	To include a literal <tt>]</tt> or <tt>-</tt> in the list, the simplest method
	159	is to enclose it in <tt>[.</tt> and <tt>.]</tt> to make it a collating element
	160	(see below). Alternatively, make it the first character (following a possible
	161	<tt>^</tt>), or (AREs only) precede it with <tt>@\</tt>. Alternatively, for
	162	<tt>-</tt>, make it the last character, or the second endpoint of a range. To
	163	use a literal <tt>-</tt> as the first endpoint of a range, make it a collating
	164	element or (AREs only) precede it with <tt>@\</tt>. With the exception of
	165	these, some combinations using <tt>[</tt> (see next paragraphs), and escapes,
	166	all other special characters lose their special significance within a bracket
	167	expression.
	168
	169	Within a bracket expression, a collating element (a character, a
	170	multi-character sequence that collates as if it were a single character, or a
	171	collating-sequence name for either) enclosed in <tt>[.</tt> and <tt>.]</tt>
	172	stands for the sequence of characters of that collating element.
	173
	174	@e wxWidgets: Currently no multi-character collating elements are defined. So
	175	in <tt>[.X.]</tt>, @c X can either be a single character literal or the name
	176	of a character. For example, the following are both identical:
	177	<tt>[[.0.]-[.9.]]</tt> and <tt>[[.zero.]-[.nine.]]</tt> and mean the same as
	178	<tt>[0-9]</tt>. See @ref overview_resyntax_characters.
	179
	180	Within a bracket expression, a collating element enclosed in <tt>[=</tt> and
	181	<tt>=]</tt> is an equivalence class, standing for the sequences of characters
	182	of all collating elements equivalent to that one, including itself. An
	183	equivalence class may not be an endpoint of a range.
	184
	185	@e wxWidgets: Currently no equivalence classes are defined, so <tt>[=X=]</tt>
	186	stands for just the single character @c X. @c X can either be a single
	187	character literal or the name of a character, see
	188	@ref overview_resyntax_characters.
	189
	190	Within a bracket expression, the name of a @e character class enclosed in
	191	<tt>[:</tt> and <tt>:]</tt> stands for the list of all characters (not all
	192	collating elements!) belonging to that class. Standard character classes are:
	193
	194	@beginTable
	195	@row2col{ <tt>alpha</tt> , A letter. }
	196	@row2col{ <tt>upper</tt> , An upper-case letter. }
	197	@row2col{ <tt>lower</tt> , A lower-case letter. }
	198	@row2col{ <tt>digit</tt> , A decimal digit. }
	199	@row2col{ <tt>xdigit</tt> , A hexadecimal digit. }
	200	@row2col{ <tt>alnum</tt> , An alphanumeric (letter or digit). }
	201	@row2col{ <tt>print</tt> , An alphanumeric (same as alnum). }
	202	@row2col{ <tt>blank</tt> , A space or tab character. }
	203	@row2col{ <tt>space</tt> , A character producing white space in displayed text. }
	204	@row2col{ <tt>punct</tt> , A punctuation character. }
	205	@row2col{ <tt>graph</tt> , A character with a visible representation. }
	206	@row2col{ <tt>cntrl</tt> , A control character. }
	207	@endTable
	208
	209	A character class may not be used as an endpoint of a range.
	210
	211	@e wxWidgets: In a non-Unicode build, these character classifications depend on
	212	the current locale, and correspond to the values return by the ANSI C "is"
	213	functions: <tt>isalpha</tt>, <tt>isupper</tt>, etc. In Unicode mode they are
	214	based on Unicode classifications, and are not affected by the current locale.
	215
	216	There are two special cases of bracket expressions: the bracket expressions
	217	<tt>[[:@<:]]</tt> and <tt>[[:@>:]]</tt> are constraints, matching empty strings at
	218	the beginning and end of a word respectively. A word is defined as a sequence
	219	of word characters that is neither preceded nor followed by word characters. A
	220	word character is an @e alnum character or an underscore (_). These special
	221	bracket expressions are deprecated; users of AREs should use constraint escapes
	222	instead (see escapes below).
	223
	224
	225	@section overview_resyntax_escapes Escapes
	226
	227	Escapes (AREs only), which begin with a <tt>@\</tt> followed by an alphanumeric
	228	character, come in several varieties: character entry, class shorthands,
	229	constraint escapes, and back references. A <tt>@\</tt> followed by an
	230	alphanumeric character but not constituting a valid escape is illegal in AREs.
	231	In EREs, there are no escapes: outside a bracket expression, a <tt>@\</tt>
	232	followed by an alphanumeric character merely stands for that character as an
	233	ordinary character, and inside a bracket expression, <tt>@\</tt> is an ordinary
	234	character. (The latter is the one actual incompatibility between EREs and
	235	AREs.)
	236
	237	Character-entry escapes (AREs only) exist to make it easier to specify
	238	non-printing and otherwise inconvenient characters in REs:
	239
	240	@beginTable
	241	@row2col{ <tt>@\a</tt> , Alert (bell) character, as in C. }
	242	@row2col{ <tt>@\b</tt> , Backspace, as in C. }
	243	@row2col{ <tt>@\B</tt> ,
	244	Synonym for <tt>@\</tt> to help reduce backslash doubling in some
	245	applications where there are multiple levels of backslash processing. }
	246	@row2col{ <tt>@\cX</tt> ,
	247	The character whose low-order 5 bits are the same as those of @e X, and
	248	whose other bits are all zero, where @e X is any character. }
	249	@row2col{ <tt>@\e</tt> ,
	250	The character whose collating-sequence name is @c ESC, or failing that,
	251	the character with octal value 033. }
	252	@row2col{ <tt>@\f</tt> , Formfeed, as in C. }
	253	@row2col{ <tt>@\n</tt> , Newline, as in C. }
	254	@row2col{ <tt>@\r</tt> , Carriage return, as in C. }
	255	@row2col{ <tt>@\t</tt> , Horizontal tab, as in C. }
	256	@row2col{ <tt>@\uwxyz</tt> ,
	257	The Unicode character <tt>U+wxyz</tt> in the local byte ordering, where
	258	@e wxyz is exactly four hexadecimal digits. }
	259	@row2col{ <tt>@\Ustuvwxyz</tt> ,
	260	Reserved for a somewhat-hypothetical Unicode extension to 32 bits, where
	261	@e stuvwxyz is exactly eight hexadecimal digits. }
	262	@row2col{ <tt>@\v</tt> , Vertical tab, as in C are all available. }
	263	@row2col{ <tt>@\xhhh</tt> ,
	264	The single character whose hexadecimal value is @e 0xhhh, where @e hhh is
	265	any sequence of hexadecimal digits. }
	266	@row2col{ <tt>@\0</tt> , The character whose value is 0. }
	267	@row2col{ <tt>@\xy</tt> ,
	268	The character whose octal value is @e 0xy, where @e xy is exactly two octal
	269	digits, and is not a <em>back reference</em> (see below). }
	270	@row2col{ <tt>@\xyz</tt> ,
	271	The character whose octal value is @e 0xyz, where @e xyz is exactly three
	272	octal digits, and is not a <em>back reference</em> (see below). }
	273	@endTable
	274
	275	Hexadecimal digits are 0-9, a-f, and A-F. Octal digits are 0-7.
	276
	277	The character-entry escapes are always taken as ordinary characters. For
	278	example, <tt>@\135</tt> is <tt>]</tt> in ASCII, but <tt>@\135</tt> does not
	279	terminate a bracket expression. Beware, however, that some applications (e.g.,
	280	C compilers) interpret such sequences themselves before the regular-expression
	281	package gets to see them, which may require doubling (quadrupling, etc.) the
	282	'<tt>@\</tt>'.
	283
	284	Class-shorthand escapes (AREs only) provide shorthands for certain
	285	commonly-used character classes:
	286
	287	@beginTable
	288	@row2col{ <tt>@\d</tt> , <tt>[[:digit:]]</tt> }
	289	@row2col{ <tt>@\s</tt> , <tt>[[:space:]]</tt> }
	290	@row2col{ <tt>@\w</tt> , <tt>[[:alnum:]_]</tt> (note underscore) }
	291	@row2col{ <tt>@\D</tt> , <tt>[^[:digit:]]</tt> }
	292	@row2col{ <tt>@\S</tt> , <tt>[^[:space:]]</tt> }
	293	@row2col{ <tt>@\W</tt> , <tt>[^[:alnum:]_]</tt> (note underscore) }
	294	@endTable
	295
	296	Within bracket expressions, <tt>@\d</tt>, <tt>@\s</tt>, and <tt>@\w</tt> lose
	297	their outer brackets, and <tt>@\D</tt>, <tt>@\S</tt>, <tt>@\W</tt> are illegal.
	298	So, for example, <tt>[a-c@\d]</tt> is equivalent to <tt>[a-c[:digit:]]</tt>.
	299	Also, <tt>[a-c@\D]</tt>, which is equivalent to <tt>[a-c^[:digit:]]</tt>, is
	300	illegal.
	301
	302	A constraint escape (AREs only) is a constraint, matching the empty string if
	303	specific conditions are met, written as an escape:
	304
	305	@beginTable
	306	@row2col{ <tt>@\A</tt> , Matches only at the beginning of the string, see
	307	@ref overview_resyntax_matching for how this differs
	308	from <tt>^</tt>. }
	309	@row2col{ <tt>@\m</tt> , Matches only at the beginning of a word. }
	310	@row2col{ <tt>@\M</tt> , Matches only at the end of a word. }
	311	@row2col{ <tt>@\y</tt> , Matches only at the beginning or end of a word. }
	312	@row2col{ <tt>@\Y</tt> , Matches only at a point that is not the beginning or
	313	end of a word. }
	314	@row2col{ <tt>@\Z</tt> , Matches only at the end of the string, see
	315	@ref overview_resyntax_matching for how this differs
	316	from <tt>@$</tt>. }
	317	@row2col{ <tt>@\m</tt> , A <em>back reference</em>, where @e m is a non-zero
	318	digit. See below. }
	319	@row2col{ <tt>@\mnn</tt> ,
	320	A <em>back reference</em>, where @e m is a nonzero digit, and @e nn is some
	321	more digits, and the decimal value @e mnn is not greater than the number of
	322	closing capturing parentheses seen so far. See below. }
	323	@endTable
	324
	325	A word is defined as in the specification of <tt>[[:@<:]]</tt> and
	326	<tt>[[:@>:]]</tt> above. Constraint escapes are illegal within bracket
	327	expressions.
	328
	329	A back reference (AREs only) matches the same string matched by the
	330	parenthesized subexpression specified by the number. For example, "([bc])\1"
	331	matches "bb" or "cc" but not "bc". The subexpression must entirely precede the
	332	back reference in the RE.Subexpressions are numbered in the order of their
	333	leading parentheses. Non-capturing parentheses do not define subexpressions.
	334
	335	There is an inherent historical ambiguity between octal character-entry escapes
	336	and back references, which is resolved by heuristics, as hinted at above. A
	337	leading zero always indicates an octal escape. A single non-zero digit, not
	338	followed by another digit, is always taken as a back reference. A multi-digit
	339	sequence not starting with a zero is taken as a back reference if it comes
	340	after a suitable subexpression (i.e. the number is in the legal range for a
	341	back reference), and otherwise is taken as octal.
	342
	343
	344	@section overview_resyntax_metasyntax Metasyntax
	345
	346	In addition to the main syntax described above, there are some special forms
	347	and miscellaneous syntactic facilities available.
	348
	349	Normally the flavor of RE being used is specified by application-dependent
	350	means. However, this can be overridden by a @e director. If an RE of any flavor
	351	begins with <tt>***:</tt>, the rest of the RE is an ARE. If an RE of any
	352	flavor begins with <tt>***=</tt>, the rest of the RE is taken to be a literal
	353	string, with all characters considered ordinary characters.
	354
	355	An ARE may begin with <em>embedded options</em>: a sequence <tt>(?xyz)</tt>
	356	(where @e xyz is one or more alphabetic characters) specifies options affecting
	357	the rest of the RE. These supplement, and can override, any options specified
	358	by the application. The available option letters are:
	359
	360	@beginTable
	361	@row2col{ <tt>b</tt> , Rest of RE is a BRE. }
	362	@row2col{ <tt>c</tt> , Case-sensitive matching (usual default). }
	363	@row2col{ <tt>e</tt> , Rest of RE is an ERE. }
	364	@row2col{ <tt>i</tt> , Case-insensitive matching (see
	365	@ref overview_resyntax_matching, below). }
	366	@row2col{ <tt>m</tt> , Historical synonym for @e n. }
	367	@row2col{ <tt>n</tt> , Newline-sensitive matching (see
	368	@ref overview_resyntax_matching, below). }
	369	@row2col{ <tt>p</tt> , Partial newline-sensitive matching (see
	370	@ref overview_resyntax_matching, below). }
	371	@row2col{ <tt>q</tt> , Rest of RE is a literal ("quoted") string, all ordinary
	372	characters. }
	373	@row2col{ <tt>s</tt> , Non-newline-sensitive matching (usual default). }
	374	@row2col{ <tt>t</tt> , Tight syntax (usual default; see below). }
	375	@row2col{ <tt>w</tt> , Inverse partial newline-sensitive ("weird") matching
	376	(see @ref overview_resyntax_matching, below). }
	377	@row2col{ <tt>x</tt> , Expanded syntax (see below). }
	378	@endTable
	379
	380	Embedded options take effect at the <tt>)</tt> terminating the sequence. They
	381	are available only at the start of an ARE, and may not be used later within it.
	382
	383	In addition to the usual (@e tight) RE syntax, in which all characters are
	384	significant, there is an @e expanded syntax, available in AREs with the
	385	embedded x option. In the expanded syntax, white-space characters are ignored
	386	and all characters between a <tt>@#</tt> and the following newline (or the end
	387	of the RE) are ignored, permitting paragraphing and commenting a complex RE.
	388	There are three exceptions to that basic rule:
	389
	390	@li A white-space character or <tt>@#</tt> preceded by <tt>@\</tt> is retained.
	391	@li White space or <tt>@#</tt> within a bracket expression is retained.
	392	@li White space and comments are illegal within multi-character symbols like
	393	the ARE <tt>(?:</tt> or the BRE <tt>\(</tt>.
	394
	395	Expanded-syntax white-space characters are blank, tab, newline, and any
	396	character that belongs to the @e space character class.
	397
	398	Finally, in an ARE, outside bracket expressions, the sequence <tt>(?@#ttt)</tt>
	399	(where @e ttt is any text not containing a <tt>)</tt>) is a comment, completely
	400	ignored. Again, this is not allowed between the characters of multi-character
	401	symbols like <tt>(?:</tt>. Such comments are more a historical artifact than a
	402	useful facility, and their use is deprecated; use the expanded syntax instead.
	403
	404	@e None of these metasyntax extensions is available if the application (or an
	405	initial <tt>***=</tt> director) has specified that the user's input be treated
	406	as a literal string rather than as an RE.
	407
	408
	409	@section overview_resyntax_matching Matching
	410
	411	In the event that an RE could match more than one substring of a given string,
	412	the RE matches the one starting earliest in the string. If the RE could match
	413	more than one substring starting at that point, the choice is determined by
	414	it's @e preference: either the longest substring, or the shortest.
	415
	416	Most atoms, and all constraints, have no preference. A parenthesized RE has the
	417	same preference (possibly none) as the RE. A quantified atom with quantifier
	418	<tt>{m}</tt> or <tt>{m}?</tt> has the same preference (possibly none) as the
	419	atom itself. A quantified atom with other normal quantifiers (including
	420	<tt>{m,n}</tt> with @e m equal to @e n) prefers longest match. A quantified
	421	atom with other non-greedy quantifiers (including <tt>{m,n}?</tt> with @e m
	422	equal to @e n) prefers shortest match. A branch has the same preference as the
	423	first quantified atom in it which has a preference. An RE consisting of two or
	424	more branches connected by the @c \| operator prefers longest match.
	425
	426	Subject to the constraints imposed by the rules for matching the whole RE,
	427	subexpressions also match the longest or shortest possible substrings, based on
	428	their preferences, with subexpressions starting earlier in the RE taking
	429	priority over ones starting later. Note that outer subexpressions thus take
	430	priority over their component subexpressions.
	431
	432	Note that the quantifiers <tt>{1,1}</tt> and <tt>{1,1}?</tt> can be used to
	433	force longest and shortest preference, respectively, on a subexpression or a
	434	whole RE.
	435
	436	Match lengths are measured in characters, not collating elements. An empty
	437	string is considered longer than no match at all. For example, <tt>bb*</tt>
	438	matches the three middle characters of "abbbc",
	439	<tt>(week\|wee)(night\|knights)</tt> matches all ten characters of "weeknights",
	440	when <tt>(.).</tt> is matched against "abc" the parenthesized subexpression
	441	matches all three characters, and when <tt>(a)</tt> is matched against "bc"
	442	both the whole RE and the parenthesized subexpression match an empty string.
	443
	444	If case-independent matching is specified, the effect is much as if all case
	445	distinctions had vanished from the alphabet. When an alphabetic that exists in
	446	multiple cases appears as an ordinary character outside a bracket expression,
	447	it is effectively transformed into a bracket expression containing both cases,
	448	so that @c x becomes @c [xX]. When it appears inside a bracket expression, all
	449	case counterparts of it are added to the bracket expression, so that @c [x]
	450	becomes @c [xX] and @c [^x] becomes @c [^xX].
	451
	452	If newline-sensitive matching is specified, "." and bracket expressions using
	453	"^" will never match the newline character (so that matches will never cross
	454	newlines unless the RE explicitly arranges it) and "^" and "$" will match the
	455	empty string after and before a newline respectively, in addition to matching
	456	at beginning and end of string respectively. ARE <tt>@\A</tt> and <tt>@\Z</tt>
	457	continue to match beginning or end of string @e only.
	458
	459	If partial newline-sensitive matching is specified, this affects "." and
	460	bracket expressions as with newline-sensitive matching, but not "^" and "$".
	461
	462	If inverse partial newline-sensitive matching is specified, this affects "^"
	463	and "$" as with newline-sensitive matching, but not "." and bracket
	464	expressions. This isn't very useful but is provided for symmetry.
	465
	466
	467	@section overview_resyntax_limits Limits and Compatibility
	468
	469	No particular limit is imposed on the length of REs. Programs intended to be
	470	highly portable should not employ REs longer than 256 bytes, as a
	471	POSIX-compliant implementation can refuse to accept such REs.
	472
	473	The only feature of AREs that is actually incompatible with POSIX EREs is that
	474	<tt>@\</tt> does not lose its special significance inside bracket expressions.
	475	All other ARE features use syntax which is illegal or has undefined or
	476	unspecified effects in POSIX EREs; the <tt>***</tt> syntax of directors
	477	likewise is outside the POSIX syntax for both BREs and EREs.
	478
	479	Many of the ARE extensions are borrowed from Perl, but some have been changed
	480	to clean them up, and a few Perl extensions are not present. Incompatibilities
	481	of note include <tt>@\b</tt>, <tt>@\B</tt>, the lack of special treatment for a
	482	trailing newline, the addition of complemented bracket expressions to the
	483	things affected by newline-sensitive matching, the restrictions on parentheses
	484	and back references in lookahead constraints, and the longest/shortest-match
	485	(rather than first-match) matching semantics.
	486
	487	The matching rules for REs containing both normal and non-greedy quantifiers
	488	have changed since early beta-test versions of this package. The new rules are
	489	much simpler and cleaner, but don't work as hard at guessing the user's real
	490	intentions.
	491
	492	Henry Spencer's original 1986 @e regexp package, still in widespread use,
	493	implemented an early version of today's EREs. There are four incompatibilities
	494	between @e regexp's near-EREs (RREs for short) and AREs. In roughly increasing
	495	order of significance:
	496
	497	@li In AREs, <tt>@\</tt> followed by an alphanumeric character is either an
	498	escape or an error, while in RREs, it was just another way of writing the
	499	alphanumeric. This should not be a problem because there was no reason to
	500	write such a sequence in RREs.
	501	@li @c { followed by a digit in an ARE is the beginning of a bound, while in
	502	RREs, @c { was always an ordinary character. Such sequences should be rare,
	503	and will often result in an error because following characters will not
	504	look like a valid bound.
	505	@li In AREs, @c @\ remains a special character within @c [], so a literal @c @\
	506	within @c [] must be written as <tt>@\@\</tt>. <tt>@\@\</tt> also gives a
	507	literal @c @\ within @c [] in RREs, but only truly paranoid programmers
	508	routinely doubled the backslash.
	509	@li AREs report the longest/shortest match for the RE, rather than the first
	510	found in a specified search order. This may affect some RREs which were
	511	written in the expectation that the first match would be reported. The
	512	careful crafting of RREs to optimize the search order for fast matching is
	513	obsolete (AREs examine all possible matches in parallel, and their
	514	performance is largely insensitive to their complexity) but cases where the
	515	search order was exploited to deliberately find a match which was @e not
	516	the longest/shortest will need rewriting.
	517
	518
	519	@section overview_resyntax_bre Basic Regular Expressions
	520
	521	BREs differ from EREs in several respects. @c \|, @c +, and @c ? are ordinary
	522	characters and there is no equivalent for their functionality. The delimiters
	523	for bounds are @c @\{ and @c @\}, with @c { and @c } by themselves ordinary
	524	characters. The parentheses for nested subexpressions are @c @$ and @c @$,
	525	with @c ( and @c ) by themselves ordinary characters. @c ^ is an ordinary
	526	character except at the beginning of the RE or the beginning of a parenthesized
	527	subexpression, @c $ is an ordinary character except at the end of the RE or the
	528	end of a parenthesized subexpression, and @c * is an ordinary character if it
	529	appears at the beginning of the RE or the beginning of a parenthesized
	530	subexpression (after a possible leading <tt>^</tt>). Finally, single-digit back
	531	references are available, and @c @\@< and @c @\@> are synonyms for
	532	<tt>[[:@<:]]</tt> and <tt>[[:@>:]]</tt> respectively; no other escapes are
	533	available.
	534
	535
	536	@section overview_resyntax_characters Regular Expression Character Names
	537
	538	Note that the character names are case sensitive.
	539
	540	<center><table class='doctable' border='0' cellspacing='5' cellpadding='4'><tr>
	541
	542	<td>
	543	@beginTable
	544	@row2col{ <tt>NUL</tt> , @\0 }
	545	@row2col{ <tt>SOH</tt> , @\001 }
	546	@row2col{ <tt>STX</tt> , @\002 }
	547	@row2col{ <tt>ETX</tt> , @\003 }
	548	@row2col{ <tt>EOT</tt> , @\004 }
	549	@row2col{ <tt>ENQ</tt> , @\005 }
	550	@row2col{ <tt>ACK</tt> , @\006 }
	551	@row2col{ <tt>BEL</tt> , @\007 }
	552	@row2col{ <tt>alert</tt> , @\007 }
	553	@row2col{ <tt>BS</tt> , @\010 }
	554	@row2col{ <tt>backspace</tt> , @\b }
	555	@row2col{ <tt>HT</tt> , @\011 }
	556	@row2col{ <tt>tab</tt> , @\t }
	557	@row2col{ <tt>LF</tt> , @\012 }
	558	@row2col{ <tt>newline</tt> , @\n }
	559	@row2col{ <tt>VT</tt> , @\013 }
	560	@row2col{ <tt>vertical-tab</tt> , @\v }
	561	@row2col{ <tt>FF</tt> , @\014 }
	562	@row2col{ <tt>form-feed</tt> , @\f }
	563	@endTable
	564	</td>
	565
	566	<td>
	567	@beginTable
	568	@row2col{ <tt>CR</tt> , @\015 }
	569	@row2col{ <tt>carriage-return</tt> , @\r }
	570	@row2col{ <tt>SO</tt> , @\016 }
	571	@row2col{ <tt>SI</tt> , @\017 }
	572	@row2col{ <tt>DLE</tt> , @\020 }
	573	@row2col{ <tt>DC1</tt> , @\021 }
	574	@row2col{ <tt>DC2</tt> , @\022 }
	575	@row2col{ <tt>DC3</tt> , @\023 }
	576	@row2col{ <tt>DC4</tt> , @\024 }
	577	@row2col{ <tt>NAK</tt> , @\025 }
	578	@row2col{ <tt>SYN</tt> , @\026 }
	579	@row2col{ <tt>ETB</tt> , @\027 }
	580	@row2col{ <tt>CAN</tt> , @\030 }
	581	@row2col{ <tt>EM</tt> , @\031 }
	582	@row2col{ <tt>SUB</tt> , @\032 }
	583	@row2col{ <tt>ESC</tt> , @\033 }
	584	@row2col{ <tt>IS4</tt> , @\034 }
	585	@row2col{ <tt>FS</tt> , @\034 }
	586	@row2col{ <tt>IS3</tt> , @\035 }
	587	@endTable
	588	</td>
	589
	590	<td>
	591	@beginTable
	592	@row2col{ <tt>GS</tt> , @\035 }
	593	@row2col{ <tt>IS2</tt> , @\036 }
	594	@row2col{ <tt>RS</tt> , @\036 }
	595	@row2col{ <tt>IS1</tt> , @\037 }
	596	@row2col{ <tt>US</tt> , @\037 }
	597	@row2col{ <tt>space</tt> , " " (space) }
	598	@row2col{ <tt>exclamation-mark</tt> , ! }
	599	@row2col{ <tt>quotation-mark</tt> , " }
	600	@row2col{ <tt>number-sign</tt> , @# }
	601	@row2col{ <tt>dollar-sign</tt> , @$ }
	602	@row2col{ <tt>percent-sign</tt> , @% }
	603	@row2col{ <tt>ampersand</tt> , @& }
	604	@row2col{ <tt>apostrophe</tt> , ' }
	605	@row2col{ <tt>left-parenthesis</tt> , ( }
	606	@row2col{ <tt>right-parenthesis</tt> , ) }
	607	@row2col{ <tt>asterisk</tt> , * }
	608	@row2col{ <tt>plus-sign</tt> , + }
	609	@row2col{ <tt>comma</tt> , \, }
	610	@row2col{ <tt>hyphen</tt> , - }
	611	@endTable
	612	</td>
	613
	614	<td>
	615	@beginTable
	616	@row2col{ <tt>hyphen-minus</tt> , - }
	617	@row2col{ <tt>period</tt> , . }
	618	@row2col{ <tt>full-stop</tt> , . }
	619	@row2col{ <tt>slash</tt> , / }
	620	@row2col{ <tt>solidus</tt> , / }
	621	@row2col{ <tt>zero</tt> , 0 }
	622	@row2col{ <tt>one</tt> , 1 }
	623	@row2col{ <tt>two</tt> , 2 }
	624	@row2col{ <tt>three</tt> , 3 }
	625	@row2col{ <tt>four</tt> , 4 }
	626	@row2col{ <tt>five</tt> , 5 }
	627	@row2col{ <tt>six</tt> , 6 }
	628	@row2col{ <tt>seven</tt> , 7 }
	629	@row2col{ <tt>eight</tt> , 8 }
	630	@row2col{ <tt>nine</tt> , 9 }
	631	@row2col{ <tt>colon</tt> , : }
	632	@row2col{ <tt>semicolon</tt> , ; }
	633	@row2col{ <tt>less-than-sign</tt> , @< }
	634	@row2col{ <tt>equals-sign</tt> , = }
	635	@endTable
	636	</td>
	637
	638	<td>
	639	@beginTable
	640	@row2col{ <tt>greater-than-sign</tt> , @> }
	641	@row2col{ <tt>question-mark</tt> , ? }
	642	@row2col{ <tt>commercial-at</tt> , @@ }
	643	@row2col{ <tt>left-square-bracket</tt> , [ }
	644	@row2col{ <tt>backslash</tt> , @\ }
	645	@row2col{ <tt>reverse-solidus</tt> , @\ }
	646	@row2col{ <tt>right-square-bracket</tt> , ] }
	647	@row2col{ <tt>circumflex</tt> , ^ }
	648	@row2col{ <tt>circumflex-accent</tt> , ^ }
	649	@row2col{ <tt>underscore</tt> , _ }
	650	@row2col{ <tt>low-line</tt> , _ }
	651	@row2col{ <tt>grave-accent</tt> , ' }
	652	@row2col{ <tt>left-brace</tt> , @leftCurly }
	653	@row2col{ <tt>left-curly-bracket</tt> , @leftCurly }
	654	@row2col{ <tt>vertical-line</tt> , \| }
	655	@row2col{ <tt>right-brace</tt> , @rightCurly }
	656	@row2col{ <tt>right-curly-bracket</tt> , @rightCurly }
	657	@row2col{ <tt>tilde</tt> , ~ }
	658	@row2col{ <tt>DEL</tt> , @\177 }
	659	@endTable
	660	</td>
	661
	662	</tr></table></center>
	663
	664	*/
	665