git.saurik.com Git - wxWidgets.git/blame_incremental

... / ...

Commit	Line	Data
	1	% manual page source format generated by PolyglotMan v3.0.9,
	2	% available from http://polyglotman.sourceforge.net/
	3
	4	\section{Syntax of the builtin regular expression library}\label{wxresyn}
	5
	6	A {\it regular expression} describes strings of characters. It's a
	7	pattern that matches certain strings and doesn't match others.
	8
	9	\wxheading{See also}
	10
	11	\helpref{wxRegEx}{wxregex}
	12
	13	\subsection{Different Flavors of REs}\label{differentflavors}
	14
	15	\helpref{Syntax of the builtin regular expression library}{wxresyn}
	16
	17	Regular expressions (``RE''s), as defined by POSIX, come in two
	18	flavors: {\it extended} REs (``EREs'') and {\it basic} REs (``BREs''). EREs are roughly those
	19	of the traditional {\it egrep}, while BREs are roughly those of the traditional
	20	{\it ed}. This implementation adds a third flavor, {\it advanced} REs (``AREs''), basically
	21	EREs with some significant extensions.
	22
	23	This manual page primarily describes
	24	AREs. BREs mostly exist for backward compatibility in some old programs;
	25	they will be discussed at the \helpref{end}{wxresynbre}. POSIX EREs are almost an exact subset
	26	of AREs. Features of AREs that are not present in EREs will be indicated.
	27
	28	\subsection{Regular Expression Syntax}\label{resyntax}
	29
	30	\helpref{Syntax of the builtin regular expression library}{wxresyn}
	31
	32	These regular expressions are implemented using
	33	the package written by Henry Spencer, based on the 1003.2 spec and some
	34	(not quite all) of the Perl5 extensions (thanks, Henry!). Much of the description
	35	of regular expressions below is copied verbatim from his manual entry.
	36
	37	An ARE is one or more {\it branches}, separated by `{\bf $\|$}', matching anything that matches
	38	any of the branches.
	39
	40	A branch is zero or more {\it constraints} or {\it quantified
	41	atoms}, concatenated. It matches a match for the first, followed by a match
	42	for the second, etc; an empty branch matches the empty string.
	43
	44	A quantified atom is an {\it atom} possibly followed by a single {\it quantifier}. Without a quantifier,
	45	it matches a match for the atom. The quantifiers, and what a so-quantified
	46	atom matches, are:
	47
	48	\begin{twocollist}\twocolwidtha{4cm}
	49	\twocolitem{{\bf *}}{a sequence of 0 or more matches of the atom}
	50	\twocolitem{{\bf +}}{a sequence of 1 or more matches of the atom}
	51	\twocolitem{{\bf ?}}{a sequence of 0 or 1 matches of the atom}
	52	\twocolitem{{\bf \{m\}}}{a sequence of exactly {\it m} matches of the atom}
	53	\twocolitem{{\bf \{m,\}}}{a sequence of {\it m} or more matches of the atom}
	54	\twocolitem{{\bf \{m,n\}}}{a sequence of {\it m} through {\it n} (inclusive)
	55	matches of the atom; {\it m} may not exceed {\it n}}
	56	\twocolitem{{\bf *? +? ?? \{m\}? \{m,\}? \{m,n\}?}}{{\it non-greedy} quantifiers,
	57	which match the same possibilities, but prefer the
	58	smallest number rather than the largest number of matches (see \helpref{Matching}{wxresynmatching})}
	59	\end{twocollist}
	60
	61	The forms using {\bf \{} and {\bf \}} are known as {\it bound}s. The numbers {\it m} and {\it n} are unsigned
	62	decimal integers with permissible values from 0 to 255 inclusive.
	63	An atom is one of:
	64
	65	\begin{twocollist}\twocolwidtha{4cm}
	66	\twocolitem{{\bf (re)}}{(where {\it re} is any regular expression) matches a match for
	67	{\it re}, with the match noted for possible reporting}
	68	\twocolitem{{\bf (?:re)}}{as previous, but
	69	does no reporting (a ``non-capturing'' set of parentheses)}
	70	\twocolitem{{\bf ()}}{matches an empty
	71	string, noted for possible reporting}
	72	\twocolitem{{\bf (?:)}}{matches an empty string, without reporting}
	73	\twocolitem{{\bf $[chars]$}}{a {\it bracket expression}, matching any one of the {\it chars}
	74	(see \helpref{Bracket Expressions}{wxresynbracket} for more detail)}
	75	\twocolitem{{\bf .}}{matches any single character }
	76	\twocolitem{{\bf $\backslash$k}}{(where {\it k} is a non-alphanumeric character)
	77	matches that character taken as an ordinary character, e.g. $\backslash\backslash$ matches a backslash
	78	character}
	79	\twocolitem{{\bf $\backslash$c}}{where {\it c} is alphanumeric (possibly followed by other characters),
	80	an {\it escape} (AREs only), see \helpref{Escapes}{wxresynescapes} below}
	81	\twocolitem{{\bf \{}}{when followed by a character
	82	other than a digit, matches the left-brace character `{\bf \{}'; when followed by
	83	a digit, it is the beginning of a {\it bound} (see above)}
	84	\twocolitem{{\bf x}}{where {\it x} is a single
	85	character with no other significance, matches that character.}
	86	\end{twocollist}
	87
	88	A {\it constraint} matches an empty string when specific conditions are met. A constraint may
	89	not be followed by a quantifier. The simple constraints are as follows;
	90	some more constraints are described later, under \helpref{Escapes}{wxresynescapes}.
	91
	92	\begin{twocollist}\twocolwidtha{4cm}
	93	\twocolitem{{\bf \caret}}{matches at the beginning of a line}
	94	\twocolitem{{\bf \$}}{matches at the end of a line}
	95	\twocolitem{{\bf (?=re)}}{{\it positive lookahead}
	96	(AREs only), matches at any point where a substring matching {\it re} begins}
	97	\twocolitem{{\bf (?!re)}}{{\it negative lookahead} (AREs only),
	98	matches at any point where no substring matching {\it re} begins}
	99	\end{twocollist}
	100
	101	The lookahead constraints may not contain back references
	102	(see later), and all parentheses within them are considered non-capturing.
	103
	104	An RE may not end with `{\bf $\backslash$}'.
	105
	106	\subsection{Bracket Expressions}\label{wxresynbracket}
	107
	108	\helpref{Syntax of the builtin regular expression library}{wxresyn}
	109
	110	A {\it bracket expression} is a list
	111	of characters enclosed in `{\bf $[]$}'. It normally matches any single character from
	112	the list (but see below). If the list begins with `{\bf \caret}', it matches any single
	113	character (but see below) {\it not} from the rest of the list.
	114
	115	If two characters
	116	in the list are separated by `{\bf -}', this is shorthand for the full {\it range} of
	117	characters between those two (inclusive) in the collating sequence, e.g.
	118	{\bf $[0-9]$} in ASCII matches any decimal digit. Two ranges may not share an endpoint,
	119	so e.g. {\bf a-c-e} is illegal. Ranges are very collating-sequence-dependent, and portable
	120	programs should avoid relying on them.
	121
	122	To include a literal {\bf $]$} or {\bf -} in the
	123	list, the simplest method is to enclose it in {\bf $[.$} and {\bf $.]$} to make it a collating
	124	element (see below). Alternatively, make it the first character (following
	125	a possible `{\bf \caret}'), or (AREs only) precede it with `{\bf $\backslash$}'.
	126	Alternatively, for `{\bf -}', make
	127	it the last character, or the second endpoint of a range. To use a literal
	128	{\bf -} as the first endpoint of a range, make it a collating element or (AREs
	129	only) precede it with `{\bf $\backslash$}'. With the exception of these, some combinations using
	130	{\bf $[$} (see next paragraphs), and escapes, all other special characters lose
	131	their special significance within a bracket expression.
	132
	133	Within a bracket
	134	expression, a collating element (a character, a multi-character sequence
	135	that collates as if it were a single character, or a collating-sequence
	136	name for either) enclosed in {\bf $[.$} and {\bf $.]$} stands for the
	137	sequence of characters of that collating element.
	138
	139	{\it wxWidgets}: Currently no multi-character collating elements are defined.
	140	So in {\bf $[.X.]$}, {\it X} can either be a single character literal or
	141	the name of a character. For example, the following are both identical
	142	{\bf $[[.0.]-[.9.]]$} and {\bf $[[.zero.]-[.nine.]]$} and mean the same as
	143	{\bf $[0-9]$}.
	144	See \helpref{Character Names}{wxresynchars}.
	145
	146	%The sequence is a single element of the bracket
	147	%expression's list. A bracket expression in a locale that has multi-character
	148	%collating elements can thus match more than one character. So (insidiously),
	149	%a bracket expression that starts with {\bf \caret} can match multi-character collating
	150	%elements even if none of them appear in the bracket expression! ({\it Note:}
	151	%Tcl currently has no multi-character collating elements. This information
	152	%is only for illustration.)
	153	%
	154	%For example, assume the collating sequence includes
	155	%a {\bf ch} multi-character collating element. Then the RE {\bf $[[.ch.]]*c$} (zero or more
	156	% {\bf ch}'s followed by {\bf c}) matches the first five characters of `{\bf chchcc}'. Also, the
	157	%RE {\bf $[^c]b$} matches all of `{\bf chb}' (because {\bf $[^c]$} matches the multi-character {\bf ch}).
	158
	159	Within a bracket expression, a collating element enclosed in {\bf $[=$} and {\bf $=]$}
	160	is an equivalence class, standing for the sequences of characters of all
	161	collating elements equivalent to that one, including itself.
	162	%(If there are
	163	%no other equivalent collating elements, the treatment is as if the enclosing
	164	%delimiters were `{\bf $[.$}' and `{\bf $.]$}'.) For example, if {\bf o}
	165	%and {\bf \caret} are the members of an
	166	%equivalence class, then `{\bf $[[$=o=$]]$}', `{\bf $[[$=\caret=$]]$}',
	167	%and `{\bf $[o^]$}' are all synonymous.
	168	An equivalence class may not be an endpoint of a range.
	169
	170	%({\it Note:} Tcl currently
	171	%implements only the Unicode locale. It doesn't define any equivalence classes.
	172	%The examples above are just illustrations.)
	173
	174	{\it wxWidgets}: Currently no equivalence classes are defined, so
	175	{\bf $[=X=]$} stands for just the single character {\it X}.
	176	{\it X} can either be a single character literal or the name of a character,
	177	see \helpref{Character Names}{wxresynchars}.
	178
	179	Within a bracket expression,
	180	the name of a {\it character class} enclosed in {\bf $[:$} and {\bf $:]$} stands for the list
	181	of all characters (not all collating elements!) belonging to that class.
	182	Standard character classes are:
	183
	184	\begin{twocollist}\twocolwidtha{3cm}
	185	\twocolitem{{\bf alpha}}{A letter.}
	186	\twocolitem{{\bf upper}}{An upper-case letter.}
	187	\twocolitem{{\bf lower}}{A lower-case letter.}
	188	\twocolitem{{\bf digit}}{A decimal digit.}
	189	\twocolitem{{\bf xdigit}}{A hexadecimal digit.}
	190	\twocolitem{{\bf alnum}}{An alphanumeric (letter or digit).}
	191	\twocolitem{{\bf print}}{An alphanumeric (same as alnum).}
	192	\twocolitem{{\bf blank}}{A space or tab character.}
	193	\twocolitem{{\bf space}}{A character producing white space in displayed text.}
	194	\twocolitem{{\bf punct}}{A punctuation character.}
	195	\twocolitem{{\bf graph}}{A character with a visible representation.}
	196	\twocolitem{{\bf cntrl}}{A control character.}
	197	\end{twocollist}
	198
	199	%A locale may provide others. (Note that the current Tcl
	200	%implementation has only one locale: the Unicode locale.)
	201	A character class may not be used as an endpoint of a range.
	202
	203	{\it wxWidgets}: In a non-Unicode build, these character classifications depend on the
	204	current locale, and correspond to the values return by the ANSI C 'is'
	205	functions: isalpha, isupper, etc. In Unicode mode they are based on
	206	Unicode classifications, and are not affected by the current locale.
	207
	208	There are two special cases of bracket expressions:
	209	the bracket expressions {\bf $[[:$<$:]]$} and {\bf $[[:$>$:]]$} are constraints, matching empty
	210	strings at the beginning and end of a word respectively. A word is defined
	211	as a sequence of word characters that is neither preceded nor followed
	212	by word characters. A word character is an {\it alnum} character or an underscore
	213	({\bf \_}). These special bracket expressions are deprecated; users of AREs should
	214	use constraint escapes instead (see \helpref{Escapes}{wxresynescapes} below).
	215
	216	\subsection{Escapes}\label{wxresynescapes}
	217
	218	\helpref{Syntax of the builtin regular expression library}{wxresyn}
	219
	220	Escapes (AREs only),
	221	which begin with a {\bf $\backslash$} followed by an alphanumeric character, come in several
	222	varieties: character entry, class shorthands, constraint escapes, and back
	223	references. A {\bf $\backslash$} followed by an alphanumeric character but not constituting
	224	a valid escape is illegal in AREs. In EREs, there are no escapes: outside
	225	a bracket expression, a {\bf $\backslash$} followed by an alphanumeric character merely stands
	226	for that character as an ordinary character, and inside a bracket expression,
	227	{\bf $\backslash$} is an ordinary character. (The latter is the one actual incompatibility
	228	between EREs and AREs.)
	229
	230	Character-entry escapes (AREs only) exist to make
	231	it easier to specify non-printing and otherwise inconvenient characters
	232	in REs:
	233
	234	\begin{twocollist}\twocolwidtha{4cm}
	235	\twocolitem{{\bf $\backslash$a}}{alert (bell) character, as in C}
	236	\twocolitem{{\bf $\backslash$b}}{backspace, as in C}
	237	\twocolitem{{\bf $\backslash$B}}{synonym
	238	for {\bf $\backslash$} to help reduce backslash doubling in some applications where there
	239	are multiple levels of backslash processing}
	240	\twocolitem{{\bf $\backslash$c{\it X}}}{(where X is any character)
	241	the character whose low-order 5 bits are the same as those of {\it X}, and whose
	242	other bits are all zero}
	243	\twocolitem{{\bf $\backslash$e}}{the character whose collating-sequence name is
	244	`{\bf ESC}', or failing that, the character with octal value 033}
	245	\twocolitem{{\bf $\backslash$f}}{formfeed, as in C}
	246	\twocolitem{{\bf $\backslash$n}}{newline, as in C}
	247	\twocolitem{{\bf $\backslash$r}}{carriage return, as in C}
	248	\twocolitem{{\bf $\backslash$t}}{horizontal tab, as in C}
	249	\twocolitem{{\bf $\backslash$u{\it wxyz}}}{(where {\it wxyz} is exactly four hexadecimal digits)
	250	the Unicode
	251	character {\bf U+{\it wxyz}} in the local byte ordering}
	252	\twocolitem{{\bf $\backslash$U{\it stuvwxyz}}}{(where {\it stuvwxyz} is
	253	exactly eight hexadecimal digits) reserved for a somewhat-hypothetical Unicode
	254	extension to 32 bits}
	255	\twocolitem{{\bf $\backslash$v}}{vertical tab, as in C are all available.}
	256	\twocolitem{{\bf $\backslash$x{\it hhh}}}{(where
	257	{\it hhh} is any sequence of hexadecimal digits) the character whose hexadecimal
	258	value is {\bf 0x{\it hhh}} (a single character no matter how many hexadecimal digits
	259	are used).}
	260	\twocolitem{{\bf $\backslash$0}}{the character whose value is {\bf 0}}
	261	\twocolitem{{\bf $\backslash${\it xy}}}{(where {\it xy} is exactly two
	262	octal digits, and is not a {\it back reference} (see below)) the character whose
	263	octal value is {\bf 0{\it xy}}}
	264	\twocolitem{{\bf $\backslash${\it xyz}}}{(where {\it xyz} is exactly three octal digits, and is
	265	not a back reference (see below))
	266	the character whose octal value is {\bf 0{\it xyz}}}
	267	\end{twocollist}
	268
	269	Hexadecimal digits are `{\bf 0}'-`{\bf 9}', `{\bf a}'-`{\bf f}', and `{\bf A}'-`{\bf F}'. Octal
	270	digits are `{\bf 0}'-`{\bf 7}'.
	271
	272	The character-entry
	273	escapes are always taken as ordinary characters. For example, {\bf $\backslash$135} is {\bf ]} in
	274	ASCII, but {\bf $\backslash$135} does not terminate a bracket expression. Beware, however,
	275	that some applications (e.g., C compilers) interpret such sequences themselves
	276	before the regular-expression package gets to see them, which may require
	277	doubling (quadrupling, etc.) the `{\bf $\backslash$}'.
	278
	279	Class-shorthand escapes (AREs only) provide
	280	shorthands for certain commonly-used character classes:
	281
	282	\begin{twocollist}\twocolwidtha{4cm}
	283	\twocolitem{{\bf $\backslash$d}}{{\bf $[[:digit:]]$}}
	284	\twocolitem{{\bf $\backslash$s}}{{\bf $[[:space:]]$}}
	285	\twocolitem{{\bf $\backslash$w}}{{\bf $[[:alnum:]\_]$} (note underscore)}
	286	\twocolitem{{\bf $\backslash$D}}{{\bf $[^[:digit:]]$}}
	287	\twocolitem{{\bf $\backslash$S}}{{\bf $[^[:space:]]$}}
	288	\twocolitem{{\bf $\backslash$W}}{{\bf $[^[:alnum:]\_]$} (note underscore)}
	289	\end{twocollist}
	290
	291	Within bracket expressions, `{\bf $\backslash$d}', `{\bf $\backslash$s}', and
	292	`{\bf $\backslash$w}' lose their outer brackets, and `{\bf $\backslash$D}',
	293	`{\bf $\backslash$S}', and `{\bf $\backslash$W}' are illegal. (So, for example,
	294	{\bf $[$a-c$\backslash$d$]$} is equivalent to {\bf $[a-c[:digit:]]$}.
	295	Also, {\bf $[$a-c$\backslash$D$]$}, which is equivalent to
	296	{\bf $[a-c^[:digit:]]$}, is illegal.)
	297
	298	A constraint escape (AREs only) is a constraint,
	299	matching the empty string if specific conditions are met, written as an
	300	escape:
	301
	302	\begin{twocollist}\twocolwidtha{4cm}
	303	\twocolitem{{\bf $\backslash$A}}{matches only at the beginning of the string
	304	(see \helpref{Matching}{wxresynmatching}, below,
	305	for how this differs from `{\bf \caret}')}
	306	\twocolitem{{\bf $\backslash$m}}{matches only at the beginning of a word}
	307	\twocolitem{{\bf $\backslash$M}}{matches only at the end of a word}
	308	\twocolitem{{\bf $\backslash$y}}{matches only at the beginning or end of a word}
	309	\twocolitem{{\bf $\backslash$Y}}{matches only at a point that is not the beginning or end of
	310	a word}
	311	\twocolitem{{\bf $\backslash$Z}}{matches only at the end of the string
	312	(see \helpref{Matching}{wxresynmatching}, below, for
	313	how this differs from `{\bf \$}')}
	314	\twocolitem{{\bf $\backslash${\it m}}}{(where {\it m} is a nonzero digit) a {\it back reference},
	315	see below}
	316	\twocolitem{{\bf $\backslash${\it mnn}}}{(where {\it m} is a nonzero digit, and {\it nn} is some more digits,
	317	and the decimal value {\it mnn} is not greater than the number of closing capturing
	318	parentheses seen so far) a {\it back reference}, see below}
	319	\end{twocollist}
	320
	321	A word is defined
	322	as in the specification of {\bf $[[:$<$:]]$} and {\bf $[[:$>$:]]$} above. Constraint escapes are
	323	illegal within bracket expressions.
	324
	325	A back reference (AREs only) matches
	326	the same string matched by the parenthesized subexpression specified by
	327	the number, so that (e.g.) {\bf ($[bc]$)$\backslash$1} matches {\bf bb} or {\bf cc} but not `{\bf bc}'.
	328	The subexpression
	329	must entirely precede the back reference in the RE. Subexpressions are numbered
	330	in the order of their leading parentheses. Non-capturing parentheses do not
	331	define subexpressions.
	332
	333	There is an inherent historical ambiguity between
	334	octal character-entry escapes and back references, which is resolved by
	335	heuristics, as hinted at above. A leading zero always indicates an octal
	336	escape. A single non-zero digit, not followed by another digit, is always
	337	taken as a back reference. A multi-digit sequence not starting with a zero
	338	is taken as a back reference if it comes after a suitable subexpression
	339	(i.e. the number is in the legal range for a back reference), and otherwise
	340	is taken as octal.
	341
	342	\subsection{Metasyntax}\label{remetasyntax}
	343
	344	\helpref{Syntax of the builtin regular expression library}{wxresyn}
	345
	346	In addition to the main syntax described above,
	347	there are some special forms and miscellaneous syntactic facilities available.
	348
	349	Normally the flavor of RE being used is specified by application-dependent
	350	means. However, this can be overridden by a {\it director}. If an RE of any flavor
	351	begins with `{\bf ***:}', the rest of the RE is an ARE. If an RE of any flavor begins
	352	with `{\bf ***=}', the rest of the RE is taken to be a literal string, with all
	353	characters considered ordinary characters.
	354
	355	An ARE may begin with {\it embedded options}: a sequence {\bf (?xyz)}
	356	(where {\it xyz} is one or more alphabetic characters)
	357	specifies options affecting the rest of the RE. These supplement, and can
	358	override, any options specified by the application. The available option
	359	letters are:
	360
	361	\begin{twocollist}\twocolwidtha{4cm}
	362	\twocolitem{{\bf b}}{rest of RE is a BRE}
	363	\twocolitem{{\bf c}}{case-sensitive matching (usual default)}
	364	\twocolitem{{\bf e}}{rest of RE is an ERE}
	365	\twocolitem{{\bf i}}{case-insensitive matching (see \helpref{Matching}{wxresynmatching}, below)}
	366	\twocolitem{{\bf m}}{historical synonym for {\bf n}}
	367	\twocolitem{{\bf n}}{newline-sensitive matching (see \helpref{Matching}{wxresynmatching}, below)}
	368	\twocolitem{{\bf p}}{partial newline-sensitive matching (see \helpref{Matching}{wxresynmatching}, below)}
	369	\twocolitem{{\bf q}}{rest of RE
	370	is a literal (``quoted'') string, all ordinary characters}
	371	\twocolitem{{\bf s}}{non-newline-sensitive matching (usual default)}
	372	\twocolitem{{\bf t}}{tight syntax (usual default; see below)}
	373	\twocolitem{{\bf w}}{inverse
	374	partial newline-sensitive (``weird'') matching (see \helpref{Matching}{wxresynmatching}, below)}
	375	\twocolitem{{\bf x}}{expanded syntax (see below)}
	376	\end{twocollist}
	377
	378	Embedded options take effect at the {\bf )} terminating the
	379	sequence. They are available only at the start of an ARE, and may not be
	380	used later within it.
	381
	382	In addition to the usual ({\it tight}) RE syntax, in which
	383	all characters are significant, there is an {\it expanded} syntax, available
	384	%in all flavors of RE with the {\bf -expanded} switch, or
	385	in AREs with the embedded
	386	x option. In the expanded syntax, white-space characters are ignored and
	387	all characters between a {\bf \#} and the following newline (or the end of the
	388	RE) are ignored, permitting paragraphing and commenting a complex RE. There
	389	are three exceptions to that basic rule:
	390	{\itemize
	391	\item%
	392	a white-space character or `{\bf \#}' preceded
	393	by `{\bf $\backslash$}' is retained
	394	\item%
	395	white space or `{\bf \#}' within a bracket expression is retained
	396	\item%
	397	white space and comments are illegal within multi-character symbols like
	398	the ARE `{\bf (?:}' or the BRE `{\bf $\backslash$(}'
	399	}
	400	Expanded-syntax white-space characters are blank,
	401	tab, newline, and any character that belongs to the {\it space} character class.
	402
	403	Finally, in an ARE, outside bracket expressions, the sequence `{\bf (?\#ttt)}' (where
	404	{\it ttt} is any text not containing a `{\bf )}') is a comment, completely ignored. Again,
	405	this is not allowed between the characters of multi-character symbols like
	406	`{\bf (?:}'. Such comments are more a historical artifact than a useful facility,
	407	and their use is deprecated; use the expanded syntax instead.
	408
	409	{\it None} of these
	410	metasyntax extensions is available if the application (or an initial {\bf ***=}
	411	director) has specified that the user's input be treated as a literal string
	412	rather than as an RE.
	413
	414	\subsection{Matching}\label{wxresynmatching}
	415
	416	\helpref{Syntax of the builtin regular expression library}{wxresyn}
	417
	418	In the event that an RE could match more than
	419	one substring of a given string, the RE matches the one starting earliest
	420	in the string. If the RE could match more than one substring starting at
	421	that point, its choice is determined by its {\it preference}: either the longest
	422	substring, or the shortest.
	423
	424	Most atoms, and all constraints, have no preference.
	425	A parenthesized RE has the same preference (possibly none) as the RE. A
	426	quantified atom with quantifier {\bf \{m\}} or {\bf \{m\}?} has the same preference (possibly
	427	none) as the atom itself. A quantified atom with other normal quantifiers
	428	(including {\bf \{m,n\}} with {\it m} equal to {\it n}) prefers longest match. A quantified
	429	atom with other non-greedy quantifiers (including {\bf \{m,n\}?} with {\it m} equal to
	430	{\it n}) prefers shortest match. A branch has the same preference as the first
	431	quantified atom in it which has a preference. An RE consisting of two or
	432	more branches connected by the {\bf $\|$} operator prefers longest match.
	433
	434	Subject to the constraints imposed by the rules for matching the whole RE, subexpressions
	435	also match the longest or shortest possible substrings, based on their
	436	preferences, with subexpressions starting earlier in the RE taking priority
	437	over ones starting later. Note that outer subexpressions thus take priority
	438	over their component subexpressions.
	439
	440	Note that the quantifiers {\bf \{1,1\}} and
	441	{\bf \{1,1\}?} can be used to force longest and shortest preference, respectively,
	442	on a subexpression or a whole RE.
	443
	444	Match lengths are measured in characters,
	445	not collating elements. An empty string is considered longer than no match
	446	at all. For example, {\bf bb*} matches the three middle characters
	447	of `{\bf abbbc}', {\bf (week$\|$wee)(night$\|$knights)}
	448	matches all ten characters of `{\bf weeknights}', when {\bf (.).} is matched against
	449	{\bf abc} the parenthesized subexpression matches all three characters, and when
	450	{\bf (a)} is matched against {\bf bc} both the whole RE and the parenthesized subexpression
	451	match an empty string.
	452
	453	If case-independent matching is specified, the effect
	454	is much as if all case distinctions had vanished from the alphabet. When
	455	an alphabetic that exists in multiple cases appears as an ordinary character
	456	outside a bracket expression, it is effectively transformed into a bracket
	457	expression containing both cases, so that {\bf x} becomes `{\bf $[xX]$}'. When it appears
	458	inside a bracket expression, all case counterparts of it are added to the
	459	bracket expression, so that {\bf $[x]$} becomes {\bf $[xX]$} and {\bf $[^x]$} becomes `{\bf $[^xX]$}'.
	460
	461	If newline-sensitive
	462	matching is specified, {\bf .} and bracket expressions using {\bf \caret} will never match
	463	the newline character (so that matches will never cross newlines unless
	464	the RE explicitly arranges it) and {\bf \caret} and {\bf \$} will match the empty string after
	465	and before a newline respectively, in addition to matching at beginning
	466	and end of string respectively. ARE {\bf $\backslash$A} and {\bf $\backslash$Z} continue to match beginning
	467	or end of string {\it only}.
	468
	469	If partial newline-sensitive matching is specified,
	470	this affects {\bf .} and bracket expressions as with newline-sensitive matching,
	471	but not {\bf \caret} and `{\bf \$}'.
	472
	473	If inverse partial newline-sensitive matching is specified,
	474	this affects {\bf \caret} and {\bf \$} as with newline-sensitive matching, but not {\bf .} and bracket
	475	expressions. This isn't very useful but is provided for symmetry.
	476
	477	\subsection{Limits And Compatibility}\label{relimits}
	478
	479	\helpref{Syntax of the builtin regular expression library}{wxresyn}
	480
	481	No particular limit is imposed on the length of REs. Programs
	482	intended to be highly portable should not employ REs longer than 256 bytes,
	483	as a POSIX-compliant implementation can refuse to accept such REs.
	484
	485	The only
	486	feature of AREs that is actually incompatible with POSIX EREs is that {\bf $\backslash$}
	487	does not lose its special significance inside bracket expressions. All other
	488	ARE features use syntax which is illegal or has undefined or unspecified
	489	effects in POSIX EREs; the {\bf ***} syntax of directors likewise is outside
	490	the POSIX syntax for both BREs and EREs.
	491
	492	Many of the ARE extensions are
	493	borrowed from Perl, but some have been changed to clean them up, and a
	494	few Perl extensions are not present. Incompatibilities of note include `{\bf $\backslash$b}',
	495	`{\bf $\backslash$B}', the lack of special treatment for a trailing newline, the addition of
	496	complemented bracket expressions to the things affected by newline-sensitive
	497	matching, the restrictions on parentheses and back references in lookahead
	498	constraints, and the longest/shortest-match (rather than first-match) matching
	499	semantics.
	500
	501	The matching rules for REs containing both normal and non-greedy
	502	quantifiers have changed since early beta-test versions of this package.
	503	(The new rules are much simpler and cleaner, but don't work as hard at guessing
	504	the user's real intentions.)
	505
	506	Henry Spencer's original 1986 {\it regexp} package, still in widespread use,
	507	%(e.g., in pre-8.1 releases of Tcl),
	508	implemented an early version of today's EREs. There are four incompatibilities between {\it regexp}'s
	509	near-EREs (`RREs' for short) and AREs. In roughly increasing order of significance:
	510	{\itemize
	511	\item In AREs, {\bf $\backslash$} followed by an alphanumeric character is either an escape or
	512	an error, while in RREs, it was just another way of writing the alphanumeric.
	513	This should not be a problem because there was no reason to write such
	514	a sequence in RREs.
	515
	516	\item {\bf \{} followed by a digit in an ARE is the beginning of
	517	a bound, while in RREs, {\bf \{} was always an ordinary character. Such sequences
	518	should be rare, and will often result in an error because following characters
	519	will not look like a valid bound.
	520
	521	\item In AREs, {\bf $\backslash$} remains a special character
	522	within `{\bf $[]$}', so a literal {\bf $\backslash$} within {\bf $[]$} must be
	523	written `{\bf $\backslash\backslash$}'. {\bf $\backslash\backslash$} also gives a literal
	524	{\bf $\backslash$} within {\bf $[]$} in RREs, but only truly paranoid programmers routinely doubled
	525	the backslash.
	526
	527	\item AREs report the longest/shortest match for the RE, rather
	528	than the first found in a specified search order. This may affect some RREs
	529	which were written in the expectation that the first match would be reported.
	530	(The careful crafting of RREs to optimize the search order for fast matching
	531	is obsolete (AREs examine all possible matches in parallel, and their performance
	532	is largely insensitive to their complexity) but cases where the search
	533	order was exploited to deliberately find a match which was {\it not} the longest/shortest
	534	will need rewriting.)
	535	}
	536
	537	\subsection{Basic Regular Expressions}\label{wxresynbre}
	538
	539	\helpref{Syntax of the builtin regular expression library}{wxresyn}
	540
	541	BREs differ from EREs in
	542	several respects. `{\bf $\|$}', `{\bf +}', and {\bf ?} are ordinary characters and there is no equivalent
	543	for their functionality. The delimiters for bounds
	544	are {\bf $\backslash$\{} and `{\bf $\backslash$\}}', with {\bf \{} and
	545	{\bf \}} by themselves ordinary characters. The parentheses for nested subexpressions
	546	are {\bf $\backslash$(} and `{\bf $\backslash$)}', with {\bf (} and {\bf )} by themselves
	547	ordinary characters. {\bf \caret} is an ordinary
	548	character except at the beginning of the RE or the beginning of a parenthesized
	549	subexpression, {\bf \$} is an ordinary character except at the end of the RE or
	550	the end of a parenthesized subexpression, and {\bf *} is an ordinary character
	551	if it appears at the beginning of the RE or the beginning of a parenthesized
	552	subexpression (after a possible leading `{\bf \caret}'). Finally, single-digit back references
	553	are available, and {\bf $\backslash<$} and {\bf $\backslash>$} are synonyms
	554	for {\bf $[[:<:]]$} and {\bf $[[:>:]]$} respectively;
	555	no other escapes are available.
	556
	557	\subsection{Regular Expression Character Names}\label{wxresynchars}
	558
	559	\helpref{Syntax of the builtin regular expression library}{wxresyn}
	560
	561	Note that the character names are case sensitive.
	562
	563	\begin{twocollist}
	564	\twocolitem{NUL}{'$\backslash$0'}
	565	\twocolitem{SOH}{'$\backslash$001'}
	566	\twocolitem{STX}{'$\backslash$002'}
	567	\twocolitem{ETX}{'$\backslash$003'}
	568	\twocolitem{EOT}{'$\backslash$004'}
	569	\twocolitem{ENQ}{'$\backslash$005'}
	570	\twocolitem{ACK}{'$\backslash$006'}
	571	\twocolitem{BEL}{'$\backslash$007'}
	572	\twocolitem{alert}{'$\backslash$007'}
	573	\twocolitem{BS}{'$\backslash$010'}
	574	\twocolitem{backspace}{'$\backslash$b'}
	575	\twocolitem{HT}{'$\backslash$011'}
	576	\twocolitem{tab}{'$\backslash$t'}
	577	\twocolitem{LF}{'$\backslash$012'}
	578	\twocolitem{newline}{'$\backslash$n'}
	579	\twocolitem{VT}{'$\backslash$013'}
	580	\twocolitem{vertical-tab}{'$\backslash$v'}
	581	\twocolitem{FF}{'$\backslash$014'}
	582	\twocolitem{form-feed}{'$\backslash$f'}
	583	\twocolitem{CR}{'$\backslash$015'}
	584	\twocolitem{carriage-return}{'$\backslash$r'}
	585	\twocolitem{SO}{'$\backslash$016'}
	586	\twocolitem{SI}{'$\backslash$017'}
	587	\twocolitem{DLE}{'$\backslash$020'}
	588	\twocolitem{DC1}{'$\backslash$021'}
	589	\twocolitem{DC2}{'$\backslash$022'}
	590	\twocolitem{DC3}{'$\backslash$023'}
	591	\twocolitem{DC4}{'$\backslash$024'}
	592	\twocolitem{NAK}{'$\backslash$025'}
	593	\twocolitem{SYN}{'$\backslash$026'}
	594	\twocolitem{ETB}{'$\backslash$027'}
	595	\twocolitem{CAN}{'$\backslash$030'}
	596	\twocolitem{EM}{'$\backslash$031'}
	597	\twocolitem{SUB}{'$\backslash$032'}
	598	\twocolitem{ESC}{'$\backslash$033'}
	599	\twocolitem{IS4}{'$\backslash$034'}
	600	\twocolitem{FS}{'$\backslash$034'}
	601	\twocolitem{IS3}{'$\backslash$035'}
	602	\twocolitem{GS}{'$\backslash$035'}
	603	\twocolitem{IS2}{'$\backslash$036'}
	604	\twocolitem{RS}{'$\backslash$036'}
	605	\twocolitem{IS1}{'$\backslash$037'}
	606	\twocolitem{US}{'$\backslash$037'}
	607	\twocolitem{space}{' '}
	608	\twocolitem{exclamation-mark}{'!'}
	609	\twocolitem{quotation-mark}{'"'}
	610	\twocolitem{number-sign}{'\#'}
	611	\twocolitem{dollar-sign}{'\$'}
	612	\twocolitem{percent-sign}{'\%'}
	613	\twocolitem{ampersand}{'\&'}
	614	\twocolitem{apostrophe}{'$\backslash$''}
	615	\twocolitem{left-parenthesis}{'('}
	616	\twocolitem{right-parenthesis}{')'}
	617	\twocolitem{asterisk}{'*'}
	618	\twocolitem{plus-sign}{'+'}
	619	\twocolitem{comma}{','}
	620	\twocolitem{hyphen}{'-'}
	621	\twocolitem{hyphen-minus}{'-'}
	622	\twocolitem{period}{'.'}
	623	\twocolitem{full-stop}{'.'}
	624	\twocolitem{slash}{'/'}
	625	\twocolitem{solidus}{'/'}
	626	\twocolitem{zero}{'0'}
	627	\twocolitem{one}{'1'}
	628	\twocolitem{two}{'2'}
	629	\twocolitem{three}{'3'}
	630	\twocolitem{four}{'4'}
	631	\twocolitem{five}{'5'}
	632	\twocolitem{six}{'6'}
	633	\twocolitem{seven}{'7'}
	634	\twocolitem{eight}{'8'}
	635	\twocolitem{nine}{'9'}
	636	\twocolitem{colon}{':'}
	637	\twocolitem{semicolon}{';'}
	638	\twocolitem{less-than-sign}{'<'}
	639	\twocolitem{equals-sign}{'='}
	640	\twocolitem{greater-than-sign}{'>'}
	641	\twocolitem{question-mark}{'?'}
	642	\twocolitem{commercial-at}{'@'}
	643	\twocolitem{left-square-bracket}{'$[$'}
	644	\twocolitem{backslash}{'$\backslash$'}
	645	\twocolitem{reverse-solidus}{'$\backslash$'}
	646	\twocolitem{right-square-bracket}{'$]$'}
	647	\twocolitem{circumflex}{'\caret'}
	648	\twocolitem{circumflex-accent}{'\caret'}
	649	\twocolitem{underscore}{'\_'}
	650	\twocolitem{low-line}{'\_'}
	651	\twocolitem{grave-accent}{'`'}
	652	\twocolitem{left-brace}{'\{'}
	653	\twocolitem{left-curly-bracket}{'\{'}
	654	\twocolitem{vertical-line}{'$\|$'}
	655	\twocolitem{right-brace}{'\}'}
	656	\twocolitem{right-curly-bracket}{'\}'}
	657	\twocolitem{tilde}{'\destruct{}'}
	658	\twocolitem{DEL}{'$\backslash$177'}
	659	\end{twocollist}
	660