From 0aa7fa9afdd1499d06b56bcff8adcd4adb6ca3f7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?V=C3=A1clav=20Slav=C3=ADk?= Date: Thu, 19 Feb 2004 18:02:48 +0000 Subject: [PATCH] applied patch 890642: wxRE_ADVANCED flag and docs git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@25870 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- autoconf_inc.m4 | 3 - docs/latex/wx/re_syntax.tex | 676 ++++++++++++++++++++++++++++++++++++ docs/latex/wx/regex.tex | 38 +- docs/latex/wx/topics.tex | 2 +- setup.h.in | 6 + setup.h_vms | 6 + 6 files changed, 718 insertions(+), 13 deletions(-) create mode 100644 docs/latex/wx/re_syntax.tex diff --git a/autoconf_inc.m4 b/autoconf_inc.m4 index 476509c6fc..1e7be34afa 100644 --- a/autoconf_inc.m4 +++ b/autoconf_inc.m4 @@ -2,9 +2,6 @@ dnl This macro was generated by dnl Bakefile 0.1.3 (http://bakefile.sourceforge.net) dnl Do not modify, all changes will be overwritten! - -dnl Conditions: - dnl ### begin block 0_AC_BAKEFILE_PRECOMP_HEADERS ### AC_BAKEFILE_PRECOMP_HEADERS diff --git a/docs/latex/wx/re_syntax.tex b/docs/latex/wx/re_syntax.tex new file mode 100644 index 0000000000..677dad856f --- /dev/null +++ b/docs/latex/wx/re_syntax.tex @@ -0,0 +1,676 @@ +% manual page source format generated by PolyglotMan v3.0.9, +% available via anonymous ftp from ftp.cs.berkeley.edu:/ucb/people/phelps/tcltk/rman.tar.Z + +\section{Syntax of the builtin regular expression library}\label{wxresyn} + +A {\it regular expression} describes strings of characters. It's a +pattern that matches certain strings and doesn't match others. + +\wxheading{See also} + +\helpref{wxRegEx}{wxregex} + + +\subsection{Different Flavors of REs} + +\helpref{Syntax of the builtin regular expression library}{wxresyn} + +Regular expressions (``RE''s), as defined by POSIX, come in two +flavors: {\it extended} REs (``EREs'') and {\it basic} REs (``BREs''). EREs are roughly those +of the traditional {\it egrep}, while BREs are roughly those of the traditional + {\it ed}. This implementation adds a third flavor, {\it advanced} REs (``AREs''), basically +EREs with some significant extensions. + +This manual page primarily describes +AREs. BREs mostly exist for backward compatibility in some old programs; +they will be discussed at the \helpref{end}{wxresynbre}. POSIX EREs are almost an exact subset +of AREs. Features of AREs that are not present in EREs will be indicated. + + +\subsection{Regular Expression Syntax} + +\helpref{Syntax of the builtin regular expression library}{wxresyn} + +These regular expressions are implemented using +the package written by Henry Spencer, based on the 1003.2 spec and some +(not quite all) of the Perl5 extensions (thanks, Henry!). Much of the description +of regular expressions below is copied verbatim from his manual entry. + +An +ARE is one or more {\it branches}, separated by `{\bf $|$}', matching anything that matches +any of the branches. + +A branch is zero or more {\it constraints} or {\it quantified +atoms}, concatenated. It matches a match for the first, followed by a match +for the second, etc; an empty branch matches the empty string. + +A quantified +atom is an {\it atom} possibly followed by a single {\it quantifier}. Without a quantifier, +it matches a match for the atom. The quantifiers, and what a so-quantified +atom matches, are: + +\begin{twocollist}\twocolwidtha{4cm} +\twocolitem{{\bf *}}{a sequence of 0 or more matches of the atom} +\twocolitem{{\bf +}}{a sequence of 1 or more matches of the atom} +\twocolitem{{\bf ?}}{a sequence of 0 or 1 matches of the atom} +\twocolitem{{\bf \{m\}}}{a sequence of exactly {\it m} matches of the atom} +\twocolitem{{\bf \{m,\}}}{a sequence of {\it m} or more matches of the atom} +\twocolitem{{\bf \{m,n\}}}{a sequence of {\it m} through {\it n} (inclusive) +matches of the atom; {\it m} may not exceed {\it n}} +\twocolitem{{\bf *? +? ?? \{m\}? \{m,\}? \{m,n\}?}}{{\it non-greedy} quantifiers, +which match the same possibilities, but prefer the +smallest number rather than the largest number of matches (see \helpref{Matching}{wxresynmatching})} +\end{twocollist} + +The forms using {\bf \{} and {\bf \}} are known as {\it bound}s. The numbers {\it m} and {\it n} are unsigned +decimal integers with permissible values from 0 to 255 inclusive. +An atom is one of: + +\begin{twocollist}\twocolwidtha{4cm} +\twocolitem{{\bf (re)}}{(where {\it re} is any regular expression) matches a match for + {\it re}, with the match noted for possible reporting} +\twocolitem{{\bf (?:re)}}{as previous, but +does no reporting (a ``non-capturing'' set of parentheses)} +\twocolitem{{\bf ()}}{matches an empty +string, noted for possible reporting} +\twocolitem{{\bf (?:)}}{matches an empty string, without reporting} +\twocolitem{{\bf $[chars]$}}{a {\it bracket expression}, matching any one of the {\it chars} +(see \helpref{Bracket Expressions}{wxresynbracket} for more detail)} +\twocolitem{{\bf .}}{matches any single character } +\twocolitem{{\bf $\backslash$k}}{(where {\it k} is a non-alphanumeric character) +matches that character taken as an ordinary character, e.g. $\backslash\backslash$ matches a backslash +character} +\twocolitem{{\bf $\backslash$c}}{where {\it c} is alphanumeric (possibly followed by other characters), +an {\it escape} (AREs only), see \helpref{Escapes}{wxresynescapes} below} +\twocolitem{{\bf \{}}{when followed by a character +other than a digit, matches the left-brace character `{\bf \{}'; when followed by +a digit, it is the beginning of a {\it bound} (see above)} +\twocolitem{{\bf x}}{where {\it x} is a single +character with no other significance, matches that character.} +\end{twocollist} + +A {\it constraint} +matches an empty string when specific conditions are met. A constraint may +not be followed by a quantifier. The simple constraints are as follows; +some more constraints are described later, under \helpref{Escapes}{wxresynescapes}. + +\begin{twocollist}\twocolwidtha{4cm} +\twocolitem{{\bf $^$}}{matches at the beginning of a line} +\twocolitem{{\bf \$}}{matches at the end of a line} +\twocolitem{{\bf (?=re)}}{{\it positive lookahead} +(AREs only), matches at any point where a substring matching {\it re} begins} +\twocolitem{{\bf (?!re)}}{{\it negative lookahead} (AREs only), +matches at any point where no substring matching {\it re} begins} +\end{twocollist} + +The lookahead constraints may not contain back references +(see later), and all parentheses within them are considered non-capturing. + +An RE may not end with `{\bf $\backslash$}'. + + +\subsection{Bracket Expressions}\label{wxresynbracket} + +\helpref{Syntax of the builtin regular expression library}{wxresyn} + +A {\it bracket expression} is a list +of characters enclosed in `{\bf $[]$}'. It normally matches any single character from +the list (but see below). If the list begins with `{\bf $^$}', it matches any single +character (but see below) {\it not} from the rest of the list. + +If two characters +in the list are separated by `{\bf -}', this is shorthand for the full {\it range} of +characters between those two (inclusive) in the collating sequence, e.g. + {\bf $[0-9]$} in ASCII matches any decimal digit. Two ranges may not share an endpoint, +so e.g. {\bf a-c-e} is illegal. Ranges are very collating-sequence-dependent, and portable +programs should avoid relying on them. + +To include a literal {\bf $]$} or {\bf -} in the +list, the simplest method is to enclose it in {\bf $[.$} and {\bf $.]$} to make it a collating +element (see below). Alternatively, make it the first character (following +a possible `{\bf $^$}'), or (AREs only) precede it with `{\bf $\backslash$}'. +Alternatively, for `{\bf -}', make +it the last character, or the second endpoint of a range. To use a literal + {\bf -} as the first endpoint of a range, make it a collating element or (AREs +only) precede it with `{\bf $\backslash$}'. With the exception of these, some combinations using + {\bf $[$} (see next paragraphs), and escapes, all other special characters lose +their special significance within a bracket expression. + +Within a bracket +expression, a collating element (a character, a multi-character sequence +that collates as if it were a single character, or a collating-sequence +name for either) enclosed in {\bf $[.$} and {\bf $.]$} stands for the +sequence of characters of that collating element. + +{\it wxWindows}: Currently no multi-character collating elements are defined. +So in {\bf $[.X.]$}, {\it X} can either be a single character literal or +the name of a character. For example, the following are both identical + {\bf $[[.0.]-[.9.]]$} and {\bf $[[.zero.]-[.nine.]]$} and mean the same as + {\bf $[0-9]$}. + See \helpref{Character Names}{wxresynchars}. + +%The sequence is a single element of the bracket +%expression's list. A bracket expression in a locale that has multi-character +%collating elements can thus match more than one character. So (insidiously), +%a bracket expression that starts with {\bf $^$} can match multi-character collating +%elements even if none of them appear in the bracket expression! ({\it Note:} +%Tcl currently has no multi-character collating elements. This information +%is only for illustration.) +% +%For example, assume the collating sequence includes +%a {\bf ch} multi-character collating element. Then the RE {\bf $[[.ch.]]*c$} (zero or more +% {\bf ch}'s followed by {\bf c}) matches the first five characters of `{\bf chchcc}'. Also, the +%RE {\bf $[^c]b$} matches all of `{\bf chb}' (because {\bf $[^c]$} matches the multi-character {\bf ch}). + +Within a bracket expression, a collating element enclosed in {\bf $[=$} and {\bf $=]$} +is an equivalence class, standing for the sequences of characters of all +collating elements equivalent to that one, including itself. +%(If there are +%no other equivalent collating elements, the treatment is as if the enclosing +%delimiters were `{\bf $[.$}' and `{\bf $.]$}'.) For example, if {\bf o} +%and {\bf $^$} are the members of an +%equivalence class, then `{\bf $[[$=o=$]]$}', `{\bf $[[$=$^$=$]]$}', +%and `{\bf $[o^]$}' are all synonymous. +An equivalence class may not be an endpoint of a range. + +%({\it Note:} Tcl currently +%implements only the Unicode locale. It doesn't define any equivalence classes. +%The examples above are just illustrations.) + +{\it wxWindows}: Currently no equivalence classes are defined, so +{\bf $[=X=]$} stands for just the single character {\it X}. + {\it X} can either be a single character literal or the name of a character, +see \helpref{Character Names}{wxresynchars}. + +Within a bracket expression, +the name of a {\it character class} enclosed in {\bf $[:$} and {\bf $:]$} stands for the list +of all characters (not all collating elements!) belonging to that class. +Standard character classes are: + +\begin{twocollist}\twocolwidtha{3cm} +\twocolitem{{\bf alpha}}{A letter.} +\twocolitem{{\bf upper}}{An upper-case letter.} +\twocolitem{{\bf lower}}{A lower-case letter.} +\twocolitem{{\bf digit}}{A decimal digit.} +\twocolitem{{\bf xdigit}}{A hexadecimal digit.} +\twocolitem{{\bf alnum}}{An alphanumeric (letter or digit).} +\twocolitem{{\bf print}}{An alphanumeric (same as alnum).} +\twocolitem{{\bf blank}}{A space or tab character.} +\twocolitem{{\bf space}}{A character producing white space in displayed text.} +\twocolitem{{\bf punct}}{A punctuation character.} +\twocolitem{{\bf graph}}{A character with a visible representation.} +\twocolitem{{\bf cntrl}}{A control character.} +\end{twocollist} + +%A locale may provide others. (Note that the current Tcl +%implementation has only one locale: the Unicode locale.) +A character class may not be used as an endpoint of a range. + +{\it wxWindows:} In a non-Unicode build, these character classifications depend on the +current locale, and correspond to the values return by the ANSI C 'is' +functions: isalpha, isupper, etc. In Unicode mode they are based on +Unicode classifications, and are not affected by the current locale. + +There are two special cases of bracket expressions: +the bracket expressions {\bf $[[:$<$:]]$} and {\bf $[[:$>$:]]$} are constraints, matching empty +strings at the beginning and end of a word respectively. A word is defined +as a sequence of word characters that is neither preceded nor followed +by word characters. A word character is an {\it alnum} character or an underscore +({\bf \_}). These special bracket expressions are deprecated; users of AREs should +use constraint escapes instead (see \helpref{Escapes}{wxresynescapes} below). + + +\subsection{Escapes}\label{wxresynescapes} + +\helpref{Syntax of the builtin regular expression library}{wxresyn} + +Escapes (AREs only), +which begin with a {\bf $\backslash$} followed by an alphanumeric character, come in several +varieties: character entry, class shorthands, constraint escapes, and back +references. A {\bf $\backslash$} followed by an alphanumeric character but not constituting +a valid escape is illegal in AREs. In EREs, there are no escapes: outside +a bracket expression, a {\bf $\backslash$} followed by an alphanumeric character merely stands +for that character as an ordinary character, and inside a bracket expression, + {\bf $\backslash$} is an ordinary character. (The latter is the one actual incompatibility +between EREs and AREs.) + +Character-entry escapes (AREs only) exist to make +it easier to specify non-printing and otherwise inconvenient characters +in REs: + +\begin{twocollist}\twocolwidtha{4cm} +\twocolitem{{\bf $\backslash$a}}{alert (bell) character, as in C} +\twocolitem{{\bf $\backslash$b}}{backspace, as in C} +\twocolitem{{\bf $\backslash$B}}{synonym +for {\bf $\backslash$} to help reduce backslash doubling in some applications where there +are multiple levels of backslash processing} +\twocolitem{{\bf $\backslash$c{\it X}}}{(where X is any character) +the character whose low-order 5 bits are the same as those of {\it X}, and whose +other bits are all zero} +\twocolitem{{\bf $\backslash$e}}{the character whose collating-sequence name is +`{\bf ESC}', or failing that, the character with octal value 033} +\twocolitem{{\bf $\backslash$f}}{formfeed, as in C} +\twocolitem{{\bf $\backslash$n}}{newline, as in C} +\twocolitem{{\bf $\backslash$r}}{carriage return, as in C} +\twocolitem{{\bf $\backslash$t}}{horizontal tab, as in C} +\twocolitem{{\bf $\backslash$u{\it wxyz}}}{(where {\it wxyz} is exactly four hexadecimal digits) +the Unicode +character {\bf U+{\it wxyz}} in the local byte ordering} +\twocolitem{{\bf $\backslash$U{\it stuvwxyz}}}{(where {\it stuvwxyz} is +exactly eight hexadecimal digits) reserved for a somewhat-hypothetical Unicode +extension to 32 bits} +\twocolitem{{\bf $\backslash$v}}{vertical tab, as in C are all available.} +\twocolitem{{\bf $\backslash$x{\it hhh}}}{(where + {\it hhh} is any sequence of hexadecimal digits) the character whose hexadecimal +value is {\bf 0x{\it hhh}} (a single character no matter how many hexadecimal digits +are used).} +\twocolitem{{\bf $\backslash$0}}{the character whose value is {\bf 0}} +\twocolitem{{\bf $\backslash${\it xy}}}{(where {\it xy} is exactly two +octal digits, and is not a {\it back reference} (see below)) the character whose +octal value is {\bf 0{\it xy}}} +\twocolitem{{\bf $\backslash${\it xyz}}}{(where {\it xyz} is exactly three octal digits, and is +not a back reference (see below)) +the character whose octal value is {\bf 0{\it xyz}}} +\end{twocollist} + +Hexadecimal digits are `{\bf 0}'-`{\bf 9}', `{\bf a}'-`{\bf f}', and `{\bf A}'-`{\bf F}'. Octal +digits are `{\bf 0}'-`{\bf 7}'. + +The character-entry +escapes are always taken as ordinary characters. For example, {\bf $\backslash$135} is {\bf ]} in +ASCII, but {\bf $\backslash$135} does not terminate a bracket expression. Beware, however, +that some applications (e.g., C compilers) interpret such sequences themselves +before the regular-expression package gets to see them, which may require +doubling (quadrupling, etc.) the `{\bf $\backslash$}'. + +Class-shorthand escapes (AREs only) provide +shorthands for certain commonly-used character classes: + +\begin{twocollist}\twocolwidtha{4cm} +\twocolitem{{\bf $\backslash$d}}{{\bf $[[:digit:]]$}} +\twocolitem{{\bf $\backslash$s}}{{\bf $[[:space:]]$}} +\twocolitem{{\bf $\backslash$w}}{{\bf $[[:alnum:]\_]$} (note underscore)} +\twocolitem{{\bf $\backslash$D}}{{\bf $[^[:digit:]]$}} +\twocolitem{{\bf $\backslash$S}}{{\bf $[^[:space:]]$}} +\twocolitem{{\bf $\backslash$W}}{{\bf $[^[:alnum:]\_]$} (note underscore)} +\end{twocollist} + +Within bracket expressions, `{\bf $\backslash$d}', `{\bf $\backslash$s}', and +`{\bf $\backslash$w}' lose their outer brackets, and `{\bf $\backslash$D}', +`{\bf $\backslash$S}', and `{\bf $\backslash$W}' are illegal. (So, for example, + {\bf $[$a-c$\backslash$d$]$} is equivalent to {\bf $[a-c[:digit:]]$}. +Also, {\bf $[$a-c$\backslash$D$]$}, which is equivalent to + {\bf $[a-c^[:digit:]]$}, is illegal.) + +A constraint escape (AREs only) is a constraint, +matching the empty string if specific conditions are met, written as an +escape: + +\begin{twocollist}\twocolwidtha{4cm} +\twocolitem{{\bf $\backslash$A}}{matches only at the beginning of the string +(see \helpref{Matching}{wxresynmatching}, below, +for how this differs from `{\bf $^$}')} +\twocolitem{{\bf $\backslash$m}}{matches only at the beginning of a word} +\twocolitem{{\bf $\backslash$M}}{matches only at the end of a word} +\twocolitem{{\bf $\backslash$y}}{matches only at the beginning or end of a word} +\twocolitem{{\bf $\backslash$Y}}{matches only at a point that is not the beginning or end of +a word} +\twocolitem{{\bf $\backslash$Z}}{matches only at the end of the string +(see \helpref{Matching}{wxresynmatching}, below, for +how this differs from `{\bf \$}')} +\twocolitem{{\bf $\backslash${\it m}}}{(where {\it m} is a nonzero digit) a {\it back reference}, +see below} +\twocolitem{{\bf $\backslash${\it mnn}}}{(where {\it m} is a nonzero digit, and {\it nn} is some more digits, +and the decimal value {\it mnn} is not greater than the number of closing capturing +parentheses seen so far) a {\it back reference}, see below} +\end{twocollist} + +A word is defined +as in the specification of {\bf $[[:$<$:]]$} and {\bf $[[:$>$:]]$} above. Constraint escapes are +illegal within bracket expressions. + +A back reference (AREs only) matches +the same string matched by the parenthesized subexpression specified by +the number, so that (e.g.) {\bf ($[bc]$)$\backslash$1} matches {\bf bb} or {\bf cc} but not `{\bf bc}'. +The subexpression +must entirely precede the back reference in the RE. Subexpressions are numbered +in the order of their leading parentheses. Non-capturing parentheses do not +define subexpressions. + +There is an inherent historical ambiguity between +octal character-entry escapes and back references, which is resolved by +heuristics, as hinted at above. A leading zero always indicates an octal +escape. A single non-zero digit, not followed by another digit, is always +taken as a back reference. A multi-digit sequence not starting with a zero +is taken as a back reference if it comes after a suitable subexpression +(i.e. the number is in the legal range for a back reference), and otherwise +is taken as octal. + + +\subsection{Metasyntax} + +\helpref{Syntax of the builtin regular expression library}{wxresyn} + +In addition to the main syntax described above, +there are some special forms and miscellaneous syntactic facilities available. + +Normally the flavor of RE being used is specified by application-dependent +means. However, this can be overridden by a {\it director}. If an RE of any flavor +begins with `{\bf ***:}', the rest of the RE is an ARE. If an RE of any flavor begins +with `{\bf ***=}', the rest of the RE is taken to be a literal string, with all +characters considered ordinary characters. + +An ARE may begin with {\it embedded options}: a sequence {\bf (?xyz)} +(where {\it xyz} is one or more alphabetic characters) +specifies options affecting the rest of the RE. These supplement, and can +override, any options specified by the application. The available option +letters are: + +\begin{twocollist}\twocolwidtha{4cm} +\twocolitem{{\bf b}}{rest of RE is a BRE} +\twocolitem{{\bf c}}{case-sensitive matching (usual default)} +\twocolitem{{\bf e}}{rest of RE is an ERE} +\twocolitem{{\bf i}}{case-insensitive matching (see \helpref{Matching}{wxresynmatching}, below)} +\twocolitem{{\bf m}}{historical synonym for {\bf n}} +\twocolitem{{\bf n}}{newline-sensitive matching (see \helpref{Matching}{wxresynmatching}, below)} +\twocolitem{{\bf p}}{partial newline-sensitive matching (see \helpref{Matching}{wxresynmatching}, below)} +\twocolitem{{\bf q}}{rest of RE +is a literal (``quoted'') string, all ordinary characters} +\twocolitem{{\bf s}}{non-newline-sensitive matching (usual default)} +\twocolitem{{\bf t}}{tight syntax (usual default; see below)} +\twocolitem{{\bf w}}{inverse +partial newline-sensitive (``weird'') matching (see \helpref{Matching}{wxresynmatching}, below)} +\twocolitem{{\bf x}}{expanded syntax (see below)} +\end{twocollist} + +Embedded options take effect at the {\bf )} terminating the +sequence. They are available only at the start of an ARE, and may not be +used later within it. + +In addition to the usual ({\it tight}) RE syntax, in which +all characters are significant, there is an {\it expanded} syntax, available +%in all flavors of RE with the {\bf -expanded} switch, or +in AREs with the embedded +x option. In the expanded syntax, white-space characters are ignored and +all characters between a {\bf \#} and the following newline (or the end of the +RE) are ignored, permitting paragraphing and commenting a complex RE. There +are three exceptions to that basic rule: +{\itemize +\item% +a white-space character or `{\bf \#}' preceded +by `{\bf $\backslash$}' is retained +\item% +white space or `{\bf \#}' within a bracket expression is retained +\item% +white space and comments are illegal within multi-character symbols like +the ARE `{\bf (?:}' or the BRE `{\bf $\backslash$(}' +} +Expanded-syntax white-space characters are blank, +tab, newline, and any character that belongs to the {\it space} character class. + +Finally, in an ARE, outside bracket expressions, the sequence `{\bf (?\#ttt)}' (where + {\it ttt} is any text not containing a `{\bf )}') is a comment, completely ignored. Again, +this is not allowed between the characters of multi-character symbols like +`{\bf (?:}'. Such comments are more a historical artifact than a useful facility, +and their use is deprecated; use the expanded syntax instead. + +{\it None} of these +metasyntax extensions is available if the application (or an initial {\bf ***=} +director) has specified that the user's input be treated as a literal string +rather than as an RE. + + +\subsection{Matching}\label{wxresynmatching} + +\helpref{Syntax of the builtin regular expression library}{wxresyn} + +In the event that an RE could match more than +one substring of a given string, the RE matches the one starting earliest +in the string. If the RE could match more than one substring starting at +that point, its choice is determined by its {\it preference}: either the longest +substring, or the shortest. + +Most atoms, and all constraints, have no preference. +A parenthesized RE has the same preference (possibly none) as the RE. A +quantified atom with quantifier {\bf \{m\}} or {\bf \{m\}?} has the same preference (possibly +none) as the atom itself. A quantified atom with other normal quantifiers +(including {\bf \{m,n\}} with {\it m} equal to {\it n}) prefers longest match. A quantified +atom with other non-greedy quantifiers (including {\bf \{m,n\}?} with {\it m} equal to + {\it n}) prefers shortest match. A branch has the same preference as the first +quantified atom in it which has a preference. An RE consisting of two or +more branches connected by the {\bf $|$} operator prefers longest match. + +Subject +to the constraints imposed by the rules for matching the whole RE, subexpressions +also match the longest or shortest possible substrings, based on their +preferences, with subexpressions starting earlier in the RE taking priority +over ones starting later. Note that outer subexpressions thus take priority +over their component subexpressions. + +Note that the quantifiers {\bf \{1,1\}} and + {\bf \{1,1\}?} can be used to force longest and shortest preference, respectively, +on a subexpression or a whole RE. + +Match lengths are measured in characters, +not collating elements. An empty string is considered longer than no match +at all. For example, {\bf bb*} matches the three middle characters +of `{\bf abbbc}', {\bf (week$|$wee)(night$|$knights)} +matches all ten characters of `{\bf weeknights}', when {\bf (.*).*} is matched against + {\bf abc} the parenthesized subexpression matches all three characters, and when + {\bf (a*)*} is matched against {\bf bc} both the whole RE and the parenthesized subexpression +match an empty string. + +If case-independent matching is specified, the effect +is much as if all case distinctions had vanished from the alphabet. When +an alphabetic that exists in multiple cases appears as an ordinary character +outside a bracket expression, it is effectively transformed into a bracket +expression containing both cases, so that {\bf x} becomes `{\bf $[xX]$}'. When it appears +inside a bracket expression, all case counterparts of it are added to the +bracket expression, so that {\bf $[x]$} becomes {\bf $[xX]$} and {\bf $[^x]$} becomes `{\bf $[^xX]$}'. + +If newline-sensitive +matching is specified, {\bf .} and bracket expressions using {\bf $^$} will never match +the newline character (so that matches will never cross newlines unless +the RE explicitly arranges it) and {\bf $^$} and {\bf \$} will match the empty string after +and before a newline respectively, in addition to matching at beginning +and end of string respectively. ARE {\bf $\backslash$A} and {\bf $\backslash$Z} continue to match beginning +or end of string {\it only}. + +If partial newline-sensitive matching is specified, +this affects {\bf .} and bracket expressions as with newline-sensitive matching, +but not {\bf $^$} and `{\bf \$}'. + +If inverse partial newline-sensitive matching is specified, +this affects {\bf $^$} and {\bf \$} as with newline-sensitive matching, but not {\bf .} and bracket +expressions. This isn't very useful but is provided for symmetry. + + +\subsection{Limits And Compatibility} + +\helpref{Syntax of the builtin regular expression library}{wxresyn} + +No particular limit is imposed on the length of REs. Programs +intended to be highly portable should not employ REs longer than 256 bytes, +as a POSIX-compliant implementation can refuse to accept such REs. + +The only +feature of AREs that is actually incompatible with POSIX EREs is that {\bf $\backslash$} +does not lose its special significance inside bracket expressions. All other +ARE features use syntax which is illegal or has undefined or unspecified +effects in POSIX EREs; the {\bf ***} syntax of directors likewise is outside +the POSIX syntax for both BREs and EREs. + +Many of the ARE extensions are +borrowed from Perl, but some have been changed to clean them up, and a +few Perl extensions are not present. Incompatibilities of note include `{\bf $\backslash$b}', +`{\bf $\backslash$B}', the lack of special treatment for a trailing newline, the addition of +complemented bracket expressions to the things affected by newline-sensitive +matching, the restrictions on parentheses and back references in lookahead +constraints, and the longest/shortest-match (rather than first-match) matching +semantics. + +The matching rules for REs containing both normal and non-greedy +quantifiers have changed since early beta-test versions of this package. +(The new rules are much simpler and cleaner, but don't work as hard at guessing +the user's real intentions.) + +Henry Spencer's original 1986 {\it regexp} package, still in widespread use, +%(e.g., in pre-8.1 releases of Tcl), +implemented an early version of today's EREs. There are four incompatibilities between {\it regexp}'s +near-EREs (`RREs' for short) and AREs. In roughly increasing order of significance: +{\itemize +\item +In AREs, {\bf $\backslash$} followed by an alphanumeric character is either an escape or +an error, while in RREs, it was just another way of writing the alphanumeric. +This should not be a problem because there was no reason to write such +a sequence in RREs. + +\item% +{\bf \{} followed by a digit in an ARE is the beginning of +a bound, while in RREs, {\bf \{} was always an ordinary character. Such sequences +should be rare, and will often result in an error because following characters +will not look like a valid bound. + +\item% +In AREs, {\bf $\backslash$} remains a special character +within `{\bf $[]$}', so a literal {\bf $\backslash$} within {\bf $[]$} must be +written `{\bf $\backslash\backslash$}'. {\bf $\backslash\backslash$} also gives a literal + {\bf $\backslash$} within {\bf $[]$} in RREs, but only truly paranoid programmers routinely doubled +the backslash. + +\item% +AREs report the longest/shortest match for the RE, rather +than the first found in a specified search order. This may affect some RREs +which were written in the expectation that the first match would be reported. +(The careful crafting of RREs to optimize the search order for fast matching +is obsolete (AREs examine all possible matches in parallel, and their performance +is largely insensitive to their complexity) but cases where the search +order was exploited to deliberately find a match which was {\it not} the longest/shortest +will need rewriting.) +} + + +\subsection{Basic Regular Expressions}\label{wxresynbre} + +\helpref{Syntax of the builtin regular expression library}{wxresyn} + +BREs differ from EREs in +several respects. `{\bf $|$}', `{\bf +}', and {\bf ?} are ordinary characters and there is no equivalent +for their functionality. The delimiters for bounds +are {\bf $\backslash$\{} and `{\bf $\backslash$\}}', with {\bf \{} and + {\bf \}} by themselves ordinary characters. The parentheses for nested subexpressions +are {\bf $\backslash$(} and `{\bf $\backslash$)}', with {\bf (} and {\bf )} by themselves +ordinary characters. {\bf $^$} is an ordinary +character except at the beginning of the RE or the beginning of a parenthesized +subexpression, {\bf \$} is an ordinary character except at the end of the RE or +the end of a parenthesized subexpression, and {\bf *} is an ordinary character +if it appears at the beginning of the RE or the beginning of a parenthesized +subexpression (after a possible leading `{\bf $^$}'). Finally, single-digit back references +are available, and {\bf $\backslash<$} and {\bf $\backslash>$} are synonyms +for {\bf $[[:<:]]$} and {\bf $[[:>:]]$} respectively; +no other escapes are available. + + +\subsection{Regular Expression Character Names}\label{wxresynchars} + +\helpref{Syntax of the builtin regular expression library}{wxresyn} + +Note that the character names are case sensitive. + +\begin{twocollist} +\twocolitem{NUL}{'$\backslash$0'} +\twocolitem{SOH}{'$\backslash$001'} +\twocolitem{STX}{'$\backslash$002'} +\twocolitem{ETX}{'$\backslash$003'} +\twocolitem{EOT}{'$\backslash$004'} +\twocolitem{ENQ}{'$\backslash$005'} +\twocolitem{ACK}{'$\backslash$006'} +\twocolitem{BEL}{'$\backslash$007'} +\twocolitem{alert}{'$\backslash$007'} +\twocolitem{BS}{'$\backslash$010'} +\twocolitem{backspace}{'$\backslash$b'} +\twocolitem{HT}{'$\backslash$011'} +\twocolitem{tab}{'$\backslash$t'} +\twocolitem{LF}{'$\backslash$012'} +\twocolitem{newline}{'$\backslash$n'} +\twocolitem{VT}{'$\backslash$013'} +\twocolitem{vertical-tab}{'$\backslash$v'} +\twocolitem{FF}{'$\backslash$014'} +\twocolitem{form-feed}{'$\backslash$f'} +\twocolitem{CR}{'$\backslash$015'} +\twocolitem{carriage-return}{'$\backslash$r'} +\twocolitem{SO}{'$\backslash$016'} +\twocolitem{SI}{'$\backslash$017'} +\twocolitem{DLE}{'$\backslash$020'} +\twocolitem{DC1}{'$\backslash$021'} +\twocolitem{DC2}{'$\backslash$022'} +\twocolitem{DC3}{'$\backslash$023'} +\twocolitem{DC4}{'$\backslash$024'} +\twocolitem{NAK}{'$\backslash$025'} +\twocolitem{SYN}{'$\backslash$026'} +\twocolitem{ETB}{'$\backslash$027'} +\twocolitem{CAN}{'$\backslash$030'} +\twocolitem{EM}{'$\backslash$031'} +\twocolitem{SUB}{'$\backslash$032'} +\twocolitem{ESC}{'$\backslash$033'} +\twocolitem{IS4}{'$\backslash$034'} +\twocolitem{FS}{'$\backslash$034'} +\twocolitem{IS3}{'$\backslash$035'} +\twocolitem{GS}{'$\backslash$035'} +\twocolitem{IS2}{'$\backslash$036'} +\twocolitem{RS}{'$\backslash$036'} +\twocolitem{IS1}{'$\backslash$037'} +\twocolitem{US}{'$\backslash$037'} +\twocolitem{space}{' '} +\twocolitem{exclamation-mark}{'!'} +\twocolitem{quotation-mark}{'"'} +\twocolitem{number-sign}{'\#'} +\twocolitem{dollar-sign}{'\$'} +\twocolitem{percent-sign}{'\%'} +\twocolitem{ampersand}{'&'} +\twocolitem{apostrophe}{'$\backslash$''} +\twocolitem{left-parenthesis}{'('} +\twocolitem{right-parenthesis}{')'} +\twocolitem{asterisk}{'*'} +\twocolitem{plus-sign}{'+'} +\twocolitem{comma}{','} +\twocolitem{hyphen}{'-'} +\twocolitem{hyphen-minus}{'-'} +\twocolitem{period}{'.'} +\twocolitem{full-stop}{'.'} +\twocolitem{slash}{'/'} +\twocolitem{solidus}{'/'} +\twocolitem{zero}{'0'} +\twocolitem{one}{'1'} +\twocolitem{two}{'2'} +\twocolitem{three}{'3'} +\twocolitem{four}{'4'} +\twocolitem{five}{'5'} +\twocolitem{six}{'6'} +\twocolitem{seven}{'7'} +\twocolitem{eight}{'8'} +\twocolitem{nine}{'9'} +\twocolitem{colon}{':'} +\twocolitem{semicolon}{';'} +\twocolitem{less-than-sign}{'<'} +\twocolitem{equals-sign}{'='} +\twocolitem{greater-than-sign}{'>'} +\twocolitem{question-mark}{'?'} +\twocolitem{commercial-at}{'@'} +\twocolitem{left-square-bracket}{'$[$'} +\twocolitem{backslash}{'$\backslash$'} +\twocolitem{reverse-solidus}{'$\backslash$'} +\twocolitem{right-square-bracket}{'$]$'} +\twocolitem{circumflex}{'$^$'} +\twocolitem{circumflex-accent}{'$^$'} +\twocolitem{underscore}{'\_'} +\twocolitem{low-line}{'\_'} +\twocolitem{grave-accent}{'`'} +\twocolitem{left-brace}{'\{'} +\twocolitem{left-curly-bracket}{'\{'} +\twocolitem{vertical-line}{'\|'} +\twocolitem{right-brace}{'\}'} +\twocolitem{right-curly-bracket}{'\}'} +\twocolitem{tilde}{'$~$'} +\twocolitem{DEL}{'$\backslash$177'} +\end{twocollist} diff --git a/docs/latex/wx/regex.tex b/docs/latex/wx/regex.tex index 025d01e411..89542cff7c 100644 --- a/docs/latex/wx/regex.tex +++ b/docs/latex/wx/regex.tex @@ -11,14 +11,29 @@ \section{\class{wxRegEx}}\label{wxregex} -wxRegEx represents a regular expression. The regular expressions syntax -supported is the POSIX one. Both basic and extended regular expressions are -supported but, unlike POSIX C API, the extended ones are used by default. - -This class provides support for regular expressions matching and also -replacement. It is built on top of either the system library (if it has support -for POSIX regular expressions - which is the case of the most modern Unices) or -uses a version of Henry Spencer's library from tcl. +wxRegEx represents a regular expression. This class provides support +for regular expressions matching and also replacement. + +It is built on top of either the system library (if it has support +for POSIX regular expressions - which is the case of the most modern +Unices) or uses the built in Henry Spencer's library. Henry Spencer +would appreciate being given credit in the documentation of software +which uses his library, but that is not a requirement. + +Regular expressions, as defined by POSIX, come in two flavours: {\it extended} +and {\it basic}. The builtin library also adds a third flavour +of expression \helpref{advanced}{wxresyn}, which is not available +when using the system library. + +Unicode is fully supported only when using the builtin library. +When using the system library in Unicode mode, the expressions and data +are translated to the default 8-bit encoding before being passed to +the library. + +On platforms where a system library is available, the default is to use +the builtin library for Unicode builds, and the system library otherwise. +It is possible to use the other if preferred by selecting it when building +the wxWindows. \wxheading{Derived from} @@ -31,8 +46,13 @@ Flags for regex compilation to be used with \helpref{Compile()}{wxregexcompile}: \begin{verbatim} enum { - // use extended regex syntax (default) + // use extended regex syntax wxRE_EXTENDED = 0, + + // use advanced RE syntax (built-in regex only) +#ifdef wxHAS_REGEX_ADVANCED + wxRE_ADVANCED = 1, +#endif // use basic RE syntax wxRE_BASIC = 2, diff --git a/docs/latex/wx/topics.tex b/docs/latex/wx/topics.tex index bae331270e..89eca18461 100644 --- a/docs/latex/wx/topics.tex +++ b/docs/latex/wx/topics.tex @@ -58,4 +58,4 @@ This chapter contains a selection of topic overviews, first things first: \input wxhtml.tex \input tenvvars.tex \input wxPython.tex - +\input re_syntax.tex diff --git a/setup.h.in b/setup.h.in index fb78a521b1..8c7a2988ae 100644 --- a/setup.h.in +++ b/setup.h.in @@ -182,6 +182,12 @@ * Use regex support */ #define wxUSE_REGEX 0 +/* + * The built-in regex supports advanced REs in additional to POSIX's basic + * and extended. Your system regex probably won't support this, and in this + * case WX_NO_REGEX_ADVANCED should be defined. + */ +#undef WX_NO_REGEX_ADVANCED /* * Use XML support */ diff --git a/setup.h_vms b/setup.h_vms index e533df83e8..30d52a5731 100644 --- a/setup.h_vms +++ b/setup.h_vms @@ -191,6 +191,12 @@ * Use regex support */ #define wxUSE_REGEX 0 +/* + * The built-in regex supports advanced REs in additional to POSIX's basic + * and extended. Your system regex probably won't support this, and in this + * case WX_NO_REGEX_ADVANCED should be defined. + */ +#undef WX_NO_REGEX_ADVANCED /* * Use XML support */ -- 2.45.2