applied patch 890642: wxRE_ADVANCED flag and docs

author Václav Slavík <vslavik@fastmail.fm>

Thu, 19 Feb 2004 18:02:48 +0000 (18:02 +0000)

committer Václav Slavík <vslavik@fastmail.fm>

Thu, 19 Feb 2004 18:02:48 +0000 (18:02 +0000)
author Václav Slavík <vslavik@fastmail.fm>
Thu, 19 Feb 2004 18:02:48 +0000 (18:02 +0000)
committer Václav Slavík <vslavik@fastmail.fm>
Thu, 19 Feb 2004 18:02:48 +0000 (18:02 +0000)
diff --git a/autoconf_inc.m4 b/autoconf_inc.m4

index 476509c6fc7337c3a61f7d8c7d2d9179f7e51f7a..1e7be34afa2cbdc8c718e0f19f9c515644e1feaf 100644 (file)
--- a/autoconf_inc.m4
+++ b/autoconf_inc.m4
@@ -2,9 +2,6 @@ dnl This macro was generated by
  dnl Bakefile 0.1.3 (http://bakefile.sourceforge.net)
  dnl Do not modify, all changes will be overwritten!
  
-
-dnl Conditions:
-
  dnl ### begin block 0_AC_BAKEFILE_PRECOMP_HEADERS ###
  AC_BAKEFILE_PRECOMP_HEADERS
  
diff --git a/docs/latex/wx/re_syntax.tex b/docs/latex/wx/re_syntax.tex

new file mode 100644 (file)

index 0000000..677dad8
--- /dev/null
+++ b/docs/latex/wx/re_syntax.tex
@@ -0,0 +1,676 @@
+% manual page source format generated by PolyglotMan v3.0.9,
+% available via anonymous ftp from ftp.cs.berkeley.edu:/ucb/people/phelps/tcltk/rman.tar.Z
+
+\section{Syntax of the builtin regular expression library}\label{wxresyn}
+
+A {\it regular expression} describes strings of characters. It's a
+pattern that matches certain strings and doesn't match others.  
+
+\wxheading{See also}
+
+\helpref{wxRegEx}{wxregex}
+
+ 
+\subsection{Different Flavors of REs}
+
+\helpref{Syntax of the builtin regular expression library}{wxresyn}
+
+Regular expressions (``RE''s), as defined by POSIX, come in two
+flavors: {\it extended} REs (``EREs'') and {\it basic} REs (``BREs''). EREs are roughly those
+of the traditional {\it egrep}, while BREs are roughly those of the traditional
+ {\it ed}.  This implementation adds a third flavor, {\it advanced} REs (``AREs''), basically
+EREs with some significant extensions. 
+
+This manual page primarily describes
+AREs.  BREs mostly exist for backward compatibility in some old programs;
+they will be discussed at the \helpref{end}{wxresynbre}.  POSIX EREs are almost an exact subset
+of AREs.  Features of AREs that are not present in EREs will be indicated.
+
+ 
+\subsection{Regular Expression Syntax}
+
+\helpref{Syntax of the builtin regular expression library}{wxresyn}
+
+These regular expressions are implemented using
+the package written by Henry Spencer, based on the 1003.2 spec and some
+(not quite all) of the Perl5 extensions (thanks, Henry!).  Much of the description
+of regular expressions below is copied verbatim from his manual entry. 
+
+An
+ARE is one or more {\it branches}, separated by `{\bf $|$}', matching anything that matches
+any of the branches. 
+
+A branch is zero or more {\it constraints} or {\it quantified
+atoms}, concatenated. It matches a match for the first, followed by a match
+for the second, etc; an empty branch matches the empty string. 
+
+A quantified
+atom is an {\it atom} possibly followed by a single {\it quantifier}. Without a quantifier,
+it matches a match for the atom. The quantifiers, and what a so-quantified
+atom matches, are:
+
+\begin{twocollist}\twocolwidtha{4cm}
+\twocolitem{{\bf *}}{a sequence of 0 or more matches of the atom}
+\twocolitem{{\bf +}}{a sequence of 1 or more matches of the atom}
+\twocolitem{{\bf ?}}{a sequence of 0 or 1 matches of the atom}
+\twocolitem{{\bf \{m\}}}{a sequence of exactly {\it m} matches of the atom}
+\twocolitem{{\bf \{m,\}}}{a sequence of {\it m} or more matches of the atom}
+\twocolitem{{\bf \{m,n\}}}{a sequence of {\it m} through {\it n} (inclusive)
+matches of the atom; {\it m} may not exceed {\it n}}
+\twocolitem{{\bf *?  +?  ??  \{m\}?  \{m,\}?  \{m,n\}?}}{{\it non-greedy} quantifiers,
+which match the same possibilities, but prefer the
+smallest number rather than the largest number of matches (see \helpref{Matching}{wxresynmatching})}
+\end{twocollist}
+
+The forms using {\bf \{} and {\bf \}} are known as {\it bound}s. The numbers {\it m} and {\it n} are unsigned
+decimal integers with permissible values from 0 to 255 inclusive. 
+An atom is one of:
+
+\begin{twocollist}\twocolwidtha{4cm}
+\twocolitem{{\bf (re)}}{(where {\it re} is any regular expression) matches a match for
+ {\it re}, with the match noted for possible reporting}
+\twocolitem{{\bf (?:re)}}{as previous, but
+does no reporting (a ``non-capturing'' set of parentheses)}
+\twocolitem{{\bf ()}}{matches an empty
+string, noted for possible reporting}
+\twocolitem{{\bf (?:)}}{matches an empty string, without reporting}
+\twocolitem{{\bf $[chars]$}}{a {\it bracket expression}, matching any one of the {\it chars}
+(see \helpref{Bracket Expressions}{wxresynbracket} for more detail)}
+\twocolitem{{\bf .}}{matches any single character }
+\twocolitem{{\bf $\backslash$k}}{(where {\it k} is a non-alphanumeric character)
+matches that character taken as an ordinary character, e.g. $\backslash\backslash$ matches a backslash
+character}
+\twocolitem{{\bf $\backslash$c}}{where {\it c} is alphanumeric (possibly followed by other characters),
+an {\it escape} (AREs only), see \helpref{Escapes}{wxresynescapes} below}
+\twocolitem{{\bf \{}}{when followed by a character
+other than a digit, matches the left-brace character `{\bf \{}'; when followed by
+a digit, it is the beginning of a {\it bound} (see above)}
+\twocolitem{{\bf x}}{where {\it x} is a single
+character with no other significance, matches that character.}
+\end{twocollist}
+
+A {\it constraint}
+matches an empty string when specific conditions are met. A constraint may
+not be followed by a quantifier. The simple constraints are as follows;
+some more constraints are described later, under \helpref{Escapes}{wxresynescapes}.
+
+\begin{twocollist}\twocolwidtha{4cm}
+\twocolitem{{\bf $^$}}{matches at the beginning of a line}
+\twocolitem{{\bf \$}}{matches at the end of a line}
+\twocolitem{{\bf (?=re)}}{{\it positive lookahead}
+(AREs only), matches at any point where a substring matching {\it re} begins}
+\twocolitem{{\bf (?!re)}}{{\it negative lookahead} (AREs only),
+matches at any point where no substring matching {\it re} begins}
+\end{twocollist}
+
+The lookahead constraints may not contain back references
+(see later), and all parentheses within them are considered non-capturing.
+
+An RE may not end with `{\bf $\backslash$}'.
+
+
+\subsection{Bracket Expressions}\label{wxresynbracket}
+
+\helpref{Syntax of the builtin regular expression library}{wxresyn}
+
+A {\it bracket expression} is a list
+of characters enclosed in `{\bf $[]$}'. It normally matches any single character from
+the list (but see below). If the list begins with `{\bf $^$}', it matches any single
+character (but see below) {\it not} from the rest of the list. 
+
+If two characters
+in the list are separated by `{\bf -}', this is shorthand for the full {\it range} of
+characters between those two (inclusive) in the collating sequence, e.g.
+ {\bf $[0-9]$} in ASCII matches any decimal digit. Two ranges may not share an endpoint,
+so e.g. {\bf a-c-e} is illegal. Ranges are very collating-sequence-dependent, and portable
+programs should avoid relying on them. 
+
+To include a literal {\bf $]$} or {\bf -} in the
+list, the simplest method is to enclose it in {\bf $[.$} and {\bf $.]$} to make it a collating
+element (see below). Alternatively, make it the first character (following
+a possible `{\bf $^$}'), or (AREs only) precede it with `{\bf $\backslash$}'.
+Alternatively, for `{\bf -}', make
+it the last character, or the second endpoint of a range. To use a literal
+ {\bf -} as the first endpoint of a range, make it a collating element or (AREs
+only) precede it with `{\bf $\backslash$}'. With the exception of these, some combinations using
+ {\bf $[$} (see next paragraphs), and escapes, all other special characters lose
+their special significance within a bracket expression. 
+
+Within a bracket
+expression, a collating element (a character, a multi-character sequence
+that collates as if it were a single character, or a collating-sequence
+name for either) enclosed in {\bf $[.$} and {\bf $.]$} stands for the
+sequence of characters of that collating element.
+
+{\it wxWindows}: Currently no multi-character collating elements are defined.
+So in {\bf $[.X.]$}, {\it X} can either be a single character literal or
+the name of a character. For example, the following are both identical
+ {\bf $[[.0.]-[.9.]]$} and {\bf $[[.zero.]-[.nine.]]$} and mean the same as
+ {\bf $[0-9]$}.
+ See \helpref{Character Names}{wxresynchars}.
+
+%The sequence is a single element of the bracket
+%expression's list. A bracket expression in a locale that has multi-character
+%collating elements can thus match more than one character. So (insidiously),
+%a bracket expression that starts with {\bf $^$} can match multi-character collating
+%elements even if none of them appear in the bracket expression! ({\it Note:}
+%Tcl currently has no multi-character collating elements. This information
+%is only for illustration.) 
+%
+%For example, assume the collating sequence includes
+%a {\bf ch} multi-character collating element. Then the RE {\bf $[[.ch.]]*c$} (zero or more
+% {\bf ch}'s followed by {\bf c}) matches the first five characters of `{\bf chchcc}'. Also, the
+%RE {\bf $[^c]b$} matches all of `{\bf chb}' (because {\bf $[^c]$} matches the multi-character {\bf ch}).
+
+Within a bracket expression, a collating element enclosed in {\bf $[=$} and {\bf $=]$}
+is an equivalence class, standing for the sequences of characters of all
+collating elements equivalent to that one, including itself.
+%(If there are
+%no other equivalent collating elements, the treatment is as if the enclosing
+%delimiters were `{\bf $[.$}' and `{\bf $.]$}'.) For example, if {\bf o}
+%and {\bf $^$} are the members of an
+%equivalence class, then `{\bf $[[$=o=$]]$}', `{\bf $[[$=$^$=$]]$}',
+%and `{\bf $[o^]$}' are all synonymous.
+An equivalence class may not be an endpoint of a range.
+
+%({\it Note:}  Tcl currently
+%implements only the Unicode locale. It doesn't define any equivalence classes.
+%The examples above are just illustrations.) 
+
+{\it wxWindows}: Currently no equivalence classes are defined, so 
+{\bf $[=X=]$} stands for just the single character {\it X}. 
+ {\it X} can either be a single character literal or the name of a character,
+see \helpref{Character Names}{wxresynchars}.
+
+Within a bracket expression,
+the name of a {\it character class} enclosed in {\bf $[:$} and {\bf $:]$} stands for the list
+of all characters (not all collating elements!) belonging to that class.
+Standard character classes are:
+
+\begin{twocollist}\twocolwidtha{3cm}
+\twocolitem{{\bf alpha}}{A letter.}
+\twocolitem{{\bf upper}}{An upper-case letter.}
+\twocolitem{{\bf lower}}{A lower-case letter.}
+\twocolitem{{\bf digit}}{A decimal digit.}
+\twocolitem{{\bf xdigit}}{A hexadecimal digit.}
+\twocolitem{{\bf alnum}}{An alphanumeric (letter or digit).}
+\twocolitem{{\bf print}}{An alphanumeric (same as alnum).}
+\twocolitem{{\bf blank}}{A space or tab character.}
+\twocolitem{{\bf space}}{A character producing white space in displayed text.}
+\twocolitem{{\bf punct}}{A punctuation character.}
+\twocolitem{{\bf graph}}{A character with a visible representation.}
+\twocolitem{{\bf cntrl}}{A control character.}
+\end{twocollist}
+
+%A locale may provide others.  (Note that the  current  Tcl
+%implementation  has  only one locale: the Unicode locale.)
+A character class may not be used as an endpoint of a range. 
+
+{\it wxWindows:} In a non-Unicode build, these character classifications depend on the
+current locale, and correspond to the values return by the ANSI C 'is'
+functions: isalpha, isupper, etc. In Unicode mode they are based on
+Unicode classifications, and are not affected by the current locale.
+
+There are two special cases of bracket expressions:
+the bracket expressions {\bf $[[:$<$:]]$} and {\bf $[[:$>$:]]$} are constraints, matching empty
+strings at the beginning and end of a word respectively.  A word is defined
+as a sequence of word characters that is neither preceded nor followed
+by word characters. A word character is an {\it alnum} character or an underscore
+({\bf \_}). These special bracket expressions are deprecated; users of AREs should
+use constraint escapes instead (see \helpref{Escapes}{wxresynescapes} below). 
+
+
+\subsection{Escapes}\label{wxresynescapes}
+
+\helpref{Syntax of the builtin regular expression library}{wxresyn}
+
+Escapes (AREs only),
+which begin with a {\bf $\backslash$} followed by an alphanumeric character, come in several
+varieties: character entry, class shorthands, constraint escapes, and back
+references. A {\bf $\backslash$} followed by an alphanumeric character but not constituting
+a valid escape is illegal in AREs. In EREs, there are no escapes: outside
+a bracket expression, a {\bf $\backslash$} followed by an alphanumeric character merely stands
+for that character as an ordinary character, and inside a bracket expression,
+ {\bf $\backslash$} is an ordinary character. (The latter is the one actual incompatibility
+between EREs and AREs.) 
+
+Character-entry escapes (AREs only) exist to make
+it easier to specify non-printing and otherwise inconvenient characters
+in REs:
+
+\begin{twocollist}\twocolwidtha{4cm}
+\twocolitem{{\bf $\backslash$a}}{alert (bell) character, as in C}
+\twocolitem{{\bf $\backslash$b}}{backspace, as in C}
+\twocolitem{{\bf $\backslash$B}}{synonym
+for {\bf $\backslash$} to help reduce backslash doubling in some applications where there
+are multiple levels of backslash processing}
+\twocolitem{{\bf $\backslash$c{\it X}}}{(where X is any character)
+the character whose low-order 5 bits are the same as those of {\it X}, and whose
+other bits are all zero}
+\twocolitem{{\bf $\backslash$e}}{the character whose collating-sequence name is
+`{\bf ESC}', or failing that, the character with octal value 033}
+\twocolitem{{\bf $\backslash$f}}{formfeed, as in C}
+\twocolitem{{\bf $\backslash$n}}{newline, as in C}
+\twocolitem{{\bf $\backslash$r}}{carriage return, as in C}
+\twocolitem{{\bf $\backslash$t}}{horizontal tab, as in C}
+\twocolitem{{\bf $\backslash$u{\it wxyz}}}{(where {\it wxyz} is exactly four hexadecimal digits)
+the Unicode
+character {\bf U+{\it wxyz}} in the local byte ordering}
+\twocolitem{{\bf $\backslash$U{\it stuvwxyz}}}{(where {\it stuvwxyz} is
+exactly eight hexadecimal digits) reserved for a somewhat-hypothetical Unicode
+extension to 32 bits}
+\twocolitem{{\bf $\backslash$v}}{vertical tab, as in C are all available.}
+\twocolitem{{\bf $\backslash$x{\it hhh}}}{(where
+ {\it hhh} is any sequence of hexadecimal digits) the character whose hexadecimal
+value is {\bf 0x{\it hhh}} (a single character no matter how many hexadecimal digits
+are used).}
+\twocolitem{{\bf $\backslash$0}}{the character whose value is {\bf 0}}
+\twocolitem{{\bf $\backslash${\it xy}}}{(where {\it xy} is exactly two
+octal digits, and is not a {\it back reference} (see below)) the character whose
+octal value is {\bf 0{\it xy}}}
+\twocolitem{{\bf $\backslash${\it xyz}}}{(where {\it xyz} is exactly three octal digits, and is
+not a back reference (see below))
+the character whose octal value is {\bf 0{\it xyz}}}
+\end{twocollist}
+
+Hexadecimal digits are `{\bf 0}'-`{\bf 9}', `{\bf a}'-`{\bf f}', and `{\bf A}'-`{\bf F}'. Octal
+digits are `{\bf 0}'-`{\bf 7}'. 
+
+The character-entry
+escapes are always taken as ordinary characters. For example, {\bf $\backslash$135} is {\bf ]} in
+ASCII, but {\bf $\backslash$135} does not terminate a bracket expression. Beware, however,
+that some applications (e.g., C compilers) interpret  such sequences themselves
+before the regular-expression package gets to see them, which may require
+doubling (quadrupling, etc.) the `{\bf $\backslash$}'. 
+
+Class-shorthand escapes (AREs only) provide
+shorthands for certain commonly-used character classes:
+
+\begin{twocollist}\twocolwidtha{4cm}
+\twocolitem{{\bf $\backslash$d}}{{\bf $[[:digit:]]$}}
+\twocolitem{{\bf $\backslash$s}}{{\bf $[[:space:]]$}}
+\twocolitem{{\bf $\backslash$w}}{{\bf $[[:alnum:]\_]$} (note underscore)}
+\twocolitem{{\bf $\backslash$D}}{{\bf $[^[:digit:]]$}}
+\twocolitem{{\bf $\backslash$S}}{{\bf $[^[:space:]]$}}
+\twocolitem{{\bf $\backslash$W}}{{\bf $[^[:alnum:]\_]$} (note underscore)}
+\end{twocollist}
+
+Within bracket expressions, `{\bf $\backslash$d}', `{\bf $\backslash$s}', and
+`{\bf $\backslash$w}' lose their outer brackets, and `{\bf $\backslash$D}',
+`{\bf $\backslash$S}', and `{\bf $\backslash$W}' are illegal. (So, for example,
+ {\bf $[$a-c$\backslash$d$]$} is equivalent to {\bf $[a-c[:digit:]]$}.
+Also, {\bf $[$a-c$\backslash$D$]$}, which is equivalent to
+ {\bf $[a-c^[:digit:]]$}, is illegal.) 
+
+A constraint escape (AREs only) is a constraint,
+matching the empty string if specific conditions are met, written as an
+escape:
+
+\begin{twocollist}\twocolwidtha{4cm}
+\twocolitem{{\bf $\backslash$A}}{matches only at the beginning of the string
+(see \helpref{Matching}{wxresynmatching}, below,
+for how this differs from `{\bf $^$}')}
+\twocolitem{{\bf $\backslash$m}}{matches only at the beginning of a word}
+\twocolitem{{\bf $\backslash$M}}{matches only at the end of a word}
+\twocolitem{{\bf $\backslash$y}}{matches only at the beginning or end of a word}
+\twocolitem{{\bf $\backslash$Y}}{matches only at a point that is not the beginning or end of
+a word}
+\twocolitem{{\bf $\backslash$Z}}{matches only at the end of the string
+(see \helpref{Matching}{wxresynmatching}, below, for
+how this differs from `{\bf \$}')}
+\twocolitem{{\bf $\backslash${\it m}}}{(where {\it m} is a nonzero digit) a {\it back reference},
+see below}
+\twocolitem{{\bf $\backslash${\it mnn}}}{(where {\it m} is a nonzero digit, and {\it nn} is some more digits,
+and the decimal value {\it mnn} is not greater than the number of closing capturing
+parentheses seen so far) a {\it back reference}, see below}
+\end{twocollist}
+
+A word is defined
+as in the specification of {\bf $[[:$<$:]]$} and {\bf $[[:$>$:]]$} above. Constraint escapes are
+illegal within bracket expressions. 
+
+A back reference (AREs only) matches
+the same string matched by the parenthesized subexpression specified by
+the number, so that (e.g.) {\bf ($[bc]$)$\backslash$1} matches {\bf bb} or {\bf cc} but not `{\bf bc}'.
+The subexpression
+must entirely precede the back reference in the RE. Subexpressions are numbered
+in the order of their leading parentheses. Non-capturing parentheses do not
+define subexpressions. 
+
+There is an inherent historical ambiguity between
+octal character-entry  escapes and back references, which is resolved by
+heuristics, as hinted at above. A leading zero always indicates an octal
+escape. A single non-zero digit, not followed by another digit, is always
+taken as a back reference. A multi-digit sequence not starting with a zero
+is taken as a back  reference if it comes after a suitable subexpression
+(i.e. the number is in the legal range for a back reference), and otherwise
+is taken as octal. 
+
+
+\subsection{Metasyntax}
+
+\helpref{Syntax of the builtin regular expression library}{wxresyn}
+
+In addition to the main syntax described above,
+there are some special forms and miscellaneous syntactic facilities available.
+
+Normally the flavor of RE being used is specified by application-dependent
+means. However, this can be overridden by a {\it director}. If an RE of any flavor
+begins with `{\bf ***:}', the rest of the RE is an ARE. If an RE of any flavor begins
+with `{\bf ***=}', the rest of the RE is taken to be a literal string, with all
+characters considered ordinary characters. 
+
+An ARE may begin with {\it embedded options}: a sequence {\bf (?xyz)}
+(where {\it xyz} is one or more alphabetic characters)
+specifies options affecting the rest of the RE. These supplement, and can
+override, any options specified by the application. The available option
+letters are:
+
+\begin{twocollist}\twocolwidtha{4cm}
+\twocolitem{{\bf b}}{rest of RE is a BRE}
+\twocolitem{{\bf c}}{case-sensitive matching (usual default)}
+\twocolitem{{\bf e}}{rest of RE is an ERE}
+\twocolitem{{\bf i}}{case-insensitive matching (see \helpref{Matching}{wxresynmatching}, below)}
+\twocolitem{{\bf m}}{historical synonym for {\bf n}}
+\twocolitem{{\bf n}}{newline-sensitive matching (see \helpref{Matching}{wxresynmatching}, below)}
+\twocolitem{{\bf p}}{partial newline-sensitive matching (see \helpref{Matching}{wxresynmatching}, below)}
+\twocolitem{{\bf q}}{rest of RE
+is a literal (``quoted'') string, all ordinary characters}
+\twocolitem{{\bf s}}{non-newline-sensitive matching (usual default)}
+\twocolitem{{\bf t}}{tight syntax (usual default; see below)}
+\twocolitem{{\bf w}}{inverse
+partial newline-sensitive (``weird'') matching (see \helpref{Matching}{wxresynmatching}, below)}
+\twocolitem{{\bf x}}{expanded syntax (see below)}
+\end{twocollist}
+
+Embedded options take effect at the {\bf )} terminating the
+sequence. They are available only at the start of an ARE, and may not be
+used later within it. 
+
+In addition to the usual ({\it tight}) RE syntax, in which
+all characters are significant, there is an {\it expanded} syntax, available
+%in all flavors of RE with the {\bf -expanded} switch, or
+in AREs with the embedded
+x option. In the expanded syntax, white-space characters are ignored and
+all characters between a {\bf \#} and the following newline (or the end of the
+RE) are ignored, permitting paragraphing and commenting a complex RE. There
+are three exceptions to that basic rule:
+{\itemize
+\item%
+a white-space character or `{\bf \#}' preceded
+by `{\bf $\backslash$}' is retained 
+\item%
+white space or `{\bf \#}' within a bracket expression is retained
+\item%
+white space and comments are illegal within multi-character symbols like
+the ARE `{\bf (?:}' or the BRE `{\bf $\backslash$(}' 
+}
+Expanded-syntax white-space characters are blank,
+tab, newline, and any character that belongs to the {\it space} character class.
+
+Finally, in an ARE, outside bracket expressions, the sequence `{\bf (?\#ttt)}' (where
+ {\it ttt} is any text not containing a `{\bf )}') is a comment, completely ignored. Again,
+this is not allowed between the characters of multi-character symbols like
+`{\bf (?:}'. Such comments are more a historical artifact than a useful facility,
+and their use is deprecated; use the expanded syntax instead. 
+
+{\it None} of these
+metasyntax extensions is available if the application (or an initial {\bf ***=}
+director) has specified that the user's input be treated as a literal string
+rather than as an RE. 
+
+
+\subsection{Matching}\label{wxresynmatching}
+
+\helpref{Syntax of the builtin regular expression library}{wxresyn}
+
+In the event that an RE could match more than
+one substring of a given string, the RE matches the one starting earliest
+in the string. If the RE could match more than one substring starting at
+that point, its choice is determined by its {\it preference}: either the longest
+substring, or the shortest. 
+
+Most atoms, and all constraints, have no preference.
+A parenthesized RE has the same preference (possibly none) as the RE. A
+quantified atom with quantifier {\bf \{m\}} or {\bf \{m\}?} has the same preference (possibly
+none) as the atom itself. A quantified atom with other normal quantifiers
+(including {\bf \{m,n\}} with {\it m} equal to {\it n}) prefers longest match. A quantified
+atom with other non-greedy quantifiers (including {\bf \{m,n\}?} with {\it m} equal to
+ {\it n}) prefers shortest match. A branch has the same preference as the first
+quantified atom in it which has a preference. An RE consisting of two or
+more branches connected by the {\bf $|$} operator prefers longest match. 
+
+Subject
+to the constraints imposed by the rules for matching the whole RE, subexpressions
+also match the longest or shortest possible substrings, based on their
+preferences, with subexpressions starting earlier in the RE taking priority
+over ones starting later. Note that outer subexpressions thus take priority
+over their component subexpressions. 
+
+Note that the quantifiers {\bf \{1,1\}} and
+ {\bf \{1,1\}?} can be used to force longest and shortest preference, respectively,
+on a subexpression or a whole RE. 
+
+Match lengths are measured in characters,
+not collating elements. An empty string is considered longer than no match
+at all. For example, {\bf bb*} matches the three middle characters
+of `{\bf abbbc}', {\bf (week$|$wee)(night$|$knights)}
+matches all ten characters of `{\bf weeknights}', when {\bf (.*).*} is matched against
+ {\bf abc} the parenthesized subexpression matches all three characters, and when
+ {\bf (a*)*} is matched against {\bf bc} both the whole RE and the parenthesized subexpression
+match an empty string. 
+
+If case-independent matching is specified, the effect
+is much as if all case distinctions had vanished from the alphabet. When
+an alphabetic that exists in multiple cases appears as an ordinary character
+outside a bracket expression, it is effectively transformed into a bracket
+expression containing both cases, so that {\bf x} becomes `{\bf $[xX]$}'. When it appears
+inside a bracket expression, all case counterparts of it are added to the
+bracket expression, so that {\bf $[x]$} becomes {\bf $[xX]$} and {\bf $[^x]$} becomes `{\bf $[^xX]$}'. 
+
+If newline-sensitive
+matching is specified, {\bf .} and bracket expressions using {\bf $^$} will never match
+the newline character (so that matches will never cross newlines unless
+the RE explicitly arranges it) and {\bf $^$} and {\bf \$} will match the empty string after
+and before a newline respectively, in addition to matching at beginning
+and end of string respectively. ARE {\bf $\backslash$A} and {\bf $\backslash$Z} continue to match beginning
+or end of string {\it only}. 
+
+If partial newline-sensitive matching is specified,
+this affects {\bf .} and bracket expressions as with newline-sensitive matching,
+but not {\bf $^$} and `{\bf \$}'. 
+
+If inverse partial newline-sensitive matching is specified,
+this affects {\bf $^$} and {\bf \$} as with newline-sensitive matching, but not {\bf .} and bracket
+expressions. This isn't very useful but is provided for symmetry. 
+
+
+\subsection{Limits And Compatibility}
+
+\helpref{Syntax of the builtin regular expression library}{wxresyn}
+
+No particular limit is imposed on the length of REs. Programs
+intended to be highly portable should not employ REs longer than 256 bytes,
+as a POSIX-compliant implementation can refuse to accept such REs. 
+
+The only
+feature of AREs that is actually incompatible with POSIX EREs is that {\bf $\backslash$}
+does not lose its special significance inside bracket expressions. All other
+ARE features use syntax which is illegal or has undefined or unspecified
+effects in POSIX EREs; the {\bf ***} syntax of directors likewise is outside
+the POSIX syntax for both BREs and EREs. 
+
+Many of the ARE extensions are
+borrowed from Perl, but some have been changed to clean them up, and a
+few Perl extensions are not present. Incompatibilities of note include `{\bf $\backslash$b}',
+`{\bf $\backslash$B}', the lack of special treatment for a trailing newline, the addition of
+complemented bracket expressions to the things affected by newline-sensitive
+matching, the restrictions on parentheses and back references in lookahead
+constraints, and the longest/shortest-match (rather than first-match) matching
+semantics. 
+
+The matching rules for REs containing both normal and non-greedy
+quantifiers have changed since early beta-test versions of this package.
+(The new rules are much simpler and cleaner, but don't work as hard at guessing
+the user's real intentions.) 
+
+Henry Spencer's original 1986 {\it regexp} package, still in widespread use,
+%(e.g., in pre-8.1 releases of Tcl),
+implemented an early version of today's EREs. There are four incompatibilities between {\it regexp}'s
+near-EREs (`RREs' for short) and AREs. In roughly increasing order of significance:
+{\itemize
+\item
+In AREs, {\bf $\backslash$} followed by an alphanumeric character is either an escape or
+an error, while in RREs, it was just another way of writing the  alphanumeric.
+This should not be a problem because there was no reason to write such
+a sequence in RREs. 
+
+\item%
+{\bf \{} followed by a digit in an ARE is the beginning of
+a bound, while in RREs, {\bf \{} was always an ordinary character. Such sequences
+should be rare, and will often result in an error because following characters
+will not look like a valid bound. 
+
+\item%
+In AREs, {\bf $\backslash$} remains a special character
+within `{\bf $[]$}', so a literal {\bf $\backslash$} within {\bf $[]$} must be
+written `{\bf $\backslash\backslash$}'. {\bf $\backslash\backslash$} also gives a literal
+ {\bf $\backslash$} within {\bf $[]$} in RREs, but only truly paranoid programmers routinely doubled
+the backslash. 
+
+\item%
+AREs report the longest/shortest match for the RE, rather
+than the first found in a specified search order. This may affect some RREs
+which were written in the expectation that the first match would be reported.
+(The careful crafting of RREs to optimize the search order for fast matching
+is obsolete (AREs examine all possible matches in parallel, and their performance
+is largely insensitive to their complexity) but cases where the search
+order was exploited to deliberately  find a match which was {\it not} the longest/shortest
+will need rewriting.)  
+}
+
+
+\subsection{Basic Regular Expressions}\label{wxresynbre}
+
+\helpref{Syntax of the builtin regular expression library}{wxresyn}
+
+BREs differ from EREs in
+several respects.  `{\bf $|$}', `{\bf +}', and {\bf ?} are ordinary characters and there is no equivalent
+for their functionality. The delimiters for bounds
+are {\bf $\backslash$\{} and `{\bf $\backslash$\}}', with {\bf \{} and
+ {\bf \}} by themselves ordinary characters. The parentheses for nested subexpressions
+are {\bf $\backslash$(} and `{\bf $\backslash$)}', with {\bf (} and {\bf )} by themselves
+ordinary characters. {\bf $^$} is an ordinary
+character except at the beginning of the RE or the beginning of a parenthesized
+subexpression, {\bf \$} is an ordinary character except at the end of the RE or
+the end of a parenthesized subexpression, and {\bf *} is an ordinary character
+if it appears at the beginning of the RE or the beginning of a parenthesized
+subexpression (after a possible leading `{\bf $^$}'). Finally, single-digit back references
+are available, and {\bf $\backslash<$} and {\bf $\backslash>$} are synonyms
+for {\bf $[[:<:]]$} and {\bf $[[:>:]]$} respectively;
+no other escapes are available.  
+
+
+\subsection{Regular Expression Character Names}\label{wxresynchars}
+
+\helpref{Syntax of the builtin regular expression library}{wxresyn}
+
+Note that the character names are case sensitive.
+
+\begin{twocollist}
+\twocolitem{NUL}{'$\backslash$0'}
+\twocolitem{SOH}{'$\backslash$001'}
+\twocolitem{STX}{'$\backslash$002'}
+\twocolitem{ETX}{'$\backslash$003'}
+\twocolitem{EOT}{'$\backslash$004'}
+\twocolitem{ENQ}{'$\backslash$005'}
+\twocolitem{ACK}{'$\backslash$006'}
+\twocolitem{BEL}{'$\backslash$007'}
+\twocolitem{alert}{'$\backslash$007'}
+\twocolitem{BS}{'$\backslash$010'}
+\twocolitem{backspace}{'$\backslash$b'}
+\twocolitem{HT}{'$\backslash$011'}
+\twocolitem{tab}{'$\backslash$t'}
+\twocolitem{LF}{'$\backslash$012'}
+\twocolitem{newline}{'$\backslash$n'}
+\twocolitem{VT}{'$\backslash$013'}
+\twocolitem{vertical-tab}{'$\backslash$v'}
+\twocolitem{FF}{'$\backslash$014'}
+\twocolitem{form-feed}{'$\backslash$f'}
+\twocolitem{CR}{'$\backslash$015'}
+\twocolitem{carriage-return}{'$\backslash$r'}
+\twocolitem{SO}{'$\backslash$016'}
+\twocolitem{SI}{'$\backslash$017'}
+\twocolitem{DLE}{'$\backslash$020'}
+\twocolitem{DC1}{'$\backslash$021'}
+\twocolitem{DC2}{'$\backslash$022'}
+\twocolitem{DC3}{'$\backslash$023'}
+\twocolitem{DC4}{'$\backslash$024'}
+\twocolitem{NAK}{'$\backslash$025'}
+\twocolitem{SYN}{'$\backslash$026'}
+\twocolitem{ETB}{'$\backslash$027'}
+\twocolitem{CAN}{'$\backslash$030'}
+\twocolitem{EM}{'$\backslash$031'}
+\twocolitem{SUB}{'$\backslash$032'}
+\twocolitem{ESC}{'$\backslash$033'}
+\twocolitem{IS4}{'$\backslash$034'}
+\twocolitem{FS}{'$\backslash$034'}
+\twocolitem{IS3}{'$\backslash$035'}
+\twocolitem{GS}{'$\backslash$035'}
+\twocolitem{IS2}{'$\backslash$036'}
+\twocolitem{RS}{'$\backslash$036'}
+\twocolitem{IS1}{'$\backslash$037'}
+\twocolitem{US}{'$\backslash$037'}
+\twocolitem{space}{' '}
+\twocolitem{exclamation-mark}{'!'}
+\twocolitem{quotation-mark}{'"'}
+\twocolitem{number-sign}{'\#'}
+\twocolitem{dollar-sign}{'\$'}
+\twocolitem{percent-sign}{'\%'}
+\twocolitem{ampersand}{'&'}
+\twocolitem{apostrophe}{'$\backslash$''}
+\twocolitem{left-parenthesis}{'('}
+\twocolitem{right-parenthesis}{')'}
+\twocolitem{asterisk}{'*'}
+\twocolitem{plus-sign}{'+'}
+\twocolitem{comma}{','}
+\twocolitem{hyphen}{'-'}
+\twocolitem{hyphen-minus}{'-'}
+\twocolitem{period}{'.'}
+\twocolitem{full-stop}{'.'}
+\twocolitem{slash}{'/'}
+\twocolitem{solidus}{'/'}
+\twocolitem{zero}{'0'}
+\twocolitem{one}{'1'}
+\twocolitem{two}{'2'}
+\twocolitem{three}{'3'}
+\twocolitem{four}{'4'}
+\twocolitem{five}{'5'}
+\twocolitem{six}{'6'}
+\twocolitem{seven}{'7'}
+\twocolitem{eight}{'8'}
+\twocolitem{nine}{'9'}
+\twocolitem{colon}{':'}
+\twocolitem{semicolon}{';'}
+\twocolitem{less-than-sign}{'<'}
+\twocolitem{equals-sign}{'='}
+\twocolitem{greater-than-sign}{'>'}
+\twocolitem{question-mark}{'?'}
+\twocolitem{commercial-at}{'@'}
+\twocolitem{left-square-bracket}{'$[$'}
+\twocolitem{backslash}{'$\backslash$'}
+\twocolitem{reverse-solidus}{'$\backslash$'}
+\twocolitem{right-square-bracket}{'$]$'}
+\twocolitem{circumflex}{'$^$'}
+\twocolitem{circumflex-accent}{'$^$'}
+\twocolitem{underscore}{'\_'}
+\twocolitem{low-line}{'\_'}
+\twocolitem{grave-accent}{'`'}
+\twocolitem{left-brace}{'\{'}
+\twocolitem{left-curly-bracket}{'\{'}
+\twocolitem{vertical-line}{'\|'}
+\twocolitem{right-brace}{'\}'}
+\twocolitem{right-curly-bracket}{'\}'}
+\twocolitem{tilde}{'$~$'}
+\twocolitem{DEL}{'$\backslash$177'}
+\end{twocollist}
diff --git a/docs/latex/wx/regex.tex b/docs/latex/wx/regex.tex

index 025d01e411db896956b4ad1eb45e9fb18e784f14..89542cff7c45cc9090988d298122cf4802d98614 100644 (file)
--- a/docs/latex/wx/regex.tex
+++ b/docs/latex/wx/regex.tex
@@ -11,14 +11,29 @@
  
  \section{\class{wxRegEx}}\label{wxregex}
  
-wxRegEx represents a regular expression. The regular expressions syntax
-supported is the POSIX one. Both basic and extended regular expressions are
-supported but, unlike POSIX C API, the extended ones are used by default.
-
-This class provides support for regular expressions matching and also
-replacement. It is built on top of either the system library (if it has support
-for POSIX regular expressions - which is the case of the most modern Unices) or
-uses a version of Henry Spencer's library from tcl.
+wxRegEx represents a regular expression.  This class provides support
+for regular expressions matching and also replacement.
+
+It is built on top of either the system library (if it has support
+for POSIX regular expressions - which is the case of the most modern
+Unices) or uses the built in Henry Spencer's library.  Henry Spencer
+would appreciate being given credit in the documentation of software
+which uses his library, but that is not a requirement.
+
+Regular expressions, as defined by POSIX, come in two flavours: {\it extended}
+and {\it basic}.  The builtin library also adds a third flavour
+of expression \helpref{advanced}{wxresyn}, which is not available
+when using the system library.
+
+Unicode is fully supported only when using the builtin library.
+When using the system library in Unicode mode, the expressions and data
+are translated to the default 8-bit encoding before being passed to
+the library.
+
+On platforms where a system library is available, the default is to use
+the builtin library for Unicode builds, and the system library otherwise.
+It is possible to use the other if preferred by selecting it when building
+the wxWindows.
  
  \wxheading{Derived from}
  
@@ -31,8 +46,13 @@ Flags for regex compilation to be used with \helpref{Compile()}{wxregexcompile}:
  \begin{verbatim}
  enum
  {
-    // use extended regex syntax (default)
+    // use extended regex syntax
      wxRE_EXTENDED = 0,
+    
+    // use advanced RE syntax (built-in regex only)
+#ifdef wxHAS_REGEX_ADVANCED
+    wxRE_ADVANCED = 1,
+#endif
  
      // use basic RE syntax
      wxRE_BASIC    = 2,
diff --git a/docs/latex/wx/topics.tex b/docs/latex/wx/topics.tex

index bae331270e2fe63458622ecda5179c89f413cd31..89eca1846121cfb7a53e606379f5ccf4294eabc3 100644 (file)
--- a/docs/latex/wx/topics.tex
+++ b/docs/latex/wx/topics.tex
@@ -58,4 +58,4 @@ This chapter contains a selection of topic overviews, first things first:
  \input wxhtml.tex
  \input tenvvars.tex
  \input wxPython.tex
-
+\input re_syntax.tex
diff --git a/setup.h.in b/setup.h.in

index fb78a521b121e30de4bc8e86bc3418f14e0011fc..8c7a2988aeb610152a69c07ac3b03ad2e105ffa7 100644 (file)
--- a/setup.h.in
+++ b/setup.h.in
@@ -182,6 +182,12 @@
   * Use regex support
   */
  #define wxUSE_REGEX 0
+/*
+ * The built-in regex supports advanced REs in additional to POSIX's basic
+ * and extended. Your system regex probably won't support this, and in this
+ * case WX_NO_REGEX_ADVANCED should be defined.
+ */
+#undef WX_NO_REGEX_ADVANCED
  /*
   * Use XML support
   */
diff --git a/setup.h_vms b/setup.h_vms

index e533df83e8b18304dcf06f722499dc6afbe779aa..30d52a5731846f9f9afb19eaa75eac06a989c707 100644 (file)
--- a/setup.h_vms
+++ b/setup.h_vms
@@ -191,6 +191,12 @@
   * Use regex support
   */
  #define wxUSE_REGEX 0
+/*
+ * The built-in regex supports advanced REs in additional to POSIX's basic
+ * and extended. Your system regex probably won't support this, and in this
+ * case WX_NO_REGEX_ADVANCED should be defined.
+ */
+#undef WX_NO_REGEX_ADVANCED
  /*
   * Use XML support
   */
author	Václav Slavík <vslavik@fastmail.fm>
	Thu, 19 Feb 2004 18:02:48 +0000 (18:02 +0000)
committer	Václav Slavík <vslavik@fastmail.fm>
	Thu, 19 Feb 2004 18:02:48 +0000 (18:02 +0000)
autoconf_inc.m4		patch \| blob \| blame \| history
docs/latex/wx/re_syntax.tex	[new file with mode: 0644]	patch \| blob
docs/latex/wx/regex.tex		patch \| blob \| blame \| history
docs/latex/wx/topics.tex		patch \| blob \| blame \| history
setup.h.in		patch \| blob \| blame \| history
setup.h_vms		patch \| blob \| blame \| history