Tcl regex lib

author Ryan Norton <wxprojects@comcast.net>

Thu, 5 Aug 1999 01:16:55 +0000 (01:16 +0000)

committer Ryan Norton <wxprojects@comcast.net>

Thu, 5 Aug 1999 01:16:55 +0000 (01:16 +0000)
author Ryan Norton <wxprojects@comcast.net>
Thu, 5 Aug 1999 01:16:55 +0000 (01:16 +0000)
committer Ryan Norton <wxprojects@comcast.net>
Thu, 5 Aug 1999 01:16:55 +0000 (01:16 +0000)
diff --git a/src/regex/re_syntax.n b/src/regex/re_syntax.n

new file mode 100644 (file)

index 0000000..f37bb85
--- /dev/null
+++ b/src/regex/re_syntax.n
@@ -0,0 +1,970 @@
+'\"
+'\" Copyright (c) 1998 Sun Microsystems, Inc.
+'\" Copyright (c) 1999 Scriptics Corporation
+'\"
+'\" This software is copyrighted by the Regents of the University of
+'\" California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+'\" Corporation and other parties.  The following terms apply to all files
+'\" associated with the software unless explicitly disclaimed in
+'\" individual files.
+'\" 
+'\" The authors hereby grant permission to use, copy, modify, distribute,
+'\" and license this software and its documentation for any purpose, provided
+'\" that existing copyright notices are retained in all copies and that this
+'\" notice is included verbatim in any distributions. No written agreement,
+'\" license, or royalty fee is required for any of the authorized uses.
+'\" Modifications to this software may be copyrighted by their authors
+'\" and need not follow the licensing terms described here, provided that
+'\" the new terms are clearly indicated on the first page of each file where
+'\" they apply.
+'\" 
+'\" IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+'\" FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+'\" ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+'\" DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+'\" POSSIBILITY OF SUCH DAMAGE.
+'\" 
+'\" THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+'\" INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+'\" FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
+'\" IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+'\" NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+'\" MODIFICATIONS.
+'\" 
+'\" GOVERNMENT USE: If you are acquiring this software on behalf of the
+'\" U.S. government, the Government shall have only "Restricted Rights"
+'\" in the software and related documentation as defined in the Federal 
+'\" Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
+'\" are acquiring the software on behalf of the Department of Defense, the
+'\" software shall be classified as "Commercial Computer Software" and the
+'\" Government shall have only "Restricted Rights" as defined in Clause
+'\" 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
+'\" authors grant the U.S. Government and others acting in its behalf
+'\" permission to use and distribute the software in accordance with the
+'\" terms specified in this license. 
+'\" 
+'\" RCS: @(#) Id: re_syntax.n,v 1.3 1999/07/14 19:09:36 jpeek Exp 
+'\"
+.so man.macros
+.TH re_syntax n "8.1" Tcl "Tcl Built-In Commands"
+.BS
+.SH NAME
+re_syntax \- Syntax of Tcl regular expressions.
+.BE
+
+.SH DESCRIPTION
+.PP
+A \fIregular expression\fR describes strings of characters.
+It's a pattern that matches certain strings and doesn't match others.
+
+.SH "DIFFERENT FLAVORS OF REs"
+Regular expressions (``RE''s), as defined by POSIX, come in two
+flavors: \fIextended\fR REs (``EREs'') and \fIbasic\fR REs (``BREs'').
+EREs are roughly those of the traditional \fIegrep\fR, while BREs are
+roughly those of the traditional \fIed\fR.  This implementation adds
+a third flavor, \fIadvanced\fR REs (``AREs''), basically EREs with
+some significant extensions.
+.PP
+This manual page primarily describes AREs.  BREs mostly exist for
+backward compatibility in some old programs; they will be discussed at
+the end.  POSIX EREs are almost an exact subset of AREs.  Features of
+AREs that are not present in EREs will be indicated.
+
+.SH "REGULAR EXPRESSION SYNTAX"
+.PP
+Tcl regular expressions are implemented using the package written by
+Henry Spencer, based on the 1003.2 spec and some (not quite all) of
+the Perl5 extensions (thanks, Henry!).  Much of the description of
+regular expressions below is copied verbatim from his manual entry.
+.PP
+An ARE is one or more \fIbranches\fR,
+separated by `\fB|\fR',
+matching anything that matches any of the branches.
+.PP
+A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR,
+concatenated.
+It matches a match for the first, followed by a match for the second, etc;
+an empty branch matches the empty string.
+.PP
+A quantified atom is an \fIatom\fR possibly followed
+by a single \fIquantifier\fR.
+Without a quantifier, it matches a match for the atom.
+The quantifiers,
+and what a so-quantified atom matches, are:
+.RS 2
+.TP 6
+\fB*\fR
+a sequence of 0 or more matches of the atom
+.TP
+\fB+\fR
+a sequence of 1 or more matches of the atom
+.TP
+\fB?\fR
+a sequence of 0 or 1 matches of the atom
+.TP
+\fB{\fIm\fB}\fR
+a sequence of exactly \fIm\fR matches of the atom
+.TP
+\fB{\fIm\fB,}\fR
+a sequence of \fIm\fR or more matches of the atom
+.TP
+\fB{\fIm\fB,\fIn\fB}\fR
+a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom;
+\fIm\fR may not exceed \fIn\fR
+.TP
+\fB*?  +?  ??  {\fIm\fB}?  {\fIm\fB,}?  {\fIm\fB,\fIn\fB}?\fR
+\fInon-greedy\fR quantifiers,
+which match the same possibilities,
+but prefer the smallest number rather than the largest number
+of matches (see MATCHING)
+.RE
+.PP
+The forms using
+\fB{\fR and \fB}\fR
+are known as \fIbound\fRs.
+The numbers
+\fIm\fR and \fIn\fR are unsigned decimal integers
+with permissible values from 0 to 255 inclusive.
+.PP
+An atom is one of:
+.RS 2
+.TP 6
+\fB(\fIre\fB)\fR
+(where \fIre\fR is any regular expression)
+matches a match for
+\fIre\fR, with the match noted for possible reporting
+.TP
+\fB(?:\fIre\fB)\fR
+as previous,
+but does no reporting
+(a ``non-capturing'' set of parentheses)
+.TP
+\fB()\fR
+matches an empty string,
+noted for possible reporting
+.TP
+\fB(?:)\fR
+matches an empty string,
+without reporting
+.TP
+\fB[\fIchars\fB]\fR
+a \fIbracket expression\fR,
+matching any one of the \fIchars\fR (see BRACKET EXPRESSIONS for more detail)
+.TP
+ \fB.\fR
+matches any single character
+.TP
+\fB\e\fIk\fR
+(where \fIk\fR is a non-alphanumeric character)
+matches that character taken as an ordinary character,
+e.g. \e\e matches a backslash character
+.TP
+\fB\e\fIc\fR
+where \fIc\fR is alphanumeric
+(possibly followed by other characters),
+an \fIescape\fR (AREs only),
+see ESCAPES below
+.TP
+\fB{\fR
+when followed by a character other than a digit,
+matches the left-brace character `\fB{\fR';
+when followed by a digit, it is the beginning of a
+\fIbound\fR (see above)
+.TP
+\fIx\fR
+where \fIx\fR is
+a single character with no other significance, matches that character.
+.RE
+.PP
+A \fIconstraint\fR matches an empty string when specific conditions
+are met.
+A constraint may not be followed by a quantifier.
+The simple constraints are as follows; some more constraints are
+described later, under ESCAPES.
+.RS 2
+.TP 8
+\fB^\fR
+matches at the beginning of a line
+.TP
+\fB$\fR
+matches at the end of a line
+.TP
+\fB(?=\fIre\fB)\fR
+\fIpositive lookahead\fR (AREs only), matches at any point
+where a substring matching \fIre\fR begins
+.TP
+\fB(?!\fIre\fB)\fR
+\fInegative lookahead\fR (AREs only), matches at any point
+where no substring matching \fIre\fR begins
+.RE
+.PP
+The lookahead constraints may not contain back references (see later),
+and all parentheses within them are considered non-capturing.
+.PP
+An RE may not end with `\fB\e\fR'.
+
+.SH "BRACKET EXPRESSIONS"
+A \fIbracket expression\fR is a list of characters enclosed in `\fB[\|]\fR'.
+It normally matches any single character from the list (but see below).
+If the list begins with `\fB^\fR',
+it matches any single character
+(but see below) \fInot\fR from the rest of the list.
+.PP
+If two characters in the list are separated by `\fB\-\fR',
+this is shorthand
+for the full \fIrange\fR of characters between those two (inclusive) in the
+collating sequence,
+e.g.
+\fB[0\-9]\fR
+in ASCII matches any decimal digit.
+Two ranges may not share an
+endpoint, so e.g.
+\fBa\-c\-e\fR
+is illegal.
+Ranges are very collating-sequence-dependent,
+and portable programs should avoid relying on them.
+.PP
+To include a literal
+\fB]\fR
+or
+\fB\-\fR
+in the list,
+the simplest method is to
+enclose it in
+\fB[.\fR and \fB.]\fR
+to make it a collating element (see below).
+Alternatively,
+make it the first character
+(following a possible `\fB^\fR'),
+or (AREs only) precede it with `\fB\e\fR'.
+Alternatively, for `\fB\-\fR',
+make it the last character,
+or the second endpoint of a range.
+To use a literal
+\fB\-\fR
+as the first endpoint of a range,
+make it a collating element
+or (AREs only) precede it with `\fB\e\fR'.
+With the exception of these, some combinations using
+\fB[\fR
+(see next
+paragraphs), and escapes,
+all other special characters lose their
+special significance within a bracket expression.
+.PP
+Within a bracket expression, a collating element (a character,
+a multi-character sequence that collates as if it were a single character,
+or a collating-sequence name for either)
+enclosed in
+\fB[.\fR and \fB.]\fR
+stands for the
+sequence of characters of that collating element.
+The sequence is a single element of the bracket expression's list.
+A bracket expression in a locale that has
+multi-character collating elements
+can thus match more than one character.
+.VS 8.2
+So (insidiously), a bracket expression that starts with \fB^\fR
+can match multi-character collating elements even if none of them
+appear in the bracket expression!
+(\fINote:\fR Tcl currently has no multi-character collating elements.
+This information is only for illustration.)
+.PP
+For example, assume the collating sequence includes a \fBch\fR
+multi-character collating element.
+Then the RE \fB[[.ch.]]*c\fR (zero or more \fBch\fP's followed by \fBc\fP)
+matches the first five characters of `\fBchchcc\fR'.
+Also, the RE \fB[^c]b\fR matches all of `\fBchb\fR'
+(because \fB[^c]\fR matches the multi-character \fBch\fR).
+.VE 8.2
+.PP
+Within a bracket expression, a collating element enclosed in
+\fB[=\fR
+and
+\fB=]\fR
+is an equivalence class, standing for the sequences of characters
+of all collating elements equivalent to that one, including itself.
+(If there are no other equivalent collating elements,
+the treatment is as if the enclosing delimiters were `\fB[.\fR'\&
+and `\fB.]\fR'.)
+For example, if
+\fBo\fR
+and
+\fB\o'o^'\fR
+are the members of an equivalence class,
+then `\fB[[=o=]]\fR', `\fB[[=\o'o^'=]]\fR',
+and `\fB[o\o'o^']\fR'\&
+are all synonymous.
+An equivalence class may not be an endpoint
+of a range.
+.VS 8.2
+(\fINote:\fR 
+Tcl currently implements only the Unicode locale.
+It doesn't define any equivalence classes.
+The examples above are just illustrations.)
+.VE 8.2
+.PP
+Within a bracket expression, the name of a \fIcharacter class\fR enclosed
+in
+\fB[:\fR
+and
+\fB:]\fR
+stands for the list of all characters
+(not all collating elements!)
+belonging to that
+class.
+Standard character classes are:
+.PP
+.RS
+.ne 5
+.nf
+.ta 3c
+\fBalpha\fR    A letter. 
+\fBupper\fR    An upper-case letter. 
+\fBlower\fR    A lower-case letter. 
+\fBdigit\fR    A decimal digit. 
+\fBxdigit\fR   A hexadecimal digit. 
+\fBalnum\fR    An alphanumeric (letter or digit). 
+\fBprint\fR    An alphanumeric (same as alnum).
+\fBblank\fR    A space or tab character.
+\fBspace\fR    A character producing white space in displayed text. 
+\fBpunct\fR    A punctuation character. 
+\fBgraph\fR    A character with a visible representation. 
+\fBcntrl\fR    A control character. 
+.fi
+.RE
+.PP
+A locale may provide others.
+.VS 8.2
+(Note that the current Tcl implementation has only one locale:
+the Unicode locale.)
+.VE 8.2
+A character class may not be used as an endpoint of a range.
+.PP
+There are two special cases of bracket expressions:
+the bracket expressions
+\fB[[:<:]]\fR
+and
+\fB[[:>:]]\fR
+are constraints, matching empty strings at
+the beginning and end of a word respectively.
+'\" note, discussion of escapes below references this definition of word
+A word is defined as a sequence of
+word characters
+that is neither preceded nor followed by
+word characters.
+A word character is an
+\fIalnum\fR
+character
+or an underscore
+(\fB_\fR).
+These special bracket expressions are deprecated;
+users of AREs should use constraint escapes instead (see below).
+.SH ESCAPES
+Escapes (AREs only), which begin with a
+\fB\e\fR
+followed by an alphanumeric character,
+come in several varieties:
+character entry, class shorthands, constraint escapes, and back references.
+A
+\fB\e\fR
+followed by an alphanumeric character but not constituting
+a valid escape is illegal in AREs.
+In EREs, there are no escapes:
+outside a bracket expression,
+a
+\fB\e\fR
+followed by an alphanumeric character merely stands for that
+character as an ordinary character,
+and inside a bracket expression,
+\fB\e\fR
+is an ordinary character.
+(The latter is the one actual incompatibility between EREs and AREs.)
+.PP
+Character-entry escapes (AREs only) exist to make it easier to specify
+non-printing and otherwise inconvenient characters in REs:
+.RS 2
+.TP 5
+\fB\ea\fR
+alert (bell) character, as in C
+.TP
+\fB\eb\fR
+backspace, as in C
+.TP
+\fB\eB\fR
+synonym for
+\fB\e\fR
+to help reduce backslash doubling in some
+applications where there are multiple levels of backslash processing
+.TP
+\fB\ec\fIX\fR
+(where X is any character) the character whose
+low-order 5 bits are the same as those of
+\fIX\fR,
+and whose other bits are all zero
+.TP
+\fB\ee\fR
+the character whose collating-sequence name
+is `\fBESC\fR',
+or failing that, the character with octal value 033
+.TP
+\fB\ef\fR
+formfeed, as in C
+.TP
+\fB\en\fR
+newline, as in C
+.TP
+\fB\er\fR
+carriage return, as in C
+.TP
+\fB\et\fR
+horizontal tab, as in C
+.TP
+\fB\eu\fIwxyz\fR
+(where
+\fIwxyz\fR
+is exactly four hexadecimal digits)
+the Unicode character
+\fBU+\fIwxyz\fR
+in the local byte ordering
+.TP
+\fB\eU\fIstuvwxyz\fR
+(where
+\fIstuvwxyz\fR
+is exactly eight hexadecimal digits)
+reserved for a somewhat-hypothetical Unicode extension to 32 bits
+.TP
+\fB\ev\fR
+vertical tab, as in C
+are all available.
+.TP
+\fB\ex\fIhhh\fR
+(where
+\fIhhh\fR
+is any sequence of hexadecimal digits)
+the character whose hexadecimal value is
+\fB0x\fIhhh\fR
+(a single character no matter how many hexadecimal digits are used).
+.TP
+\fB\e0\fR
+the character whose value is
+\fB0\fR
+.TP
+\fB\e\fIxy\fR
+(where
+\fIxy\fR
+is exactly two octal digits,
+and is not a
+\fIback reference\fR (see below))
+the character whose octal value is
+\fB0\fIxy\fR
+.TP
+\fB\e\fIxyz\fR
+(where
+\fIxyz\fR
+is exactly three octal digits,
+and is not a
+back reference (see below))
+the character whose octal value is
+\fB0\fIxyz\fR
+.RE
+.PP
+Hexadecimal digits are `\fB0\fR'-`\fB9\fR', `\fBa\fR'-`\fBf\fR',
+and `\fBA\fR'-`\fBF\fR'.
+Octal digits are `\fB0\fR'-`\fB7\fR'.
+.PP
+The character-entry escapes are always taken as ordinary characters.
+For example,
+\fB\e135\fR
+is
+\fB]\fR
+in ASCII,
+but
+\fB\e135\fR
+does not terminate a bracket expression.
+Beware, however, that some applications (e.g., C compilers) interpret 
+such sequences themselves before the regular-expression package
+gets to see them, which may require doubling (quadrupling, etc.) the `\fB\e\fR'.
+.PP
+Class-shorthand escapes (AREs only) provide shorthands for certain commonly-used
+character classes:
+.RS 2
+.TP 10
+\fB\ed\fR
+\fB[[:digit:]]\fR
+.TP
+\fB\es\fR
+\fB[[:space:]]\fR
+.TP
+\fB\ew\fR
+\fB[[:alnum:]_]\fR
+(note underscore)
+.TP
+\fB\eD\fR
+\fB[^[:digit:]]\fR
+.TP
+\fB\eS\fR
+\fB[^[:space:]]\fR
+.TP
+\fB\eW\fR
+\fB[^[:alnum:]_]\fR
+(note underscore)
+.RE
+.PP
+Within bracket expressions, `\fB\ed\fR', `\fB\es\fR',
+and `\fB\ew\fR'\&
+lose their outer brackets,
+and `\fB\eD\fR', `\fB\eS\fR',
+and `\fB\eW\fR'\&
+are illegal.
+.VS 8.2
+(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
+Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.)
+.VE 8.2
+.PP
+A constraint escape (AREs only) is a constraint,
+matching the empty string if specific conditions are met,
+written as an escape:
+.RS 2
+.TP 6
+\fB\eA\fR
+matches only at the beginning of the string
+(see MATCHING, below, for how this differs from `\fB^\fR')
+.TP
+\fB\em\fR
+matches only at the beginning of a word
+.TP
+\fB\eM\fR
+matches only at the end of a word
+.TP
+\fB\ey\fR
+matches only at the beginning or end of a word
+.TP
+\fB\eY\fR
+matches only at a point that is not the beginning or end of a word
+.TP
+\fB\eZ\fR
+matches only at the end of the string
+(see MATCHING, below, for how this differs from `\fB$\fR')
+.TP
+\fB\e\fIm\fR
+(where
+\fIm\fR
+is a nonzero digit) a \fIback reference\fR, see below
+.TP
+\fB\e\fImnn\fR
+(where
+\fIm\fR
+is a nonzero digit, and
+\fInn\fR
+is some more digits,
+and the decimal value
+\fImnn\fR
+is not greater than the number of closing capturing parentheses seen so far)
+a \fIback reference\fR, see below
+.RE
+.PP
+A word is defined as in the specification of
+\fB[[:<:]]\fR
+and
+\fB[[:>:]]\fR
+above.
+Constraint escapes are illegal within bracket expressions.
+.PP
+A back reference (AREs only) matches the same string matched by the parenthesized
+subexpression specified by the number,
+so that (e.g.)
+\fB([bc])\e1\fR
+matches
+\fBbb\fR
+or
+\fBcc\fR
+but not `\fBbc\fR'.
+The subexpression must entirely precede the back reference in the RE.
+Subexpressions are numbered in the order of their leading parentheses.
+Non-capturing parentheses do not define subexpressions.
+.PP
+There is an inherent historical ambiguity between octal character-entry 
+escapes and back references, which is resolved by heuristics,
+as hinted at above.
+A leading zero always indicates an octal escape.
+A single non-zero digit, not followed by another digit,
+is always taken as a back reference.
+A multi-digit sequence not starting with a zero is taken as a back 
+reference if it comes after a suitable subexpression
+(i.e. the number is in the legal range for a back reference),
+and otherwise is taken as octal.
+.SH "METASYNTAX"
+In addition to the main syntax described above, there are some special
+forms and miscellaneous syntactic facilities available.
+.PP
+Normally the flavor of RE being used is specified by
+application-dependent means.
+However, this can be overridden by a \fIdirector\fR.
+If an RE of any flavor begins with `\fB***:\fR',
+the rest of the RE is an ARE.
+If an RE of any flavor begins with `\fB***=\fR',
+the rest of the RE is taken to be a literal string,
+with all characters considered ordinary characters.
+.PP
+An ARE may begin with \fIembedded options\fR:
+a sequence
+\fB(?\fIxyz\fB)\fR
+(where
+\fIxyz\fR
+is one or more alphabetic characters)
+specifies options affecting the rest of the RE.
+These supplement, and can override,
+any options specified by the application.
+The available option letters are:
+.RS 2
+.TP 3
+\fBb\fR
+rest of RE is a BRE
+.TP 3
+\fBc\fR
+case-sensitive matching (usual default)
+.TP 3
+\fBe\fR
+rest of RE is an ERE
+.TP 3
+\fBi\fR
+case-insensitive matching (see MATCHING, below)
+.TP 3
+\fBm\fR
+historical synonym for
+\fBn\fR
+.TP 3
+\fBn\fR
+newline-sensitive matching (see MATCHING, below)
+.TP 3
+\fBp\fR
+partial newline-sensitive matching (see MATCHING, below)
+.TP 3
+\fBq\fR
+rest of RE is a literal (``quoted'') string, all ordinary characters
+.TP 3
+\fBs\fR
+non-newline-sensitive matching (usual default)
+.TP 3
+\fBt\fR
+tight syntax (usual default; see below)
+.TP 3
+\fBw\fR
+inverse partial newline-sensitive (``weird'') matching (see MATCHING, below)
+.TP 3
+\fBx\fR
+expanded syntax (see below)
+.RE
+.PP
+Embedded options take effect at the
+\fB)\fR
+terminating the sequence.
+They are available only at the start of an ARE,
+and may not be used later within it.
+.PP
+In addition to the usual (\fItight\fR) RE syntax, in which all characters are
+significant, there is an \fIexpanded\fR syntax,
+available in all flavors of RE
+with the \fB-expanded\fR switch, or in AREs with the embedded x option.
+In the expanded syntax,
+white-space characters are ignored
+and all characters between a
+\fB#\fR
+and the following newline (or the end of the RE) are ignored,
+permitting paragraphing and commenting a complex RE.
+There are three exceptions to that basic rule:
+.RS 2
+.PP
+a white-space character or `\fB#\fR' preceded by `\fB\e\fR' is retained
+.PP
+white space or `\fB#\fR' within a bracket expression is retained
+.PP
+white space and comments are illegal within multi-character symbols
+like the ARE `\fB(?:\fR' or the BRE `\fB\e(\fR'
+.RE
+.PP
+Expanded-syntax white-space characters are blank, tab, newline, and
+.VS 8.2
+any character that belongs to the \fIspace\fR character class.
+.VE 8.2
+.PP
+Finally, in an ARE,
+outside bracket expressions, the sequence `\fB(?#\fIttt\fB)\fR'
+(where
+\fIttt\fR
+is any text not containing a `\fB)\fR')
+is a comment,
+completely ignored.
+Again, this is not allowed between the characters of
+multi-character symbols like `\fB(?:\fR'.
+Such comments are more a historical artifact than a useful facility,
+and their use is deprecated;
+use the expanded syntax instead.
+.PP
+\fINone\fR of these metasyntax extensions is available if the application
+(or an initial
+\fB***=\fR
+director)
+has specified that the user's input be treated as a literal string
+rather than as an RE.
+.SH MATCHING
+In the event that an RE could match more than one substring of a given
+string,
+the RE matches the one starting earliest in the string.
+If the RE could match more than one substring starting at that point,
+its choice is determined by its \fIpreference\fR:
+either the longest substring, or the shortest.
+.PP
+Most atoms, and all constraints, have no preference.
+A parenthesized RE has the same preference (possibly none) as the RE.
+A quantified atom with quantifier
+\fB{\fIm\fB}\fR
+or
+\fB{\fIm\fB}?\fR
+has the same preference (possibly none) as the atom itself.
+A quantified atom with other normal quantifiers (including
+\fB{\fIm\fB,\fIn\fB}\fR
+with
+\fIm\fR
+equal to
+\fIn\fR)
+prefers longest match.
+A quantified atom with other non-greedy quantifiers (including
+\fB{\fIm\fB,\fIn\fB}?\fR
+with
+\fIm\fR
+equal to
+\fIn\fR)
+prefers shortest match.
+A branch has the same preference as the first quantified atom in it
+which has a preference.
+An RE consisting of two or more branches connected by the
+\fB|\fR
+operator prefers longest match.
+.PP
+Subject to the constraints imposed by the rules for matching the whole RE,
+subexpressions also match the longest or shortest possible substrings,
+based on their preferences,
+with subexpressions starting earlier in the RE taking priority over
+ones starting later.
+Note that outer subexpressions thus take priority over
+their component subexpressions.
+.PP
+Note that the quantifiers
+\fB{1,1}\fR
+and
+\fB{1,1}?\fR
+can be used to force longest and shortest preference, respectively,
+on a subexpression or a whole RE.
+.PP
+Match lengths are measured in characters, not collating elements.
+An empty string is considered longer than no match at all.
+For example,
+\fBbb*\fR
+matches the three middle characters of `\fBabbbc\fR',
+\fB(week|wee)(night|knights)\fR
+matches all ten characters of `\fBweeknights\fR',
+when
+\fB(.*).*\fR
+is matched against
+\fBabc\fR
+the parenthesized subexpression
+matches all three characters, and
+when
+\fB(a*)*\fR
+is matched against
+\fBbc\fR
+both the whole RE and the parenthesized
+subexpression match an empty string.
+.PP
+If case-independent matching is specified,
+the effect is much as if all case distinctions had vanished from the
+alphabet.
+When an alphabetic that exists in multiple cases appears as an
+ordinary character outside a bracket expression, it is effectively
+transformed into a bracket expression containing both cases,
+so that
+\fBx\fR
+becomes `\fB[xX]\fR'.
+When it appears inside a bracket expression, all case counterparts
+of it are added to the bracket expression, so that
+\fB[x]\fR
+becomes
+\fB[xX]\fR
+and
+\fB[^x]\fR
+becomes `\fB[^xX]\fR'.
+.PP
+If newline-sensitive matching is specified, \fB.\fR
+and bracket expressions using
+\fB^\fR
+will never match the newline character
+(so that matches will never cross newlines unless the RE
+explicitly arranges it)
+and
+\fB^\fR
+and
+\fB$\fR
+will match the empty string after and before a newline
+respectively, in addition to matching at beginning and end of string
+respectively.
+ARE
+\fB\eA\fR
+and
+\fB\eZ\fR
+continue to match beginning or end of string \fIonly\fR.
+.PP
+If partial newline-sensitive matching is specified,
+this affects \fB.\fR
+and bracket expressions
+as with newline-sensitive matching, but not
+\fB^\fR
+and `\fB$\fR'.
+.PP
+If inverse partial newline-sensitive matching is specified,
+this affects
+\fB^\fR
+and
+\fB$\fR
+as with
+newline-sensitive matching,
+but not \fB.\fR
+and bracket expressions.
+This isn't very useful but is provided for symmetry.
+.SH "LIMITS AND COMPATIBILITY"
+No particular limit is imposed on the length of REs.
+Programs intended to be highly portable should not employ REs longer
+than 256 bytes,
+as a POSIX-compliant implementation can refuse to accept such REs.
+.PP
+The only feature of AREs that is actually incompatible with
+POSIX EREs is that
+\fB\e\fR
+does not lose its special
+significance inside bracket expressions.
+All other ARE features use syntax which is illegal or has
+undefined or unspecified effects in POSIX EREs;
+the
+\fB***\fR
+syntax of directors likewise is outside the POSIX
+syntax for both BREs and EREs.
+.PP
+Many of the ARE extensions are borrowed from Perl, but some have
+been changed to clean them up, and a few Perl extensions are not present.
+Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR',
+the lack of special treatment for a trailing newline,
+the addition of complemented bracket expressions to the things
+affected by newline-sensitive matching,
+the restrictions on parentheses and back references in lookahead constraints,
+and the longest/shortest-match (rather than first-match) matching semantics.
+.PP
+The matching rules for REs containing both normal and non-greedy quantifiers
+have changed since early beta-test versions of this package.
+(The new rules are much simpler and cleaner,
+but don't work as hard at guessing the user's real intentions.)
+.PP
+Henry Spencer's original 1986 \fIregexp\fR package,
+still in widespread use (e.g., in pre-8.1 releases of Tcl),
+implemented an early version of today's EREs.
+There are four incompatibilities between \fIregexp\fR's near-EREs
+(`RREs' for short) and AREs.
+In roughly increasing order of significance:
+.PP
+.RS
+In AREs,
+\fB\e\fR
+followed by an alphanumeric character is either an
+escape or an error,
+while in RREs, it was just another way of writing the 
+alphanumeric.
+This should not be a problem because there was no reason to write
+such a sequence in RREs.
+.PP
+\fB{\fR
+followed by a digit in an ARE is the beginning of a bound,
+while in RREs,
+\fB{\fR
+was always an ordinary character.
+Such sequences should be rare,
+and will often result in an error because following characters
+will not look like a valid bound.
+.PP
+In AREs,
+\fB\e\fR
+remains a special character within `\fB[\|]\fR',
+so a literal
+\fB\e\fR
+within
+\fB[\|]\fR
+must be written `\fB\e\e\fR'.
+\fB\e\e\fR
+also gives a literal
+\fB\e\fR
+within
+\fB[\|]\fR
+in RREs,
+but only truly paranoid programmers routinely doubled the backslash.
+.PP
+AREs report the longest/shortest match for the RE,
+rather than the first found in a specified search order.
+This may affect some RREs which were written in the expectation that
+the first match would be reported.
+(The careful crafting of RREs to optimize the search order for fast
+matching is obsolete (AREs examine all possible matches
+in parallel, and their performance is largely insensitive to their
+complexity) but cases where the search order was exploited to deliberately 
+find a match which was \fInot\fR the longest/shortest will need rewriting.)
+.RE
+
+.SH "BASIC REGULAR EXPRESSIONS"
+BREs differ from EREs in several respects.  `\fB|\fR', `\fB+\fR',
+and
+\fB?\fR
+are ordinary characters and there is no equivalent
+for their functionality.
+The delimiters for bounds are
+\fB\e{\fR
+and `\fB\e}\fR',
+with
+\fB{\fR
+and
+\fB}\fR
+by themselves ordinary characters.
+The parentheses for nested subexpressions are
+\fB\e(\fR
+and `\fB\e)\fR',
+with
+\fB(\fR
+and
+\fB)\fR
+by themselves ordinary characters.
+\fB^\fR
+is an ordinary character except at the beginning of the
+RE or the beginning of a parenthesized subexpression,
+\fB$\fR
+is an ordinary character except at the end of the
+RE or the end of a parenthesized subexpression,
+and
+\fB*\fR
+is an ordinary character if it appears at the beginning of the
+RE or the beginning of a parenthesized subexpression
+(after a possible leading `\fB^\fR').
+Finally,
+single-digit back references are available,
+and
+\fB\e<\fR
+and
+\fB\e>\fR
+are synonyms for
+\fB[[:<:]]\fR
+and
+\fB[[:>:]]\fR
+respectively;
+no other escapes are available.
+
+.SH "SEE ALSO"
+RegExp(3), regexp(n), regsub(n), lsearch(n), switch(n), text(n)
+
+.SH KEYWORDS
+match, regular expression, string
diff --git a/src/regex/regc_nfa.c b/src/regex/regc_nfa.c

new file mode 100644 (file)

index 0000000..cc9f6ea
--- /dev/null
+++ b/src/regex/regc_nfa.c
@@ -0,0 +1,1559 @@
+/*
+ * NFA utilities.
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.     All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Header$
+ *
+ *
+ * One or two things that technically ought to be in here
+ * are actually in color.c, thanks to some incestuous relationships in
+ * the color chains.
+ */
+
+#define NISERR()       VISERR(nfa->v)
+#define NERR(e)                VERR(nfa->v, (e))
+
+
+/*
+ * newnfa - set up an NFA
+ */
+static struct nfa *                            /* the NFA, or NULL */
+newnfa(struct vars * v,
+          struct colormap * cm,
+          struct nfa * parent)         /* NULL if primary NFA */
+{
+       struct nfa *nfa;
+
+       nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
+       if (nfa == NULL)
+               return NULL;
+
+       nfa->states = NULL;
+       nfa->slast = NULL;
+       nfa->free = NULL;
+       nfa->nstates = 0;
+       nfa->cm = cm;
+       nfa->v = v;
+       nfa->bos[0] = nfa->bos[1] = COLORLESS;
+       nfa->eos[0] = nfa->eos[1] = COLORLESS;
+       nfa->post = newfstate(nfa, '@');        /* number 0 */
+       nfa->pre = newfstate(nfa, '>');         /* number 1 */
+       nfa->parent = parent;
+
+       nfa->init = newstate(nfa);      /* may become invalid later */
+       nfa->final = newstate(nfa);
+       if (ISERR())
+       {
+               freenfa(nfa);
+               return NULL;
+       }
+       rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
+       newarc(nfa, '^', 1, nfa->pre, nfa->init);
+       newarc(nfa, '^', 0, nfa->pre, nfa->init);
+       rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
+       newarc(nfa, '$', 1, nfa->final, nfa->post);
+       newarc(nfa, '$', 0, nfa->final, nfa->post);
+
+       if (ISERR())
+       {
+               freenfa(nfa);
+               return NULL;
+       }
+       return nfa;
+}
+
+/*
+ * freenfa - free an entire NFA
+ */
+static void
+freenfa(struct nfa * nfa)
+{
+       struct state *s;
+
+       while ((s = nfa->states) != NULL)
+       {
+               s->nins = s->nouts = 0; /* don't worry about arcs */
+               freestate(nfa, s);
+       }
+       while ((s = nfa->free) != NULL)
+       {
+               nfa->free = s->next;
+               destroystate(nfa, s);
+       }
+
+       nfa->slast = NULL;
+       nfa->nstates = -1;
+       nfa->pre = NULL;
+       nfa->post = NULL;
+       FREE(nfa);
+}
+
+/*
+ * newstate - allocate an NFA state, with zero flag value
+ */
+static struct state *                  /* NULL on error */
+newstate(struct nfa * nfa)
+{
+       struct state *s;
+
+       if (nfa->free != NULL)
+       {
+               s = nfa->free;
+               nfa->free = s->next;
+       }
+       else
+       {
+               s = (struct state *) MALLOC(sizeof(struct state));
+               if (s == NULL)
+               {
+                       NERR(REG_ESPACE);
+                       return NULL;
+               }
+               s->oas.next = NULL;
+               s->free = NULL;
+               s->noas = 0;
+       }
+
+       assert(nfa->nstates >= 0);
+       s->no = nfa->nstates++;
+       s->flag = 0;
+       if (nfa->states == NULL)
+               nfa->states = s;
+       s->nins = 0;
+       s->ins = NULL;
+       s->nouts = 0;
+       s->outs = NULL;
+       s->tmp = NULL;
+       s->next = NULL;
+       if (nfa->slast != NULL)
+       {
+               assert(nfa->slast->next == NULL);
+               nfa->slast->next = s;
+       }
+       s->prev = nfa->slast;
+       nfa->slast = s;
+       return s;
+}
+
+/*
+ * newfstate - allocate an NFA state with a specified flag value
+ */
+static struct state *                  /* NULL on error */
+newfstate(struct nfa * nfa, int flag)
+{
+       struct state *s;
+
+       s = newstate(nfa);
+       if (s != NULL)
+               s->flag = (char) flag;
+       return s;
+}
+
+/*
+ * dropstate - delete a state's inarcs and outarcs and free it
+ */
+static void
+dropstate(struct nfa * nfa,
+                 struct state * s)
+{
+       struct arc *a;
+
+       while ((a = s->ins) != NULL)
+               freearc(nfa, a);
+       while ((a = s->outs) != NULL)
+               freearc(nfa, a);
+       freestate(nfa, s);
+}
+
+/*
+ * freestate - free a state, which has no in-arcs or out-arcs
+ */
+static void
+freestate(struct nfa * nfa,
+                 struct state * s)
+{
+       assert(s != NULL);
+       assert(s->nins == 0 && s->nouts == 0);
+
+       s->no = FREESTATE;
+       s->flag = 0;
+       if (s->next != NULL)
+               s->next->prev = s->prev;
+       else
+       {
+               assert(s == nfa->slast);
+               nfa->slast = s->prev;
+       }
+       if (s->prev != NULL)
+               s->prev->next = s->next;
+       else
+       {
+               assert(s == nfa->states);
+               nfa->states = s->next;
+       }
+       s->prev = NULL;
+       s->next = nfa->free;            /* don't delete it, put it on the free
+                                                                * list */
+       nfa->free = s;
+}
+
+/*
+ * destroystate - really get rid of an already-freed state
+ */
+static void
+destroystate(struct nfa * nfa,
+                        struct state * s)
+{
+       struct arcbatch *ab;
+       struct arcbatch *abnext;
+
+       assert(s->no == FREESTATE);
+       for (ab = s->oas.next; ab != NULL; ab = abnext)
+       {
+               abnext = ab->next;
+               FREE(ab);
+       }
+       s->ins = NULL;
+       s->outs = NULL;
+       s->next = NULL;
+       FREE(s);
+}
+
+/*
+ * newarc - set up a new arc within an NFA
+ */
+static void
+newarc(struct nfa * nfa,
+          int t,
+          pcolor co,
+          struct state * from,
+          struct state * to)
+{
+       struct arc *a;
+
+       assert(from != NULL && to != NULL);
+
+       /* check for duplicates */
+       for (a = from->outs; a != NULL; a = a->outchain)
+               if (a->to == to && a->co == co && a->type == t)
+                       return;
+
+       a = allocarc(nfa, from);
+       if (NISERR())
+               return;
+       assert(a != NULL);
+
+       a->type = t;
+       a->co = (color) co;
+       a->to = to;
+       a->from = from;
+
+       /*
+        * Put the new arc on the beginning, not the end, of the chains. Not
+        * only is this easier, it has the very useful side effect that
+        * deleting the most-recently-added arc is the cheapest case rather
+        * than the most expensive one.
+        */
+       a->inchain = to->ins;
+       to->ins = a;
+       a->outchain = from->outs;
+       from->outs = a;
+
+       from->nouts++;
+       to->nins++;
+
+       if (COLORED(a) && nfa->parent == NULL)
+               colorchain(nfa->cm, a);
+
+       return;
+}
+
+/*
+ * allocarc - allocate a new out-arc within a state
+ */
+static struct arc *                            /* NULL for failure */
+allocarc(struct nfa * nfa,
+                struct state * s)
+{
+       struct arc *a;
+       struct arcbatch *new;
+       int                     i;
+
+       /* shortcut */
+       if (s->free == NULL && s->noas < ABSIZE)
+       {
+               a = &s->oas.a[s->noas];
+               s->noas++;
+               return a;
+       }
+
+       /* if none at hand, get more */
+       if (s->free == NULL)
+       {
+               new = (struct arcbatch *) MALLOC(sizeof(struct arcbatch));
+               if (new == NULL)
+               {
+                       NERR(REG_ESPACE);
+                       return NULL;
+               }
+               new->next = s->oas.next;
+               s->oas.next = new;
+
+               for (i = 0; i < ABSIZE; i++)
+               {
+                       new->a[i].type = 0;
+                       new->a[i].freechain = &new->a[i + 1];
+               }
+               new->a[ABSIZE - 1].freechain = NULL;
+               s->free = &new->a[0];
+       }
+       assert(s->free != NULL);
+
+       a = s->free;
+       s->free = a->freechain;
+       return a;
+}
+
+/*
+ * freearc - free an arc
+ */
+static void
+freearc(struct nfa * nfa,
+               struct arc * victim)
+{
+       struct state *from = victim->from;
+       struct state *to = victim->to;
+       struct arc *a;
+
+       assert(victim->type != 0);
+
+       /* take it off color chain if necessary */
+       if (COLORED(victim) && nfa->parent == NULL)
+               uncolorchain(nfa->cm, victim);
+
+       /* take it off source's out-chain */
+       assert(from != NULL);
+       assert(from->outs != NULL);
+       a = from->outs;
+       if (a == victim)                        /* simple case:  first in chain */
+               from->outs = victim->outchain;
+       else
+       {
+               for (; a != NULL && a->outchain != victim; a = a->outchain)
+                       continue;
+               assert(a != NULL);
+               a->outchain = victim->outchain;
+       }
+       from->nouts--;
+
+       /* take it off target's in-chain */
+       assert(to != NULL);
+       assert(to->ins != NULL);
+       a = to->ins;
+       if (a == victim)                        /* simple case:  first in chain */
+               to->ins = victim->inchain;
+       else
+       {
+               for (; a != NULL && a->inchain != victim; a = a->inchain)
+                       continue;
+               assert(a != NULL);
+               a->inchain = victim->inchain;
+       }
+       to->nins--;
+
+       /* clean up and place on free list */
+       victim->type = 0;
+       victim->from = NULL;            /* precautions... */
+       victim->to = NULL;
+       victim->inchain = NULL;
+       victim->outchain = NULL;
+       victim->freechain = from->free;
+       from->free = victim;
+}
+
+/*
+ * findarc - find arc, if any, from given source with given type and color
+ * If there is more than one such arc, the result is random.
+ */
+static struct arc *
+findarc(struct state * s,
+               int type,
+               pcolor co)
+{
+       struct arc *a;
+
+       for (a = s->outs; a != NULL; a = a->outchain)
+               if (a->type == type && a->co == co)
+                       return a;
+       return NULL;
+}
+
+/*
+ * cparc - allocate a new arc within an NFA, copying details from old one
+ */
+static void
+cparc(struct nfa * nfa,
+         struct arc * oa,
+         struct state * from,
+         struct state * to)
+{
+       newarc(nfa, oa->type, oa->co, from, to);
+}
+
+/*
+ * moveins - move all in arcs of a state to another state
+ *
+ * You might think this could be done better by just updating the
+ * existing arcs, and you would be right if it weren't for the desire
+ * for duplicate suppression, which makes it easier to just make new
+ * ones to exploit the suppression built into newarc.
+ */
+static void
+moveins(struct nfa * nfa,
+               struct state * old,
+               struct state * new)
+{
+       struct arc *a;
+
+       assert(old != new);
+
+       while ((a = old->ins) != NULL)
+       {
+               cparc(nfa, a, a->from, new);
+               freearc(nfa, a);
+       }
+       assert(old->nins == 0);
+       assert(old->ins == NULL);
+}
+
+/*
+ * copyins - copy all in arcs of a state to another state
+ */
+static void
+copyins(struct nfa * nfa,
+               struct state * old,
+               struct state * new)
+{
+       struct arc *a;
+
+       assert(old != new);
+
+       for (a = old->ins; a != NULL; a = a->inchain)
+               cparc(nfa, a, a->from, new);
+}
+
+/*
+ * moveouts - move all out arcs of a state to another state
+ */
+static void
+moveouts(struct nfa * nfa,
+                struct state * old,
+                struct state * new)
+{
+       struct arc *a;
+
+       assert(old != new);
+
+       while ((a = old->outs) != NULL)
+       {
+               cparc(nfa, a, new, a->to);
+               freearc(nfa, a);
+       }
+}
+
+/*
+ * copyouts - copy all out arcs of a state to another state
+ */
+static void
+copyouts(struct nfa * nfa,
+                struct state * old,
+                struct state * new)
+{
+       struct arc *a;
+
+       assert(old != new);
+
+       for (a = old->outs; a != NULL; a = a->outchain)
+               cparc(nfa, a, new, a->to);
+}
+
+/*
+ * cloneouts - copy out arcs of a state to another state pair, modifying type
+ */
+static void
+cloneouts(struct nfa * nfa,
+                 struct state * old,
+                 struct state * from,
+                 struct state * to,
+                 int type)
+{
+       struct arc *a;
+
+       assert(old != from);
+
+       for (a = old->outs; a != NULL; a = a->outchain)
+               newarc(nfa, type, a->co, from, to);
+}
+
+/*
+ * delsub - delete a sub-NFA, updating subre pointers if necessary
+ *
+ * This uses a recursive traversal of the sub-NFA, marking already-seen
+ * states using their tmp pointer.
+ */
+static void
+delsub(struct nfa * nfa,
+          struct state * lp,           /* the sub-NFA goes from here... */
+          struct state * rp)           /* ...to here, *not* inclusive */
+{
+       assert(lp != rp);
+
+       rp->tmp = rp;                           /* mark end */
+
+       deltraverse(nfa, lp, lp);
+       assert(lp->nouts == 0 && rp->nins == 0);        /* did the job */
+       assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
+
+       rp->tmp = NULL;                         /* unmark end */
+       lp->tmp = NULL;                         /* and begin, marked by deltraverse */
+}
+
+/*
+ * deltraverse - the recursive heart of delsub
+ * This routine's basic job is to destroy all out-arcs of the state.
+ */
+static void
+deltraverse(struct nfa * nfa,
+                       struct state * leftend,
+                       struct state * s)
+{
+       struct arc *a;
+       struct state *to;
+
+       if (s->nouts == 0)
+               return;                                 /* nothing to do */
+       if (s->tmp != NULL)
+               return;                                 /* already in progress */
+
+       s->tmp = s;                                     /* mark as in progress */
+
+       while ((a = s->outs) != NULL)
+       {
+               to = a->to;
+               deltraverse(nfa, leftend, to);
+               assert(to->nouts == 0 || to->tmp != NULL);
+               freearc(nfa, a);
+               if (to->nins == 0 && to->tmp == NULL)
+               {
+                       assert(to->nouts == 0);
+                       freestate(nfa, to);
+               }
+       }
+
+       assert(s->no != FREESTATE); /* we're still here */
+       assert(s == leftend || s->nins != 0);           /* and still reachable */
+       assert(s->nouts == 0);          /* but have no outarcs */
+
+       s->tmp = NULL;                          /* we're done here */
+}
+
+/*
+ * dupnfa - duplicate sub-NFA
+ *
+ * Another recursive traversal, this time using tmp to point to duplicates
+ * as well as mark already-seen states.  (You knew there was a reason why
+ * it's a state pointer, didn't you? :-))
+ */
+static void
+dupnfa(struct nfa * nfa,
+          struct state * start,        /* duplicate of subNFA starting here */
+          struct state * stop,         /* and stopping here */
+          struct state * from,         /* stringing duplicate from here */
+          struct state * to)           /* to here */
+{
+       if (start == stop)
+       {
+               newarc(nfa, EMPTY, 0, from, to);
+               return;
+       }
+
+       stop->tmp = to;
+       duptraverse(nfa, start, from);
+       /* done, except for clearing out the tmp pointers */
+
+       stop->tmp = NULL;
+       cleartraverse(nfa, start);
+}
+
+/*
+ * duptraverse - recursive heart of dupnfa
+ */
+static void
+duptraverse(struct nfa * nfa,
+                       struct state * s,
+                       struct state * stmp)    /* s's duplicate, or NULL */
+{
+       struct arc *a;
+
+       if (s->tmp != NULL)
+               return;                                 /* already done */
+
+       s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
+       if (s->tmp == NULL)
+       {
+               assert(NISERR());
+               return;
+       }
+
+       for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
+       {
+               duptraverse(nfa, a->to, (struct state *) NULL);
+               assert(a->to->tmp != NULL);
+               cparc(nfa, a, s->tmp, a->to->tmp);
+       }
+}
+
+/*
+ * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
+ */
+static void
+cleartraverse(struct nfa * nfa,
+                         struct state * s)
+{
+       struct arc *a;
+
+       if (s->tmp == NULL)
+               return;
+       s->tmp = NULL;
+
+       for (a = s->outs; a != NULL; a = a->outchain)
+               cleartraverse(nfa, a->to);
+}
+
+/*
+ * specialcolors - fill in special colors for an NFA
+ */
+static void
+specialcolors(struct nfa * nfa)
+{
+       /* false colors for BOS, BOL, EOS, EOL */
+       if (nfa->parent == NULL)
+       {
+               nfa->bos[0] = pseudocolor(nfa->cm);
+               nfa->bos[1] = pseudocolor(nfa->cm);
+               nfa->eos[0] = pseudocolor(nfa->cm);
+               nfa->eos[1] = pseudocolor(nfa->cm);
+       }
+       else
+       {
+               assert(nfa->parent->bos[0] != COLORLESS);
+               nfa->bos[0] = nfa->parent->bos[0];
+               assert(nfa->parent->bos[1] != COLORLESS);
+               nfa->bos[1] = nfa->parent->bos[1];
+               assert(nfa->parent->eos[0] != COLORLESS);
+               nfa->eos[0] = nfa->parent->eos[0];
+               assert(nfa->parent->eos[1] != COLORLESS);
+               nfa->eos[1] = nfa->parent->eos[1];
+       }
+}
+
+/*
+ * optimize - optimize an NFA
+ */
+static long                                            /* re_info bits */
+optimize(struct nfa * nfa,
+                FILE *f)                               /* for debug output; NULL none */
+{
+#ifdef REG_DEBUG
+       int                     verbose = (f != NULL) ? 1 : 0;
+
+       if (verbose)
+               fprintf(f, "\ninitial cleanup:\n");
+#endif
+       cleanup(nfa);                           /* may simplify situation */
+#ifdef REG_DEBUG
+       if (verbose)
+               dumpnfa(nfa, f);
+       if (verbose)
+               fprintf(f, "\nempties:\n");
+#endif
+       fixempties(nfa, f);                     /* get rid of EMPTY arcs */
+#ifdef REG_DEBUG
+       if (verbose)
+               fprintf(f, "\nconstraints:\n");
+#endif
+       pullback(nfa, f);                       /* pull back constraints backward */
+       pushfwd(nfa, f);                        /* push fwd constraints forward */
+#ifdef REG_DEBUG
+       if (verbose)
+               fprintf(f, "\nfinal cleanup:\n");
+#endif
+       cleanup(nfa);                           /* final tidying */
+       return analyze(nfa);            /* and analysis */
+}
+
+/*
+ * pullback - pull back constraints backward to (with luck) eliminate them
+ */
+static void
+pullback(struct nfa * nfa,
+                FILE *f)                               /* for debug output; NULL none */
+{
+       struct state *s;
+       struct state *nexts;
+       struct arc *a;
+       struct arc *nexta;
+       int                     progress;
+
+       /* find and pull until there are no more */
+       do
+       {
+               progress = 0;
+               for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+               {
+                       nexts = s->next;
+                       for (a = s->outs; a != NULL && !NISERR(); a = nexta)
+                       {
+                               nexta = a->outchain;
+                               if (a->type == '^' || a->type == BEHIND)
+                                       if (pull(nfa, a))
+                                               progress = 1;
+                               assert(nexta == NULL || s->no != FREESTATE);
+                       }
+               }
+               if (progress && f != NULL)
+                       dumpnfa(nfa, f);
+       } while (progress && !NISERR());
+       if (NISERR())
+               return;
+
+       for (a = nfa->pre->outs; a != NULL; a = nexta)
+       {
+               nexta = a->outchain;
+               if (a->type == '^')
+               {
+                       assert(a->co == 0 || a->co == 1);
+                       newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
+                       freearc(nfa, a);
+               }
+       }
+}
+
+/*
+ * pull - pull a back constraint backward past its source state
+ * A significant property of this function is that it deletes at most
+ * one state -- the constraint's from state -- and only if the constraint
+ * was that state's last outarc.
+ */
+static int                                             /* 0 couldn't, 1 could */
+pull(struct nfa * nfa,
+        struct arc * con)
+{
+       struct state *from = con->from;
+       struct state *to = con->to;
+       struct arc *a;
+       struct arc *nexta;
+       struct state *s;
+
+       if (from == to)
+       {                                                       /* circular constraint is pointless */
+               freearc(nfa, con);
+               return 1;
+       }
+       if (from->flag)                         /* can't pull back beyond start */
+               return 0;
+       if (from->nins == 0)
+       {                                                       /* unreachable */
+               freearc(nfa, con);
+               return 1;
+       }
+
+       /* first, clone from state if necessary to avoid other outarcs */
+       if (from->nouts > 1)
+       {
+               s = newstate(nfa);
+               if (NISERR())
+                       return 0;
+               assert(to != from);             /* con is not an inarc */
+               copyins(nfa, from, s);  /* duplicate inarcs */
+               cparc(nfa, con, s, to); /* move constraint arc */
+               freearc(nfa, con);
+               from = s;
+               con = from->outs;
+       }
+       assert(from->nouts == 1);
+
+       /* propagate the constraint into the from state's inarcs */
+       for (a = from->ins; a != NULL; a = nexta)
+       {
+               nexta = a->inchain;
+               switch (combine(con, a))
+               {
+                       case INCOMPATIBLE:      /* destroy the arc */
+                               freearc(nfa, a);
+                               break;
+                       case SATISFIED:         /* no action needed */
+                               break;
+                       case COMPATIBLE:        /* swap the two arcs, more or less */
+                               s = newstate(nfa);
+                               if (NISERR())
+                                       return 0;
+                               cparc(nfa, a, s, to);   /* anticipate move */
+                               cparc(nfa, con, a->from, s);
+                               if (NISERR())
+                                       return 0;
+                               freearc(nfa, a);
+                               break;
+                       default:
+                               assert(NOTREACHED);
+                               break;
+               }
+       }
+
+       /* remaining inarcs, if any, incorporate the constraint */
+       moveins(nfa, from, to);
+       dropstate(nfa, from);           /* will free the constraint */
+       return 1;
+}
+
+/*
+ * pushfwd - push forward constraints forward to (with luck) eliminate them
+ */
+static void
+pushfwd(struct nfa * nfa,
+               FILE *f)                                /* for debug output; NULL none */
+{
+       struct state *s;
+       struct state *nexts;
+       struct arc *a;
+       struct arc *nexta;
+       int                     progress;
+
+       /* find and push until there are no more */
+       do
+       {
+               progress = 0;
+               for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+               {
+                       nexts = s->next;
+                       for (a = s->ins; a != NULL && !NISERR(); a = nexta)
+                       {
+                               nexta = a->inchain;
+                               if (a->type == '$' || a->type == AHEAD)
+                                       if (push(nfa, a))
+                                               progress = 1;
+                               assert(nexta == NULL || s->no != FREESTATE);
+                       }
+               }
+               if (progress && f != NULL)
+                       dumpnfa(nfa, f);
+       } while (progress && !NISERR());
+       if (NISERR())
+               return;
+
+       for (a = nfa->post->ins; a != NULL; a = nexta)
+       {
+               nexta = a->inchain;
+               if (a->type == '$')
+               {
+                       assert(a->co == 0 || a->co == 1);
+                       newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
+                       freearc(nfa, a);
+               }
+       }
+}
+
+/*
+ * push - push a forward constraint forward past its destination state
+ * A significant property of this function is that it deletes at most
+ * one state -- the constraint's to state -- and only if the constraint
+ * was that state's last inarc.
+ */
+static int                                             /* 0 couldn't, 1 could */
+push(struct nfa * nfa,
+        struct arc * con)
+{
+       struct state *from = con->from;
+       struct state *to = con->to;
+       struct arc *a;
+       struct arc *nexta;
+       struct state *s;
+
+       if (to == from)
+       {                                                       /* circular constraint is pointless */
+               freearc(nfa, con);
+               return 1;
+       }
+       if (to->flag)                           /* can't push forward beyond end */
+               return 0;
+       if (to->nouts == 0)
+       {                                                       /* dead end */
+               freearc(nfa, con);
+               return 1;
+       }
+
+       /* first, clone to state if necessary to avoid other inarcs */
+       if (to->nins > 1)
+       {
+               s = newstate(nfa);
+               if (NISERR())
+                       return 0;
+               copyouts(nfa, to, s);   /* duplicate outarcs */
+               cparc(nfa, con, from, s);               /* move constraint */
+               freearc(nfa, con);
+               to = s;
+               con = to->ins;
+       }
+       assert(to->nins == 1);
+
+       /* propagate the constraint into the to state's outarcs */
+       for (a = to->outs; a != NULL; a = nexta)
+       {
+               nexta = a->outchain;
+               switch (combine(con, a))
+               {
+                       case INCOMPATIBLE:      /* destroy the arc */
+                               freearc(nfa, a);
+                               break;
+                       case SATISFIED:         /* no action needed */
+                               break;
+                       case COMPATIBLE:        /* swap the two arcs, more or less */
+                               s = newstate(nfa);
+                               if (NISERR())
+                                       return 0;
+                               cparc(nfa, con, s, a->to);              /* anticipate move */
+                               cparc(nfa, a, from, s);
+                               if (NISERR())
+                                       return 0;
+                               freearc(nfa, a);
+                               break;
+                       default:
+                               assert(NOTREACHED);
+                               break;
+               }
+       }
+
+       /* remaining outarcs, if any, incorporate the constraint */
+       moveouts(nfa, to, from);
+       dropstate(nfa, to);                     /* will free the constraint */
+       return 1;
+}
+
+/*
+ * combine - constraint lands on an arc, what happens?
+ *
+ * #def INCOMPATIBLE   1       // destroys arc
+ * #def SATISFIED              2       // constraint satisfied
+ * #def COMPATIBLE             3       // compatible but not satisfied yet
+ */
+static int
+combine(struct arc * con,
+               struct arc * a)
+{
+#define  CA(ct,at)      (((ct)<<CHAR_BIT) | (at))
+
+       switch (CA(con->type, a->type))
+       {
+               case CA('^', PLAIN):    /* newlines are handled separately */
+               case CA('$', PLAIN):
+                       return INCOMPATIBLE;
+                       break;
+               case CA(AHEAD, PLAIN):  /* color constraints meet colors */
+               case CA(BEHIND, PLAIN):
+                       if (con->co == a->co)
+                               return SATISFIED;
+                       return INCOMPATIBLE;
+                       break;
+               case CA('^', '^'):              /* collision, similar constraints */
+               case CA('$', '$'):
+               case CA(AHEAD, AHEAD):
+               case CA(BEHIND, BEHIND):
+                       if (con->co == a->co)           /* true duplication */
+                               return SATISFIED;
+                       return INCOMPATIBLE;
+                       break;
+               case CA('^', BEHIND):   /* collision, dissimilar constraints */
+               case CA(BEHIND, '^'):
+               case CA('$', AHEAD):
+               case CA(AHEAD, '$'):
+                       return INCOMPATIBLE;
+                       break;
+               case CA('^', '$'):              /* constraints passing each other */
+               case CA('^', AHEAD):
+               case CA(BEHIND, '$'):
+               case CA(BEHIND, AHEAD):
+               case CA('$', '^'):
+               case CA('$', BEHIND):
+               case CA(AHEAD, '^'):
+               case CA(AHEAD, BEHIND):
+               case CA('^', LACON):
+               case CA(BEHIND, LACON):
+               case CA('$', LACON):
+               case CA(AHEAD, LACON):
+                       return COMPATIBLE;
+                       break;
+       }
+       assert(NOTREACHED);
+       return INCOMPATIBLE;            /* for benefit of blind compilers */
+}
+
+/*
+ * fixempties - get rid of EMPTY arcs
+ */
+static void
+fixempties(struct nfa * nfa,
+                  FILE *f)                             /* for debug output; NULL none */
+{
+       struct state *s;
+       struct state *nexts;
+       struct arc *a;
+       struct arc *nexta;
+       int                     progress;
+
+       /* find and eliminate empties until there are no more */
+       do
+       {
+               progress = 0;
+               for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+               {
+                       nexts = s->next;
+                       for (a = s->outs; a != NULL && !NISERR(); a = nexta)
+                       {
+                               nexta = a->outchain;
+                               if (a->type == EMPTY && unempty(nfa, a))
+                                       progress = 1;
+                               assert(nexta == NULL || s->no != FREESTATE);
+                       }
+               }
+               if (progress && f != NULL)
+                       dumpnfa(nfa, f);
+       } while (progress && !NISERR());
+}
+
+/*
+ * unempty - optimize out an EMPTY arc, if possible
+ *
+ * Actually, as it stands this function always succeeds, but the return
+ * value is kept with an eye on possible future changes.
+ */
+static int                                             /* 0 couldn't, 1 could */
+unempty(struct nfa * nfa,
+               struct arc * a)
+{
+       struct state *from = a->from;
+       struct state *to = a->to;
+       int                     usefrom;                /* work on from, as opposed to to? */
+
+       assert(a->type == EMPTY);
+       assert(from != nfa->pre && to != nfa->post);
+
+       if (from == to)
+       {                                                       /* vacuous loop */
+               freearc(nfa, a);
+               return 1;
+       }
+
+       /* decide which end to work on */
+       usefrom = 1;                            /* default:  attack from */
+       if (from->nouts > to->nins)
+               usefrom = 0;
+       else if (from->nouts == to->nins)
+       {
+               /* decide on secondary issue:  move/copy fewest arcs */
+               if (from->nins > to->nouts)
+                       usefrom = 0;
+       }
+
+       freearc(nfa, a);
+       if (usefrom)
+       {
+               if (from->nouts == 0)
+               {
+                       /* was the state's only outarc */
+                       moveins(nfa, from, to);
+                       freestate(nfa, from);
+               }
+               else
+                       copyins(nfa, from, to);
+       }
+       else
+       {
+               if (to->nins == 0)
+               {
+                       /* was the state's only inarc */
+                       moveouts(nfa, to, from);
+                       freestate(nfa, to);
+               }
+               else
+                       copyouts(nfa, to, from);
+       }
+
+       return 1;
+}
+
+/*
+ * cleanup - clean up NFA after optimizations
+ */
+static void
+cleanup(struct nfa * nfa)
+{
+       struct state *s;
+       struct state *nexts;
+       int                     n;
+
+       /* clear out unreachable or dead-end states */
+       /* use pre to mark reachable, then post to mark can-reach-post */
+       markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
+       markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
+       for (s = nfa->states; s != NULL; s = nexts)
+       {
+               nexts = s->next;
+               if (s->tmp != nfa->post && !s->flag)
+                       dropstate(nfa, s);
+       }
+       assert(nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
+       cleartraverse(nfa, nfa->pre);
+       assert(nfa->post->nins == 0 || nfa->post->tmp == NULL);
+       /* the nins==0 (final unreachable) case will be caught later */
+
+       /* renumber surviving states */
+       n = 0;
+       for (s = nfa->states; s != NULL; s = s->next)
+               s->no = n++;
+       nfa->nstates = n;
+}
+
+/*
+ * markreachable - recursive marking of reachable states
+ */
+static void
+markreachable(struct nfa * nfa,
+                         struct state * s,
+                         struct state * okay,          /* consider only states with this
+                                                                                * mark */
+                         struct state * mark)          /* the value to mark with */
+{
+       struct arc *a;
+
+       if (s->tmp != okay)
+               return;
+       s->tmp = mark;
+
+       for (a = s->outs; a != NULL; a = a->outchain)
+               markreachable(nfa, a->to, okay, mark);
+}
+
+/*
+ * markcanreach - recursive marking of states which can reach here
+ */
+static void
+markcanreach(struct nfa * nfa,
+                        struct state * s,
+                        struct state * okay,           /* consider only states with this
+                                                                                * mark */
+                        struct state * mark)           /* the value to mark with */
+{
+       struct arc *a;
+
+       if (s->tmp != okay)
+               return;
+       s->tmp = mark;
+
+       for (a = s->ins; a != NULL; a = a->inchain)
+               markcanreach(nfa, a->from, okay, mark);
+}
+
+/*
+ * analyze - ascertain potentially-useful facts about an optimized NFA
+ */
+static long                                            /* re_info bits to be ORed in */
+analyze(struct nfa * nfa)
+{
+       struct arc *a;
+       struct arc *aa;
+
+       if (nfa->pre->outs == NULL)
+               return REG_UIMPOSSIBLE;
+       for (a = nfa->pre->outs; a != NULL; a = a->outchain)
+               for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
+                       if (aa->to == nfa->post)
+                               return REG_UEMPTYMATCH;
+       return 0;
+}
+
+/*
+ * compact - compact an NFA
+ */
+static void
+compact(struct nfa * nfa,
+               struct cnfa * cnfa)
+{
+       struct state *s;
+       struct arc *a;
+       size_t          nstates;
+       size_t          narcs;
+       struct carc *ca;
+       struct carc *first;
+
+       assert(!NISERR());
+
+       nstates = 0;
+       narcs = 0;
+       for (s = nfa->states; s != NULL; s = s->next)
+       {
+               nstates++;
+               narcs += 1 + s->nouts + 1;
+               /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */
+       }
+
+       cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
+       cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
+       if (cnfa->states == NULL || cnfa->arcs == NULL)
+       {
+               if (cnfa->states != NULL)
+                       FREE(cnfa->states);
+               if (cnfa->arcs != NULL)
+                       FREE(cnfa->arcs);
+               NERR(REG_ESPACE);
+               return;
+       }
+       cnfa->nstates = nstates;
+       cnfa->pre = nfa->pre->no;
+       cnfa->post = nfa->post->no;
+       cnfa->bos[0] = nfa->bos[0];
+       cnfa->bos[1] = nfa->bos[1];
+       cnfa->eos[0] = nfa->eos[0];
+       cnfa->eos[1] = nfa->eos[1];
+       cnfa->ncolors = maxcolor(nfa->cm) + 1;
+       cnfa->flags = 0;
+
+       ca = cnfa->arcs;
+       for (s = nfa->states; s != NULL; s = s->next)
+       {
+               assert((size_t) s->no < nstates);
+               cnfa->states[s->no] = ca;
+               ca->co = 0;                             /* clear and skip flags "arc" */
+               ca++;
+               first = ca;
+               for (a = s->outs; a != NULL; a = a->outchain)
+                       switch (a->type)
+                       {
+                               case PLAIN:
+                                       ca->co = a->co;
+                                       ca->to = a->to->no;
+                                       ca++;
+                                       break;
+                               case LACON:
+                                       assert(s->no != cnfa->pre);
+                                       ca->co = (color) (cnfa->ncolors + a->co);
+                                       ca->to = a->to->no;
+                                       ca++;
+                                       cnfa->flags |= HASLACONS;
+                                       break;
+                               default:
+                                       assert(NOTREACHED);
+                                       break;
+                       }
+               carcsort(first, ca - 1);
+               ca->co = COLORLESS;
+               ca->to = 0;
+               ca++;
+       }
+       assert(ca == &cnfa->arcs[narcs]);
+       assert(cnfa->nstates != 0);
+
+       /* mark no-progress states */
+       for (a = nfa->pre->outs; a != NULL; a = a->outchain)
+               cnfa->states[a->to->no]->co = 1;
+       cnfa->states[nfa->pre->no]->co = 1;
+}
+
+/*
+ * carcsort - sort compacted-NFA arcs by color
+ *
+ * Really dumb algorithm, but if the list is long enough for that to matter,
+ * you're in real trouble anyway.
+ */
+static void
+carcsort(struct carc * first,
+                struct carc * last)
+{
+       struct carc *p;
+       struct carc *q;
+       struct carc tmp;
+
+       if (last - first <= 1)
+               return;
+
+       for (p = first; p <= last; p++)
+               for (q = p; q <= last; q++)
+                       if (p->co > q->co ||
+                               (p->co == q->co && p->to > q->to))
+                       {
+                               assert(p != q);
+                               tmp = *p;
+                               *p = *q;
+                               *q = tmp;
+                       }
+}
+
+/*
+ * freecnfa - free a compacted NFA
+ */
+static void
+freecnfa(struct cnfa * cnfa)
+{
+       assert(cnfa->nstates != 0); /* not empty already */
+       cnfa->nstates = 0;
+       FREE(cnfa->states);
+       FREE(cnfa->arcs);
+}
+
+/*
+ * dumpnfa - dump an NFA in human-readable form
+ */
+static void
+dumpnfa(struct nfa * nfa,
+               FILE *f)
+{
+#ifdef REG_DEBUG
+       struct state *s;
+
+       fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
+       if (nfa->bos[0] != COLORLESS)
+               fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
+       if (nfa->bos[1] != COLORLESS)
+               fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
+       if (nfa->eos[0] != COLORLESS)
+               fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
+       if (nfa->eos[1] != COLORLESS)
+               fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
+       fprintf(f, "\n");
+       for (s = nfa->states; s != NULL; s = s->next)
+               dumpstate(s, f);
+       if (nfa->parent == NULL)
+               dumpcolors(nfa->cm, f);
+       fflush(f);
+#endif
+}
+
+#ifdef REG_DEBUG                               /* subordinates of dumpnfa */
+
+/*
+ * dumpstate - dump an NFA state in human-readable form
+ */
+static void
+dumpstate(struct state * s,
+                 FILE *f)
+{
+       struct arc *a;
+
+       fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
+                       (s->flag) ? s->flag : '.');
+       if (s->prev != NULL && s->prev->next != s)
+               fprintf(f, "\tstate chain bad\n");
+       if (s->nouts == 0)
+               fprintf(f, "\tno out arcs\n");
+       else
+               dumparcs(s, f);
+       fflush(f);
+       for (a = s->ins; a != NULL; a = a->inchain)
+       {
+               if (a->to != s)
+                       fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
+                                       a->from->no, a->to->no, s->no);
+       }
+}
+
+/*
+ * dumparcs - dump out-arcs in human-readable form
+ */
+static void
+dumparcs(struct state * s,
+                FILE *f)
+{
+       int                     pos;
+
+       assert(s->nouts > 0);
+       /* printing arcs in reverse order is usually clearer */
+       pos = dumprarcs(s->outs, s, f, 1);
+       if (pos != 1)
+               fprintf(f, "\n");
+}
+
+/*
+ * dumprarcs - dump remaining outarcs, recursively, in reverse order
+ */
+static int                                             /* resulting print position */
+dumprarcs(struct arc * a,
+                 struct state * s,
+                 FILE *f,
+                 int pos)                              /* initial print position */
+{
+       if (a->outchain != NULL)
+               pos = dumprarcs(a->outchain, s, f, pos);
+       dumparc(a, s, f);
+       if (pos == 5)
+       {
+               fprintf(f, "\n");
+               pos = 1;
+       }
+       else
+               pos++;
+       return pos;
+}
+
+/*
+ * dumparc - dump one outarc in readable form, including prefixing tab
+ */
+static void
+dumparc(struct arc * a,
+               struct state * s,
+               FILE *f)
+{
+       struct arc *aa;
+       struct arcbatch *ab;
+
+       fprintf(f, "\t");
+       switch (a->type)
+       {
+               case PLAIN:
+                       fprintf(f, "[%ld]", (long) a->co);
+                       break;
+               case AHEAD:
+                       fprintf(f, ">%ld>", (long) a->co);
+                       break;
+               case BEHIND:
+                       fprintf(f, "<%ld<", (long) a->co);
+                       break;
+               case LACON:
+                       fprintf(f, ":%ld:", (long) a->co);
+                       break;
+               case '^':
+               case '$':
+                       fprintf(f, "%c%d", a->type, (int) a->co);
+                       break;
+               case EMPTY:
+                       break;
+               default:
+                       fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
+                       break;
+       }
+       if (a->from != s)
+               fprintf(f, "?%d?", a->from->no);
+       for (ab = &a->from->oas; ab != NULL; ab = ab->next)
+       {
+               for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++)
+                       if (aa == a)
+                               break;                  /* NOTE BREAK OUT */
+               if (aa < &ab->a[ABSIZE])        /* propagate break */
+                       break;                          /* NOTE BREAK OUT */
+       }
+       if (ab == NULL)
+               fprintf(f, "?!?");              /* not in allocated space */
+       fprintf(f, "->");
+       if (a->to == NULL)
+       {
+               fprintf(f, "NULL");
+               return;
+       }
+       fprintf(f, "%d", a->to->no);
+       for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
+               if (aa == a)
+                       break;                          /* NOTE BREAK OUT */
+       if (aa == NULL)
+               fprintf(f, "?!?");              /* missing from in-chain */
+}
+#endif   /* REG_DEBUG */
+
+/*
+ * dumpcnfa - dump a compacted NFA in human-readable form
+ */
+#ifdef REG_DEBUG
+static void
+dumpcnfa(struct cnfa * cnfa,
+                FILE *f)
+{
+       int                     st;
+
+       fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
+       if (cnfa->bos[0] != COLORLESS)
+               fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
+       if (cnfa->bos[1] != COLORLESS)
+               fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
+       if (cnfa->eos[0] != COLORLESS)
+               fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
+       if (cnfa->eos[1] != COLORLESS)
+               fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
+       if (cnfa->flags & HASLACONS)
+               fprintf(f, ", haslacons");
+       fprintf(f, "\n");
+       for (st = 0; st < cnfa->nstates; st++)
+               dumpcstate(st, cnfa->states[st], cnfa, f);
+       fflush(f);
+}
+#endif
+
+#ifdef REG_DEBUG                               /* subordinates of dumpcnfa */
+
+/*
+ * dumpcstate - dump a compacted-NFA state in human-readable form
+ */
+static void
+dumpcstate(int st,
+                  struct carc * ca,
+                  struct cnfa * cnfa,
+                  FILE *f)
+{
+       int                     i;
+       int                     pos;
+
+       fprintf(f, "%d%s", st, (ca[0].co) ? ":" : ".");
+       pos = 1;
+       for (i = 1; ca[i].co != COLORLESS; i++)
+       {
+               if (ca[i].co < cnfa->ncolors)
+                       fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to);
+               else
+                       fprintf(f, "\t:%ld:->%d", (long) ca[i].co - cnfa->ncolors,
+                                       ca[i].to);
+               if (pos == 5)
+               {
+                       fprintf(f, "\n");
+                       pos = 1;
+               }
+               else
+                       pos++;
+       }
+       if (i == 1 || pos != 1)
+               fprintf(f, "\n");
+       fflush(f);
+}
+
+#endif   /* REG_DEBUG */
diff --git a/src/regex/rege_dfa.c b/src/regex/rege_dfa.c

new file mode 100644 (file)

index 0000000..5347b90
--- /dev/null
+++ b/src/regex/rege_dfa.c
@@ -0,0 +1,699 @@
+/*
+ * DFA routines
+ * This file is #included by regexec.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.     All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Header$
+ *
+ */
+
+/*
+ * longest - longest-preferred matching engine
+ */
+static chr *                                   /* endpoint, or NULL */
+longest(struct vars * v,               /* used only for debug and exec flags */
+               struct dfa * d,
+               chr *start,                             /* where the match should start */
+               chr *stop,                              /* match must end at or before here */
+               int *hitstopp)                  /* record whether hit v->stop, if non-NULL */
+{
+       chr                *cp;
+       chr                *realstop = (stop == v->stop) ? stop : stop + 1;
+       color           co;
+       struct sset *css;
+       struct sset *ss;
+       chr                *post;
+       int                     i;
+       struct colormap *cm = d->cm;
+
+       /* initialize */
+       css = initialize(v, d, start);
+       cp = start;
+       if (hitstopp != NULL)
+               *hitstopp = 0;
+
+       /* startup */
+       FDEBUG(("+++ startup +++\n"));
+       if (cp == v->start)
+       {
+               co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+               FDEBUG(("color %ld\n", (long) co));
+       }
+       else
+       {
+               co = GETCOLOR(cm, *(cp - 1));
+               FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
+       }
+       css = miss(v, d, css, co, cp, start);
+       if (css == NULL)
+               return NULL;
+       css->lastseen = cp;
+
+       /* main loop */
+       if (v->eflags & REG_FTRACE)
+               while (cp < realstop)
+               {
+                       FDEBUG(("+++ at c%d +++\n", css - d->ssets));
+                       co = GETCOLOR(cm, *cp);
+                       FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+                       ss = css->outs[co];
+                       if (ss == NULL)
+                       {
+                               ss = miss(v, d, css, co, cp + 1, start);
+                               if (ss == NULL)
+                                       break;          /* NOTE BREAK OUT */
+                       }
+                       cp++;
+                       ss->lastseen = cp;
+                       css = ss;
+               }
+       else
+               while (cp < realstop)
+               {
+                       co = GETCOLOR(cm, *cp);
+                       ss = css->outs[co];
+                       if (ss == NULL)
+                       {
+                               ss = miss(v, d, css, co, cp + 1, start);
+                               if (ss == NULL)
+                                       break;          /* NOTE BREAK OUT */
+                       }
+                       cp++;
+                       ss->lastseen = cp;
+                       css = ss;
+               }
+
+       /* shutdown */
+       FDEBUG(("+++ shutdown at c%d +++\n", css - d->ssets));
+       if (cp == v->stop && stop == v->stop)
+       {
+               if (hitstopp != NULL)
+                       *hitstopp = 1;
+               co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+               FDEBUG(("color %ld\n", (long) co));
+               ss = miss(v, d, css, co, cp, start);
+               /* special case:  match ended at eol? */
+               if (ss != NULL && (ss->flags & POSTSTATE))
+                       return cp;
+               else if (ss != NULL)
+                       ss->lastseen = cp;      /* to be tidy */
+       }
+
+       /* find last match, if any */
+       post = d->lastpost;
+       for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
+               if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
+                       (post == NULL || post < ss->lastseen))
+                       post = ss->lastseen;
+       if (post != NULL)                       /* found one */
+               return post - 1;
+
+       return NULL;
+}
+
+/*
+ * shortest - shortest-preferred matching engine
+ */
+static chr *                                   /* endpoint, or NULL */
+shortest(struct vars * v,
+                struct dfa * d,
+                chr *start,                    /* where the match should start */
+                chr *min,                              /* match must end at or after here */
+                chr *max,                              /* match must end at or before here */
+                chr **coldp,                   /* store coldstart pointer here, if
+                                                                * nonNULL */
+                int *hitstopp)                 /* record whether hit v->stop, if non-NULL */
+{
+       chr                *cp;
+       chr                *realmin = (min == v->stop) ? min : min + 1;
+       chr                *realmax = (max == v->stop) ? max : max + 1;
+       color           co;
+       struct sset *css;
+       struct sset *ss;
+       struct colormap *cm = d->cm;
+
+       /* initialize */
+       css = initialize(v, d, start);
+       cp = start;
+       if (hitstopp != NULL)
+               *hitstopp = 0;
+
+       /* startup */
+       FDEBUG(("--- startup ---\n"));
+       if (cp == v->start)
+       {
+               co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+               FDEBUG(("color %ld\n", (long) co));
+       }
+       else
+       {
+               co = GETCOLOR(cm, *(cp - 1));
+               FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
+       }
+       css = miss(v, d, css, co, cp, start);
+       if (css == NULL)
+               return NULL;
+       css->lastseen = cp;
+       ss = css;
+
+       /* main loop */
+       if (v->eflags & REG_FTRACE)
+               while (cp < realmax)
+               {
+                       FDEBUG(("--- at c%d ---\n", css - d->ssets));
+                       co = GETCOLOR(cm, *cp);
+                       FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+                       ss = css->outs[co];
+                       if (ss == NULL)
+                       {
+                               ss = miss(v, d, css, co, cp + 1, start);
+                               if (ss == NULL)
+                                       break;          /* NOTE BREAK OUT */
+                       }
+                       cp++;
+                       ss->lastseen = cp;
+                       css = ss;
+                       if ((ss->flags & POSTSTATE) && cp >= realmin)
+                               break;                  /* NOTE BREAK OUT */
+               }
+       else
+               while (cp < realmax)
+               {
+                       co = GETCOLOR(cm, *cp);
+                       ss = css->outs[co];
+                       if (ss == NULL)
+                       {
+                               ss = miss(v, d, css, co, cp + 1, start);
+                               if (ss == NULL)
+                                       break;          /* NOTE BREAK OUT */
+                       }
+                       cp++;
+                       ss->lastseen = cp;
+                       css = ss;
+                       if ((ss->flags & POSTSTATE) && cp >= realmin)
+                               break;                  /* NOTE BREAK OUT */
+               }
+
+       if (ss == NULL)
+               return NULL;
+
+       if (coldp != NULL)                      /* report last no-progress state set, if
+                                                                * any */
+               *coldp = lastcold(v, d);
+
+       if ((ss->flags & POSTSTATE) && cp > min)
+       {
+               assert(cp >= realmin);
+               cp--;
+       }
+       else if (cp == v->stop && max == v->stop)
+       {
+               co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+               FDEBUG(("color %ld\n", (long) co));
+               ss = miss(v, d, css, co, cp, start);
+               /* match might have ended at eol */
+               if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
+                       *hitstopp = 1;
+       }
+
+       if (ss == NULL || !(ss->flags & POSTSTATE))
+               return NULL;
+
+       return cp;
+}
+
+/*
+ * lastcold - determine last point at which no progress had been made
+ */
+static chr *                                   /* endpoint, or NULL */
+lastcold(struct vars * v,
+                struct dfa * d)
+{
+       struct sset *ss;
+       chr                *nopr;
+       int                     i;
+
+       nopr = d->lastnopr;
+       if (nopr == NULL)
+               nopr = v->start;
+       for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
+               if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
+                       nopr = ss->lastseen;
+       return nopr;
+}
+
+/*
+ * newdfa - set up a fresh DFA
+ */
+static struct dfa *
+newdfa(struct vars * v,
+          struct cnfa * cnfa,
+          struct colormap * cm,
+          struct smalldfa * small) /* preallocated space, may be NULL */
+{
+       struct dfa *d;
+       size_t          nss = cnfa->nstates * 2;
+       int                     wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
+       struct smalldfa *smallwas = small;
+
+       assert(cnfa != NULL && cnfa->nstates != 0);
+
+       if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
+       {
+               assert(wordsper == 1);
+               if (small == NULL)
+               {
+                       small = (struct smalldfa *) MALLOC(
+                                                                                          sizeof(struct smalldfa));
+                       if (small == NULL)
+                       {
+                               ERR(REG_ESPACE);
+                               return NULL;
+                       }
+               }
+               d = &small->dfa;
+               d->ssets = small->ssets;
+               d->statesarea = small->statesarea;
+               d->work = &d->statesarea[nss];
+               d->outsarea = small->outsarea;
+               d->incarea = small->incarea;
+               d->cptsmalloced = 0;
+               d->mallocarea = (smallwas == NULL) ? (char *) small : NULL;
+       }
+       else
+       {
+               d = (struct dfa *) MALLOC(sizeof(struct dfa));
+               if (d == NULL)
+               {
+                       ERR(REG_ESPACE);
+                       return NULL;
+               }
+               d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
+               d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
+                                                                                       sizeof(unsigned));
+               d->work = &d->statesarea[nss * wordsper];
+               d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
+                                                                                         sizeof(struct sset *));
+               d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
+                                                                                       sizeof(struct arcp));
+               d->cptsmalloced = 1;
+               d->mallocarea = (char *) d;
+               if (d->ssets == NULL || d->statesarea == NULL ||
+                       d->outsarea == NULL || d->incarea == NULL)
+               {
+                       freedfa(d);
+                       ERR(REG_ESPACE);
+                       return NULL;
+               }
+       }
+
+       d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
+       d->nssused = 0;
+       d->nstates = cnfa->nstates;
+       d->ncolors = cnfa->ncolors;
+       d->wordsper = wordsper;
+       d->cnfa = cnfa;
+       d->cm = cm;
+       d->lastpost = NULL;
+       d->lastnopr = NULL;
+       d->search = d->ssets;
+
+       /* initialization of sset fields is done as needed */
+
+       return d;
+}
+
+/*
+ * freedfa - free a DFA
+ */
+static void
+freedfa(struct dfa * d)
+{
+       if (d->cptsmalloced)
+       {
+               if (d->ssets != NULL)
+                       FREE(d->ssets);
+               if (d->statesarea != NULL)
+                       FREE(d->statesarea);
+               if (d->outsarea != NULL)
+                       FREE(d->outsarea);
+               if (d->incarea != NULL)
+                       FREE(d->incarea);
+       }
+
+       if (d->mallocarea != NULL)
+               FREE(d->mallocarea);
+}
+
+/*
+ * hash - construct a hash code for a bitvector
+ *
+ * There are probably better ways, but they're more expensive.
+ */
+static unsigned
+hash(unsigned *uv,
+        int n)
+{
+       int                     i;
+       unsigned        h;
+
+       h = 0;
+       for (i = 0; i < n; i++)
+               h ^= uv[i];
+       return h;
+}
+
+/*
+ * initialize - hand-craft a cache entry for startup, otherwise get ready
+ */
+static struct sset *
+initialize(struct vars * v,            /* used only for debug flags */
+                  struct dfa * d,
+                  chr *start)
+{
+       struct sset *ss;
+       int                     i;
+
+       /* is previous one still there? */
+       if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
+               ss = &d->ssets[0];
+       else
+       {                                                       /* no, must (re)build it */
+               ss = getvacant(v, d, start, start);
+               for (i = 0; i < d->wordsper; i++)
+                       ss->states[i] = 0;
+               BSET(ss->states, d->cnfa->pre);
+               ss->hash = HASH(ss->states, d->wordsper);
+               assert(d->cnfa->pre != d->cnfa->post);
+               ss->flags = STARTER | LOCKED | NOPROGRESS;
+               /* lastseen dealt with below */
+       }
+
+       for (i = 0; i < d->nssused; i++)
+               d->ssets[i].lastseen = NULL;
+       ss->lastseen = start;           /* maybe untrue, but harmless */
+       d->lastpost = NULL;
+       d->lastnopr = NULL;
+       return ss;
+}
+
+/*
+ * miss - handle a cache miss
+ */
+static struct sset *                   /* NULL if goes to empty set */
+miss(struct vars * v,                  /* used only for debug flags */
+        struct dfa * d,
+        struct sset * css,
+        pcolor co,
+        chr *cp,                                       /* next chr */
+        chr *start)                            /* where the attempt got started */
+{
+       struct cnfa *cnfa = d->cnfa;
+       int                     i;
+       unsigned        h;
+       struct carc *ca;
+       struct sset *p;
+       int                     ispost;
+       int                     noprogress;
+       int                     gotstate;
+       int                     dolacons;
+       int                     sawlacons;
+
+       /* for convenience, we can be called even if it might not be a miss */
+       if (css->outs[co] != NULL)
+       {
+               FDEBUG(("hit\n"));
+               return css->outs[co];
+       }
+       FDEBUG(("miss\n"));
+
+       /* first, what set of states would we end up in? */
+       for (i = 0; i < d->wordsper; i++)
+               d->work[i] = 0;
+       ispost = 0;
+       noprogress = 1;
+       gotstate = 0;
+       for (i = 0; i < d->nstates; i++)
+               if (ISBSET(css->states, i))
+                       for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++)
+                               if (ca->co == co)
+                               {
+                                       BSET(d->work, ca->to);
+                                       gotstate = 1;
+                                       if (ca->to == cnfa->post)
+                                               ispost = 1;
+                                       if (!cnfa->states[ca->to]->co)
+                                               noprogress = 0;
+                                       FDEBUG(("%d -> %d\n", i, ca->to));
+                               }
+       dolacons = (gotstate) ? (cnfa->flags & HASLACONS) : 0;
+       sawlacons = 0;
+       while (dolacons)
+       {                                                       /* transitive closure */
+               dolacons = 0;
+               for (i = 0; i < d->nstates; i++)
+                       if (ISBSET(d->work, i))
+                               for (ca = cnfa->states[i] + 1; ca->co != COLORLESS;
+                                        ca++)
+                               {
+                                       if (ca->co <= cnfa->ncolors)
+                                               continue;               /* NOTE CONTINUE */
+                                       sawlacons = 1;
+                                       if (ISBSET(d->work, ca->to))
+                                               continue;               /* NOTE CONTINUE */
+                                       if (!lacon(v, cnfa, cp, ca->co))
+                                               continue;               /* NOTE CONTINUE */
+                                       BSET(d->work, ca->to);
+                                       dolacons = 1;
+                                       if (ca->to == cnfa->post)
+                                               ispost = 1;
+                                       if (!cnfa->states[ca->to]->co)
+                                               noprogress = 0;
+                                       FDEBUG(("%d :> %d\n", i, ca->to));
+                               }
+       }
+       if (!gotstate)
+               return NULL;
+       h = HASH(d->work, d->wordsper);
+
+       /* next, is that in the cache? */
+       for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
+               if (HIT(h, d->work, p, d->wordsper))
+               {
+                       FDEBUG(("cached c%d\n", p - d->ssets));
+                       break;                          /* NOTE BREAK OUT */
+               }
+       if (i == 0)
+       {                                                       /* nope, need a new cache entry */
+               p = getvacant(v, d, cp, start);
+               assert(p != css);
+               for (i = 0; i < d->wordsper; i++)
+                       p->states[i] = d->work[i];
+               p->hash = h;
+               p->flags = (ispost) ? POSTSTATE : 0;
+               if (noprogress)
+                       p->flags |= NOPROGRESS;
+               /* lastseen to be dealt with by caller */
+       }
+
+       if (!sawlacons)
+       {                                                       /* lookahead conds. always cache miss */
+               FDEBUG(("c%d[%d]->c%d\n", css - d->ssets, co, p - d->ssets));
+               css->outs[co] = p;
+               css->inchain[co] = p->ins;
+               p->ins.ss = css;
+               p->ins.co = (color) co;
+       }
+       return p;
+}
+
+/*
+ * lacon - lookahead-constraint checker for miss()
+ */
+static int                                             /* predicate:  constraint satisfied? */
+lacon(struct vars * v,
+         struct cnfa * pcnfa,          /* parent cnfa */
+         chr *cp,
+         pcolor co)                            /* "color" of the lookahead constraint */
+{
+       int                     n;
+       struct subre *sub;
+       struct dfa *d;
+       struct smalldfa sd;
+       chr                *end;
+
+       n = co - pcnfa->ncolors;
+       assert(n < v->g->nlacons && v->g->lacons != NULL);
+       FDEBUG(("=== testing lacon %d\n", n));
+       sub = &v->g->lacons[n];
+       d = newdfa(v, &sub->cnfa, &v->g->cmap, &sd);
+       if (d == NULL)
+       {
+               ERR(REG_ESPACE);
+               return 0;
+       }
+       end = longest(v, d, cp, v->stop, (int *) NULL);
+       freedfa(d);
+       FDEBUG(("=== lacon %d match %d\n", n, (end != NULL)));
+       return (sub->subno) ? (end != NULL) : (end == NULL);
+}
+
+/*
+ * getvacant - get a vacant state set
+ * This routine clears out the inarcs and outarcs, but does not otherwise
+ * clear the innards of the state set -- that's up to the caller.
+ */
+static struct sset *
+getvacant(struct vars * v,             /* used only for debug flags */
+                 struct dfa * d,
+                 chr *cp,
+                 chr *start)
+{
+       int                     i;
+       struct sset *ss;
+       struct sset *p;
+       struct arcp ap;
+       struct arcp lastap;
+       color           co;
+
+       ss = pickss(v, d, cp, start);
+       assert(!(ss->flags & LOCKED));
+
+       /* clear out its inarcs, including self-referential ones */
+       ap = ss->ins;
+       while ((p = ap.ss) != NULL)
+       {
+               co = ap.co;
+               FDEBUG(("zapping c%d's %ld outarc\n", p - d->ssets, (long) co));
+               p->outs[co] = NULL;
+               ap = p->inchain[co];
+               p->inchain[co].ss = NULL;               /* paranoia */
+       }
+       ss->ins.ss = NULL;
+
+       /* take it off the inarc chains of the ssets reached by its outarcs */
+       for (i = 0; i < d->ncolors; i++)
+       {
+               p = ss->outs[i];
+               assert(p != ss);                /* not self-referential */
+               if (p == NULL)
+                       continue;                       /* NOTE CONTINUE */
+               FDEBUG(("del outarc %d from c%d's in chn\n", i, p - d->ssets));
+               if (p->ins.ss == ss && p->ins.co == i)
+                       p->ins = ss->inchain[i];
+               else
+               {
+                       assert(p->ins.ss != NULL);
+                       for (ap = p->ins; ap.ss != NULL &&
+                                !(ap.ss == ss && ap.co == i);
+                                ap = ap.ss->inchain[ap.co])
+                               lastap = ap;
+                       assert(ap.ss != NULL);
+                       lastap.ss->inchain[lastap.co] = ss->inchain[i];
+               }
+               ss->outs[i] = NULL;
+               ss->inchain[i].ss = NULL;
+       }
+
+       /* if ss was a success state, may need to remember location */
+       if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
+               (d->lastpost == NULL || d->lastpost < ss->lastseen))
+               d->lastpost = ss->lastseen;
+
+       /* likewise for a no-progress state */
+       if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
+               (d->lastnopr == NULL || d->lastnopr < ss->lastseen))
+               d->lastnopr = ss->lastseen;
+
+       return ss;
+}
+
+/*
+ * pickss - pick the next stateset to be used
+ */
+static struct sset *
+pickss(struct vars * v,                        /* used only for debug flags */
+          struct dfa * d,
+          chr *cp,
+          chr *start)
+{
+       int                     i;
+       struct sset *ss;
+       struct sset *end;
+       chr                *ancient;
+
+       /* shortcut for cases where cache isn't full */
+       if (d->nssused < d->nssets)
+       {
+               i = d->nssused;
+               d->nssused++;
+               ss = &d->ssets[i];
+               FDEBUG(("new c%d\n", i));
+               /* set up innards */
+               ss->states = &d->statesarea[i * d->wordsper];
+               ss->flags = 0;
+               ss->ins.ss = NULL;
+               ss->ins.co = WHITE;             /* give it some value */
+               ss->outs = &d->outsarea[i * d->ncolors];
+               ss->inchain = &d->incarea[i * d->ncolors];
+               for (i = 0; i < d->ncolors; i++)
+               {
+                       ss->outs[i] = NULL;
+                       ss->inchain[i].ss = NULL;
+               }
+               return ss;
+       }
+
+       /* look for oldest, or old enough anyway */
+       if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
+               ancient = cp - d->nssets * 2 / 3;
+       else
+               ancient = start;
+       for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
+               if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
+                       !(ss->flags & LOCKED))
+               {
+                       d->search = ss + 1;
+                       FDEBUG(("replacing c%d\n", ss - d->ssets));
+                       return ss;
+               }
+       for (ss = d->ssets, end = d->search; ss < end; ss++)
+               if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
+                       !(ss->flags & LOCKED))
+               {
+                       d->search = ss + 1;
+                       FDEBUG(("replacing c%d\n", ss - d->ssets));
+                       return ss;
+               }
+
+       /* nobody's old enough?!? -- something's really wrong */
+       FDEBUG(("can't find victim to replace!\n"));
+       assert(NOTREACHED);
+       ERR(REG_ASSERT);
+       return d->ssets;
+}
author	Ryan Norton <wxprojects@comcast.net>
	Thu, 5 Aug 1999 01:16:55 +0000 (01:16 +0000)
committer	Ryan Norton <wxprojects@comcast.net>
	Thu, 5 Aug 1999 01:16:55 +0000 (01:16 +0000)
src/regex/re_syntax.n	[new file with mode: 0644]	patch \| blob
src/regex/regc_nfa.c	[new file with mode: 0644]	patch \| blob
src/regex/rege_dfa.c	[new file with mode: 0644]	patch \| blob