--- /dev/null
+'\"
+'\" Copyright (c) 1998 Sun Microsystems, Inc.
+'\" Copyright (c) 1999 Scriptics Corporation
+'\"
+'\" This software is copyrighted by the Regents of the University of
+'\" California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+'\" Corporation and other parties. The following terms apply to all files
+'\" associated with the software unless explicitly disclaimed in
+'\" individual files.
+'\"
+'\" The authors hereby grant permission to use, copy, modify, distribute,
+'\" and license this software and its documentation for any purpose, provided
+'\" that existing copyright notices are retained in all copies and that this
+'\" notice is included verbatim in any distributions. No written agreement,
+'\" license, or royalty fee is required for any of the authorized uses.
+'\" Modifications to this software may be copyrighted by their authors
+'\" and need not follow the licensing terms described here, provided that
+'\" the new terms are clearly indicated on the first page of each file where
+'\" they apply.
+'\"
+'\" IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+'\" FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+'\" ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+'\" DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+'\" POSSIBILITY OF SUCH DAMAGE.
+'\"
+'\" THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+'\" INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+'\" FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
+'\" IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+'\" NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+'\" MODIFICATIONS.
+'\"
+'\" GOVERNMENT USE: If you are acquiring this software on behalf of the
+'\" U.S. government, the Government shall have only "Restricted Rights"
+'\" in the software and related documentation as defined in the Federal
+'\" Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
+'\" are acquiring the software on behalf of the Department of Defense, the
+'\" software shall be classified as "Commercial Computer Software" and the
+'\" Government shall have only "Restricted Rights" as defined in Clause
+'\" 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
+'\" authors grant the U.S. Government and others acting in its behalf
+'\" permission to use and distribute the software in accordance with the
+'\" terms specified in this license.
+'\"
+'\" RCS: @(#) Id: re_syntax.n,v 1.3 1999/07/14 19:09:36 jpeek Exp
+'\"
+.so man.macros
+.TH re_syntax n "8.1" Tcl "Tcl Built-In Commands"
+.BS
+.SH NAME
+re_syntax \- Syntax of Tcl regular expressions.
+.BE
+
+.SH DESCRIPTION
+.PP
+A \fIregular expression\fR describes strings of characters.
+It's a pattern that matches certain strings and doesn't match others.
+
+.SH "DIFFERENT FLAVORS OF REs"
+Regular expressions (``RE''s), as defined by POSIX, come in two
+flavors: \fIextended\fR REs (``EREs'') and \fIbasic\fR REs (``BREs'').
+EREs are roughly those of the traditional \fIegrep\fR, while BREs are
+roughly those of the traditional \fIed\fR. This implementation adds
+a third flavor, \fIadvanced\fR REs (``AREs''), basically EREs with
+some significant extensions.
+.PP
+This manual page primarily describes AREs. BREs mostly exist for
+backward compatibility in some old programs; they will be discussed at
+the end. POSIX EREs are almost an exact subset of AREs. Features of
+AREs that are not present in EREs will be indicated.
+
+.SH "REGULAR EXPRESSION SYNTAX"
+.PP
+Tcl regular expressions are implemented using the package written by
+Henry Spencer, based on the 1003.2 spec and some (not quite all) of
+the Perl5 extensions (thanks, Henry!). Much of the description of
+regular expressions below is copied verbatim from his manual entry.
+.PP
+An ARE is one or more \fIbranches\fR,
+separated by `\fB|\fR',
+matching anything that matches any of the branches.
+.PP
+A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR,
+concatenated.
+It matches a match for the first, followed by a match for the second, etc;
+an empty branch matches the empty string.
+.PP
+A quantified atom is an \fIatom\fR possibly followed
+by a single \fIquantifier\fR.
+Without a quantifier, it matches a match for the atom.
+The quantifiers,
+and what a so-quantified atom matches, are:
+.RS 2
+.TP 6
+\fB*\fR
+a sequence of 0 or more matches of the atom
+.TP
+\fB+\fR
+a sequence of 1 or more matches of the atom
+.TP
+\fB?\fR
+a sequence of 0 or 1 matches of the atom
+.TP
+\fB{\fIm\fB}\fR
+a sequence of exactly \fIm\fR matches of the atom
+.TP
+\fB{\fIm\fB,}\fR
+a sequence of \fIm\fR or more matches of the atom
+.TP
+\fB{\fIm\fB,\fIn\fB}\fR
+a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom;
+\fIm\fR may not exceed \fIn\fR
+.TP
+\fB*? +? ?? {\fIm\fB}? {\fIm\fB,}? {\fIm\fB,\fIn\fB}?\fR
+\fInon-greedy\fR quantifiers,
+which match the same possibilities,
+but prefer the smallest number rather than the largest number
+of matches (see MATCHING)
+.RE
+.PP
+The forms using
+\fB{\fR and \fB}\fR
+are known as \fIbound\fRs.
+The numbers
+\fIm\fR and \fIn\fR are unsigned decimal integers
+with permissible values from 0 to 255 inclusive.
+.PP
+An atom is one of:
+.RS 2
+.TP 6
+\fB(\fIre\fB)\fR
+(where \fIre\fR is any regular expression)
+matches a match for
+\fIre\fR, with the match noted for possible reporting
+.TP
+\fB(?:\fIre\fB)\fR
+as previous,
+but does no reporting
+(a ``non-capturing'' set of parentheses)
+.TP
+\fB()\fR
+matches an empty string,
+noted for possible reporting
+.TP
+\fB(?:)\fR
+matches an empty string,
+without reporting
+.TP
+\fB[\fIchars\fB]\fR
+a \fIbracket expression\fR,
+matching any one of the \fIchars\fR (see BRACKET EXPRESSIONS for more detail)
+.TP
+ \fB.\fR
+matches any single character
+.TP
+\fB\e\fIk\fR
+(where \fIk\fR is a non-alphanumeric character)
+matches that character taken as an ordinary character,
+e.g. \e\e matches a backslash character
+.TP
+\fB\e\fIc\fR
+where \fIc\fR is alphanumeric
+(possibly followed by other characters),
+an \fIescape\fR (AREs only),
+see ESCAPES below
+.TP
+\fB{\fR
+when followed by a character other than a digit,
+matches the left-brace character `\fB{\fR';
+when followed by a digit, it is the beginning of a
+\fIbound\fR (see above)
+.TP
+\fIx\fR
+where \fIx\fR is
+a single character with no other significance, matches that character.
+.RE
+.PP
+A \fIconstraint\fR matches an empty string when specific conditions
+are met.
+A constraint may not be followed by a quantifier.
+The simple constraints are as follows; some more constraints are
+described later, under ESCAPES.
+.RS 2
+.TP 8
+\fB^\fR
+matches at the beginning of a line
+.TP
+\fB$\fR
+matches at the end of a line
+.TP
+\fB(?=\fIre\fB)\fR
+\fIpositive lookahead\fR (AREs only), matches at any point
+where a substring matching \fIre\fR begins
+.TP
+\fB(?!\fIre\fB)\fR
+\fInegative lookahead\fR (AREs only), matches at any point
+where no substring matching \fIre\fR begins
+.RE
+.PP
+The lookahead constraints may not contain back references (see later),
+and all parentheses within them are considered non-capturing.
+.PP
+An RE may not end with `\fB\e\fR'.
+
+.SH "BRACKET EXPRESSIONS"
+A \fIbracket expression\fR is a list of characters enclosed in `\fB[\|]\fR'.
+It normally matches any single character from the list (but see below).
+If the list begins with `\fB^\fR',
+it matches any single character
+(but see below) \fInot\fR from the rest of the list.
+.PP
+If two characters in the list are separated by `\fB\-\fR',
+this is shorthand
+for the full \fIrange\fR of characters between those two (inclusive) in the
+collating sequence,
+e.g.
+\fB[0\-9]\fR
+in ASCII matches any decimal digit.
+Two ranges may not share an
+endpoint, so e.g.
+\fBa\-c\-e\fR
+is illegal.
+Ranges are very collating-sequence-dependent,
+and portable programs should avoid relying on them.
+.PP
+To include a literal
+\fB]\fR
+or
+\fB\-\fR
+in the list,
+the simplest method is to
+enclose it in
+\fB[.\fR and \fB.]\fR
+to make it a collating element (see below).
+Alternatively,
+make it the first character
+(following a possible `\fB^\fR'),
+or (AREs only) precede it with `\fB\e\fR'.
+Alternatively, for `\fB\-\fR',
+make it the last character,
+or the second endpoint of a range.
+To use a literal
+\fB\-\fR
+as the first endpoint of a range,
+make it a collating element
+or (AREs only) precede it with `\fB\e\fR'.
+With the exception of these, some combinations using
+\fB[\fR
+(see next
+paragraphs), and escapes,
+all other special characters lose their
+special significance within a bracket expression.
+.PP
+Within a bracket expression, a collating element (a character,
+a multi-character sequence that collates as if it were a single character,
+or a collating-sequence name for either)
+enclosed in
+\fB[.\fR and \fB.]\fR
+stands for the
+sequence of characters of that collating element.
+The sequence is a single element of the bracket expression's list.
+A bracket expression in a locale that has
+multi-character collating elements
+can thus match more than one character.
+.VS 8.2
+So (insidiously), a bracket expression that starts with \fB^\fR
+can match multi-character collating elements even if none of them
+appear in the bracket expression!
+(\fINote:\fR Tcl currently has no multi-character collating elements.
+This information is only for illustration.)
+.PP
+For example, assume the collating sequence includes a \fBch\fR
+multi-character collating element.
+Then the RE \fB[[.ch.]]*c\fR (zero or more \fBch\fP's followed by \fBc\fP)
+matches the first five characters of `\fBchchcc\fR'.
+Also, the RE \fB[^c]b\fR matches all of `\fBchb\fR'
+(because \fB[^c]\fR matches the multi-character \fBch\fR).
+.VE 8.2
+.PP
+Within a bracket expression, a collating element enclosed in
+\fB[=\fR
+and
+\fB=]\fR
+is an equivalence class, standing for the sequences of characters
+of all collating elements equivalent to that one, including itself.
+(If there are no other equivalent collating elements,
+the treatment is as if the enclosing delimiters were `\fB[.\fR'\&
+and `\fB.]\fR'.)
+For example, if
+\fBo\fR
+and
+\fB\o'o^'\fR
+are the members of an equivalence class,
+then `\fB[[=o=]]\fR', `\fB[[=\o'o^'=]]\fR',
+and `\fB[o\o'o^']\fR'\&
+are all synonymous.
+An equivalence class may not be an endpoint
+of a range.
+.VS 8.2
+(\fINote:\fR
+Tcl currently implements only the Unicode locale.
+It doesn't define any equivalence classes.
+The examples above are just illustrations.)
+.VE 8.2
+.PP
+Within a bracket expression, the name of a \fIcharacter class\fR enclosed
+in
+\fB[:\fR
+and
+\fB:]\fR
+stands for the list of all characters
+(not all collating elements!)
+belonging to that
+class.
+Standard character classes are:
+.PP
+.RS
+.ne 5
+.nf
+.ta 3c
+\fBalpha\fR A letter.
+\fBupper\fR An upper-case letter.
+\fBlower\fR A lower-case letter.
+\fBdigit\fR A decimal digit.
+\fBxdigit\fR A hexadecimal digit.
+\fBalnum\fR An alphanumeric (letter or digit).
+\fBprint\fR An alphanumeric (same as alnum).
+\fBblank\fR A space or tab character.
+\fBspace\fR A character producing white space in displayed text.
+\fBpunct\fR A punctuation character.
+\fBgraph\fR A character with a visible representation.
+\fBcntrl\fR A control character.
+.fi
+.RE
+.PP
+A locale may provide others.
+.VS 8.2
+(Note that the current Tcl implementation has only one locale:
+the Unicode locale.)
+.VE 8.2
+A character class may not be used as an endpoint of a range.
+.PP
+There are two special cases of bracket expressions:
+the bracket expressions
+\fB[[:<:]]\fR
+and
+\fB[[:>:]]\fR
+are constraints, matching empty strings at
+the beginning and end of a word respectively.
+'\" note, discussion of escapes below references this definition of word
+A word is defined as a sequence of
+word characters
+that is neither preceded nor followed by
+word characters.
+A word character is an
+\fIalnum\fR
+character
+or an underscore
+(\fB_\fR).
+These special bracket expressions are deprecated;
+users of AREs should use constraint escapes instead (see below).
+.SH ESCAPES
+Escapes (AREs only), which begin with a
+\fB\e\fR
+followed by an alphanumeric character,
+come in several varieties:
+character entry, class shorthands, constraint escapes, and back references.
+A
+\fB\e\fR
+followed by an alphanumeric character but not constituting
+a valid escape is illegal in AREs.
+In EREs, there are no escapes:
+outside a bracket expression,
+a
+\fB\e\fR
+followed by an alphanumeric character merely stands for that
+character as an ordinary character,
+and inside a bracket expression,
+\fB\e\fR
+is an ordinary character.
+(The latter is the one actual incompatibility between EREs and AREs.)
+.PP
+Character-entry escapes (AREs only) exist to make it easier to specify
+non-printing and otherwise inconvenient characters in REs:
+.RS 2
+.TP 5
+\fB\ea\fR
+alert (bell) character, as in C
+.TP
+\fB\eb\fR
+backspace, as in C
+.TP
+\fB\eB\fR
+synonym for
+\fB\e\fR
+to help reduce backslash doubling in some
+applications where there are multiple levels of backslash processing
+.TP
+\fB\ec\fIX\fR
+(where X is any character) the character whose
+low-order 5 bits are the same as those of
+\fIX\fR,
+and whose other bits are all zero
+.TP
+\fB\ee\fR
+the character whose collating-sequence name
+is `\fBESC\fR',
+or failing that, the character with octal value 033
+.TP
+\fB\ef\fR
+formfeed, as in C
+.TP
+\fB\en\fR
+newline, as in C
+.TP
+\fB\er\fR
+carriage return, as in C
+.TP
+\fB\et\fR
+horizontal tab, as in C
+.TP
+\fB\eu\fIwxyz\fR
+(where
+\fIwxyz\fR
+is exactly four hexadecimal digits)
+the Unicode character
+\fBU+\fIwxyz\fR
+in the local byte ordering
+.TP
+\fB\eU\fIstuvwxyz\fR
+(where
+\fIstuvwxyz\fR
+is exactly eight hexadecimal digits)
+reserved for a somewhat-hypothetical Unicode extension to 32 bits
+.TP
+\fB\ev\fR
+vertical tab, as in C
+are all available.
+.TP
+\fB\ex\fIhhh\fR
+(where
+\fIhhh\fR
+is any sequence of hexadecimal digits)
+the character whose hexadecimal value is
+\fB0x\fIhhh\fR
+(a single character no matter how many hexadecimal digits are used).
+.TP
+\fB\e0\fR
+the character whose value is
+\fB0\fR
+.TP
+\fB\e\fIxy\fR
+(where
+\fIxy\fR
+is exactly two octal digits,
+and is not a
+\fIback reference\fR (see below))
+the character whose octal value is
+\fB0\fIxy\fR
+.TP
+\fB\e\fIxyz\fR
+(where
+\fIxyz\fR
+is exactly three octal digits,
+and is not a
+back reference (see below))
+the character whose octal value is
+\fB0\fIxyz\fR
+.RE
+.PP
+Hexadecimal digits are `\fB0\fR'-`\fB9\fR', `\fBa\fR'-`\fBf\fR',
+and `\fBA\fR'-`\fBF\fR'.
+Octal digits are `\fB0\fR'-`\fB7\fR'.
+.PP
+The character-entry escapes are always taken as ordinary characters.
+For example,
+\fB\e135\fR
+is
+\fB]\fR
+in ASCII,
+but
+\fB\e135\fR
+does not terminate a bracket expression.
+Beware, however, that some applications (e.g., C compilers) interpret
+such sequences themselves before the regular-expression package
+gets to see them, which may require doubling (quadrupling, etc.) the `\fB\e\fR'.
+.PP
+Class-shorthand escapes (AREs only) provide shorthands for certain commonly-used
+character classes:
+.RS 2
+.TP 10
+\fB\ed\fR
+\fB[[:digit:]]\fR
+.TP
+\fB\es\fR
+\fB[[:space:]]\fR
+.TP
+\fB\ew\fR
+\fB[[:alnum:]_]\fR
+(note underscore)
+.TP
+\fB\eD\fR
+\fB[^[:digit:]]\fR
+.TP
+\fB\eS\fR
+\fB[^[:space:]]\fR
+.TP
+\fB\eW\fR
+\fB[^[:alnum:]_]\fR
+(note underscore)
+.RE
+.PP
+Within bracket expressions, `\fB\ed\fR', `\fB\es\fR',
+and `\fB\ew\fR'\&
+lose their outer brackets,
+and `\fB\eD\fR', `\fB\eS\fR',
+and `\fB\eW\fR'\&
+are illegal.
+.VS 8.2
+(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
+Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.)
+.VE 8.2
+.PP
+A constraint escape (AREs only) is a constraint,
+matching the empty string if specific conditions are met,
+written as an escape:
+.RS 2
+.TP 6
+\fB\eA\fR
+matches only at the beginning of the string
+(see MATCHING, below, for how this differs from `\fB^\fR')
+.TP
+\fB\em\fR
+matches only at the beginning of a word
+.TP
+\fB\eM\fR
+matches only at the end of a word
+.TP
+\fB\ey\fR
+matches only at the beginning or end of a word
+.TP
+\fB\eY\fR
+matches only at a point that is not the beginning or end of a word
+.TP
+\fB\eZ\fR
+matches only at the end of the string
+(see MATCHING, below, for how this differs from `\fB$\fR')
+.TP
+\fB\e\fIm\fR
+(where
+\fIm\fR
+is a nonzero digit) a \fIback reference\fR, see below
+.TP
+\fB\e\fImnn\fR
+(where
+\fIm\fR
+is a nonzero digit, and
+\fInn\fR
+is some more digits,
+and the decimal value
+\fImnn\fR
+is not greater than the number of closing capturing parentheses seen so far)
+a \fIback reference\fR, see below
+.RE
+.PP
+A word is defined as in the specification of
+\fB[[:<:]]\fR
+and
+\fB[[:>:]]\fR
+above.
+Constraint escapes are illegal within bracket expressions.
+.PP
+A back reference (AREs only) matches the same string matched by the parenthesized
+subexpression specified by the number,
+so that (e.g.)
+\fB([bc])\e1\fR
+matches
+\fBbb\fR
+or
+\fBcc\fR
+but not `\fBbc\fR'.
+The subexpression must entirely precede the back reference in the RE.
+Subexpressions are numbered in the order of their leading parentheses.
+Non-capturing parentheses do not define subexpressions.
+.PP
+There is an inherent historical ambiguity between octal character-entry
+escapes and back references, which is resolved by heuristics,
+as hinted at above.
+A leading zero always indicates an octal escape.
+A single non-zero digit, not followed by another digit,
+is always taken as a back reference.
+A multi-digit sequence not starting with a zero is taken as a back
+reference if it comes after a suitable subexpression
+(i.e. the number is in the legal range for a back reference),
+and otherwise is taken as octal.
+.SH "METASYNTAX"
+In addition to the main syntax described above, there are some special
+forms and miscellaneous syntactic facilities available.
+.PP
+Normally the flavor of RE being used is specified by
+application-dependent means.
+However, this can be overridden by a \fIdirector\fR.
+If an RE of any flavor begins with `\fB***:\fR',
+the rest of the RE is an ARE.
+If an RE of any flavor begins with `\fB***=\fR',
+the rest of the RE is taken to be a literal string,
+with all characters considered ordinary characters.
+.PP
+An ARE may begin with \fIembedded options\fR:
+a sequence
+\fB(?\fIxyz\fB)\fR
+(where
+\fIxyz\fR
+is one or more alphabetic characters)
+specifies options affecting the rest of the RE.
+These supplement, and can override,
+any options specified by the application.
+The available option letters are:
+.RS 2
+.TP 3
+\fBb\fR
+rest of RE is a BRE
+.TP 3
+\fBc\fR
+case-sensitive matching (usual default)
+.TP 3
+\fBe\fR
+rest of RE is an ERE
+.TP 3
+\fBi\fR
+case-insensitive matching (see MATCHING, below)
+.TP 3
+\fBm\fR
+historical synonym for
+\fBn\fR
+.TP 3
+\fBn\fR
+newline-sensitive matching (see MATCHING, below)
+.TP 3
+\fBp\fR
+partial newline-sensitive matching (see MATCHING, below)
+.TP 3
+\fBq\fR
+rest of RE is a literal (``quoted'') string, all ordinary characters
+.TP 3
+\fBs\fR
+non-newline-sensitive matching (usual default)
+.TP 3
+\fBt\fR
+tight syntax (usual default; see below)
+.TP 3
+\fBw\fR
+inverse partial newline-sensitive (``weird'') matching (see MATCHING, below)
+.TP 3
+\fBx\fR
+expanded syntax (see below)
+.RE
+.PP
+Embedded options take effect at the
+\fB)\fR
+terminating the sequence.
+They are available only at the start of an ARE,
+and may not be used later within it.
+.PP
+In addition to the usual (\fItight\fR) RE syntax, in which all characters are
+significant, there is an \fIexpanded\fR syntax,
+available in all flavors of RE
+with the \fB-expanded\fR switch, or in AREs with the embedded x option.
+In the expanded syntax,
+white-space characters are ignored
+and all characters between a
+\fB#\fR
+and the following newline (or the end of the RE) are ignored,
+permitting paragraphing and commenting a complex RE.
+There are three exceptions to that basic rule:
+.RS 2
+.PP
+a white-space character or `\fB#\fR' preceded by `\fB\e\fR' is retained
+.PP
+white space or `\fB#\fR' within a bracket expression is retained
+.PP
+white space and comments are illegal within multi-character symbols
+like the ARE `\fB(?:\fR' or the BRE `\fB\e(\fR'
+.RE
+.PP
+Expanded-syntax white-space characters are blank, tab, newline, and
+.VS 8.2
+any character that belongs to the \fIspace\fR character class.
+.VE 8.2
+.PP
+Finally, in an ARE,
+outside bracket expressions, the sequence `\fB(?#\fIttt\fB)\fR'
+(where
+\fIttt\fR
+is any text not containing a `\fB)\fR')
+is a comment,
+completely ignored.
+Again, this is not allowed between the characters of
+multi-character symbols like `\fB(?:\fR'.
+Such comments are more a historical artifact than a useful facility,
+and their use is deprecated;
+use the expanded syntax instead.
+.PP
+\fINone\fR of these metasyntax extensions is available if the application
+(or an initial
+\fB***=\fR
+director)
+has specified that the user's input be treated as a literal string
+rather than as an RE.
+.SH MATCHING
+In the event that an RE could match more than one substring of a given
+string,
+the RE matches the one starting earliest in the string.
+If the RE could match more than one substring starting at that point,
+its choice is determined by its \fIpreference\fR:
+either the longest substring, or the shortest.
+.PP
+Most atoms, and all constraints, have no preference.
+A parenthesized RE has the same preference (possibly none) as the RE.
+A quantified atom with quantifier
+\fB{\fIm\fB}\fR
+or
+\fB{\fIm\fB}?\fR
+has the same preference (possibly none) as the atom itself.
+A quantified atom with other normal quantifiers (including
+\fB{\fIm\fB,\fIn\fB}\fR
+with
+\fIm\fR
+equal to
+\fIn\fR)
+prefers longest match.
+A quantified atom with other non-greedy quantifiers (including
+\fB{\fIm\fB,\fIn\fB}?\fR
+with
+\fIm\fR
+equal to
+\fIn\fR)
+prefers shortest match.
+A branch has the same preference as the first quantified atom in it
+which has a preference.
+An RE consisting of two or more branches connected by the
+\fB|\fR
+operator prefers longest match.
+.PP
+Subject to the constraints imposed by the rules for matching the whole RE,
+subexpressions also match the longest or shortest possible substrings,
+based on their preferences,
+with subexpressions starting earlier in the RE taking priority over
+ones starting later.
+Note that outer subexpressions thus take priority over
+their component subexpressions.
+.PP
+Note that the quantifiers
+\fB{1,1}\fR
+and
+\fB{1,1}?\fR
+can be used to force longest and shortest preference, respectively,
+on a subexpression or a whole RE.
+.PP
+Match lengths are measured in characters, not collating elements.
+An empty string is considered longer than no match at all.
+For example,
+\fBbb*\fR
+matches the three middle characters of `\fBabbbc\fR',
+\fB(week|wee)(night|knights)\fR
+matches all ten characters of `\fBweeknights\fR',
+when
+\fB(.*).*\fR
+is matched against
+\fBabc\fR
+the parenthesized subexpression
+matches all three characters, and
+when
+\fB(a*)*\fR
+is matched against
+\fBbc\fR
+both the whole RE and the parenthesized
+subexpression match an empty string.
+.PP
+If case-independent matching is specified,
+the effect is much as if all case distinctions had vanished from the
+alphabet.
+When an alphabetic that exists in multiple cases appears as an
+ordinary character outside a bracket expression, it is effectively
+transformed into a bracket expression containing both cases,
+so that
+\fBx\fR
+becomes `\fB[xX]\fR'.
+When it appears inside a bracket expression, all case counterparts
+of it are added to the bracket expression, so that
+\fB[x]\fR
+becomes
+\fB[xX]\fR
+and
+\fB[^x]\fR
+becomes `\fB[^xX]\fR'.
+.PP
+If newline-sensitive matching is specified, \fB.\fR
+and bracket expressions using
+\fB^\fR
+will never match the newline character
+(so that matches will never cross newlines unless the RE
+explicitly arranges it)
+and
+\fB^\fR
+and
+\fB$\fR
+will match the empty string after and before a newline
+respectively, in addition to matching at beginning and end of string
+respectively.
+ARE
+\fB\eA\fR
+and
+\fB\eZ\fR
+continue to match beginning or end of string \fIonly\fR.
+.PP
+If partial newline-sensitive matching is specified,
+this affects \fB.\fR
+and bracket expressions
+as with newline-sensitive matching, but not
+\fB^\fR
+and `\fB$\fR'.
+.PP
+If inverse partial newline-sensitive matching is specified,
+this affects
+\fB^\fR
+and
+\fB$\fR
+as with
+newline-sensitive matching,
+but not \fB.\fR
+and bracket expressions.
+This isn't very useful but is provided for symmetry.
+.SH "LIMITS AND COMPATIBILITY"
+No particular limit is imposed on the length of REs.
+Programs intended to be highly portable should not employ REs longer
+than 256 bytes,
+as a POSIX-compliant implementation can refuse to accept such REs.
+.PP
+The only feature of AREs that is actually incompatible with
+POSIX EREs is that
+\fB\e\fR
+does not lose its special
+significance inside bracket expressions.
+All other ARE features use syntax which is illegal or has
+undefined or unspecified effects in POSIX EREs;
+the
+\fB***\fR
+syntax of directors likewise is outside the POSIX
+syntax for both BREs and EREs.
+.PP
+Many of the ARE extensions are borrowed from Perl, but some have
+been changed to clean them up, and a few Perl extensions are not present.
+Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR',
+the lack of special treatment for a trailing newline,
+the addition of complemented bracket expressions to the things
+affected by newline-sensitive matching,
+the restrictions on parentheses and back references in lookahead constraints,
+and the longest/shortest-match (rather than first-match) matching semantics.
+.PP
+The matching rules for REs containing both normal and non-greedy quantifiers
+have changed since early beta-test versions of this package.
+(The new rules are much simpler and cleaner,
+but don't work as hard at guessing the user's real intentions.)
+.PP
+Henry Spencer's original 1986 \fIregexp\fR package,
+still in widespread use (e.g., in pre-8.1 releases of Tcl),
+implemented an early version of today's EREs.
+There are four incompatibilities between \fIregexp\fR's near-EREs
+(`RREs' for short) and AREs.
+In roughly increasing order of significance:
+.PP
+.RS
+In AREs,
+\fB\e\fR
+followed by an alphanumeric character is either an
+escape or an error,
+while in RREs, it was just another way of writing the
+alphanumeric.
+This should not be a problem because there was no reason to write
+such a sequence in RREs.
+.PP
+\fB{\fR
+followed by a digit in an ARE is the beginning of a bound,
+while in RREs,
+\fB{\fR
+was always an ordinary character.
+Such sequences should be rare,
+and will often result in an error because following characters
+will not look like a valid bound.
+.PP
+In AREs,
+\fB\e\fR
+remains a special character within `\fB[\|]\fR',
+so a literal
+\fB\e\fR
+within
+\fB[\|]\fR
+must be written `\fB\e\e\fR'.
+\fB\e\e\fR
+also gives a literal
+\fB\e\fR
+within
+\fB[\|]\fR
+in RREs,
+but only truly paranoid programmers routinely doubled the backslash.
+.PP
+AREs report the longest/shortest match for the RE,
+rather than the first found in a specified search order.
+This may affect some RREs which were written in the expectation that
+the first match would be reported.
+(The careful crafting of RREs to optimize the search order for fast
+matching is obsolete (AREs examine all possible matches
+in parallel, and their performance is largely insensitive to their
+complexity) but cases where the search order was exploited to deliberately
+find a match which was \fInot\fR the longest/shortest will need rewriting.)
+.RE
+
+.SH "BASIC REGULAR EXPRESSIONS"
+BREs differ from EREs in several respects. `\fB|\fR', `\fB+\fR',
+and
+\fB?\fR
+are ordinary characters and there is no equivalent
+for their functionality.
+The delimiters for bounds are
+\fB\e{\fR
+and `\fB\e}\fR',
+with
+\fB{\fR
+and
+\fB}\fR
+by themselves ordinary characters.
+The parentheses for nested subexpressions are
+\fB\e(\fR
+and `\fB\e)\fR',
+with
+\fB(\fR
+and
+\fB)\fR
+by themselves ordinary characters.
+\fB^\fR
+is an ordinary character except at the beginning of the
+RE or the beginning of a parenthesized subexpression,
+\fB$\fR
+is an ordinary character except at the end of the
+RE or the end of a parenthesized subexpression,
+and
+\fB*\fR
+is an ordinary character if it appears at the beginning of the
+RE or the beginning of a parenthesized subexpression
+(after a possible leading `\fB^\fR').
+Finally,
+single-digit back references are available,
+and
+\fB\e<\fR
+and
+\fB\e>\fR
+are synonyms for
+\fB[[:<:]]\fR
+and
+\fB[[:>:]]\fR
+respectively;
+no other escapes are available.
+
+.SH "SEE ALSO"
+RegExp(3), regexp(n), regsub(n), lsearch(n), switch(n), text(n)
+
+.SH KEYWORDS
+match, regular expression, string
--- /dev/null
+/*
+ * NFA utilities.
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Header$
+ *
+ *
+ * One or two things that technically ought to be in here
+ * are actually in color.c, thanks to some incestuous relationships in
+ * the color chains.
+ */
+
+#define NISERR() VISERR(nfa->v)
+#define NERR(e) VERR(nfa->v, (e))
+
+
+/*
+ * newnfa - set up an NFA
+ */
+static struct nfa * /* the NFA, or NULL */
+newnfa(struct vars * v,
+ struct colormap * cm,
+ struct nfa * parent) /* NULL if primary NFA */
+{
+ struct nfa *nfa;
+
+ nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
+ if (nfa == NULL)
+ return NULL;
+
+ nfa->states = NULL;
+ nfa->slast = NULL;
+ nfa->free = NULL;
+ nfa->nstates = 0;
+ nfa->cm = cm;
+ nfa->v = v;
+ nfa->bos[0] = nfa->bos[1] = COLORLESS;
+ nfa->eos[0] = nfa->eos[1] = COLORLESS;
+ nfa->post = newfstate(nfa, '@'); /* number 0 */
+ nfa->pre = newfstate(nfa, '>'); /* number 1 */
+ nfa->parent = parent;
+
+ nfa->init = newstate(nfa); /* may become invalid later */
+ nfa->final = newstate(nfa);
+ if (ISERR())
+ {
+ freenfa(nfa);
+ return NULL;
+ }
+ rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
+ newarc(nfa, '^', 1, nfa->pre, nfa->init);
+ newarc(nfa, '^', 0, nfa->pre, nfa->init);
+ rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
+ newarc(nfa, '$', 1, nfa->final, nfa->post);
+ newarc(nfa, '$', 0, nfa->final, nfa->post);
+
+ if (ISERR())
+ {
+ freenfa(nfa);
+ return NULL;
+ }
+ return nfa;
+}
+
+/*
+ * freenfa - free an entire NFA
+ */
+static void
+freenfa(struct nfa * nfa)
+{
+ struct state *s;
+
+ while ((s = nfa->states) != NULL)
+ {
+ s->nins = s->nouts = 0; /* don't worry about arcs */
+ freestate(nfa, s);
+ }
+ while ((s = nfa->free) != NULL)
+ {
+ nfa->free = s->next;
+ destroystate(nfa, s);
+ }
+
+ nfa->slast = NULL;
+ nfa->nstates = -1;
+ nfa->pre = NULL;
+ nfa->post = NULL;
+ FREE(nfa);
+}
+
+/*
+ * newstate - allocate an NFA state, with zero flag value
+ */
+static struct state * /* NULL on error */
+newstate(struct nfa * nfa)
+{
+ struct state *s;
+
+ if (nfa->free != NULL)
+ {
+ s = nfa->free;
+ nfa->free = s->next;
+ }
+ else
+ {
+ s = (struct state *) MALLOC(sizeof(struct state));
+ if (s == NULL)
+ {
+ NERR(REG_ESPACE);
+ return NULL;
+ }
+ s->oas.next = NULL;
+ s->free = NULL;
+ s->noas = 0;
+ }
+
+ assert(nfa->nstates >= 0);
+ s->no = nfa->nstates++;
+ s->flag = 0;
+ if (nfa->states == NULL)
+ nfa->states = s;
+ s->nins = 0;
+ s->ins = NULL;
+ s->nouts = 0;
+ s->outs = NULL;
+ s->tmp = NULL;
+ s->next = NULL;
+ if (nfa->slast != NULL)
+ {
+ assert(nfa->slast->next == NULL);
+ nfa->slast->next = s;
+ }
+ s->prev = nfa->slast;
+ nfa->slast = s;
+ return s;
+}
+
+/*
+ * newfstate - allocate an NFA state with a specified flag value
+ */
+static struct state * /* NULL on error */
+newfstate(struct nfa * nfa, int flag)
+{
+ struct state *s;
+
+ s = newstate(nfa);
+ if (s != NULL)
+ s->flag = (char) flag;
+ return s;
+}
+
+/*
+ * dropstate - delete a state's inarcs and outarcs and free it
+ */
+static void
+dropstate(struct nfa * nfa,
+ struct state * s)
+{
+ struct arc *a;
+
+ while ((a = s->ins) != NULL)
+ freearc(nfa, a);
+ while ((a = s->outs) != NULL)
+ freearc(nfa, a);
+ freestate(nfa, s);
+}
+
+/*
+ * freestate - free a state, which has no in-arcs or out-arcs
+ */
+static void
+freestate(struct nfa * nfa,
+ struct state * s)
+{
+ assert(s != NULL);
+ assert(s->nins == 0 && s->nouts == 0);
+
+ s->no = FREESTATE;
+ s->flag = 0;
+ if (s->next != NULL)
+ s->next->prev = s->prev;
+ else
+ {
+ assert(s == nfa->slast);
+ nfa->slast = s->prev;
+ }
+ if (s->prev != NULL)
+ s->prev->next = s->next;
+ else
+ {
+ assert(s == nfa->states);
+ nfa->states = s->next;
+ }
+ s->prev = NULL;
+ s->next = nfa->free; /* don't delete it, put it on the free
+ * list */
+ nfa->free = s;
+}
+
+/*
+ * destroystate - really get rid of an already-freed state
+ */
+static void
+destroystate(struct nfa * nfa,
+ struct state * s)
+{
+ struct arcbatch *ab;
+ struct arcbatch *abnext;
+
+ assert(s->no == FREESTATE);
+ for (ab = s->oas.next; ab != NULL; ab = abnext)
+ {
+ abnext = ab->next;
+ FREE(ab);
+ }
+ s->ins = NULL;
+ s->outs = NULL;
+ s->next = NULL;
+ FREE(s);
+}
+
+/*
+ * newarc - set up a new arc within an NFA
+ */
+static void
+newarc(struct nfa * nfa,
+ int t,
+ pcolor co,
+ struct state * from,
+ struct state * to)
+{
+ struct arc *a;
+
+ assert(from != NULL && to != NULL);
+
+ /* check for duplicates */
+ for (a = from->outs; a != NULL; a = a->outchain)
+ if (a->to == to && a->co == co && a->type == t)
+ return;
+
+ a = allocarc(nfa, from);
+ if (NISERR())
+ return;
+ assert(a != NULL);
+
+ a->type = t;
+ a->co = (color) co;
+ a->to = to;
+ a->from = from;
+
+ /*
+ * Put the new arc on the beginning, not the end, of the chains. Not
+ * only is this easier, it has the very useful side effect that
+ * deleting the most-recently-added arc is the cheapest case rather
+ * than the most expensive one.
+ */
+ a->inchain = to->ins;
+ to->ins = a;
+ a->outchain = from->outs;
+ from->outs = a;
+
+ from->nouts++;
+ to->nins++;
+
+ if (COLORED(a) && nfa->parent == NULL)
+ colorchain(nfa->cm, a);
+
+ return;
+}
+
+/*
+ * allocarc - allocate a new out-arc within a state
+ */
+static struct arc * /* NULL for failure */
+allocarc(struct nfa * nfa,
+ struct state * s)
+{
+ struct arc *a;
+ struct arcbatch *new;
+ int i;
+
+ /* shortcut */
+ if (s->free == NULL && s->noas < ABSIZE)
+ {
+ a = &s->oas.a[s->noas];
+ s->noas++;
+ return a;
+ }
+
+ /* if none at hand, get more */
+ if (s->free == NULL)
+ {
+ new = (struct arcbatch *) MALLOC(sizeof(struct arcbatch));
+ if (new == NULL)
+ {
+ NERR(REG_ESPACE);
+ return NULL;
+ }
+ new->next = s->oas.next;
+ s->oas.next = new;
+
+ for (i = 0; i < ABSIZE; i++)
+ {
+ new->a[i].type = 0;
+ new->a[i].freechain = &new->a[i + 1];
+ }
+ new->a[ABSIZE - 1].freechain = NULL;
+ s->free = &new->a[0];
+ }
+ assert(s->free != NULL);
+
+ a = s->free;
+ s->free = a->freechain;
+ return a;
+}
+
+/*
+ * freearc - free an arc
+ */
+static void
+freearc(struct nfa * nfa,
+ struct arc * victim)
+{
+ struct state *from = victim->from;
+ struct state *to = victim->to;
+ struct arc *a;
+
+ assert(victim->type != 0);
+
+ /* take it off color chain if necessary */
+ if (COLORED(victim) && nfa->parent == NULL)
+ uncolorchain(nfa->cm, victim);
+
+ /* take it off source's out-chain */
+ assert(from != NULL);
+ assert(from->outs != NULL);
+ a = from->outs;
+ if (a == victim) /* simple case: first in chain */
+ from->outs = victim->outchain;
+ else
+ {
+ for (; a != NULL && a->outchain != victim; a = a->outchain)
+ continue;
+ assert(a != NULL);
+ a->outchain = victim->outchain;
+ }
+ from->nouts--;
+
+ /* take it off target's in-chain */
+ assert(to != NULL);
+ assert(to->ins != NULL);
+ a = to->ins;
+ if (a == victim) /* simple case: first in chain */
+ to->ins = victim->inchain;
+ else
+ {
+ for (; a != NULL && a->inchain != victim; a = a->inchain)
+ continue;
+ assert(a != NULL);
+ a->inchain = victim->inchain;
+ }
+ to->nins--;
+
+ /* clean up and place on free list */
+ victim->type = 0;
+ victim->from = NULL; /* precautions... */
+ victim->to = NULL;
+ victim->inchain = NULL;
+ victim->outchain = NULL;
+ victim->freechain = from->free;
+ from->free = victim;
+}
+
+/*
+ * findarc - find arc, if any, from given source with given type and color
+ * If there is more than one such arc, the result is random.
+ */
+static struct arc *
+findarc(struct state * s,
+ int type,
+ pcolor co)
+{
+ struct arc *a;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ if (a->type == type && a->co == co)
+ return a;
+ return NULL;
+}
+
+/*
+ * cparc - allocate a new arc within an NFA, copying details from old one
+ */
+static void
+cparc(struct nfa * nfa,
+ struct arc * oa,
+ struct state * from,
+ struct state * to)
+{
+ newarc(nfa, oa->type, oa->co, from, to);
+}
+
+/*
+ * moveins - move all in arcs of a state to another state
+ *
+ * You might think this could be done better by just updating the
+ * existing arcs, and you would be right if it weren't for the desire
+ * for duplicate suppression, which makes it easier to just make new
+ * ones to exploit the suppression built into newarc.
+ */
+static void
+moveins(struct nfa * nfa,
+ struct state * old,
+ struct state * new)
+{
+ struct arc *a;
+
+ assert(old != new);
+
+ while ((a = old->ins) != NULL)
+ {
+ cparc(nfa, a, a->from, new);
+ freearc(nfa, a);
+ }
+ assert(old->nins == 0);
+ assert(old->ins == NULL);
+}
+
+/*
+ * copyins - copy all in arcs of a state to another state
+ */
+static void
+copyins(struct nfa * nfa,
+ struct state * old,
+ struct state * new)
+{
+ struct arc *a;
+
+ assert(old != new);
+
+ for (a = old->ins; a != NULL; a = a->inchain)
+ cparc(nfa, a, a->from, new);
+}
+
+/*
+ * moveouts - move all out arcs of a state to another state
+ */
+static void
+moveouts(struct nfa * nfa,
+ struct state * old,
+ struct state * new)
+{
+ struct arc *a;
+
+ assert(old != new);
+
+ while ((a = old->outs) != NULL)
+ {
+ cparc(nfa, a, new, a->to);
+ freearc(nfa, a);
+ }
+}
+
+/*
+ * copyouts - copy all out arcs of a state to another state
+ */
+static void
+copyouts(struct nfa * nfa,
+ struct state * old,
+ struct state * new)
+{
+ struct arc *a;
+
+ assert(old != new);
+
+ for (a = old->outs; a != NULL; a = a->outchain)
+ cparc(nfa, a, new, a->to);
+}
+
+/*
+ * cloneouts - copy out arcs of a state to another state pair, modifying type
+ */
+static void
+cloneouts(struct nfa * nfa,
+ struct state * old,
+ struct state * from,
+ struct state * to,
+ int type)
+{
+ struct arc *a;
+
+ assert(old != from);
+
+ for (a = old->outs; a != NULL; a = a->outchain)
+ newarc(nfa, type, a->co, from, to);
+}
+
+/*
+ * delsub - delete a sub-NFA, updating subre pointers if necessary
+ *
+ * This uses a recursive traversal of the sub-NFA, marking already-seen
+ * states using their tmp pointer.
+ */
+static void
+delsub(struct nfa * nfa,
+ struct state * lp, /* the sub-NFA goes from here... */
+ struct state * rp) /* ...to here, *not* inclusive */
+{
+ assert(lp != rp);
+
+ rp->tmp = rp; /* mark end */
+
+ deltraverse(nfa, lp, lp);
+ assert(lp->nouts == 0 && rp->nins == 0); /* did the job */
+ assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
+
+ rp->tmp = NULL; /* unmark end */
+ lp->tmp = NULL; /* and begin, marked by deltraverse */
+}
+
+/*
+ * deltraverse - the recursive heart of delsub
+ * This routine's basic job is to destroy all out-arcs of the state.
+ */
+static void
+deltraverse(struct nfa * nfa,
+ struct state * leftend,
+ struct state * s)
+{
+ struct arc *a;
+ struct state *to;
+
+ if (s->nouts == 0)
+ return; /* nothing to do */
+ if (s->tmp != NULL)
+ return; /* already in progress */
+
+ s->tmp = s; /* mark as in progress */
+
+ while ((a = s->outs) != NULL)
+ {
+ to = a->to;
+ deltraverse(nfa, leftend, to);
+ assert(to->nouts == 0 || to->tmp != NULL);
+ freearc(nfa, a);
+ if (to->nins == 0 && to->tmp == NULL)
+ {
+ assert(to->nouts == 0);
+ freestate(nfa, to);
+ }
+ }
+
+ assert(s->no != FREESTATE); /* we're still here */
+ assert(s == leftend || s->nins != 0); /* and still reachable */
+ assert(s->nouts == 0); /* but have no outarcs */
+
+ s->tmp = NULL; /* we're done here */
+}
+
+/*
+ * dupnfa - duplicate sub-NFA
+ *
+ * Another recursive traversal, this time using tmp to point to duplicates
+ * as well as mark already-seen states. (You knew there was a reason why
+ * it's a state pointer, didn't you? :-))
+ */
+static void
+dupnfa(struct nfa * nfa,
+ struct state * start, /* duplicate of subNFA starting here */
+ struct state * stop, /* and stopping here */
+ struct state * from, /* stringing duplicate from here */
+ struct state * to) /* to here */
+{
+ if (start == stop)
+ {
+ newarc(nfa, EMPTY, 0, from, to);
+ return;
+ }
+
+ stop->tmp = to;
+ duptraverse(nfa, start, from);
+ /* done, except for clearing out the tmp pointers */
+
+ stop->tmp = NULL;
+ cleartraverse(nfa, start);
+}
+
+/*
+ * duptraverse - recursive heart of dupnfa
+ */
+static void
+duptraverse(struct nfa * nfa,
+ struct state * s,
+ struct state * stmp) /* s's duplicate, or NULL */
+{
+ struct arc *a;
+
+ if (s->tmp != NULL)
+ return; /* already done */
+
+ s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
+ if (s->tmp == NULL)
+ {
+ assert(NISERR());
+ return;
+ }
+
+ for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
+ {
+ duptraverse(nfa, a->to, (struct state *) NULL);
+ assert(a->to->tmp != NULL);
+ cparc(nfa, a, s->tmp, a->to->tmp);
+ }
+}
+
+/*
+ * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
+ */
+static void
+cleartraverse(struct nfa * nfa,
+ struct state * s)
+{
+ struct arc *a;
+
+ if (s->tmp == NULL)
+ return;
+ s->tmp = NULL;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ cleartraverse(nfa, a->to);
+}
+
+/*
+ * specialcolors - fill in special colors for an NFA
+ */
+static void
+specialcolors(struct nfa * nfa)
+{
+ /* false colors for BOS, BOL, EOS, EOL */
+ if (nfa->parent == NULL)
+ {
+ nfa->bos[0] = pseudocolor(nfa->cm);
+ nfa->bos[1] = pseudocolor(nfa->cm);
+ nfa->eos[0] = pseudocolor(nfa->cm);
+ nfa->eos[1] = pseudocolor(nfa->cm);
+ }
+ else
+ {
+ assert(nfa->parent->bos[0] != COLORLESS);
+ nfa->bos[0] = nfa->parent->bos[0];
+ assert(nfa->parent->bos[1] != COLORLESS);
+ nfa->bos[1] = nfa->parent->bos[1];
+ assert(nfa->parent->eos[0] != COLORLESS);
+ nfa->eos[0] = nfa->parent->eos[0];
+ assert(nfa->parent->eos[1] != COLORLESS);
+ nfa->eos[1] = nfa->parent->eos[1];
+ }
+}
+
+/*
+ * optimize - optimize an NFA
+ */
+static long /* re_info bits */
+optimize(struct nfa * nfa,
+ FILE *f) /* for debug output; NULL none */
+{
+#ifdef REG_DEBUG
+ int verbose = (f != NULL) ? 1 : 0;
+
+ if (verbose)
+ fprintf(f, "\ninitial cleanup:\n");
+#endif
+ cleanup(nfa); /* may simplify situation */
+#ifdef REG_DEBUG
+ if (verbose)
+ dumpnfa(nfa, f);
+ if (verbose)
+ fprintf(f, "\nempties:\n");
+#endif
+ fixempties(nfa, f); /* get rid of EMPTY arcs */
+#ifdef REG_DEBUG
+ if (verbose)
+ fprintf(f, "\nconstraints:\n");
+#endif
+ pullback(nfa, f); /* pull back constraints backward */
+ pushfwd(nfa, f); /* push fwd constraints forward */
+#ifdef REG_DEBUG
+ if (verbose)
+ fprintf(f, "\nfinal cleanup:\n");
+#endif
+ cleanup(nfa); /* final tidying */
+ return analyze(nfa); /* and analysis */
+}
+
+/*
+ * pullback - pull back constraints backward to (with luck) eliminate them
+ */
+static void
+pullback(struct nfa * nfa,
+ FILE *f) /* for debug output; NULL none */
+{
+ struct state *s;
+ struct state *nexts;
+ struct arc *a;
+ struct arc *nexta;
+ int progress;
+
+ /* find and pull until there are no more */
+ do
+ {
+ progress = 0;
+ for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+ {
+ nexts = s->next;
+ for (a = s->outs; a != NULL && !NISERR(); a = nexta)
+ {
+ nexta = a->outchain;
+ if (a->type == '^' || a->type == BEHIND)
+ if (pull(nfa, a))
+ progress = 1;
+ assert(nexta == NULL || s->no != FREESTATE);
+ }
+ }
+ if (progress && f != NULL)
+ dumpnfa(nfa, f);
+ } while (progress && !NISERR());
+ if (NISERR())
+ return;
+
+ for (a = nfa->pre->outs; a != NULL; a = nexta)
+ {
+ nexta = a->outchain;
+ if (a->type == '^')
+ {
+ assert(a->co == 0 || a->co == 1);
+ newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
+ freearc(nfa, a);
+ }
+ }
+}
+
+/*
+ * pull - pull a back constraint backward past its source state
+ * A significant property of this function is that it deletes at most
+ * one state -- the constraint's from state -- and only if the constraint
+ * was that state's last outarc.
+ */
+static int /* 0 couldn't, 1 could */
+pull(struct nfa * nfa,
+ struct arc * con)
+{
+ struct state *from = con->from;
+ struct state *to = con->to;
+ struct arc *a;
+ struct arc *nexta;
+ struct state *s;
+
+ if (from == to)
+ { /* circular constraint is pointless */
+ freearc(nfa, con);
+ return 1;
+ }
+ if (from->flag) /* can't pull back beyond start */
+ return 0;
+ if (from->nins == 0)
+ { /* unreachable */
+ freearc(nfa, con);
+ return 1;
+ }
+
+ /* first, clone from state if necessary to avoid other outarcs */
+ if (from->nouts > 1)
+ {
+ s = newstate(nfa);
+ if (NISERR())
+ return 0;
+ assert(to != from); /* con is not an inarc */
+ copyins(nfa, from, s); /* duplicate inarcs */
+ cparc(nfa, con, s, to); /* move constraint arc */
+ freearc(nfa, con);
+ from = s;
+ con = from->outs;
+ }
+ assert(from->nouts == 1);
+
+ /* propagate the constraint into the from state's inarcs */
+ for (a = from->ins; a != NULL; a = nexta)
+ {
+ nexta = a->inchain;
+ switch (combine(con, a))
+ {
+ case INCOMPATIBLE: /* destroy the arc */
+ freearc(nfa, a);
+ break;
+ case SATISFIED: /* no action needed */
+ break;
+ case COMPATIBLE: /* swap the two arcs, more or less */
+ s = newstate(nfa);
+ if (NISERR())
+ return 0;
+ cparc(nfa, a, s, to); /* anticipate move */
+ cparc(nfa, con, a->from, s);
+ if (NISERR())
+ return 0;
+ freearc(nfa, a);
+ break;
+ default:
+ assert(NOTREACHED);
+ break;
+ }
+ }
+
+ /* remaining inarcs, if any, incorporate the constraint */
+ moveins(nfa, from, to);
+ dropstate(nfa, from); /* will free the constraint */
+ return 1;
+}
+
+/*
+ * pushfwd - push forward constraints forward to (with luck) eliminate them
+ */
+static void
+pushfwd(struct nfa * nfa,
+ FILE *f) /* for debug output; NULL none */
+{
+ struct state *s;
+ struct state *nexts;
+ struct arc *a;
+ struct arc *nexta;
+ int progress;
+
+ /* find and push until there are no more */
+ do
+ {
+ progress = 0;
+ for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+ {
+ nexts = s->next;
+ for (a = s->ins; a != NULL && !NISERR(); a = nexta)
+ {
+ nexta = a->inchain;
+ if (a->type == '$' || a->type == AHEAD)
+ if (push(nfa, a))
+ progress = 1;
+ assert(nexta == NULL || s->no != FREESTATE);
+ }
+ }
+ if (progress && f != NULL)
+ dumpnfa(nfa, f);
+ } while (progress && !NISERR());
+ if (NISERR())
+ return;
+
+ for (a = nfa->post->ins; a != NULL; a = nexta)
+ {
+ nexta = a->inchain;
+ if (a->type == '$')
+ {
+ assert(a->co == 0 || a->co == 1);
+ newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
+ freearc(nfa, a);
+ }
+ }
+}
+
+/*
+ * push - push a forward constraint forward past its destination state
+ * A significant property of this function is that it deletes at most
+ * one state -- the constraint's to state -- and only if the constraint
+ * was that state's last inarc.
+ */
+static int /* 0 couldn't, 1 could */
+push(struct nfa * nfa,
+ struct arc * con)
+{
+ struct state *from = con->from;
+ struct state *to = con->to;
+ struct arc *a;
+ struct arc *nexta;
+ struct state *s;
+
+ if (to == from)
+ { /* circular constraint is pointless */
+ freearc(nfa, con);
+ return 1;
+ }
+ if (to->flag) /* can't push forward beyond end */
+ return 0;
+ if (to->nouts == 0)
+ { /* dead end */
+ freearc(nfa, con);
+ return 1;
+ }
+
+ /* first, clone to state if necessary to avoid other inarcs */
+ if (to->nins > 1)
+ {
+ s = newstate(nfa);
+ if (NISERR())
+ return 0;
+ copyouts(nfa, to, s); /* duplicate outarcs */
+ cparc(nfa, con, from, s); /* move constraint */
+ freearc(nfa, con);
+ to = s;
+ con = to->ins;
+ }
+ assert(to->nins == 1);
+
+ /* propagate the constraint into the to state's outarcs */
+ for (a = to->outs; a != NULL; a = nexta)
+ {
+ nexta = a->outchain;
+ switch (combine(con, a))
+ {
+ case INCOMPATIBLE: /* destroy the arc */
+ freearc(nfa, a);
+ break;
+ case SATISFIED: /* no action needed */
+ break;
+ case COMPATIBLE: /* swap the two arcs, more or less */
+ s = newstate(nfa);
+ if (NISERR())
+ return 0;
+ cparc(nfa, con, s, a->to); /* anticipate move */
+ cparc(nfa, a, from, s);
+ if (NISERR())
+ return 0;
+ freearc(nfa, a);
+ break;
+ default:
+ assert(NOTREACHED);
+ break;
+ }
+ }
+
+ /* remaining outarcs, if any, incorporate the constraint */
+ moveouts(nfa, to, from);
+ dropstate(nfa, to); /* will free the constraint */
+ return 1;
+}
+
+/*
+ * combine - constraint lands on an arc, what happens?
+ *
+ * #def INCOMPATIBLE 1 // destroys arc
+ * #def SATISFIED 2 // constraint satisfied
+ * #def COMPATIBLE 3 // compatible but not satisfied yet
+ */
+static int
+combine(struct arc * con,
+ struct arc * a)
+{
+#define CA(ct,at) (((ct)<<CHAR_BIT) | (at))
+
+ switch (CA(con->type, a->type))
+ {
+ case CA('^', PLAIN): /* newlines are handled separately */
+ case CA('$', PLAIN):
+ return INCOMPATIBLE;
+ break;
+ case CA(AHEAD, PLAIN): /* color constraints meet colors */
+ case CA(BEHIND, PLAIN):
+ if (con->co == a->co)
+ return SATISFIED;
+ return INCOMPATIBLE;
+ break;
+ case CA('^', '^'): /* collision, similar constraints */
+ case CA('$', '$'):
+ case CA(AHEAD, AHEAD):
+ case CA(BEHIND, BEHIND):
+ if (con->co == a->co) /* true duplication */
+ return SATISFIED;
+ return INCOMPATIBLE;
+ break;
+ case CA('^', BEHIND): /* collision, dissimilar constraints */
+ case CA(BEHIND, '^'):
+ case CA('$', AHEAD):
+ case CA(AHEAD, '$'):
+ return INCOMPATIBLE;
+ break;
+ case CA('^', '$'): /* constraints passing each other */
+ case CA('^', AHEAD):
+ case CA(BEHIND, '$'):
+ case CA(BEHIND, AHEAD):
+ case CA('$', '^'):
+ case CA('$', BEHIND):
+ case CA(AHEAD, '^'):
+ case CA(AHEAD, BEHIND):
+ case CA('^', LACON):
+ case CA(BEHIND, LACON):
+ case CA('$', LACON):
+ case CA(AHEAD, LACON):
+ return COMPATIBLE;
+ break;
+ }
+ assert(NOTREACHED);
+ return INCOMPATIBLE; /* for benefit of blind compilers */
+}
+
+/*
+ * fixempties - get rid of EMPTY arcs
+ */
+static void
+fixempties(struct nfa * nfa,
+ FILE *f) /* for debug output; NULL none */
+{
+ struct state *s;
+ struct state *nexts;
+ struct arc *a;
+ struct arc *nexta;
+ int progress;
+
+ /* find and eliminate empties until there are no more */
+ do
+ {
+ progress = 0;
+ for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+ {
+ nexts = s->next;
+ for (a = s->outs; a != NULL && !NISERR(); a = nexta)
+ {
+ nexta = a->outchain;
+ if (a->type == EMPTY && unempty(nfa, a))
+ progress = 1;
+ assert(nexta == NULL || s->no != FREESTATE);
+ }
+ }
+ if (progress && f != NULL)
+ dumpnfa(nfa, f);
+ } while (progress && !NISERR());
+}
+
+/*
+ * unempty - optimize out an EMPTY arc, if possible
+ *
+ * Actually, as it stands this function always succeeds, but the return
+ * value is kept with an eye on possible future changes.
+ */
+static int /* 0 couldn't, 1 could */
+unempty(struct nfa * nfa,
+ struct arc * a)
+{
+ struct state *from = a->from;
+ struct state *to = a->to;
+ int usefrom; /* work on from, as opposed to to? */
+
+ assert(a->type == EMPTY);
+ assert(from != nfa->pre && to != nfa->post);
+
+ if (from == to)
+ { /* vacuous loop */
+ freearc(nfa, a);
+ return 1;
+ }
+
+ /* decide which end to work on */
+ usefrom = 1; /* default: attack from */
+ if (from->nouts > to->nins)
+ usefrom = 0;
+ else if (from->nouts == to->nins)
+ {
+ /* decide on secondary issue: move/copy fewest arcs */
+ if (from->nins > to->nouts)
+ usefrom = 0;
+ }
+
+ freearc(nfa, a);
+ if (usefrom)
+ {
+ if (from->nouts == 0)
+ {
+ /* was the state's only outarc */
+ moveins(nfa, from, to);
+ freestate(nfa, from);
+ }
+ else
+ copyins(nfa, from, to);
+ }
+ else
+ {
+ if (to->nins == 0)
+ {
+ /* was the state's only inarc */
+ moveouts(nfa, to, from);
+ freestate(nfa, to);
+ }
+ else
+ copyouts(nfa, to, from);
+ }
+
+ return 1;
+}
+
+/*
+ * cleanup - clean up NFA after optimizations
+ */
+static void
+cleanup(struct nfa * nfa)
+{
+ struct state *s;
+ struct state *nexts;
+ int n;
+
+ /* clear out unreachable or dead-end states */
+ /* use pre to mark reachable, then post to mark can-reach-post */
+ markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
+ markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
+ for (s = nfa->states; s != NULL; s = nexts)
+ {
+ nexts = s->next;
+ if (s->tmp != nfa->post && !s->flag)
+ dropstate(nfa, s);
+ }
+ assert(nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
+ cleartraverse(nfa, nfa->pre);
+ assert(nfa->post->nins == 0 || nfa->post->tmp == NULL);
+ /* the nins==0 (final unreachable) case will be caught later */
+
+ /* renumber surviving states */
+ n = 0;
+ for (s = nfa->states; s != NULL; s = s->next)
+ s->no = n++;
+ nfa->nstates = n;
+}
+
+/*
+ * markreachable - recursive marking of reachable states
+ */
+static void
+markreachable(struct nfa * nfa,
+ struct state * s,
+ struct state * okay, /* consider only states with this
+ * mark */
+ struct state * mark) /* the value to mark with */
+{
+ struct arc *a;
+
+ if (s->tmp != okay)
+ return;
+ s->tmp = mark;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ markreachable(nfa, a->to, okay, mark);
+}
+
+/*
+ * markcanreach - recursive marking of states which can reach here
+ */
+static void
+markcanreach(struct nfa * nfa,
+ struct state * s,
+ struct state * okay, /* consider only states with this
+ * mark */
+ struct state * mark) /* the value to mark with */
+{
+ struct arc *a;
+
+ if (s->tmp != okay)
+ return;
+ s->tmp = mark;
+
+ for (a = s->ins; a != NULL; a = a->inchain)
+ markcanreach(nfa, a->from, okay, mark);
+}
+
+/*
+ * analyze - ascertain potentially-useful facts about an optimized NFA
+ */
+static long /* re_info bits to be ORed in */
+analyze(struct nfa * nfa)
+{
+ struct arc *a;
+ struct arc *aa;
+
+ if (nfa->pre->outs == NULL)
+ return REG_UIMPOSSIBLE;
+ for (a = nfa->pre->outs; a != NULL; a = a->outchain)
+ for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
+ if (aa->to == nfa->post)
+ return REG_UEMPTYMATCH;
+ return 0;
+}
+
+/*
+ * compact - compact an NFA
+ */
+static void
+compact(struct nfa * nfa,
+ struct cnfa * cnfa)
+{
+ struct state *s;
+ struct arc *a;
+ size_t nstates;
+ size_t narcs;
+ struct carc *ca;
+ struct carc *first;
+
+ assert(!NISERR());
+
+ nstates = 0;
+ narcs = 0;
+ for (s = nfa->states; s != NULL; s = s->next)
+ {
+ nstates++;
+ narcs += 1 + s->nouts + 1;
+ /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */
+ }
+
+ cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
+ cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
+ if (cnfa->states == NULL || cnfa->arcs == NULL)
+ {
+ if (cnfa->states != NULL)
+ FREE(cnfa->states);
+ if (cnfa->arcs != NULL)
+ FREE(cnfa->arcs);
+ NERR(REG_ESPACE);
+ return;
+ }
+ cnfa->nstates = nstates;
+ cnfa->pre = nfa->pre->no;
+ cnfa->post = nfa->post->no;
+ cnfa->bos[0] = nfa->bos[0];
+ cnfa->bos[1] = nfa->bos[1];
+ cnfa->eos[0] = nfa->eos[0];
+ cnfa->eos[1] = nfa->eos[1];
+ cnfa->ncolors = maxcolor(nfa->cm) + 1;
+ cnfa->flags = 0;
+
+ ca = cnfa->arcs;
+ for (s = nfa->states; s != NULL; s = s->next)
+ {
+ assert((size_t) s->no < nstates);
+ cnfa->states[s->no] = ca;
+ ca->co = 0; /* clear and skip flags "arc" */
+ ca++;
+ first = ca;
+ for (a = s->outs; a != NULL; a = a->outchain)
+ switch (a->type)
+ {
+ case PLAIN:
+ ca->co = a->co;
+ ca->to = a->to->no;
+ ca++;
+ break;
+ case LACON:
+ assert(s->no != cnfa->pre);
+ ca->co = (color) (cnfa->ncolors + a->co);
+ ca->to = a->to->no;
+ ca++;
+ cnfa->flags |= HASLACONS;
+ break;
+ default:
+ assert(NOTREACHED);
+ break;
+ }
+ carcsort(first, ca - 1);
+ ca->co = COLORLESS;
+ ca->to = 0;
+ ca++;
+ }
+ assert(ca == &cnfa->arcs[narcs]);
+ assert(cnfa->nstates != 0);
+
+ /* mark no-progress states */
+ for (a = nfa->pre->outs; a != NULL; a = a->outchain)
+ cnfa->states[a->to->no]->co = 1;
+ cnfa->states[nfa->pre->no]->co = 1;
+}
+
+/*
+ * carcsort - sort compacted-NFA arcs by color
+ *
+ * Really dumb algorithm, but if the list is long enough for that to matter,
+ * you're in real trouble anyway.
+ */
+static void
+carcsort(struct carc * first,
+ struct carc * last)
+{
+ struct carc *p;
+ struct carc *q;
+ struct carc tmp;
+
+ if (last - first <= 1)
+ return;
+
+ for (p = first; p <= last; p++)
+ for (q = p; q <= last; q++)
+ if (p->co > q->co ||
+ (p->co == q->co && p->to > q->to))
+ {
+ assert(p != q);
+ tmp = *p;
+ *p = *q;
+ *q = tmp;
+ }
+}
+
+/*
+ * freecnfa - free a compacted NFA
+ */
+static void
+freecnfa(struct cnfa * cnfa)
+{
+ assert(cnfa->nstates != 0); /* not empty already */
+ cnfa->nstates = 0;
+ FREE(cnfa->states);
+ FREE(cnfa->arcs);
+}
+
+/*
+ * dumpnfa - dump an NFA in human-readable form
+ */
+static void
+dumpnfa(struct nfa * nfa,
+ FILE *f)
+{
+#ifdef REG_DEBUG
+ struct state *s;
+
+ fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
+ if (nfa->bos[0] != COLORLESS)
+ fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
+ if (nfa->bos[1] != COLORLESS)
+ fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
+ if (nfa->eos[0] != COLORLESS)
+ fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
+ if (nfa->eos[1] != COLORLESS)
+ fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
+ fprintf(f, "\n");
+ for (s = nfa->states; s != NULL; s = s->next)
+ dumpstate(s, f);
+ if (nfa->parent == NULL)
+ dumpcolors(nfa->cm, f);
+ fflush(f);
+#endif
+}
+
+#ifdef REG_DEBUG /* subordinates of dumpnfa */
+
+/*
+ * dumpstate - dump an NFA state in human-readable form
+ */
+static void
+dumpstate(struct state * s,
+ FILE *f)
+{
+ struct arc *a;
+
+ fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
+ (s->flag) ? s->flag : '.');
+ if (s->prev != NULL && s->prev->next != s)
+ fprintf(f, "\tstate chain bad\n");
+ if (s->nouts == 0)
+ fprintf(f, "\tno out arcs\n");
+ else
+ dumparcs(s, f);
+ fflush(f);
+ for (a = s->ins; a != NULL; a = a->inchain)
+ {
+ if (a->to != s)
+ fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
+ a->from->no, a->to->no, s->no);
+ }
+}
+
+/*
+ * dumparcs - dump out-arcs in human-readable form
+ */
+static void
+dumparcs(struct state * s,
+ FILE *f)
+{
+ int pos;
+
+ assert(s->nouts > 0);
+ /* printing arcs in reverse order is usually clearer */
+ pos = dumprarcs(s->outs, s, f, 1);
+ if (pos != 1)
+ fprintf(f, "\n");
+}
+
+/*
+ * dumprarcs - dump remaining outarcs, recursively, in reverse order
+ */
+static int /* resulting print position */
+dumprarcs(struct arc * a,
+ struct state * s,
+ FILE *f,
+ int pos) /* initial print position */
+{
+ if (a->outchain != NULL)
+ pos = dumprarcs(a->outchain, s, f, pos);
+ dumparc(a, s, f);
+ if (pos == 5)
+ {
+ fprintf(f, "\n");
+ pos = 1;
+ }
+ else
+ pos++;
+ return pos;
+}
+
+/*
+ * dumparc - dump one outarc in readable form, including prefixing tab
+ */
+static void
+dumparc(struct arc * a,
+ struct state * s,
+ FILE *f)
+{
+ struct arc *aa;
+ struct arcbatch *ab;
+
+ fprintf(f, "\t");
+ switch (a->type)
+ {
+ case PLAIN:
+ fprintf(f, "[%ld]", (long) a->co);
+ break;
+ case AHEAD:
+ fprintf(f, ">%ld>", (long) a->co);
+ break;
+ case BEHIND:
+ fprintf(f, "<%ld<", (long) a->co);
+ break;
+ case LACON:
+ fprintf(f, ":%ld:", (long) a->co);
+ break;
+ case '^':
+ case '$':
+ fprintf(f, "%c%d", a->type, (int) a->co);
+ break;
+ case EMPTY:
+ break;
+ default:
+ fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
+ break;
+ }
+ if (a->from != s)
+ fprintf(f, "?%d?", a->from->no);
+ for (ab = &a->from->oas; ab != NULL; ab = ab->next)
+ {
+ for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++)
+ if (aa == a)
+ break; /* NOTE BREAK OUT */
+ if (aa < &ab->a[ABSIZE]) /* propagate break */
+ break; /* NOTE BREAK OUT */
+ }
+ if (ab == NULL)
+ fprintf(f, "?!?"); /* not in allocated space */
+ fprintf(f, "->");
+ if (a->to == NULL)
+ {
+ fprintf(f, "NULL");
+ return;
+ }
+ fprintf(f, "%d", a->to->no);
+ for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
+ if (aa == a)
+ break; /* NOTE BREAK OUT */
+ if (aa == NULL)
+ fprintf(f, "?!?"); /* missing from in-chain */
+}
+#endif /* REG_DEBUG */
+
+/*
+ * dumpcnfa - dump a compacted NFA in human-readable form
+ */
+#ifdef REG_DEBUG
+static void
+dumpcnfa(struct cnfa * cnfa,
+ FILE *f)
+{
+ int st;
+
+ fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
+ if (cnfa->bos[0] != COLORLESS)
+ fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
+ if (cnfa->bos[1] != COLORLESS)
+ fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
+ if (cnfa->eos[0] != COLORLESS)
+ fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
+ if (cnfa->eos[1] != COLORLESS)
+ fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
+ if (cnfa->flags & HASLACONS)
+ fprintf(f, ", haslacons");
+ fprintf(f, "\n");
+ for (st = 0; st < cnfa->nstates; st++)
+ dumpcstate(st, cnfa->states[st], cnfa, f);
+ fflush(f);
+}
+#endif
+
+#ifdef REG_DEBUG /* subordinates of dumpcnfa */
+
+/*
+ * dumpcstate - dump a compacted-NFA state in human-readable form
+ */
+static void
+dumpcstate(int st,
+ struct carc * ca,
+ struct cnfa * cnfa,
+ FILE *f)
+{
+ int i;
+ int pos;
+
+ fprintf(f, "%d%s", st, (ca[0].co) ? ":" : ".");
+ pos = 1;
+ for (i = 1; ca[i].co != COLORLESS; i++)
+ {
+ if (ca[i].co < cnfa->ncolors)
+ fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to);
+ else
+ fprintf(f, "\t:%ld:->%d", (long) ca[i].co - cnfa->ncolors,
+ ca[i].to);
+ if (pos == 5)
+ {
+ fprintf(f, "\n");
+ pos = 1;
+ }
+ else
+ pos++;
+ }
+ if (i == 1 || pos != 1)
+ fprintf(f, "\n");
+ fflush(f);
+}
+
+#endif /* REG_DEBUG */
--- /dev/null
+/*
+ * DFA routines
+ * This file is #included by regexec.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Header$
+ *
+ */
+
+/*
+ * longest - longest-preferred matching engine
+ */
+static chr * /* endpoint, or NULL */
+longest(struct vars * v, /* used only for debug and exec flags */
+ struct dfa * d,
+ chr *start, /* where the match should start */
+ chr *stop, /* match must end at or before here */
+ int *hitstopp) /* record whether hit v->stop, if non-NULL */
+{
+ chr *cp;
+ chr *realstop = (stop == v->stop) ? stop : stop + 1;
+ color co;
+ struct sset *css;
+ struct sset *ss;
+ chr *post;
+ int i;
+ struct colormap *cm = d->cm;
+
+ /* initialize */
+ css = initialize(v, d, start);
+ cp = start;
+ if (hitstopp != NULL)
+ *hitstopp = 0;
+
+ /* startup */
+ FDEBUG(("+++ startup +++\n"));
+ if (cp == v->start)
+ {
+ co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+ }
+ else
+ {
+ co = GETCOLOR(cm, *(cp - 1));
+ FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
+ }
+ css = miss(v, d, css, co, cp, start);
+ if (css == NULL)
+ return NULL;
+ css->lastseen = cp;
+
+ /* main loop */
+ if (v->eflags & REG_FTRACE)
+ while (cp < realstop)
+ {
+ FDEBUG(("+++ at c%d +++\n", css - d->ssets));
+ co = GETCOLOR(cm, *cp);
+ FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+ ss = css->outs[co];
+ if (ss == NULL)
+ {
+ ss = miss(v, d, css, co, cp + 1, start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ }
+ else
+ while (cp < realstop)
+ {
+ co = GETCOLOR(cm, *cp);
+ ss = css->outs[co];
+ if (ss == NULL)
+ {
+ ss = miss(v, d, css, co, cp + 1, start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ }
+
+ /* shutdown */
+ FDEBUG(("+++ shutdown at c%d +++\n", css - d->ssets));
+ if (cp == v->stop && stop == v->stop)
+ {
+ if (hitstopp != NULL)
+ *hitstopp = 1;
+ co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+ ss = miss(v, d, css, co, cp, start);
+ /* special case: match ended at eol? */
+ if (ss != NULL && (ss->flags & POSTSTATE))
+ return cp;
+ else if (ss != NULL)
+ ss->lastseen = cp; /* to be tidy */
+ }
+
+ /* find last match, if any */
+ post = d->lastpost;
+ for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
+ if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
+ (post == NULL || post < ss->lastseen))
+ post = ss->lastseen;
+ if (post != NULL) /* found one */
+ return post - 1;
+
+ return NULL;
+}
+
+/*
+ * shortest - shortest-preferred matching engine
+ */
+static chr * /* endpoint, or NULL */
+shortest(struct vars * v,
+ struct dfa * d,
+ chr *start, /* where the match should start */
+ chr *min, /* match must end at or after here */
+ chr *max, /* match must end at or before here */
+ chr **coldp, /* store coldstart pointer here, if
+ * nonNULL */
+ int *hitstopp) /* record whether hit v->stop, if non-NULL */
+{
+ chr *cp;
+ chr *realmin = (min == v->stop) ? min : min + 1;
+ chr *realmax = (max == v->stop) ? max : max + 1;
+ color co;
+ struct sset *css;
+ struct sset *ss;
+ struct colormap *cm = d->cm;
+
+ /* initialize */
+ css = initialize(v, d, start);
+ cp = start;
+ if (hitstopp != NULL)
+ *hitstopp = 0;
+
+ /* startup */
+ FDEBUG(("--- startup ---\n"));
+ if (cp == v->start)
+ {
+ co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+ }
+ else
+ {
+ co = GETCOLOR(cm, *(cp - 1));
+ FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
+ }
+ css = miss(v, d, css, co, cp, start);
+ if (css == NULL)
+ return NULL;
+ css->lastseen = cp;
+ ss = css;
+
+ /* main loop */
+ if (v->eflags & REG_FTRACE)
+ while (cp < realmax)
+ {
+ FDEBUG(("--- at c%d ---\n", css - d->ssets));
+ co = GETCOLOR(cm, *cp);
+ FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+ ss = css->outs[co];
+ if (ss == NULL)
+ {
+ ss = miss(v, d, css, co, cp + 1, start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ if ((ss->flags & POSTSTATE) && cp >= realmin)
+ break; /* NOTE BREAK OUT */
+ }
+ else
+ while (cp < realmax)
+ {
+ co = GETCOLOR(cm, *cp);
+ ss = css->outs[co];
+ if (ss == NULL)
+ {
+ ss = miss(v, d, css, co, cp + 1, start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ if ((ss->flags & POSTSTATE) && cp >= realmin)
+ break; /* NOTE BREAK OUT */
+ }
+
+ if (ss == NULL)
+ return NULL;
+
+ if (coldp != NULL) /* report last no-progress state set, if
+ * any */
+ *coldp = lastcold(v, d);
+
+ if ((ss->flags & POSTSTATE) && cp > min)
+ {
+ assert(cp >= realmin);
+ cp--;
+ }
+ else if (cp == v->stop && max == v->stop)
+ {
+ co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+ ss = miss(v, d, css, co, cp, start);
+ /* match might have ended at eol */
+ if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
+ *hitstopp = 1;
+ }
+
+ if (ss == NULL || !(ss->flags & POSTSTATE))
+ return NULL;
+
+ return cp;
+}
+
+/*
+ * lastcold - determine last point at which no progress had been made
+ */
+static chr * /* endpoint, or NULL */
+lastcold(struct vars * v,
+ struct dfa * d)
+{
+ struct sset *ss;
+ chr *nopr;
+ int i;
+
+ nopr = d->lastnopr;
+ if (nopr == NULL)
+ nopr = v->start;
+ for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
+ if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
+ nopr = ss->lastseen;
+ return nopr;
+}
+
+/*
+ * newdfa - set up a fresh DFA
+ */
+static struct dfa *
+newdfa(struct vars * v,
+ struct cnfa * cnfa,
+ struct colormap * cm,
+ struct smalldfa * small) /* preallocated space, may be NULL */
+{
+ struct dfa *d;
+ size_t nss = cnfa->nstates * 2;
+ int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
+ struct smalldfa *smallwas = small;
+
+ assert(cnfa != NULL && cnfa->nstates != 0);
+
+ if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
+ {
+ assert(wordsper == 1);
+ if (small == NULL)
+ {
+ small = (struct smalldfa *) MALLOC(
+ sizeof(struct smalldfa));
+ if (small == NULL)
+ {
+ ERR(REG_ESPACE);
+ return NULL;
+ }
+ }
+ d = &small->dfa;
+ d->ssets = small->ssets;
+ d->statesarea = small->statesarea;
+ d->work = &d->statesarea[nss];
+ d->outsarea = small->outsarea;
+ d->incarea = small->incarea;
+ d->cptsmalloced = 0;
+ d->mallocarea = (smallwas == NULL) ? (char *) small : NULL;
+ }
+ else
+ {
+ d = (struct dfa *) MALLOC(sizeof(struct dfa));
+ if (d == NULL)
+ {
+ ERR(REG_ESPACE);
+ return NULL;
+ }
+ d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
+ d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
+ sizeof(unsigned));
+ d->work = &d->statesarea[nss * wordsper];
+ d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
+ sizeof(struct sset *));
+ d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
+ sizeof(struct arcp));
+ d->cptsmalloced = 1;
+ d->mallocarea = (char *) d;
+ if (d->ssets == NULL || d->statesarea == NULL ||
+ d->outsarea == NULL || d->incarea == NULL)
+ {
+ freedfa(d);
+ ERR(REG_ESPACE);
+ return NULL;
+ }
+ }
+
+ d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
+ d->nssused = 0;
+ d->nstates = cnfa->nstates;
+ d->ncolors = cnfa->ncolors;
+ d->wordsper = wordsper;
+ d->cnfa = cnfa;
+ d->cm = cm;
+ d->lastpost = NULL;
+ d->lastnopr = NULL;
+ d->search = d->ssets;
+
+ /* initialization of sset fields is done as needed */
+
+ return d;
+}
+
+/*
+ * freedfa - free a DFA
+ */
+static void
+freedfa(struct dfa * d)
+{
+ if (d->cptsmalloced)
+ {
+ if (d->ssets != NULL)
+ FREE(d->ssets);
+ if (d->statesarea != NULL)
+ FREE(d->statesarea);
+ if (d->outsarea != NULL)
+ FREE(d->outsarea);
+ if (d->incarea != NULL)
+ FREE(d->incarea);
+ }
+
+ if (d->mallocarea != NULL)
+ FREE(d->mallocarea);
+}
+
+/*
+ * hash - construct a hash code for a bitvector
+ *
+ * There are probably better ways, but they're more expensive.
+ */
+static unsigned
+hash(unsigned *uv,
+ int n)
+{
+ int i;
+ unsigned h;
+
+ h = 0;
+ for (i = 0; i < n; i++)
+ h ^= uv[i];
+ return h;
+}
+
+/*
+ * initialize - hand-craft a cache entry for startup, otherwise get ready
+ */
+static struct sset *
+initialize(struct vars * v, /* used only for debug flags */
+ struct dfa * d,
+ chr *start)
+{
+ struct sset *ss;
+ int i;
+
+ /* is previous one still there? */
+ if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
+ ss = &d->ssets[0];
+ else
+ { /* no, must (re)build it */
+ ss = getvacant(v, d, start, start);
+ for (i = 0; i < d->wordsper; i++)
+ ss->states[i] = 0;
+ BSET(ss->states, d->cnfa->pre);
+ ss->hash = HASH(ss->states, d->wordsper);
+ assert(d->cnfa->pre != d->cnfa->post);
+ ss->flags = STARTER | LOCKED | NOPROGRESS;
+ /* lastseen dealt with below */
+ }
+
+ for (i = 0; i < d->nssused; i++)
+ d->ssets[i].lastseen = NULL;
+ ss->lastseen = start; /* maybe untrue, but harmless */
+ d->lastpost = NULL;
+ d->lastnopr = NULL;
+ return ss;
+}
+
+/*
+ * miss - handle a cache miss
+ */
+static struct sset * /* NULL if goes to empty set */
+miss(struct vars * v, /* used only for debug flags */
+ struct dfa * d,
+ struct sset * css,
+ pcolor co,
+ chr *cp, /* next chr */
+ chr *start) /* where the attempt got started */
+{
+ struct cnfa *cnfa = d->cnfa;
+ int i;
+ unsigned h;
+ struct carc *ca;
+ struct sset *p;
+ int ispost;
+ int noprogress;
+ int gotstate;
+ int dolacons;
+ int sawlacons;
+
+ /* for convenience, we can be called even if it might not be a miss */
+ if (css->outs[co] != NULL)
+ {
+ FDEBUG(("hit\n"));
+ return css->outs[co];
+ }
+ FDEBUG(("miss\n"));
+
+ /* first, what set of states would we end up in? */
+ for (i = 0; i < d->wordsper; i++)
+ d->work[i] = 0;
+ ispost = 0;
+ noprogress = 1;
+ gotstate = 0;
+ for (i = 0; i < d->nstates; i++)
+ if (ISBSET(css->states, i))
+ for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++)
+ if (ca->co == co)
+ {
+ BSET(d->work, ca->to);
+ gotstate = 1;
+ if (ca->to == cnfa->post)
+ ispost = 1;
+ if (!cnfa->states[ca->to]->co)
+ noprogress = 0;
+ FDEBUG(("%d -> %d\n", i, ca->to));
+ }
+ dolacons = (gotstate) ? (cnfa->flags & HASLACONS) : 0;
+ sawlacons = 0;
+ while (dolacons)
+ { /* transitive closure */
+ dolacons = 0;
+ for (i = 0; i < d->nstates; i++)
+ if (ISBSET(d->work, i))
+ for (ca = cnfa->states[i] + 1; ca->co != COLORLESS;
+ ca++)
+ {
+ if (ca->co <= cnfa->ncolors)
+ continue; /* NOTE CONTINUE */
+ sawlacons = 1;
+ if (ISBSET(d->work, ca->to))
+ continue; /* NOTE CONTINUE */
+ if (!lacon(v, cnfa, cp, ca->co))
+ continue; /* NOTE CONTINUE */
+ BSET(d->work, ca->to);
+ dolacons = 1;
+ if (ca->to == cnfa->post)
+ ispost = 1;
+ if (!cnfa->states[ca->to]->co)
+ noprogress = 0;
+ FDEBUG(("%d :> %d\n", i, ca->to));
+ }
+ }
+ if (!gotstate)
+ return NULL;
+ h = HASH(d->work, d->wordsper);
+
+ /* next, is that in the cache? */
+ for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
+ if (HIT(h, d->work, p, d->wordsper))
+ {
+ FDEBUG(("cached c%d\n", p - d->ssets));
+ break; /* NOTE BREAK OUT */
+ }
+ if (i == 0)
+ { /* nope, need a new cache entry */
+ p = getvacant(v, d, cp, start);
+ assert(p != css);
+ for (i = 0; i < d->wordsper; i++)
+ p->states[i] = d->work[i];
+ p->hash = h;
+ p->flags = (ispost) ? POSTSTATE : 0;
+ if (noprogress)
+ p->flags |= NOPROGRESS;
+ /* lastseen to be dealt with by caller */
+ }
+
+ if (!sawlacons)
+ { /* lookahead conds. always cache miss */
+ FDEBUG(("c%d[%d]->c%d\n", css - d->ssets, co, p - d->ssets));
+ css->outs[co] = p;
+ css->inchain[co] = p->ins;
+ p->ins.ss = css;
+ p->ins.co = (color) co;
+ }
+ return p;
+}
+
+/*
+ * lacon - lookahead-constraint checker for miss()
+ */
+static int /* predicate: constraint satisfied? */
+lacon(struct vars * v,
+ struct cnfa * pcnfa, /* parent cnfa */
+ chr *cp,
+ pcolor co) /* "color" of the lookahead constraint */
+{
+ int n;
+ struct subre *sub;
+ struct dfa *d;
+ struct smalldfa sd;
+ chr *end;
+
+ n = co - pcnfa->ncolors;
+ assert(n < v->g->nlacons && v->g->lacons != NULL);
+ FDEBUG(("=== testing lacon %d\n", n));
+ sub = &v->g->lacons[n];
+ d = newdfa(v, &sub->cnfa, &v->g->cmap, &sd);
+ if (d == NULL)
+ {
+ ERR(REG_ESPACE);
+ return 0;
+ }
+ end = longest(v, d, cp, v->stop, (int *) NULL);
+ freedfa(d);
+ FDEBUG(("=== lacon %d match %d\n", n, (end != NULL)));
+ return (sub->subno) ? (end != NULL) : (end == NULL);
+}
+
+/*
+ * getvacant - get a vacant state set
+ * This routine clears out the inarcs and outarcs, but does not otherwise
+ * clear the innards of the state set -- that's up to the caller.
+ */
+static struct sset *
+getvacant(struct vars * v, /* used only for debug flags */
+ struct dfa * d,
+ chr *cp,
+ chr *start)
+{
+ int i;
+ struct sset *ss;
+ struct sset *p;
+ struct arcp ap;
+ struct arcp lastap;
+ color co;
+
+ ss = pickss(v, d, cp, start);
+ assert(!(ss->flags & LOCKED));
+
+ /* clear out its inarcs, including self-referential ones */
+ ap = ss->ins;
+ while ((p = ap.ss) != NULL)
+ {
+ co = ap.co;
+ FDEBUG(("zapping c%d's %ld outarc\n", p - d->ssets, (long) co));
+ p->outs[co] = NULL;
+ ap = p->inchain[co];
+ p->inchain[co].ss = NULL; /* paranoia */
+ }
+ ss->ins.ss = NULL;
+
+ /* take it off the inarc chains of the ssets reached by its outarcs */
+ for (i = 0; i < d->ncolors; i++)
+ {
+ p = ss->outs[i];
+ assert(p != ss); /* not self-referential */
+ if (p == NULL)
+ continue; /* NOTE CONTINUE */
+ FDEBUG(("del outarc %d from c%d's in chn\n", i, p - d->ssets));
+ if (p->ins.ss == ss && p->ins.co == i)
+ p->ins = ss->inchain[i];
+ else
+ {
+ assert(p->ins.ss != NULL);
+ for (ap = p->ins; ap.ss != NULL &&
+ !(ap.ss == ss && ap.co == i);
+ ap = ap.ss->inchain[ap.co])
+ lastap = ap;
+ assert(ap.ss != NULL);
+ lastap.ss->inchain[lastap.co] = ss->inchain[i];
+ }
+ ss->outs[i] = NULL;
+ ss->inchain[i].ss = NULL;
+ }
+
+ /* if ss was a success state, may need to remember location */
+ if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
+ (d->lastpost == NULL || d->lastpost < ss->lastseen))
+ d->lastpost = ss->lastseen;
+
+ /* likewise for a no-progress state */
+ if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
+ (d->lastnopr == NULL || d->lastnopr < ss->lastseen))
+ d->lastnopr = ss->lastseen;
+
+ return ss;
+}
+
+/*
+ * pickss - pick the next stateset to be used
+ */
+static struct sset *
+pickss(struct vars * v, /* used only for debug flags */
+ struct dfa * d,
+ chr *cp,
+ chr *start)
+{
+ int i;
+ struct sset *ss;
+ struct sset *end;
+ chr *ancient;
+
+ /* shortcut for cases where cache isn't full */
+ if (d->nssused < d->nssets)
+ {
+ i = d->nssused;
+ d->nssused++;
+ ss = &d->ssets[i];
+ FDEBUG(("new c%d\n", i));
+ /* set up innards */
+ ss->states = &d->statesarea[i * d->wordsper];
+ ss->flags = 0;
+ ss->ins.ss = NULL;
+ ss->ins.co = WHITE; /* give it some value */
+ ss->outs = &d->outsarea[i * d->ncolors];
+ ss->inchain = &d->incarea[i * d->ncolors];
+ for (i = 0; i < d->ncolors; i++)
+ {
+ ss->outs[i] = NULL;
+ ss->inchain[i].ss = NULL;
+ }
+ return ss;
+ }
+
+ /* look for oldest, or old enough anyway */
+ if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
+ ancient = cp - d->nssets * 2 / 3;
+ else
+ ancient = start;
+ for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
+ if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
+ !(ss->flags & LOCKED))
+ {
+ d->search = ss + 1;
+ FDEBUG(("replacing c%d\n", ss - d->ssets));
+ return ss;
+ }
+ for (ss = d->ssets, end = d->search; ss < end; ss++)
+ if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
+ !(ss->flags & LOCKED))
+ {
+ d->search = ss + 1;
+ FDEBUG(("replacing c%d\n", ss - d->ssets));
+ return ss;
+ }
+
+ /* nobody's old enough?!? -- something's really wrong */
+ FDEBUG(("can't find victim to replace!\n"));
+ assert(NOTREACHED);
+ ERR(REG_ASSERT);
+ return d->ssets;
+}