Tcl regex lib

author Ryan Norton <wxprojects@comcast.net>

Mon, 29 Jul 2002 12:27:49 +0000 (12:27 +0000)

committer Ryan Norton <wxprojects@comcast.net>

Mon, 29 Jul 2002 12:27:49 +0000 (12:27 +0000)
author Ryan Norton <wxprojects@comcast.net>
Mon, 29 Jul 2002 12:27:49 +0000 (12:27 +0000)
committer Ryan Norton <wxprojects@comcast.net>
Mon, 29 Jul 2002 12:27:49 +0000 (12:27 +0000)
diff --git a/src/regex/regc_locale.c b/src/regex/regc_locale.c

new file mode 100644 (file)

index 0000000..4e13b84
--- /dev/null
+++ b/src/regex/regc_locale.c
@@ -0,0 +1,838 @@
+/*
+ * regc_locale.c --
+ *
+ *     This file contains locale-specific regexp routines.
+ *     This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998 by Scriptics Corporation.
+ *
+ * This software is copyrighted by the Regents of the University of
+ * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+ * Corporation and other parties.  The following terms apply to all files
+ * associated with the software unless explicitly disclaimed in
+ * individual files.
+ *
+ * The authors hereby grant permission to use, copy, modify, distribute,
+ * and license this software and its documentation for any purpose, provided
+ * that existing copyright notices are retained in all copies and that this
+ * notice is included verbatim in any distributions. No written agreement,
+ * license, or royalty fee is required for any of the authorized uses.
+ * Modifications to this software may be copyrighted by their authors
+ * and need not follow the licensing terms described here, provided that
+ * the new terms are clearly indicated on the first page of each file where
+ * they apply.
+ *
+ * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+ * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+ * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.     THIS SOFTWARE
+ * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+ * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+ * MODIFICATIONS.
+ *
+ * GOVERNMENT USE: If you are acquiring this software on behalf of the
+ * U.S. government, the Government shall have only "Restricted Rights"
+ * in the software and related documentation as defined in the Federal
+ * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
+ * are acquiring the software on behalf of the Department of Defense, the
+ * software shall be classified as "Commercial Computer Software" and the
+ * Government shall have only "Restricted Rights" as defined in Clause
+ * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
+ * authors grant the U.S. Government and others acting in its behalf
+ * permission to use and distribute the software in accordance with the
+ * terms specified in this license.
+ *
+ * $Header$
+ */
+
+int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
+{
+       while(*cp++ == (const char)*wp++ && --nNum){}
+
+       return nNum;
+}
+
+/* ASCII character-name table */
+
+static struct cname
+{
+       char       *name;
+       char            code;
+}      cnames[] =
+
+{
+       {
+               "NUL", '\0'
+       },
+       {
+               "SOH", '\001'
+       },
+       {
+               "STX", '\002'
+       },
+       {
+               "ETX", '\003'
+       },
+       {
+               "EOT", '\004'
+       },
+       {
+               "ENQ", '\005'
+       },
+       {
+               "ACK", '\006'
+       },
+       {
+               "BEL", '\007'
+       },
+       {
+               "alert", '\007'
+       },
+       {
+               "BS", '\010'
+       },
+       {
+               "backspace", '\b'
+       },
+       {
+               "HT", '\011'
+       },
+       {
+               "tab", '\t'
+       },
+       {
+               "LF", '\012'
+       },
+       {
+               "newline", '\n'
+       },
+       {
+               "VT", '\013'
+       },
+       {
+               "vertical-tab", '\v'
+       },
+       {
+               "FF", '\014'
+       },
+       {
+               "form-feed", '\f'
+       },
+       {
+               "CR", '\015'
+       },
+       {
+               "carriage-return", '\r'
+       },
+       {
+               "SO", '\016'
+       },
+       {
+               "SI", '\017'
+       },
+       {
+               "DLE", '\020'
+       },
+       {
+               "DC1", '\021'
+       },
+       {
+               "DC2", '\022'
+       },
+       {
+               "DC3", '\023'
+       },
+       {
+               "DC4", '\024'
+       },
+       {
+               "NAK", '\025'
+       },
+       {
+               "SYN", '\026'
+       },
+       {
+               "ETB", '\027'
+       },
+       {
+               "CAN", '\030'
+       },
+       {
+               "EM", '\031'
+       },
+       {
+               "SUB", '\032'
+       },
+       {
+               "ESC", '\033'
+       },
+       {
+               "IS4", '\034'
+       },
+       {
+               "FS", '\034'
+       },
+       {
+               "IS3", '\035'
+       },
+       {
+               "GS", '\035'
+       },
+       {
+               "IS2", '\036'
+       },
+       {
+               "RS", '\036'
+       },
+       {
+               "IS1", '\037'
+       },
+       {
+               "US", '\037'
+       },
+       {
+               "space", ' '
+       },
+       {
+               "exclamation-mark", '!'
+       },
+       {
+               "quotation-mark", '"'
+       },
+       {
+               "number-sign", '#'
+       },
+       {
+               "dollar-sign", '$'
+       },
+       {
+               "percent-sign", '%'
+       },
+       {
+               "ampersand", '&'
+       },
+       {
+               "apostrophe", '\''
+       },
+       {
+               "left-parenthesis", '('
+       },
+       {
+               "right-parenthesis", ')'
+       },
+       {
+               "asterisk", '*'
+       },
+       {
+               "plus-sign", '+'
+       },
+       {
+               "comma", ','
+       },
+       {
+               "hyphen", '-'
+       },
+       {
+               "hyphen-minus", '-'
+       },
+       {
+               "period", '.'
+       },
+       {
+               "full-stop", '.'
+       },
+       {
+               "slash", '/'
+       },
+       {
+               "solidus", '/'
+       },
+       {
+               "zero", '0'
+       },
+       {
+               "one", '1'
+       },
+       {
+               "two", '2'
+       },
+       {
+               "three", '3'
+       },
+       {
+               "four", '4'
+       },
+       {
+               "five", '5'
+       },
+       {
+               "six", '6'
+       },
+       {
+               "seven", '7'
+       },
+       {
+               "eight", '8'
+       },
+       {
+               "nine", '9'
+       },
+       {
+               "colon", ':'
+       },
+       {
+               "semicolon", ';'
+       },
+       {
+               "less-than-sign", '<'
+       },
+       {
+               "equals-sign", '='
+       },
+       {
+               "greater-than-sign", '>'
+       },
+       {
+               "question-mark", '?'
+       },
+       {
+               "commercial-at", '@'
+       },
+       {
+               "left-square-bracket", '['
+       },
+       {
+               "backslash", '\\'
+       },
+       {
+               "reverse-solidus", '\\'
+       },
+       {
+               "right-square-bracket", ']'
+       },
+       {
+               "circumflex", '^'
+       },
+       {
+               "circumflex-accent", '^'
+       },
+       {
+               "underscore", '_'
+       },
+       {
+               "low-line", '_'
+       },
+       {
+               "grave-accent", '`'
+       },
+       {
+               "left-brace", '{'
+       },
+       {
+               "left-curly-bracket", '{'
+       },
+       {
+               "vertical-line", '|'
+       },
+       {
+               "right-brace", '}'
+       },
+       {
+               "right-curly-bracket", '}'
+       },
+       {
+               "tilde", '~'
+       },
+       {
+               "DEL", '\177'
+       },
+       {
+               NULL, 0
+       }
+};
+
+/*
+ * some ctype functions with non-ascii-char guard
+ */
+static int
+wx_isdigit(wx_wchar c)
+{
+       return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
+}
+
+static int
+wx_isalpha(wx_wchar c)
+{
+       return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
+}
+
+static int
+wx_isalnum(wx_wchar c)
+{
+       return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
+}
+
+static int
+wx_isupper(wx_wchar c)
+{
+       return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
+}
+
+static int
+wx_islower(wx_wchar c)
+{
+       return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
+}
+
+static int
+wx_isgraph(wx_wchar c)
+{
+       return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
+}
+
+static int
+wx_ispunct(wx_wchar c)
+{
+       return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
+}
+
+static int
+wx_isspace(wx_wchar c)
+{
+       return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
+}
+
+static wx_wchar
+wx_toupper(wx_wchar c)
+{
+       if (c >= 0 && c <= UCHAR_MAX)
+               return toupper((unsigned char) c);
+       return c;
+}
+
+static wx_wchar
+wx_tolower(wx_wchar c)
+{
+       if (c >= 0 && c <= UCHAR_MAX)
+               return tolower((unsigned char) c);
+       return c;
+}
+
+
+/*
+ * nmcces - how many distinct MCCEs are there?
+ */
+static int
+nmcces(struct vars * v)
+{
+       /*
+        * No multi-character collating elements defined at the moment.
+        */
+       return 0;
+}
+
+/*
+ * nleaders - how many chrs can be first chrs of MCCEs?
+ */
+static int
+nleaders(struct vars * v)
+{
+       return 0;
+}
+
+/*
+ * allmcces - return a cvec with all the MCCEs of the locale
+ */
+static struct cvec *
+allmcces(struct vars * v,              /* context */
+                struct cvec * cv)              /* this is supposed to have enough room */
+{
+       return clearcvec(cv);
+}
+
+/*
+ * element - map collating-element name to celt
+ */
+static celt
+element(struct vars * v,               /* context */
+               chr *startp,                    /* points to start of name */
+               chr *endp)                              /* points just past end of name */
+{
+       struct cname *cn;
+       size_t          len;
+
+       /* generic:  one-chr names stand for themselves */
+       assert(startp < endp);
+       len = endp - startp;
+       if (len == 1)
+               return *startp;
+
+       NOTE(REG_ULOCALE);
+
+       /* search table */
+       for (cn = cnames; cn->name != NULL; cn++)
+       {
+               if (strlen(cn->name) == len &&
+                       char_and_wchar_strncmp(cn->name, startp, len) == 0)
+               {
+                       break;                          /* NOTE BREAK OUT */
+               }
+       }
+       if (cn->name != NULL)
+               return CHR(cn->code);
+
+       /* couldn't find it */
+       ERR(REG_ECOLLATE);
+       return 0;
+}
+
+/*
+ * range - supply cvec for a range, including legality check
+ */
+static struct cvec *
+range(struct vars * v,                 /* context */
+         celt a,                                       /* range start */
+         celt b,                                       /* range end, might equal a */
+         int cases)                            /* case-independent? */
+{
+       int                     nchrs;
+       struct cvec *cv;
+       celt            c,
+                               lc,
+                               uc;
+
+       if (a != b && !before(a, b))
+       {
+               ERR(REG_ERANGE);
+               return NULL;
+       }
+
+       if (!cases)
+       {                                                       /* easy version */
+               cv = getcvec(v, 0, 1, 0);
+               NOERRN();
+               addrange(cv, a, b);
+               return cv;
+       }
+
+       /*
+        * When case-independent, it's hard to decide when cvec ranges are
+        * usable, so for now at least, we won't try.  We allocate enough
+        * space for two case variants plus a little extra for the two title
+        * case variants.
+        */
+
+       nchrs = (b - a + 1) * 2 + 4;
+
+       cv = getcvec(v, nchrs, 0, 0);
+       NOERRN();
+
+       for (c = a; c <= b; c++)
+       {
+               addchr(cv, c);
+               lc = wx_tolower((chr) c);
+               if (c != lc)
+                       addchr(cv, lc);
+               uc = wx_toupper((chr) c);
+               if (c != uc)
+                       addchr(cv, uc);
+       }
+
+       return cv;
+}
+
+/*
+ * before - is celt x before celt y, for purposes of range legality?
+ */
+static int                                             /* predicate */
+before(celt x, celt y)
+{
+       /* trivial because no MCCEs */
+       if (x < y)
+               return 1;
+       return 0;
+}
+
+/*
+ * eclass - supply cvec for an equivalence class
+ * Must include case counterparts on request.
+ */
+static struct cvec *
+eclass(struct vars * v,                        /* context */
+          celt c,                                      /* Collating element representing the
+                                                                * equivalence class. */
+          int cases)                           /* all cases? */
+{
+       struct cvec *cv;
+
+       /* crude fake equivalence class for testing */
+       if ((v->cflags & REG_FAKE) && c == 'x')
+       {
+               cv = getcvec(v, 4, 0, 0);
+               addchr(cv, (chr) 'x');
+               addchr(cv, (chr) 'y');
+               if (cases)
+               {
+                       addchr(cv, (chr) 'X');
+                       addchr(cv, (chr) 'Y');
+               }
+               return cv;
+       }
+
+       /* otherwise, none */
+       if (cases)
+               return allcases(v, c);
+       cv = getcvec(v, 1, 0, 0);
+       assert(cv != NULL);
+       addchr(cv, (chr) c);
+       return cv;
+}
+
+/*
+ * cclass - supply cvec for a character class
+ *
+ * Must include case counterparts on request.
+ */
+static struct cvec *
+cclass(struct vars * v,                        /* context */
+          chr *startp,                         /* where the name starts */
+          chr *endp,                           /* just past the end of the name */
+          int cases)                           /* case-independent? */
+{
+       size_t          len;
+       struct cvec *cv = NULL;
+       char      **namePtr;
+       int                     i,
+                               index;
+
+       /*
+        * The following arrays define the valid character class names.
+        */
+
+       static char *classNames[] = {
+               "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
+               "lower", "print", "punct", "space", "upper", "xdigit", NULL
+       };
+
+       enum classes
+       {
+               CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
+               CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
+       };
+
+       /*
+        * Map the name to the corresponding enumerated value.
+        */
+       len = endp - startp;
+       index = -1;
+       for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
+       {
+               if (strlen(*namePtr) == len &&
+                       char_and_wchar_strncmp(*namePtr, startp, len) == 0)
+               {
+                       index = i;
+                       break;
+               }
+       }
+       if (index == -1)
+       {
+               ERR(REG_ECTYPE);
+               return NULL;
+       }
+
+       /*
+        * Remap lower and upper to alpha if the match is case insensitive.
+        */
+
+       if (cases &&
+               ((enum classes) index == CC_LOWER ||
+                (enum classes) index == CC_UPPER))
+               index = (int) CC_ALPHA;
+
+       /*
+        * Now compute the character class contents.
+        *
+        * For the moment, assume that only char codes < 256 can be in these
+        * classes.
+        */
+
+       switch ((enum classes) index)
+       {
+               case CC_PRINT:
+               case CC_ALNUM:
+                       cv = getcvec(v, UCHAR_MAX, 1, 0);
+                       if (cv)
+                       {
+                               for (i = 0; i <= UCHAR_MAX; i++)
+                               {
+                                       if (wx_isalpha((chr) i))
+                                               addchr(cv, (chr) i);
+                               }
+                               addrange(cv, (chr) '0', (chr) '9');
+                       }
+                       break;
+               case CC_ALPHA:
+                       cv = getcvec(v, UCHAR_MAX, 0, 0);
+                       if (cv)
+                       {
+                               for (i = 0; i <= UCHAR_MAX; i++)
+                               {
+                                       if (wx_isalpha((chr) i))
+                                               addchr(cv, (chr) i);
+                               }
+                       }
+                       break;
+               case CC_ASCII:
+                       cv = getcvec(v, 0, 1, 0);
+                       if (cv)
+                               addrange(cv, 0, 0x7f);
+                       break;
+               case CC_BLANK:
+                       cv = getcvec(v, 2, 0, 0);
+                       addchr(cv, '\t');
+                       addchr(cv, ' ');
+                       break;
+               case CC_CNTRL:
+                       cv = getcvec(v, 0, 2, 0);
+                       addrange(cv, 0x0, 0x1f);
+                       addrange(cv, 0x7f, 0x9f);
+                       break;
+               case CC_DIGIT:
+                       cv = getcvec(v, 0, 1, 0);
+                       if (cv)
+                               addrange(cv, (chr) '0', (chr) '9');
+                       break;
+               case CC_PUNCT:
+                       cv = getcvec(v, UCHAR_MAX, 0, 0);
+                       if (cv)
+                       {
+                               for (i = 0; i <= UCHAR_MAX; i++)
+                               {
+                                       if (wx_ispunct((chr) i))
+                                               addchr(cv, (chr) i);
+                               }
+                       }
+                       break;
+               case CC_XDIGIT:
+                       cv = getcvec(v, 0, 3, 0);
+                       if (cv)
+                       {
+                               addrange(cv, '0', '9');
+                               addrange(cv, 'a', 'f');
+                               addrange(cv, 'A', 'F');
+                       }
+                       break;
+               case CC_SPACE:
+                       cv = getcvec(v, UCHAR_MAX, 0, 0);
+                       if (cv)
+                       {
+                               for (i = 0; i <= UCHAR_MAX; i++)
+                               {
+                                       if (wx_isspace((chr) i))
+                                               addchr(cv, (chr) i);
+                               }
+                       }
+                       break;
+               case CC_LOWER:
+                       cv = getcvec(v, UCHAR_MAX, 0, 0);
+                       if (cv)
+                       {
+                               for (i = 0; i <= UCHAR_MAX; i++)
+                               {
+                                       if (wx_islower((chr) i))
+                                               addchr(cv, (chr) i);
+                               }
+                       }
+                       break;
+               case CC_UPPER:
+                       cv = getcvec(v, UCHAR_MAX, 0, 0);
+                       if (cv)
+                       {
+                               for (i = 0; i <= UCHAR_MAX; i++)
+                               {
+                                       if (wx_isupper((chr) i))
+                                               addchr(cv, (chr) i);
+                               }
+                       }
+                       break;
+               case CC_GRAPH:
+                       cv = getcvec(v, UCHAR_MAX, 0, 0);
+                       if (cv)
+                       {
+                               for (i = 0; i <= UCHAR_MAX; i++)
+                               {
+                                       if (wx_isgraph((chr) i))
+                                               addchr(cv, (chr) i);
+                               }
+                       }
+                       break;
+       }
+       if (cv == NULL)
+               ERR(REG_ESPACE);
+       return cv;
+}
+
+/*
+ * allcases - supply cvec for all case counterparts of a chr (including itself)
+ *
+ * This is a shortcut, preferably an efficient one, for simple characters;
+ * messy cases are done via range().
+ */
+static struct cvec *
+allcases(struct vars * v,              /* context */
+                chr pc)                                /* character to get case equivs of */
+{
+       struct cvec *cv;
+       chr                     c = (chr) pc;
+       chr                     lc,
+                               uc;
+
+       lc = wx_tolower((chr) c);
+       uc = wx_toupper((chr) c);
+
+       cv = getcvec(v, 2, 0, 0);
+       addchr(cv, lc);
+       if (lc != uc)
+               addchr(cv, uc);
+       return cv;
+}
+
+/*
+ * cmp - chr-substring compare
+ *
+ * Backrefs need this. It should preferably be efficient.
+ * Note that it does not need to report anything except equal/unequal.
+ * Note also that the length is exact, and the comparison should not
+ * stop at embedded NULs!
+ */
+static int                                             /* 0 for equal, nonzero for unequal */
+cmp(const chr *x, const chr *y, /* strings to compare */
+       size_t len)                                     /* exact length of comparison */
+{
+       return memcmp(VS(x), VS(y), len * sizeof(chr));
+}
+
+/*
+ * casecmp - case-independent chr-substring compare
+ *
+ * REG_ICASE backrefs need this.  It should preferably be efficient.
+ * Note that it does not need to report anything except equal/unequal.
+ * Note also that the length is exact, and the comparison should not
+ * stop at embedded NULs!
+ */
+static int                                             /* 0 for equal, nonzero for unequal */
+casecmp(const chr *x, const chr *y,            /* strings to compare */
+               size_t len)                             /* exact length of comparison */
+{
+       for (; len > 0; len--, x++, y++)
+       {
+               if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y)))
+                       return 1;
+       }
+       return 0;
+}
author	Ryan Norton <wxprojects@comcast.net>
	Mon, 29 Jul 2002 12:27:49 +0000 (12:27 +0000)
committer	Ryan Norton <wxprojects@comcast.net>
	Mon, 29 Jul 2002 12:27:49 +0000 (12:27 +0000)