src/regex/regc_locale.c

/*
 * regc_locale.c --
 *
 *	This file contains locale-specific regexp routines.
 *	This file is #included by regcomp.c.
 *
 * Copyright (c) 1998 by Scriptics Corporation.
 *
 * This software is copyrighted by the Regents of the University of
 * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
 * Corporation and other parties.  The following terms apply to all files
 * associated with the software unless explicitly disclaimed in
 * individual files.
 *
 * The authors hereby grant permission to use, copy, modify, distribute,
 * and license this software and its documentation for any purpose, provided
 * that existing copyright notices are retained in all copies and that this
 * notice is included verbatim in any distributions. No written agreement,
 * license, or royalty fee is required for any of the authorized uses.
 * Modifications to this software may be copyrighted by their authors
 * and need not follow the licensing terms described here, provided that
 * the new terms are clearly indicated on the first page of each file where
 * they apply.
 *
 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
 * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
 * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.	THIS SOFTWARE
 * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
 * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
 * MODIFICATIONS.
 *
 * GOVERNMENT USE: If you are acquiring this software on behalf of the
 * U.S. government, the Government shall have only "Restricted Rights"
 * in the software and related documentation as defined in the Federal
 * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).	If you
 * are acquiring the software on behalf of the Department of Defense, the
 * software shall be classified as "Commercial Computer Software" and the
 * Government shall have only "Restricted Rights" as defined in Clause
 * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
 * authors grant the U.S. Government and others acting in its behalf
 * permission to use and distribute the software in accordance with the
 * terms specified in this license.
 *
 * $Header$
 */

int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
{
	while(*cp++ == (const char)*wp++ && --nNum){}

	return nNum;
}

/* ASCII character-name table */

static struct cname
{
	char	   *name;
	char		code;
}	cnames[] =

{
	{
		"NUL", '\0'
	},
	{
		"SOH", '\001'
	},
	{
		"STX", '\002'
	},
	{
		"ETX", '\003'
	},
	{
		"EOT", '\004'
	},
	{
		"ENQ", '\005'
	},
	{
		"ACK", '\006'
	},
	{
		"BEL", '\007'
	},
	{
		"alert", '\007'
	},
	{
		"BS", '\010'
	},
	{
		"backspace", '\b'
	},
	{
		"HT", '\011'
	},
	{
		"tab", '\t'
	},
	{
		"LF", '\012'
	},
	{
		"newline", '\n'
	},
	{
		"VT", '\013'
	},
	{
		"vertical-tab", '\v'
	},
	{
		"FF", '\014'
	},
	{
		"form-feed", '\f'
	},
	{
		"CR", '\015'
	},
	{
		"carriage-return", '\r'
	},
	{
		"SO", '\016'
	},
	{
		"SI", '\017'
	},
	{
		"DLE", '\020'
	},
	{
		"DC1", '\021'
	},
	{
		"DC2", '\022'
	},
	{
		"DC3", '\023'
	},
	{
		"DC4", '\024'
	},
	{
		"NAK", '\025'
	},
	{
		"SYN", '\026'
	},
	{
		"ETB", '\027'
	},
	{
		"CAN", '\030'
	},
	{
		"EM", '\031'
	},
	{
		"SUB", '\032'
	},
	{
		"ESC", '\033'
	},
	{
		"IS4", '\034'
	},
	{
		"FS", '\034'
	},
	{
		"IS3", '\035'
	},
	{
		"GS", '\035'
	},
	{
		"IS2", '\036'
	},
	{
		"RS", '\036'
	},
	{
		"IS1", '\037'
	},
	{
		"US", '\037'
	},
	{
		"space", ' '
	},
	{
		"exclamation-mark", '!'
	},
	{
		"quotation-mark", '"'
	},
	{
		"number-sign", '#'
	},
	{
		"dollar-sign", '$'
	},
	{
		"percent-sign", '%'
	},
	{
		"ampersand", '&'
	},
	{
		"apostrophe", '\''
	},
	{
		"left-parenthesis", '('
	},
	{
		"right-parenthesis", ')'
	},
	{
		"asterisk", '*'
	},
	{
		"plus-sign", '+'
	},
	{
		"comma", ','
	},
	{
		"hyphen", '-'
	},
	{
		"hyphen-minus", '-'
	},
	{
		"period", '.'
	},
	{
		"full-stop", '.'
	},
	{
		"slash", '/'
	},
	{
		"solidus", '/'
	},
	{
		"zero", '0'
	},
	{
		"one", '1'
	},
	{
		"two", '2'
	},
	{
		"three", '3'
	},
	{
		"four", '4'
	},
	{
		"five", '5'
	},
	{
		"six", '6'
	},
	{
		"seven", '7'
	},
	{
		"eight", '8'
	},
	{
		"nine", '9'
	},
	{
		"colon", ':'
	},
	{
		"semicolon", ';'
	},
	{
		"less-than-sign", '<'
	},
	{
		"equals-sign", '='
	},
	{
		"greater-than-sign", '>'
	},
	{
		"question-mark", '?'
	},
	{
		"commercial-at", '@'
	},
	{
		"left-square-bracket", '['
	},
	{
		"backslash", '\\'
	},
	{
		"reverse-solidus", '\\'
	},
	{
		"right-square-bracket", ']'
	},
	{
		"circumflex", '^'
	},
	{
		"circumflex-accent", '^'
	},
	{
		"underscore", '_'
	},
	{
		"low-line", '_'
	},
	{
		"grave-accent", '`'
	},
	{
		"left-brace", '{'
	},
	{
		"left-curly-bracket", '{'
	},
	{
		"vertical-line", '|'
	},
	{
		"right-brace", '}'
	},
	{
		"right-curly-bracket", '}'
	},
	{
		"tilde", '~'
	},
	{
		"DEL", '\177'
	},
	{
		NULL, 0
	}
};

/*
 * some ctype functions with non-ascii-char guard
 */
static int
wx_isdigit(wx_wchar c)
{
	return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
}

static int
wx_isalpha(wx_wchar c)
{
	return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
}

static int
wx_isalnum(wx_wchar c)
{
	return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
}

static int
wx_isupper(wx_wchar c)
{
	return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
}

static int
wx_islower(wx_wchar c)
{
	return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
}

static int
wx_isgraph(wx_wchar c)
{
	return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
}

static int
wx_ispunct(wx_wchar c)
{
	return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
}

static int
wx_isspace(wx_wchar c)
{
	return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
}

static wx_wchar
wx_toupper(wx_wchar c)
{
	if (c >= 0 && c <= UCHAR_MAX)
		return toupper((unsigned char) c);
	return c;
}

static wx_wchar
wx_tolower(wx_wchar c)
{
	if (c >= 0 && c <= UCHAR_MAX)
		return tolower((unsigned char) c);
	return c;
}


/*
 * nmcces - how many distinct MCCEs are there?
 */
static int
nmcces(struct vars * v)
{
	/*
	 * No multi-character collating elements defined at the moment.
	 */
	return 0;
}

/*
 * nleaders - how many chrs can be first chrs of MCCEs?
 */
static int
nleaders(struct vars * v)
{
	return 0;
}

/*
 * allmcces - return a cvec with all the MCCEs of the locale
 */
static struct cvec *
allmcces(struct vars * v,		/* context */
		 struct cvec * cv)		/* this is supposed to have enough room */
{
	return clearcvec(cv);
}

/*
 * element - map collating-element name to celt
 */
static celt
element(struct vars * v,		/* context */
		chr *startp,			/* points to start of name */
		chr *endp)				/* points just past end of name */
{
	struct cname *cn;
	size_t		len;

	/* generic:  one-chr names stand for themselves */
	assert(startp < endp);
	len = endp - startp;
	if (len == 1)
		return *startp;

	NOTE(REG_ULOCALE);

	/* search table */
	for (cn = cnames; cn->name != NULL; cn++)
	{
		if (strlen(cn->name) == len &&
			char_and_wchar_strncmp(cn->name, startp, len) == 0)
		{
			break;				/* NOTE BREAK OUT */
		}
	}
	if (cn->name != NULL)
		return CHR(cn->code);

	/* couldn't find it */
	ERR(REG_ECOLLATE);
	return 0;
}

/*
 * range - supply cvec for a range, including legality check
 */
static struct cvec *
range(struct vars * v,			/* context */
	  celt a,					/* range start */
	  celt b,					/* range end, might equal a */
	  int cases)				/* case-independent? */
{
	int			nchrs;
	struct cvec *cv;
	celt		c,
				lc,
				uc;

	if (a != b && !before(a, b))
	{
		ERR(REG_ERANGE);
		return NULL;
	}

	if (!cases)
	{							/* easy version */
		cv = getcvec(v, 0, 1, 0);
		NOERRN();
		addrange(cv, a, b);
		return cv;
	}

	/*
	 * When case-independent, it's hard to decide when cvec ranges are
	 * usable, so for now at least, we won't try.  We allocate enough
	 * space for two case variants plus a little extra for the two title
	 * case variants.
	 */

	nchrs = (b - a + 1) * 2 + 4;

	cv = getcvec(v, nchrs, 0, 0);
	NOERRN();

	for (c = a; c <= b; c++)
	{
		addchr(cv, c);
		lc = wx_tolower((chr) c);
		if (c != lc)
			addchr(cv, lc);
		uc = wx_toupper((chr) c);
		if (c != uc)
			addchr(cv, uc);
	}

	return cv;
}

/*
 * before - is celt x before celt y, for purposes of range legality?
 */
static int						/* predicate */
before(celt x, celt y)
{
	/* trivial because no MCCEs */
	if (x < y)
		return 1;
	return 0;
}

/*
 * eclass - supply cvec for an equivalence class
 * Must include case counterparts on request.
 */
static struct cvec *
eclass(struct vars * v,			/* context */
	   celt c,					/* Collating element representing the
								 * equivalence class. */
	   int cases)				/* all cases? */
{
	struct cvec *cv;

	/* crude fake equivalence class for testing */
	if ((v->cflags & REG_FAKE) && c == 'x')
	{
		cv = getcvec(v, 4, 0, 0);
		addchr(cv, (chr) 'x');
		addchr(cv, (chr) 'y');
		if (cases)
		{
			addchr(cv, (chr) 'X');
			addchr(cv, (chr) 'Y');
		}
		return cv;
	}

	/* otherwise, none */
	if (cases)
		return allcases(v, c);
	cv = getcvec(v, 1, 0, 0);
	assert(cv != NULL);
	addchr(cv, (chr) c);
	return cv;
}

/*
 * cclass - supply cvec for a character class
 *
 * Must include case counterparts on request.
 */
static struct cvec *
cclass(struct vars * v,			/* context */
	   chr *startp,				/* where the name starts */
	   chr *endp,				/* just past the end of the name */
	   int cases)				/* case-independent? */
{
	size_t		len;
	struct cvec *cv = NULL;
	char	  **namePtr;
	int			i,
				index;

	/*
	 * The following arrays define the valid character class names.
	 */

	static char *classNames[] = {
		"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
		"lower", "print", "punct", "space", "upper", "xdigit", NULL
	};

	enum classes
	{
		CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
		CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
	};

	/*
	 * Map the name to the corresponding enumerated value.
	 */
	len = endp - startp;
	index = -1;
	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
	{
		if (strlen(*namePtr) == len &&
			char_and_wchar_strncmp(*namePtr, startp, len) == 0)
		{
			index = i;
			break;
		}
	}
	if (index == -1)
	{
		ERR(REG_ECTYPE);
		return NULL;
	}

	/*
	 * Remap lower and upper to alpha if the match is case insensitive.
	 */

	if (cases &&
		((enum classes) index == CC_LOWER ||
		 (enum classes) index == CC_UPPER))
		index = (int) CC_ALPHA;

	/*
	 * Now compute the character class contents.
	 *
	 * For the moment, assume that only char codes < 256 can be in these
	 * classes.
	 */

	switch ((enum classes) index)
	{
		case CC_PRINT:
		case CC_ALNUM:
			cv = getcvec(v, UCHAR_MAX, 1, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isalpha((chr) i))
						addchr(cv, (chr) i);
				}
				addrange(cv, (chr) '0', (chr) '9');
			}
			break;
		case CC_ALPHA:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isalpha((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_ASCII:
			cv = getcvec(v, 0, 1, 0);
			if (cv)
				addrange(cv, 0, 0x7f);
			break;
		case CC_BLANK:
			cv = getcvec(v, 2, 0, 0);
			addchr(cv, '\t');
			addchr(cv, ' ');
			break;
		case CC_CNTRL:
			cv = getcvec(v, 0, 2, 0);
			addrange(cv, 0x0, 0x1f);
			addrange(cv, 0x7f, 0x9f);
			break;
		case CC_DIGIT:
			cv = getcvec(v, 0, 1, 0);
			if (cv)
				addrange(cv, (chr) '0', (chr) '9');
			break;
		case CC_PUNCT:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_ispunct((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_XDIGIT:
			cv = getcvec(v, 0, 3, 0);
			if (cv)
			{
				addrange(cv, '0', '9');
				addrange(cv, 'a', 'f');
				addrange(cv, 'A', 'F');
			}
			break;
		case CC_SPACE:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isspace((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_LOWER:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_islower((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_UPPER:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isupper((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_GRAPH:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isgraph((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
	}
	if (cv == NULL)
		ERR(REG_ESPACE);
	return cv;
}

/*
 * allcases - supply cvec for all case counterparts of a chr (including itself)
 *
 * This is a shortcut, preferably an efficient one, for simple characters;
 * messy cases are done via range().
 */
static struct cvec *
allcases(struct vars * v,		/* context */
		 chr pc)				/* character to get case equivs of */
{
	struct cvec *cv;
	chr			c = (chr) pc;
	chr			lc,
				uc;

	lc = wx_tolower((chr) c);
	uc = wx_toupper((chr) c);

	cv = getcvec(v, 2, 0, 0);
	addchr(cv, lc);
	if (lc != uc)
		addchr(cv, uc);
	return cv;
}

/*
 * cmp - chr-substring compare
 *
 * Backrefs need this.	It should preferably be efficient.
 * Note that it does not need to report anything except equal/unequal.
 * Note also that the length is exact, and the comparison should not
 * stop at embedded NULs!
 */
static int						/* 0 for equal, nonzero for unequal */
cmp(const chr *x, const chr *y, /* strings to compare */
	size_t len)					/* exact length of comparison */
{
	return memcmp(VS(x), VS(y), len * sizeof(chr));
}

/*
 * casecmp - case-independent chr-substring compare
 *
 * REG_ICASE backrefs need this.  It should preferably be efficient.
 * Note that it does not need to report anything except equal/unequal.
 * Note also that the length is exact, and the comparison should not
 * stop at embedded NULs!
 */
static int						/* 0 for equal, nonzero for unequal */
casecmp(const chr *x, const chr *y,		/* strings to compare */
		size_t len)				/* exact length of comparison */
{
	for (; len > 0; len--, x++, y++)
	{
		if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y)))
			return 1;
	}
	return 0;
}
Commit	Line	Data
	1	/*
	2	* regc_locale.c --
	3	*
	4	* This file contains locale-specific regexp routines.
	5	* This file is #included by regcomp.c.
	6	*
	7	* Copyright (c) 1998 by Scriptics Corporation.
	8	*
	9	* This software is copyrighted by the Regents of the University of
	10	* California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
	11	* Corporation and other parties. The following terms apply to all files
	12	* associated with the software unless explicitly disclaimed in
	13	* individual files.
	14	*
	15	* The authors hereby grant permission to use, copy, modify, distribute,
	16	* and license this software and its documentation for any purpose, provided
	17	* that existing copyright notices are retained in all copies and that this
	18	* notice is included verbatim in any distributions. No written agreement,
	19	* license, or royalty fee is required for any of the authorized uses.
	20	* Modifications to this software may be copyrighted by their authors
	21	* and need not follow the licensing terms described here, provided that
	22	* the new terms are clearly indicated on the first page of each file where
	23	* they apply.
	24	*
	25	* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
	26	* FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
	27	* ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
	28	* DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
	29	* POSSIBILITY OF SUCH DAMAGE.
	30	*
	31	* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
	32	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
	33	* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
	34	* IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
	35	* NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
	36	* MODIFICATIONS.
	37	*
	38	* GOVERNMENT USE: If you are acquiring this software on behalf of the
	39	* U.S. government, the Government shall have only "Restricted Rights"
	40	* in the software and related documentation as defined in the Federal
	41	* Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
	42	* are acquiring the software on behalf of the Department of Defense, the
	43	* software shall be classified as "Commercial Computer Software" and the
	44	* Government shall have only "Restricted Rights" as defined in Clause
	45	* 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
	46	* authors grant the U.S. Government and others acting in its behalf
	47	* permission to use and distribute the software in accordance with the
	48	* terms specified in this license.
	49	*
	50	* $Header$
	51	*/
	52
	53	int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
	54	{
	55	while(cp++ == (const char)wp++ && --nNum){}
	56
	57	return nNum;
	58	}
	59
	60	/* ASCII character-name table */
	61
	62	static struct cname
	63	{
	64	char *name;
	65	char code;
	66	} cnames[] =
	67
	68	{
	69	{
	70	"NUL", '\0'
	71	},
	72	{
	73	"SOH", '\001'
	74	},
	75	{
	76	"STX", '\002'
	77	},
	78	{
	79	"ETX", '\003'
	80	},
	81	{
	82	"EOT", '\004'
	83	},
	84	{
	85	"ENQ", '\005'
	86	},
	87	{
	88	"ACK", '\006'
	89	},
	90	{
	91	"BEL", '\007'
	92	},
	93	{
	94	"alert", '\007'
	95	},
	96	{
	97	"BS", '\010'
	98	},
	99	{
	100	"backspace", '\b'
	101	},
	102	{
	103	"HT", '\011'
	104	},
	105	{
	106	"tab", '\t'
	107	},
	108	{
	109	"LF", '\012'
	110	},
	111	{
	112	"newline", '\n'
	113	},
	114	{
	115	"VT", '\013'
	116	},
	117	{
	118	"vertical-tab", '\v'
	119	},
	120	{
	121	"FF", '\014'
	122	},
	123	{
	124	"form-feed", '\f'
	125	},
	126	{
	127	"CR", '\015'
	128	},
	129	{
	130	"carriage-return", '\r'
	131	},
	132	{
	133	"SO", '\016'
	134	},
	135	{
	136	"SI", '\017'
	137	},
	138	{
	139	"DLE", '\020'
	140	},
	141	{
	142	"DC1", '\021'
	143	},
	144	{
	145	"DC2", '\022'
	146	},
	147	{
	148	"DC3", '\023'
	149	},
	150	{
	151	"DC4", '\024'
	152	},
	153	{
	154	"NAK", '\025'
	155	},
	156	{
	157	"SYN", '\026'
	158	},
	159	{
	160	"ETB", '\027'
	161	},
	162	{
	163	"CAN", '\030'
	164	},
	165	{
	166	"EM", '\031'
	167	},
	168	{
	169	"SUB", '\032'
	170	},
	171	{
	172	"ESC", '\033'
	173	},
	174	{
	175	"IS4", '\034'
	176	},
	177	{
	178	"FS", '\034'
	179	},
	180	{
	181	"IS3", '\035'
	182	},
	183	{
	184	"GS", '\035'
	185	},
	186	{
	187	"IS2", '\036'
	188	},
	189	{
	190	"RS", '\036'
	191	},
	192	{
	193	"IS1", '\037'
	194	},
	195	{
	196	"US", '\037'
	197	},
	198	{
	199	"space", ' '
	200	},
	201	{
	202	"exclamation-mark", '!'
	203	},
	204	{
	205	"quotation-mark", '"'
	206	},
	207	{
	208	"number-sign", '#'
	209	},
	210	{
	211	"dollar-sign", '$'
	212	},
	213	{
	214	"percent-sign", '%'
	215	},
	216	{
	217	"ampersand", '&'
	218	},
	219	{
	220	"apostrophe", '\''
	221	},
	222	{
	223	"left-parenthesis", '('
	224	},
	225	{
	226	"right-parenthesis", ')'
	227	},
	228	{
	229	"asterisk", '*'
	230	},
	231	{
	232	"plus-sign", '+'
	233	},
	234	{
	235	"comma", ','
	236	},
	237	{
	238	"hyphen", '-'
	239	},
	240	{
	241	"hyphen-minus", '-'
	242	},
	243	{
	244	"period", '.'
	245	},
	246	{
	247	"full-stop", '.'
	248	},
	249	{
	250	"slash", '/'
	251	},
	252	{
	253	"solidus", '/'
	254	},
	255	{
	256	"zero", '0'
	257	},
	258	{
	259	"one", '1'
	260	},
	261	{
	262	"two", '2'
	263	},
	264	{
	265	"three", '3'
	266	},
	267	{
	268	"four", '4'
	269	},
	270	{
	271	"five", '5'
	272	},
	273	{
	274	"six", '6'
	275	},
	276	{
	277	"seven", '7'
	278	},
	279	{
	280	"eight", '8'
	281	},
	282	{
	283	"nine", '9'
	284	},
	285	{
	286	"colon", ':'
	287	},
	288	{
	289	"semicolon", ';'
	290	},
	291	{
	292	"less-than-sign", '<'
	293	},
	294	{
	295	"equals-sign", '='
	296	},
	297	{
	298	"greater-than-sign", '>'
	299	},
	300	{
	301	"question-mark", '?'
	302	},
	303	{
	304	"commercial-at", '@'
	305	},
	306	{
	307	"left-square-bracket", '['
	308	},
	309	{
	310	"backslash", '\\'
	311	},
	312	{
	313	"reverse-solidus", '\\'
	314	},
	315	{
	316	"right-square-bracket", ']'
	317	},
	318	{
	319	"circumflex", '^'
	320	},
	321	{
	322	"circumflex-accent", '^'
	323	},
	324	{
	325	"underscore", '_'
	326	},
	327	{
	328	"low-line", '_'
	329	},
	330	{
	331	"grave-accent", '`'
	332	},
	333	{
	334	"left-brace", '{'
	335	},
	336	{
	337	"left-curly-bracket", '{'
	338	},
	339	{
	340	"vertical-line", '\|'
	341	},
	342	{
	343	"right-brace", '}'
	344	},
	345	{
	346	"right-curly-bracket", '}'
	347	},
	348	{
	349	"tilde", '~'
	350	},
	351	{
	352	"DEL", '\177'
	353	},
	354	{
	355	NULL, 0
	356	}
	357	};
	358
	359	/*
	360	* some ctype functions with non-ascii-char guard
	361	*/
	362	static int
	363	wx_isdigit(wx_wchar c)
	364	{
	365	return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
	366	}
	367
	368	static int
	369	wx_isalpha(wx_wchar c)
	370	{
	371	return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
	372	}
	373
	374	static int
	375	wx_isalnum(wx_wchar c)
	376	{
	377	return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
	378	}
	379
	380	static int
	381	wx_isupper(wx_wchar c)
	382	{
	383	return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
	384	}
	385
	386	static int
	387	wx_islower(wx_wchar c)
	388	{
	389	return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
	390	}
	391
	392	static int
	393	wx_isgraph(wx_wchar c)
	394	{
	395	return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
	396	}
	397
	398	static int
	399	wx_ispunct(wx_wchar c)
	400	{
	401	return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
	402	}
	403
	404	static int
	405	wx_isspace(wx_wchar c)
	406	{
	407	return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
	408	}
	409
	410	static wx_wchar
	411	wx_toupper(wx_wchar c)
	412	{
	413	if (c >= 0 && c <= UCHAR_MAX)
	414	return toupper((unsigned char) c);
	415	return c;
	416	}
	417
	418	static wx_wchar
	419	wx_tolower(wx_wchar c)
	420	{
	421	if (c >= 0 && c <= UCHAR_MAX)
	422	return tolower((unsigned char) c);
	423	return c;
	424	}
	425
	426
	427	/*
	428	* nmcces - how many distinct MCCEs are there?
	429	*/
	430	static int
	431	nmcces(struct vars * v)
	432	{
	433	/*
	434	* No multi-character collating elements defined at the moment.
	435	*/
	436	return 0;
	437	}
	438
	439	/*
	440	* nleaders - how many chrs can be first chrs of MCCEs?
	441	*/
	442	static int
	443	nleaders(struct vars * v)
	444	{
	445	return 0;
	446	}
	447
	448	/*
	449	* allmcces - return a cvec with all the MCCEs of the locale
	450	*/
	451	static struct cvec *
	452	allmcces(struct vars * v, /* context */
	453	struct cvec * cv) /* this is supposed to have enough room */
	454	{
	455	return clearcvec(cv);
	456	}
	457
	458	/*
	459	* element - map collating-element name to celt
	460	*/
	461	static celt
	462	element(struct vars * v, /* context */
	463	chr startp, / points to start of name */
	464	chr endp) / points just past end of name */
	465	{
	466	struct cname *cn;
	467	size_t len;
	468
	469	/* generic: one-chr names stand for themselves */
	470	assert(startp < endp);
	471	len = endp - startp;
	472	if (len == 1)
	473	return *startp;
	474
	475	NOTE(REG_ULOCALE);
	476
	477	/* search table */
	478	for (cn = cnames; cn->name != NULL; cn++)
	479	{
	480	if (strlen(cn->name) == len &&
	481	char_and_wchar_strncmp(cn->name, startp, len) == 0)
	482	{
	483	break; /* NOTE BREAK OUT */
	484	}
	485	}
	486	if (cn->name != NULL)
	487	return CHR(cn->code);
	488
	489	/* couldn't find it */
	490	ERR(REG_ECOLLATE);
	491	return 0;
	492	}
	493
	494	/*
	495	* range - supply cvec for a range, including legality check
	496	*/
	497	static struct cvec *
	498	range(struct vars * v, /* context */
	499	celt a, /* range start */
	500	celt b, /* range end, might equal a */
	501	int cases) /* case-independent? */
	502	{
	503	int nchrs;
	504	struct cvec *cv;
	505	celt c,
	506	lc,
	507	uc;
	508
	509	if (a != b && !before(a, b))
	510	{
	511	ERR(REG_ERANGE);
	512	return NULL;
	513	}
	514
	515	if (!cases)
	516	{ /* easy version */
	517	cv = getcvec(v, 0, 1, 0);
	518	NOERRN();
	519	addrange(cv, a, b);
	520	return cv;
	521	}
	522
	523	/*
	524	* When case-independent, it's hard to decide when cvec ranges are
	525	* usable, so for now at least, we won't try. We allocate enough
	526	* space for two case variants plus a little extra for the two title
	527	* case variants.
	528	*/
	529
	530	nchrs = (b - a + 1) * 2 + 4;
	531
	532	cv = getcvec(v, nchrs, 0, 0);
	533	NOERRN();
	534
	535	for (c = a; c <= b; c++)
	536	{
	537	addchr(cv, c);
	538	lc = wx_tolower((chr) c);
	539	if (c != lc)
	540	addchr(cv, lc);
	541	uc = wx_toupper((chr) c);
	542	if (c != uc)
	543	addchr(cv, uc);
	544	}
	545
	546	return cv;
	547	}
	548
	549	/*
	550	* before - is celt x before celt y, for purposes of range legality?
	551	*/
	552	static int /* predicate */
	553	before(celt x, celt y)
	554	{
	555	/* trivial because no MCCEs */
	556	if (x < y)
	557	return 1;
	558	return 0;
	559	}
	560
	561	/*
	562	* eclass - supply cvec for an equivalence class
	563	* Must include case counterparts on request.
	564	*/
	565	static struct cvec *
	566	eclass(struct vars * v, /* context */
	567	celt c, /* Collating element representing the
	568	* equivalence class. */
	569	int cases) /* all cases? */
	570	{
	571	struct cvec *cv;
	572
	573	/* crude fake equivalence class for testing */
	574	if ((v->cflags & REG_FAKE) && c == 'x')
	575	{
	576	cv = getcvec(v, 4, 0, 0);
	577	addchr(cv, (chr) 'x');
	578	addchr(cv, (chr) 'y');
	579	if (cases)
	580	{
	581	addchr(cv, (chr) 'X');
	582	addchr(cv, (chr) 'Y');
	583	}
	584	return cv;
	585	}
	586
	587	/* otherwise, none */
	588	if (cases)
	589	return allcases(v, c);
	590	cv = getcvec(v, 1, 0, 0);
	591	assert(cv != NULL);
	592	addchr(cv, (chr) c);
	593	return cv;
	594	}
	595
	596	/*
	597	* cclass - supply cvec for a character class
	598	*
	599	* Must include case counterparts on request.
	600	*/
	601	static struct cvec *
	602	cclass(struct vars * v, /* context */
	603	chr startp, / where the name starts */
	604	chr endp, / just past the end of the name */
	605	int cases) /* case-independent? */
	606	{
	607	size_t len;
	608	struct cvec *cv = NULL;
	609	char **namePtr;
	610	int i,
	611	index;
	612
	613	/*
	614	* The following arrays define the valid character class names.
	615	*/
	616
	617	static char *classNames[] = {
	618	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
	619	"lower", "print", "punct", "space", "upper", "xdigit", NULL
	620	};
	621
	622	enum classes
	623	{
	624	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
	625	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
	626	};
	627
	628	/*
	629	* Map the name to the corresponding enumerated value.
	630	*/
	631	len = endp - startp;
	632	index = -1;
	633	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
	634	{
	635	if (strlen(*namePtr) == len &&
	636	char_and_wchar_strncmp(*namePtr, startp, len) == 0)
	637	{
	638	index = i;
	639	break;
	640	}
	641	}
	642	if (index == -1)
	643	{
	644	ERR(REG_ECTYPE);
	645	return NULL;
	646	}
	647
	648	/*
	649	* Remap lower and upper to alpha if the match is case insensitive.
	650	*/
	651
	652	if (cases &&
	653	((enum classes) index == CC_LOWER \|\|
	654	(enum classes) index == CC_UPPER))
	655	index = (int) CC_ALPHA;
	656
	657	/*
	658	* Now compute the character class contents.
	659	*
	660	* For the moment, assume that only char codes < 256 can be in these
	661	* classes.
	662	*/
	663
	664	switch ((enum classes) index)
	665	{
	666	case CC_PRINT:
	667	case CC_ALNUM:
	668	cv = getcvec(v, UCHAR_MAX, 1, 0);
	669	if (cv)
	670	{
	671	for (i = 0; i <= UCHAR_MAX; i++)
	672	{
	673	if (wx_isalpha((chr) i))
	674	addchr(cv, (chr) i);
	675	}
	676	addrange(cv, (chr) '0', (chr) '9');
	677	}
	678	break;
	679	case CC_ALPHA:
	680	cv = getcvec(v, UCHAR_MAX, 0, 0);
	681	if (cv)
	682	{
	683	for (i = 0; i <= UCHAR_MAX; i++)
	684	{
	685	if (wx_isalpha((chr) i))
	686	addchr(cv, (chr) i);
	687	}
	688	}
	689	break;
	690	case CC_ASCII:
	691	cv = getcvec(v, 0, 1, 0);
	692	if (cv)
	693	addrange(cv, 0, 0x7f);
	694	break;
	695	case CC_BLANK:
	696	cv = getcvec(v, 2, 0, 0);
	697	addchr(cv, '\t');
	698	addchr(cv, ' ');
	699	break;
	700	case CC_CNTRL:
	701	cv = getcvec(v, 0, 2, 0);
	702	addrange(cv, 0x0, 0x1f);
	703	addrange(cv, 0x7f, 0x9f);
	704	break;
	705	case CC_DIGIT:
	706	cv = getcvec(v, 0, 1, 0);
	707	if (cv)
	708	addrange(cv, (chr) '0', (chr) '9');
	709	break;
	710	case CC_PUNCT:
	711	cv = getcvec(v, UCHAR_MAX, 0, 0);
	712	if (cv)
	713	{
	714	for (i = 0; i <= UCHAR_MAX; i++)
	715	{
	716	if (wx_ispunct((chr) i))
	717	addchr(cv, (chr) i);
	718	}
	719	}
	720	break;
	721	case CC_XDIGIT:
	722	cv = getcvec(v, 0, 3, 0);
	723	if (cv)
	724	{
	725	addrange(cv, '0', '9');
	726	addrange(cv, 'a', 'f');
	727	addrange(cv, 'A', 'F');
	728	}
	729	break;
	730	case CC_SPACE:
	731	cv = getcvec(v, UCHAR_MAX, 0, 0);
	732	if (cv)
	733	{
	734	for (i = 0; i <= UCHAR_MAX; i++)
	735	{
	736	if (wx_isspace((chr) i))
	737	addchr(cv, (chr) i);
	738	}
	739	}
	740	break;
	741	case CC_LOWER:
	742	cv = getcvec(v, UCHAR_MAX, 0, 0);
	743	if (cv)
	744	{
	745	for (i = 0; i <= UCHAR_MAX; i++)
	746	{
	747	if (wx_islower((chr) i))
	748	addchr(cv, (chr) i);
	749	}
	750	}
	751	break;
	752	case CC_UPPER:
	753	cv = getcvec(v, UCHAR_MAX, 0, 0);
	754	if (cv)
	755	{
	756	for (i = 0; i <= UCHAR_MAX; i++)
	757	{
	758	if (wx_isupper((chr) i))
	759	addchr(cv, (chr) i);
	760	}
	761	}
	762	break;
	763	case CC_GRAPH:
	764	cv = getcvec(v, UCHAR_MAX, 0, 0);
	765	if (cv)
	766	{
	767	for (i = 0; i <= UCHAR_MAX; i++)
	768	{
	769	if (wx_isgraph((chr) i))
	770	addchr(cv, (chr) i);
	771	}
	772	}
	773	break;
	774	}
	775	if (cv == NULL)
	776	ERR(REG_ESPACE);
	777	return cv;
	778	}
	779
	780	/*
	781	* allcases - supply cvec for all case counterparts of a chr (including itself)
	782	*
	783	* This is a shortcut, preferably an efficient one, for simple characters;
	784	* messy cases are done via range().
	785	*/
	786	static struct cvec *
	787	allcases(struct vars * v, /* context */
	788	chr pc) /* character to get case equivs of */
	789	{
	790	struct cvec *cv;
	791	chr c = (chr) pc;
	792	chr lc,
	793	uc;
	794
	795	lc = wx_tolower((chr) c);
	796	uc = wx_toupper((chr) c);
	797
	798	cv = getcvec(v, 2, 0, 0);
	799	addchr(cv, lc);
	800	if (lc != uc)
	801	addchr(cv, uc);
	802	return cv;
	803	}
	804
	805	/*
	806	* cmp - chr-substring compare
	807	*
	808	* Backrefs need this. It should preferably be efficient.
	809	* Note that it does not need to report anything except equal/unequal.
	810	* Note also that the length is exact, and the comparison should not
	811	* stop at embedded NULs!
	812	*/
	813	static int /* 0 for equal, nonzero for unequal */
	814	cmp(const chr x, const chr y, /* strings to compare */
	815	size_t len) /* exact length of comparison */
	816	{
	817	return memcmp(VS(x), VS(y), len * sizeof(chr));
	818	}
	819
	820	/*
	821	* casecmp - case-independent chr-substring compare
	822	*
	823	* REG_ICASE backrefs need this. It should preferably be efficient.
	824	* Note that it does not need to report anything except equal/unequal.
	825	* Note also that the length is exact, and the comparison should not
	826	* stop at embedded NULs!
	827	*/
	828	static int /* 0 for equal, nonzero for unequal */
	829	casecmp(const chr x, const chr y, /* strings to compare */
	830	size_t len) /* exact length of comparison */
	831	{
	832	for (; len > 0; len--, x++, y++)
	833	{
	834	if ((x != y) && (wx_tolower(x) != wx_tolower(y)))
	835	return 1;
	836	}
	837	return 0;
	838	}