[wxWidgets.git] / src / regex / regc_locale.c

/*
 * regc_locale.c --
 *
 *	This file contains locale-specific regexp routines.
 *	This file is #included by regcomp.c.
 *
 * Copyright (c) 1998 by Scriptics Corporation.
 *
 * This software is copyrighted by the Regents of the University of
 * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
 * Corporation and other parties.  The following terms apply to all files
 * associated with the software unless explicitly disclaimed in
 * individual files.
 *
 * The authors hereby grant permission to use, copy, modify, distribute,
 * and license this software and its documentation for any purpose, provided
 * that existing copyright notices are retained in all copies and that this
 * notice is included verbatim in any distributions. No written agreement,
 * license, or royalty fee is required for any of the authorized uses.
 * Modifications to this software may be copyrighted by their authors
 * and need not follow the licensing terms described here, provided that
 * the new terms are clearly indicated on the first page of each file where
 * they apply.
 *
 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
 * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
 * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.	THIS SOFTWARE
 * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
 * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
 * MODIFICATIONS.
 *
 * GOVERNMENT USE: If you are acquiring this software on behalf of the
 * U.S. government, the Government shall have only "Restricted Rights"
 * in the software and related documentation as defined in the Federal
 * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).	If you
 * are acquiring the software on behalf of the Department of Defense, the
 * software shall be classified as "Commercial Computer Software" and the
 * Government shall have only "Restricted Rights" as defined in Clause
 * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
 * authors grant the U.S. Government and others acting in its behalf
 * permission to use and distribute the software in accordance with the
 * terms specified in this license.
 *
 * $Header$
 */

int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
{
	while(*cp++ == (const char)*wp++ && --nNum){}
	return nNum;
}

int wx_isdigit(wx_wchar c) {return wxIsdigit(c);}
int wx_isalpha(wx_wchar c) {return wxIsalpha(c);}
int wx_isalnum(wx_wchar c) {return wxIsalnum(c);}
int wx_isupper(wx_wchar c) {return wxIsupper(c);}
int wx_islower(wx_wchar c) {return wxIslower(c);}
int wx_isgraph(wx_wchar c) {return wxIsgraph(c);}
int wx_ispunct(wx_wchar c) {return wxIspunct(c);}
int wx_isspace(wx_wchar c) {return wxIsspace(c);}

wx_wchar wx_toupper(wx_wchar c) 
{
    return wxToupper(c);
}

wx_wchar wx_tolower(wx_wchar c)
{
    return wxTolower(c);
}

int wx_strlen(const wx_wchar* szString)
{
    /*  
    Generic -- note that some clib functions also test for eol character '^Z'
    
	int	nLength	= 0;
	for (; *(szString + nLength) != '\0'; nLength++);
	return nLength;
    */
    return szString == NULL ? 0 : wxStrlen_(szString);
}
/* ASCII character-name table */

static struct cname
{
	char	   *name;
	char		code;
}	cnames[] =

{
	{
		"NUL", '\0'
	},
	{
		"SOH", '\001'
	},
	{
		"STX", '\002'
	},
	{
		"ETX", '\003'
	},
	{
		"EOT", '\004'
	},
	{
		"ENQ", '\005'
	},
	{
		"ACK", '\006'
	},
	{
		"BEL", '\007'
	},
	{
		"alert", '\007'
	},
	{
		"BS", '\010'
	},
	{
		"backspace", '\b'
	},
	{
		"HT", '\011'
	},
	{
		"tab", '\t'
	},
	{
		"LF", '\012'
	},
	{
		"newline", '\n'
	},
	{
		"VT", '\013'
	},
	{
		"vertical-tab", '\v'
	},
	{
		"FF", '\014'
	},
	{
		"form-feed", '\f'
	},
	{
		"CR", '\015'
	},
	{
		"carriage-return", '\r'
	},
	{
		"SO", '\016'
	},
	{
		"SI", '\017'
	},
	{
		"DLE", '\020'
	},
	{
		"DC1", '\021'
	},
	{
		"DC2", '\022'
	},
	{
		"DC3", '\023'
	},
	{
		"DC4", '\024'
	},
	{
		"NAK", '\025'
	},
	{
		"SYN", '\026'
	},
	{
		"ETB", '\027'
	},
	{
		"CAN", '\030'
	},
	{
		"EM", '\031'
	},
	{
		"SUB", '\032'
	},
	{
		"ESC", '\033'
	},
	{
		"IS4", '\034'
	},
	{
		"FS", '\034'
	},
	{
		"IS3", '\035'
	},
	{
		"GS", '\035'
	},
	{
		"IS2", '\036'
	},
	{
		"RS", '\036'
	},
	{
		"IS1", '\037'
	},
	{
		"US", '\037'
	},
	{
		"space", ' '
	},
	{
		"exclamation-mark", '!'
	},
	{
		"quotation-mark", '"'
	},
	{
		"number-sign", '#'
	},
	{
		"dollar-sign", '$'
	},
	{
		"percent-sign", '%'
	},
	{
		"ampersand", '&'
	},
	{
		"apostrophe", '\''
	},
	{
		"left-parenthesis", '('
	},
	{
		"right-parenthesis", ')'
	},
	{
		"asterisk", '*'
	},
	{
		"plus-sign", '+'
	},
	{
		"comma", ','
	},
	{
		"hyphen", '-'
	},
	{
		"hyphen-minus", '-'
	},
	{
		"period", '.'
	},
	{
		"full-stop", '.'
	},
	{
		"slash", '/'
	},
	{
		"solidus", '/'
	},
	{
		"zero", '0'
	},
	{
		"one", '1'
	},
	{
		"two", '2'
	},
	{
		"three", '3'
	},
	{
		"four", '4'
	},
	{
		"five", '5'
	},
	{
		"six", '6'
	},
	{
		"seven", '7'
	},
	{
		"eight", '8'
	},
	{
		"nine", '9'
	},
	{
		"colon", ':'
	},
	{
		"semicolon", ';'
	},
	{
		"less-than-sign", '<'
	},
	{
		"equals-sign", '='
	},
	{
		"greater-than-sign", '>'
	},
	{
		"question-mark", '?'
	},
	{
		"commercial-at", '@'
	},
	{
		"left-square-bracket", '['
	},
	{
		"backslash", '\\'
	},
	{
		"reverse-solidus", '\\'
	},
	{
		"right-square-bracket", ']'
	},
	{
		"circumflex", '^'
	},
	{
		"circumflex-accent", '^'
	},
	{
		"underscore", '_'
	},
	{
		"low-line", '_'
	},
	{
		"grave-accent", '`'
	},
	{
		"left-brace", '{'
	},
	{
		"left-curly-bracket", '{'
	},
	{
		"vertical-line", '|'
	},
	{
		"right-brace", '}'
	},
	{
		"right-curly-bracket", '}'
	},
	{
		"tilde", '~'
	},
	{
		"DEL", '\177'
	},
	{
		NULL, 0
	}
};


/*
 * nmcces - how many distinct MCCEs are there?
 */
static int
nmcces(struct vars * v)
{
	/*
	 * No multi-character collating elements defined at the moment.
	 */
	return 0;
}

/*
 * nleaders - how many chrs can be first chrs of MCCEs?
 */
static int
nleaders(struct vars * v)
{
	return 0;
}

/*
 * allmcces - return a cvec with all the MCCEs of the locale
 */
static struct cvec *
allmcces(struct vars * v,		/* context */
		 struct cvec * cv)		/* this is supposed to have enough room */
{
	return clearcvec(cv);
}

/*
 * element - map collating-element name to celt
 */
static celt
element(struct vars * v,		/* context */
		chr *startp,			/* points to start of name */
		chr *endp)				/* points just past end of name */
{
	struct cname *cn;
	size_t		len;

	/* generic:  one-chr names stand for themselves */
	assert(startp < endp);
	len = endp - startp;
	if (len == 1)
		return *startp;

	NOTE(REG_ULOCALE);

	/* search table */
	for (cn = cnames; cn->name != NULL; cn++)
	{
		if (strlen(cn->name) == len &&
			char_and_wchar_strncmp(cn->name, startp, len) == 0)
		{
			break;				/* NOTE BREAK OUT */
		}
	}
	if (cn->name != NULL)
		return CHR(cn->code);

	/* couldn't find it */
	ERR(REG_ECOLLATE);
	return 0;
}

/*
 * range - supply cvec for a range, including legality check
 */
static struct cvec *
range(struct vars * v,			/* context */
	  celt a,					/* range start */
	  celt b,					/* range end, might equal a */
	  int cases)				/* case-independent? */
{
	int			nchrs;
	struct cvec *cv;
	celt		c,
				lc,
				uc;

	if (a != b && !before(a, b))
	{
		ERR(REG_ERANGE);
		return NULL;
	}

	if (!cases)
	{							/* easy version */
		cv = getcvec(v, 0, 1, 0);
		NOERRN();
		addrange(cv, a, b);
		return cv;
	}

	/*
	 * When case-independent, it's hard to decide when cvec ranges are
	 * usable, so for now at least, we won't try.  We allocate enough
	 * space for two case variants plus a little extra for the two title
	 * case variants.
	 */

	nchrs = (b - a + 1) * 2 + 4;

	cv = getcvec(v, nchrs, 0, 0);
	NOERRN();

	for (c = a; c <= b; c++)
	{
		addchr(cv, c);
		lc = wx_tolower((chr) c);
		if (c != lc)
			addchr(cv, lc);
		uc = wx_toupper((chr) c);
		if (c != uc)
			addchr(cv, uc);
	}

	return cv;
}

/*
 * before - is celt x before celt y, for purposes of range legality?
 */
static int						/* predicate */
before(celt x, celt y)
{
	/* trivial because no MCCEs */
	if (x < y)
		return 1;
	return 0;
}

/*
 * eclass - supply cvec for an equivalence class
 * Must include case counterparts on request.
 */
static struct cvec *
eclass(struct vars * v,			/* context */
	   celt c,					/* Collating element representing the
								 * equivalence class. */
	   int cases)				/* all cases? */
{
	struct cvec *cv;

	/* crude fake equivalence class for testing */
	if ((v->cflags & REG_FAKE) && c == 'x')
	{
		cv = getcvec(v, 4, 0, 0);
		addchr(cv, (chr) 'x');
		addchr(cv, (chr) 'y');
		if (cases)
		{
			addchr(cv, (chr) 'X');
			addchr(cv, (chr) 'Y');
		}
		return cv;
	}

	/* otherwise, none */
	if (cases)
		return allcases(v, c);
	cv = getcvec(v, 1, 0, 0);
	assert(cv != NULL);
	addchr(cv, (chr) c);
	return cv;
}

/*
 * cclass - supply cvec for a character class
 *
 * Must include case counterparts on request.
 */
static struct cvec *
cclass(struct vars * v,			/* context */
	   chr *startp,				/* where the name starts */
	   chr *endp,				/* just past the end of the name */
	   int cases)				/* case-independent? */
{
	size_t		len;
	struct cvec *cv = NULL;
	char	  **namePtr;
	int			i,
				index;

	/*
	 * The following arrays define the valid character class names.
	 */

	static char *classNames[] = {
		"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
		"lower", "print", "punct", "space", "upper", "xdigit", NULL
	};

	enum classes
	{
		CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
		CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
	};

	/*
	 * Map the name to the corresponding enumerated value.
	 */
	len = endp - startp;
	index = -1;
	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
	{
		if (strlen(*namePtr) == len &&
			char_and_wchar_strncmp(*namePtr, startp, len) == 0)
		{
			index = i;
			break;
		}
	}
	if (index == -1)
	{
		ERR(REG_ECTYPE);
		return NULL;
	}

	/*
	 * Remap lower and upper to alpha if the match is case insensitive.
	 */

	if (cases &&
		((enum classes) index == CC_LOWER ||
		 (enum classes) index == CC_UPPER))
		index = (int) CC_ALPHA;

	/*
	 * Now compute the character class contents.
	 *
	 * For the moment, assume that only char codes < 256 can be in these
	 * classes.
	 */

	switch ((enum classes) index)
	{
		case CC_PRINT:
		case CC_ALNUM:
			cv = getcvec(v, UCHAR_MAX, 1, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isalpha((chr) i))
						addchr(cv, (chr) i);
				}
				addrange(cv, (chr) '0', (chr) '9');
			}
			break;
		case CC_ALPHA:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isalpha((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_ASCII:
			cv = getcvec(v, 0, 1, 0);
			if (cv)
				addrange(cv, 0, 0x7f);
			break;
		case CC_BLANK:
			cv = getcvec(v, 2, 0, 0);
			addchr(cv, '\t');
			addchr(cv, ' ');
			break;
		case CC_CNTRL:
			cv = getcvec(v, 0, 2, 0);
			addrange(cv, 0x0, 0x1f);
			addrange(cv, 0x7f, 0x9f);
			break;
		case CC_DIGIT:
			cv = getcvec(v, 0, 1, 0);
			if (cv)
				addrange(cv, (chr) '0', (chr) '9');
			break;
		case CC_PUNCT:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_ispunct((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_XDIGIT:
			cv = getcvec(v, 0, 3, 0);
			if (cv)
			{
				addrange(cv, '0', '9');
				addrange(cv, 'a', 'f');
				addrange(cv, 'A', 'F');
			}
			break;
		case CC_SPACE:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isspace((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_LOWER:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_islower((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_UPPER:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isupper((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_GRAPH:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isgraph((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
	}
	if (cv == NULL)
		ERR(REG_ESPACE);
	return cv;
}

/*
 * allcases - supply cvec for all case counterparts of a chr (including itself)
 *
 * This is a shortcut, preferably an efficient one, for simple characters;
 * messy cases are done via range().
 */
static struct cvec *
allcases(struct vars * v,		/* context */
		 chr pc)				/* character to get case equivs of */
{
	struct cvec *cv;
	chr			c = (chr) pc;
	chr			lc,
				uc;

	lc = wx_tolower((chr) c);
	uc = wx_toupper((chr) c);

	cv = getcvec(v, 2, 0, 0);
	addchr(cv, lc);
	if (lc != uc)
		addchr(cv, uc);
	return cv;
}

/*
 * cmp - chr-substring compare
 *
 * Backrefs need this.	It should preferably be efficient.
 * Note that it does not need to report anything except equal/unequal.
 * Note also that the length is exact, and the comparison should not
 * stop at embedded NULs!
 */
static int						/* 0 for equal, nonzero for unequal */
cmp(const chr *x, const chr *y, /* strings to compare */
	size_t len)					/* exact length of comparison */
{
	return memcmp(VS(x), VS(y), len * sizeof(chr));
}

/*
 * casecmp - case-independent chr-substring compare
 *
 * REG_ICASE backrefs need this.  It should preferably be efficient.
 * Note that it does not need to report anything except equal/unequal.
 * Note also that the length is exact, and the comparison should not
 * stop at embedded NULs!
 */
static int						/* 0 for equal, nonzero for unequal */
casecmp(const chr *x, const chr *y,		/* strings to compare */
		size_t len)				/* exact length of comparison */
{
	for (; len > 0; len--, x++, y++)
	{
		if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y)))
			return 1;
	}
	return 0;
}
Commit	Line	Data
830efc9b RN	1	/*
	2	* regc_locale.c --
	3	*
	4	* This file contains locale-specific regexp routines.
	5	* This file is #included by regcomp.c.
	6	*
	7	* Copyright (c) 1998 by Scriptics Corporation.
	8	*
	9	* This software is copyrighted by the Regents of the University of
	10	* California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
	11	* Corporation and other parties. The following terms apply to all files
	12	* associated with the software unless explicitly disclaimed in
	13	* individual files.
	14	*
	15	* The authors hereby grant permission to use, copy, modify, distribute,
	16	* and license this software and its documentation for any purpose, provided
	17	* that existing copyright notices are retained in all copies and that this
	18	* notice is included verbatim in any distributions. No written agreement,
	19	* license, or royalty fee is required for any of the authorized uses.
	20	* Modifications to this software may be copyrighted by their authors
	21	* and need not follow the licensing terms described here, provided that
	22	* the new terms are clearly indicated on the first page of each file where
	23	* they apply.
	24	*
	25	* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
	26	* FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
	27	* ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
	28	* DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
	29	* POSSIBILITY OF SUCH DAMAGE.
	30	*
	31	* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
	32	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
	33	* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
	34	* IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
	35	* NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
	36	* MODIFICATIONS.
	37	*
	38	* GOVERNMENT USE: If you are acquiring this software on behalf of the
	39	* U.S. government, the Government shall have only "Restricted Rights"
	40	* in the software and related documentation as defined in the Federal
	41	* Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
	42	* are acquiring the software on behalf of the Department of Defense, the
	43	* software shall be classified as "Commercial Computer Software" and the
	44	* Government shall have only "Restricted Rights" as defined in Clause
	45	* 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
	46	* authors grant the U.S. Government and others acting in its behalf
	47	* permission to use and distribute the software in accordance with the
	48	* terms specified in this license.
	49	*
	50	* $Header$
	51	*/
	52
	53	int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
	54	{
	55	while(cp++ == (const char)wp++ && --nNum){}
830efc9b RN	56	return nNum;
	57	}
	58
b8f896a1 RN	59	int wx_isdigit(wx_wchar c) {return wxIsdigit(c);}
	60	int wx_isalpha(wx_wchar c) {return wxIsalpha(c);}
	61	int wx_isalnum(wx_wchar c) {return wxIsalnum(c);}
	62	int wx_isupper(wx_wchar c) {return wxIsupper(c);}
	63	int wx_islower(wx_wchar c) {return wxIslower(c);}
	64	int wx_isgraph(wx_wchar c) {return wxIsgraph(c);}
	65	int wx_ispunct(wx_wchar c) {return wxIspunct(c);}
	66	int wx_isspace(wx_wchar c) {return wxIsspace(c);}
c5feba0e RN	67
	68	wx_wchar wx_toupper(wx_wchar c)
	69	{
b8f896a1	70	return wxToupper(c);
c5feba0e RN	71	}
	72
	73	wx_wchar wx_tolower(wx_wchar c)
	74	{
b8f896a1	75	return wxTolower(c);
c5feba0e RN	76	}
	77
	78	int wx_strlen(const wx_wchar* szString)
	79	{
	80	/*
	81	Generic -- note that some clib functions also test for eol character '^Z'
	82
	83	int nLength = 0;
	84	for (; *(szString + nLength) != '\0'; nLength++);
	85	return nLength;
	86	*/
	87	return szString == NULL ? 0 : wxStrlen_(szString);
	88	}
830efc9b RN	89	/* ASCII character-name table */
	90
	91	static struct cname
	92	{
	93	char *name;
	94	char code;
	95	} cnames[] =
	96
	97	{
	98	{
	99	"NUL", '\0'
	100	},
	101	{
	102	"SOH", '\001'
	103	},
	104	{
	105	"STX", '\002'
	106	},
	107	{
	108	"ETX", '\003'
	109	},
	110	{
	111	"EOT", '\004'
	112	},
	113	{
	114	"ENQ", '\005'
	115	},
	116	{
	117	"ACK", '\006'
	118	},
	119	{
	120	"BEL", '\007'
	121	},
	122	{
	123	"alert", '\007'
	124	},
	125	{
	126	"BS", '\010'
	127	},
	128	{
	129	"backspace", '\b'
	130	},
	131	{
	132	"HT", '\011'
	133	},
	134	{
	135	"tab", '\t'
	136	},
	137	{
	138	"LF", '\012'
	139	},
	140	{
	141	"newline", '\n'
	142	},
	143	{
	144	"VT", '\013'
	145	},
	146	{
	147	"vertical-tab", '\v'
	148	},
	149	{
	150	"FF", '\014'
	151	},
	152	{
153	"form-feed", '\f'
154	},
155	{
156	"CR", '\015'
157	},
158	{
159	"carriage-return", '\r'
160	},
161	{
162	"SO", '\016'
163	},
164	{
165	"SI", '\017'
166	},
167	{
168	"DLE", '\020'
169	},
170	{
171	"DC1", '\021'
172	},
173	{
174	"DC2", '\022'
175	},
176	{
177	"DC3", '\023'
178	},
179	{
180	"DC4", '\024'
181	},
182	{
183	"NAK", '\025'
184	},
185	{
186	"SYN", '\026'
187	},
188	{
189	"ETB", '\027'
190	},
191	{
192	"CAN", '\030'
193	},
194	{
195	"EM", '\031'
196	},
197	{
198	"SUB", '\032'
199	},
200	{
201	"ESC", '\033'
202	},
203	{
204	"IS4", '\034'
205	},
206	{
207	"FS", '\034'
208	},
209	{
210	"IS3", '\035'
211	},
212	{
213	"GS", '\035'
214	},
215	{
216	"IS2", '\036'
217	},
218	{
219	"RS", '\036'
220	},
221	{
222	"IS1", '\037'
223	},
224	{
225	"US", '\037'
226	},
227	{
228	"space", ' '
229	},
230	{
231	"exclamation-mark", '!'
232	},
233	{
234	"quotation-mark", '"'
235	},
236	{
237	"number-sign", '#'
238	},
239	{
240	"dollar-sign", '$'
241	},
242	{
243	"percent-sign", '%'
244	},
245	{
246	"ampersand", '&'
247	},
248	{
249	"apostrophe", '\''
250	},
251	{
252	"left-parenthesis", '('
253	},
254	{
255	"right-parenthesis", ')'
256	},
257	{
258	"asterisk", '*'
259	},
260	{
261	"plus-sign", '+'
262	},
263	{
264	"comma", ','
265	},
266	{
267	"hyphen", '-'
268	},
269	{
270	"hyphen-minus", '-'
271	},
272	{
273	"period", '.'
274	},
275	{
276	"full-stop", '.'
277	},
278	{
279	"slash", '/'
280	},
281	{
282	"solidus", '/'
283	},
284	{
285	"zero", '0'
286	},
287	{
288	"one", '1'
289	},
290	{
291	"two", '2'
292	},
293	{
294	"three", '3'
295	},
296	{
297	"four", '4'
298	},
299	{
300	"five", '5'
301	},
302	{
303	"six", '6'
304	},
305	{
306	"seven", '7'
307	},
308	{
309	"eight", '8'
310	},
311	{
312	"nine", '9'
313	},
314	{
315	"colon", ':'
316	},
317	{
318	"semicolon", ';'
319	},
320	{
321	"less-than-sign", '<'
322	},
323	{
324	"equals-sign", '='
325	},
326	{
327	"greater-than-sign", '>'
328	},
329	{
330	"question-mark", '?'
331	},
332	{
333	"commercial-at", '@'
334	},
335	{
336	"left-square-bracket", '['
337	},
338	{
339	"backslash", '\\'
340	},
341	{
342	"reverse-solidus", '\\'
343	},
344	{
345	"right-square-bracket", ']'
346	},
347	{
348	"circumflex", '^'
349	},
350	{
351	"circumflex-accent", '^'
352	},
353	{
354	"underscore", '_'
355	},
356	{
357	"low-line", '_'
358	},
359	{
360	"grave-accent", '`'
361	},
362	{
363	"left-brace", '{'
364	},
365	{
366	"left-curly-bracket", '{'
367	},
368	{
369	"vertical-line", '\|'
370	},
371	{
372	"right-brace", '}'
373	},
374	{
375	"right-curly-bracket", '}'
376	},
377	{
378	"tilde", '~'
379	},
380	{
381	"DEL", '\177'
382	},
383	{
384	NULL, 0
385	}
386	};
387
830efc9b RN	388
	389	/*
	390	* nmcces - how many distinct MCCEs are there?
	391	*/
	392	static int
	393	nmcces(struct vars * v)
	394	{
	395	/*
	396	* No multi-character collating elements defined at the moment.
	397	*/
	398	return 0;
	399	}
	400
	401	/*
	402	* nleaders - how many chrs can be first chrs of MCCEs?
	403	*/
	404	static int
	405	nleaders(struct vars * v)
	406	{
	407	return 0;
	408	}
	409
	410	/*
	411	* allmcces - return a cvec with all the MCCEs of the locale
	412	*/
	413	static struct cvec *
	414	allmcces(struct vars * v, /* context */
	415	struct cvec * cv) /* this is supposed to have enough room */
	416	{
	417	return clearcvec(cv);
	418	}
	419
	420	/*
	421	* element - map collating-element name to celt
	422	*/
	423	static celt
	424	element(struct vars * v, /* context */
	425	chr startp, / points to start of name */
	426	chr endp) / points just past end of name */
	427	{
	428	struct cname *cn;
	429	size_t len;
	430
	431	/* generic: one-chr names stand for themselves */
	432	assert(startp < endp);
	433	len = endp - startp;
	434	if (len == 1)
	435	return *startp;
	436
	437	NOTE(REG_ULOCALE);
	438
	439	/* search table */
	440	for (cn = cnames; cn->name != NULL; cn++)
	441	{
	442	if (strlen(cn->name) == len &&
	443	char_and_wchar_strncmp(cn->name, startp, len) == 0)
	444	{
	445	break; /* NOTE BREAK OUT */
	446	}
	447	}
	448	if (cn->name != NULL)
	449	return CHR(cn->code);
	450
	451	/* couldn't find it */
452	ERR(REG_ECOLLATE);
453	return 0;
454	}
455
456	/*
457	* range - supply cvec for a range, including legality check
458	*/
459	static struct cvec *
460	range(struct vars * v, /* context */
461	celt a, /* range start */
462	celt b, /* range end, might equal a */
463	int cases) /* case-independent? */
464	{
465	int nchrs;
466	struct cvec *cv;
467	celt c,
468	lc,
469	uc;
470
471	if (a != b && !before(a, b))
472	{
473	ERR(REG_ERANGE);
474	return NULL;
475	}
476
477	if (!cases)
478	{ /* easy version */
479	cv = getcvec(v, 0, 1, 0);
480	NOERRN();
481	addrange(cv, a, b);
482	return cv;
483	}
484
485	/*
486	* When case-independent, it's hard to decide when cvec ranges are
487	* usable, so for now at least, we won't try. We allocate enough
488	* space for two case variants plus a little extra for the two title
489	* case variants.
490	*/
491
492	nchrs = (b - a + 1) * 2 + 4;
493
494	cv = getcvec(v, nchrs, 0, 0);
495	NOERRN();
496
497	for (c = a; c <= b; c++)
498	{
499	addchr(cv, c);
500	lc = wx_tolower((chr) c);
501	if (c != lc)
502	addchr(cv, lc);
503	uc = wx_toupper((chr) c);
504	if (c != uc)
505	addchr(cv, uc);
506	}
507
508	return cv;
509	}
510
511	/*
512	* before - is celt x before celt y, for purposes of range legality?
513	*/
514	static int /* predicate */
515	before(celt x, celt y)
516	{
517	/* trivial because no MCCEs */
518	if (x < y)
519	return 1;
520	return 0;
521	}
522
523	/*
524	* eclass - supply cvec for an equivalence class
525	* Must include case counterparts on request.
526	*/
527	static struct cvec *
528	eclass(struct vars * v, /* context */
529	celt c, /* Collating element representing the
530	* equivalence class. */
531	int cases) /* all cases? */
532	{
533	struct cvec *cv;
534
535	/* crude fake equivalence class for testing */
536	if ((v->cflags & REG_FAKE) && c == 'x')
537	{
538	cv = getcvec(v, 4, 0, 0);
539	addchr(cv, (chr) 'x');
540	addchr(cv, (chr) 'y');
541	if (cases)
542	{
543	addchr(cv, (chr) 'X');
544	addchr(cv, (chr) 'Y');
545	}
546	return cv;
547	}
548
549	/* otherwise, none */
550	if (cases)
551	return allcases(v, c);
552	cv = getcvec(v, 1, 0, 0);
553	assert(cv != NULL);
554	addchr(cv, (chr) c);
555	return cv;
556	}
557
558	/*
559	* cclass - supply cvec for a character class
560	*
561	* Must include case counterparts on request.
562	*/
563	static struct cvec *
564	cclass(struct vars * v, /* context */
565	chr startp, / where the name starts */
566	chr endp, / just past the end of the name */
567	int cases) /* case-independent? */
568	{
569	size_t len;
570	struct cvec *cv = NULL;
571	char **namePtr;
572	int i,
573	index;
574
575	/*
576	* The following arrays define the valid character class names.
577	*/
578
579	static char *classNames[] = {
580	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
581	"lower", "print", "punct", "space", "upper", "xdigit", NULL
582	};
583
584	enum classes
585	{
586	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
587	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
588	};
589
590	/*
591	* Map the name to the corresponding enumerated value.
592	*/
593	len = endp - startp;
594	index = -1;
595	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
596	{
597	if (strlen(*namePtr) == len &&
598	char_and_wchar_strncmp(*namePtr, startp, len) == 0)
599	{
600	index = i;
601	break;
602	}
603	}
604	if (index == -1)
605	{
606	ERR(REG_ECTYPE);
607	return NULL;
608	}
609
610	/*
611	* Remap lower and upper to alpha if the match is case insensitive.
612	*/
613
614	if (cases &&
615	((enum classes) index == CC_LOWER \|\|
616	(enum classes) index == CC_UPPER))
617	index = (int) CC_ALPHA;
618
619	/*
620	* Now compute the character class contents.
621	*
622	* For the moment, assume that only char codes < 256 can be in these
623	* classes.
624	*/
625
626	switch ((enum classes) index)
627	{
628	case CC_PRINT:
629	case CC_ALNUM:
630	cv = getcvec(v, UCHAR_MAX, 1, 0);
631	if (cv)
632	{
633	for (i = 0; i <= UCHAR_MAX; i++)
634	{
635	if (wx_isalpha((chr) i))
636	addchr(cv, (chr) i);
637	}
638	addrange(cv, (chr) '0', (chr) '9');
639	}
640	break;
641	case CC_ALPHA:
642	cv = getcvec(v, UCHAR_MAX, 0, 0);
643	if (cv)
644	{
645	for (i = 0; i <= UCHAR_MAX; i++)
646	{
647	if (wx_isalpha((chr) i))
648	addchr(cv, (chr) i);
649	}
650	}
651	break;
652	case CC_ASCII:
653	cv = getcvec(v, 0, 1, 0);
654	if (cv)
655	addrange(cv, 0, 0x7f);
656	break;
657	case CC_BLANK:
658	cv = getcvec(v, 2, 0, 0);
659	addchr(cv, '\t');
660	addchr(cv, ' ');
661	break;
662	case CC_CNTRL:
663	cv = getcvec(v, 0, 2, 0);
664	addrange(cv, 0x0, 0x1f);
665	addrange(cv, 0x7f, 0x9f);
666	break;
667	case CC_DIGIT:
668	cv = getcvec(v, 0, 1, 0);
669	if (cv)
670	addrange(cv, (chr) '0', (chr) '9');
671	break;
672	case CC_PUNCT:
673	cv = getcvec(v, UCHAR_MAX, 0, 0);
674	if (cv)
675	{
676	for (i = 0; i <= UCHAR_MAX; i++)
677	{
678	if (wx_ispunct((chr) i))
679	addchr(cv, (chr) i);
680	}
681	}
682	break;
683	case CC_XDIGIT:
684	cv = getcvec(v, 0, 3, 0);
685	if (cv)
686	{
687	addrange(cv, '0', '9');
688	addrange(cv, 'a', 'f');
689	addrange(cv, 'A', 'F');
690	}
691	break;
692	case CC_SPACE:
693	cv = getcvec(v, UCHAR_MAX, 0, 0);
694	if (cv)
695	{
696	for (i = 0; i <= UCHAR_MAX; i++)
697	{
698	if (wx_isspace((chr) i))
699	addchr(cv, (chr) i);
700	}
701	}
702	break;
703	case CC_LOWER:
704	cv = getcvec(v, UCHAR_MAX, 0, 0);
705	if (cv)
706	{
707	for (i = 0; i <= UCHAR_MAX; i++)
708	{
709	if (wx_islower((chr) i))
710	addchr(cv, (chr) i);
711	}
712	}
713	break;
714	case CC_UPPER:
715	cv = getcvec(v, UCHAR_MAX, 0, 0);
716	if (cv)
717	{
718	for (i = 0; i <= UCHAR_MAX; i++)
719	{
720	if (wx_isupper((chr) i))
721	addchr(cv, (chr) i);
722	}
723	}
724	break;
725	case CC_GRAPH:
726	cv = getcvec(v, UCHAR_MAX, 0, 0);
727	if (cv)
728	{
729	for (i = 0; i <= UCHAR_MAX; i++)
730	{
731	if (wx_isgraph((chr) i))
732	addchr(cv, (chr) i);
733	}
734	}
735	break;
736	}
737	if (cv == NULL)
738	ERR(REG_ESPACE);
739	return cv;
740	}
741
742	/*
743	* allcases - supply cvec for all case counterparts of a chr (including itself)
744	*
745	* This is a shortcut, preferably an efficient one, for simple characters;
746	* messy cases are done via range().
747	*/
748	static struct cvec *
749	allcases(struct vars * v, /* context */
750	chr pc) /* character to get case equivs of */
751	{
752	struct cvec *cv;
753	chr c = (chr) pc;
754	chr lc,
755	uc;
756
757	lc = wx_tolower((chr) c);
758	uc = wx_toupper((chr) c);
759
760	cv = getcvec(v, 2, 0, 0);
761	addchr(cv, lc);
762	if (lc != uc)
763	addchr(cv, uc);
764	return cv;
765	}
766
767	/*
768	* cmp - chr-substring compare
769	*
770	* Backrefs need this. It should preferably be efficient.
771	* Note that it does not need to report anything except equal/unequal.
772	* Note also that the length is exact, and the comparison should not
773	* stop at embedded NULs!
774	*/
775	static int /* 0 for equal, nonzero for unequal */
776	cmp(const chr x, const chr y, /* strings to compare */
777	size_t len) /* exact length of comparison */
778	{
779	return memcmp(VS(x), VS(y), len * sizeof(chr));
780	}
781
782	/*
783	* casecmp - case-independent chr-substring compare
784	*
785	* REG_ICASE backrefs need this. It should preferably be efficient.
786	* Note that it does not need to report anything except equal/unequal.
787	* Note also that the length is exact, and the comparison should not
788	* stop at embedded NULs!
789	*/
790	static int /* 0 for equal, nonzero for unequal */
791	casecmp(const chr x, const chr y, /* strings to compare */
792	size_t len) /* exact length of comparison */
793	{
794	for (; len > 0; len--, x++, y++)
795	{
796	if ((x != y) && (wx_tolower(x) != wx_tolower(y)))
797	return 1;
798	}
799	return 0;
800	}