[wxWidgets.git] / src / regex / regc_locale.c

/*
 * regc_locale.c --
 *
 *	This file contains locale-specific regexp routines.
 *	This file is #included by regcomp.c.
 *
 * Copyright (c) 1998 by Scriptics Corporation.
 *
 * This software is copyrighted by the Regents of the University of
 * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
 * Corporation and other parties.  The following terms apply to all files
 * associated with the software unless explicitly disclaimed in
 * individual files.
 *
 * The authors hereby grant permission to use, copy, modify, distribute,
 * and license this software and its documentation for any purpose, provided
 * that existing copyright notices are retained in all copies and that this
 * notice is included verbatim in any distributions. No written agreement,
 * license, or royalty fee is required for any of the authorized uses.
 * Modifications to this software may be copyrighted by their authors
 * and need not follow the licensing terms described here, provided that
 * the new terms are clearly indicated on the first page of each file where
 * they apply.
 *
 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
 * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
 * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.	THIS SOFTWARE
 * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
 * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
 * MODIFICATIONS.
 *
 * GOVERNMENT USE: If you are acquiring this software on behalf of the
 * U.S. government, the Government shall have only "Restricted Rights"
 * in the software and related documentation as defined in the Federal
 * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).	If you
 * are acquiring the software on behalf of the Department of Defense, the
 * software shall be classified as "Commercial Computer Software" and the
 * Government shall have only "Restricted Rights" as defined in Clause
 * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
 * authors grant the U.S. Government and others acting in its behalf
 * permission to use and distribute the software in accordance with the
 * terms specified in this license.
 *
 * $Header$
 */

int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
{
	while(*cp++ == (const char)*wp++ && --nNum){}

	return nNum;
}

/* ASCII character-name table */

static struct cname
{
	char	   *name;
	char		code;
}	cnames[] =

{
	{
		"NUL", '\0'
	},
	{
		"SOH", '\001'
	},
	{
		"STX", '\002'
	},
	{
		"ETX", '\003'
	},
	{
		"EOT", '\004'
	},
	{
		"ENQ", '\005'
	},
	{
		"ACK", '\006'
	},
	{
		"BEL", '\007'
	},
	{
		"alert", '\007'
	},
	{
		"BS", '\010'
	},
	{
		"backspace", '\b'
	},
	{
		"HT", '\011'
	},
	{
		"tab", '\t'
	},
	{
		"LF", '\012'
	},
	{
		"newline", '\n'
	},
	{
		"VT", '\013'
	},
	{
		"vertical-tab", '\v'
	},
	{
		"FF", '\014'
	},
	{
		"form-feed", '\f'
	},
	{
		"CR", '\015'
	},
	{
		"carriage-return", '\r'
	},
	{
		"SO", '\016'
	},
	{
		"SI", '\017'
	},
	{
		"DLE", '\020'
	},
	{
		"DC1", '\021'
	},
	{
		"DC2", '\022'
	},
	{
		"DC3", '\023'
	},
	{
		"DC4", '\024'
	},
	{
		"NAK", '\025'
	},
	{
		"SYN", '\026'
	},
	{
		"ETB", '\027'
	},
	{
		"CAN", '\030'
	},
	{
		"EM", '\031'
	},
	{
		"SUB", '\032'
	},
	{
		"ESC", '\033'
	},
	{
		"IS4", '\034'
	},
	{
		"FS", '\034'
	},
	{
		"IS3", '\035'
	},
	{
		"GS", '\035'
	},
	{
		"IS2", '\036'
	},
	{
		"RS", '\036'
	},
	{
		"IS1", '\037'
	},
	{
		"US", '\037'
	},
	{
		"space", ' '
	},
	{
		"exclamation-mark", '!'
	},
	{
		"quotation-mark", '"'
	},
	{
		"number-sign", '#'
	},
	{
		"dollar-sign", '$'
	},
	{
		"percent-sign", '%'
	},
	{
		"ampersand", '&'
	},
	{
		"apostrophe", '\''
	},
	{
		"left-parenthesis", '('
	},
	{
		"right-parenthesis", ')'
	},
	{
		"asterisk", '*'
	},
	{
		"plus-sign", '+'
	},
	{
		"comma", ','
	},
	{
		"hyphen", '-'
	},
	{
		"hyphen-minus", '-'
	},
	{
		"period", '.'
	},
	{
		"full-stop", '.'
	},
	{
		"slash", '/'
	},
	{
		"solidus", '/'
	},
	{
		"zero", '0'
	},
	{
		"one", '1'
	},
	{
		"two", '2'
	},
	{
		"three", '3'
	},
	{
		"four", '4'
	},
	{
		"five", '5'
	},
	{
		"six", '6'
	},
	{
		"seven", '7'
	},
	{
		"eight", '8'
	},
	{
		"nine", '9'
	},
	{
		"colon", ':'
	},
	{
		"semicolon", ';'
	},
	{
		"less-than-sign", '<'
	},
	{
		"equals-sign", '='
	},
	{
		"greater-than-sign", '>'
	},
	{
		"question-mark", '?'
	},
	{
		"commercial-at", '@'
	},
	{
		"left-square-bracket", '['
	},
	{
		"backslash", '\\'
	},
	{
		"reverse-solidus", '\\'
	},
	{
		"right-square-bracket", ']'
	},
	{
		"circumflex", '^'
	},
	{
		"circumflex-accent", '^'
	},
	{
		"underscore", '_'
	},
	{
		"low-line", '_'
	},
	{
		"grave-accent", '`'
	},
	{
		"left-brace", '{'
	},
	{
		"left-curly-bracket", '{'
	},
	{
		"vertical-line", '|'
	},
	{
		"right-brace", '}'
	},
	{
		"right-curly-bracket", '}'
	},
	{
		"tilde", '~'
	},
	{
		"DEL", '\177'
	},
	{
		NULL, 0
	}
};


/*
 * nmcces - how many distinct MCCEs are there?
 */
static int
nmcces(struct vars * v)
{
	/*
	 * No multi-character collating elements defined at the moment.
	 */
	return 0;
}

/*
 * nleaders - how many chrs can be first chrs of MCCEs?
 */
static int
nleaders(struct vars * v)
{
	return 0;
}

/*
 * allmcces - return a cvec with all the MCCEs of the locale
 */
static struct cvec *
allmcces(struct vars * v,		/* context */
		 struct cvec * cv)		/* this is supposed to have enough room */
{
	return clearcvec(cv);
}

/*
 * element - map collating-element name to celt
 */
static celt
element(struct vars * v,		/* context */
		chr *startp,			/* points to start of name */
		chr *endp)				/* points just past end of name */
{
	struct cname *cn;
	size_t		len;

	/* generic:  one-chr names stand for themselves */
	assert(startp < endp);
	len = endp - startp;
	if (len == 1)
		return *startp;

	NOTE(REG_ULOCALE);

	/* search table */
	for (cn = cnames; cn->name != NULL; cn++)
	{
		if (strlen(cn->name) == len &&
			char_and_wchar_strncmp(cn->name, startp, len) == 0)
		{
			break;				/* NOTE BREAK OUT */
		}
	}
	if (cn->name != NULL)
		return CHR(cn->code);

	/* couldn't find it */
	ERR(REG_ECOLLATE);
	return 0;
}

/*
 * range - supply cvec for a range, including legality check
 */
static struct cvec *
range(struct vars * v,			/* context */
	  celt a,					/* range start */
	  celt b,					/* range end, might equal a */
	  int cases)				/* case-independent? */
{
	int			nchrs;
	struct cvec *cv;
	celt		c,
				lc,
				uc;

	if (a != b && !before(a, b))
	{
		ERR(REG_ERANGE);
		return NULL;
	}

	if (!cases)
	{							/* easy version */
		cv = getcvec(v, 0, 1, 0);
		NOERRN();
		addrange(cv, a, b);
		return cv;
	}

	/*
	 * When case-independent, it's hard to decide when cvec ranges are
	 * usable, so for now at least, we won't try.  We allocate enough
	 * space for two case variants plus a little extra for the two title
	 * case variants.
	 */

	nchrs = (b - a + 1) * 2 + 4;

	cv = getcvec(v, nchrs, 0, 0);
	NOERRN();

	for (c = a; c <= b; c++)
	{
		addchr(cv, c);
		lc = wx_tolower((chr) c);
		if (c != lc)
			addchr(cv, lc);
		uc = wx_toupper((chr) c);
		if (c != uc)
			addchr(cv, uc);
	}

	return cv;
}

/*
 * before - is celt x before celt y, for purposes of range legality?
 */
static int						/* predicate */
before(celt x, celt y)
{
	/* trivial because no MCCEs */
	if (x < y)
		return 1;
	return 0;
}

/*
 * eclass - supply cvec for an equivalence class
 * Must include case counterparts on request.
 */
static struct cvec *
eclass(struct vars * v,			/* context */
	   celt c,					/* Collating element representing the
								 * equivalence class. */
	   int cases)				/* all cases? */
{
	struct cvec *cv;

	/* crude fake equivalence class for testing */
	if ((v->cflags & REG_FAKE) && c == 'x')
	{
		cv = getcvec(v, 4, 0, 0);
		addchr(cv, (chr) 'x');
		addchr(cv, (chr) 'y');
		if (cases)
		{
			addchr(cv, (chr) 'X');
			addchr(cv, (chr) 'Y');
		}
		return cv;
	}

	/* otherwise, none */
	if (cases)
		return allcases(v, c);
	cv = getcvec(v, 1, 0, 0);
	assert(cv != NULL);
	addchr(cv, (chr) c);
	return cv;
}

/*
 * cclass - supply cvec for a character class
 *
 * Must include case counterparts on request.
 */
static struct cvec *
cclass(struct vars * v,			/* context */
	   chr *startp,				/* where the name starts */
	   chr *endp,				/* just past the end of the name */
	   int cases)				/* case-independent? */
{
	size_t		len;
	struct cvec *cv = NULL;
	char	  **namePtr;
	int			i,
				index;

	/*
	 * The following arrays define the valid character class names.
	 */

	static char *classNames[] = {
		"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
		"lower", "print", "punct", "space", "upper", "xdigit", NULL
	};

	enum classes
	{
		CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
		CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
	};

	/*
	 * Map the name to the corresponding enumerated value.
	 */
	len = endp - startp;
	index = -1;
	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
	{
		if (strlen(*namePtr) == len &&
			char_and_wchar_strncmp(*namePtr, startp, len) == 0)
		{
			index = i;
			break;
		}
	}
	if (index == -1)
	{
		ERR(REG_ECTYPE);
		return NULL;
	}

	/*
	 * Remap lower and upper to alpha if the match is case insensitive.
	 */

	if (cases &&
		((enum classes) index == CC_LOWER ||
		 (enum classes) index == CC_UPPER))
		index = (int) CC_ALPHA;

	/*
	 * Now compute the character class contents.
	 *
	 * For the moment, assume that only char codes < 256 can be in these
	 * classes.
	 */

	switch ((enum classes) index)
	{
		case CC_PRINT:
		case CC_ALNUM:
			cv = getcvec(v, UCHAR_MAX, 1, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isalpha((chr) i))
						addchr(cv, (chr) i);
				}
				addrange(cv, (chr) '0', (chr) '9');
			}
			break;
		case CC_ALPHA:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isalpha((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_ASCII:
			cv = getcvec(v, 0, 1, 0);
			if (cv)
				addrange(cv, 0, 0x7f);
			break;
		case CC_BLANK:
			cv = getcvec(v, 2, 0, 0);
			addchr(cv, '\t');
			addchr(cv, ' ');
			break;
		case CC_CNTRL:
			cv = getcvec(v, 0, 2, 0);
			addrange(cv, 0x0, 0x1f);
			addrange(cv, 0x7f, 0x9f);
			break;
		case CC_DIGIT:
			cv = getcvec(v, 0, 1, 0);
			if (cv)
				addrange(cv, (chr) '0', (chr) '9');
			break;
		case CC_PUNCT:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_ispunct((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_XDIGIT:
			cv = getcvec(v, 0, 3, 0);
			if (cv)
			{
				addrange(cv, '0', '9');
				addrange(cv, 'a', 'f');
				addrange(cv, 'A', 'F');
			}
			break;
		case CC_SPACE:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isspace((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_LOWER:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_islower((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_UPPER:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isupper((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_GRAPH:
			cv = getcvec(v, UCHAR_MAX, 0, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (wx_isgraph((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
	}
	if (cv == NULL)
		ERR(REG_ESPACE);
	return cv;
}

/*
 * allcases - supply cvec for all case counterparts of a chr (including itself)
 *
 * This is a shortcut, preferably an efficient one, for simple characters;
 * messy cases are done via range().
 */
static struct cvec *
allcases(struct vars * v,		/* context */
		 chr pc)				/* character to get case equivs of */
{
	struct cvec *cv;
	chr			c = (chr) pc;
	chr			lc,
				uc;

	lc = wx_tolower((chr) c);
	uc = wx_toupper((chr) c);

	cv = getcvec(v, 2, 0, 0);
	addchr(cv, lc);
	if (lc != uc)
		addchr(cv, uc);
	return cv;
}

/*
 * cmp - chr-substring compare
 *
 * Backrefs need this.	It should preferably be efficient.
 * Note that it does not need to report anything except equal/unequal.
 * Note also that the length is exact, and the comparison should not
 * stop at embedded NULs!
 */
static int						/* 0 for equal, nonzero for unequal */
cmp(const chr *x, const chr *y, /* strings to compare */
	size_t len)					/* exact length of comparison */
{
	return memcmp(VS(x), VS(y), len * sizeof(chr));
}

/*
 * casecmp - case-independent chr-substring compare
 *
 * REG_ICASE backrefs need this.  It should preferably be efficient.
 * Note that it does not need to report anything except equal/unequal.
 * Note also that the length is exact, and the comparison should not
 * stop at embedded NULs!
 */
static int						/* 0 for equal, nonzero for unequal */
casecmp(const chr *x, const chr *y,		/* strings to compare */
		size_t len)				/* exact length of comparison */
{
	for (; len > 0; len--, x++, y++)
	{
		if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y)))
			return 1;
	}
	return 0;
}
Commit	Line	Data
830efc9b RN	1	/*
	2	* regc_locale.c --
	3	*
	4	* This file contains locale-specific regexp routines.
	5	* This file is #included by regcomp.c.
	6	*
	7	* Copyright (c) 1998 by Scriptics Corporation.
	8	*
	9	* This software is copyrighted by the Regents of the University of
	10	* California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
	11	* Corporation and other parties. The following terms apply to all files
	12	* associated with the software unless explicitly disclaimed in
	13	* individual files.
	14	*
	15	* The authors hereby grant permission to use, copy, modify, distribute,
	16	* and license this software and its documentation for any purpose, provided
	17	* that existing copyright notices are retained in all copies and that this
	18	* notice is included verbatim in any distributions. No written agreement,
	19	* license, or royalty fee is required for any of the authorized uses.
	20	* Modifications to this software may be copyrighted by their authors
	21	* and need not follow the licensing terms described here, provided that
	22	* the new terms are clearly indicated on the first page of each file where
	23	* they apply.
	24	*
	25	* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
	26	* FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
	27	* ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
	28	* DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
	29	* POSSIBILITY OF SUCH DAMAGE.
	30	*
	31	* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
	32	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
	33	* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
	34	* IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
	35	* NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
	36	* MODIFICATIONS.
	37	*
	38	* GOVERNMENT USE: If you are acquiring this software on behalf of the
	39	* U.S. government, the Government shall have only "Restricted Rights"
	40	* in the software and related documentation as defined in the Federal
	41	* Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
	42	* are acquiring the software on behalf of the Department of Defense, the
	43	* software shall be classified as "Commercial Computer Software" and the
	44	* Government shall have only "Restricted Rights" as defined in Clause
	45	* 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
	46	* authors grant the U.S. Government and others acting in its behalf
	47	* permission to use and distribute the software in accordance with the
	48	* terms specified in this license.
	49	*
	50	* $Header$
	51	*/
	52
	53	int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
	54	{
	55	while(cp++ == (const char)wp++ && --nNum){}
	56
	57	return nNum;
	58	}
	59
	60	/* ASCII character-name table */
	61
	62	static struct cname
	63	{
	64	char *name;
65	char code;
66	} cnames[] =
67
68	{
69	{
70	"NUL", '\0'
71	},
72	{
73	"SOH", '\001'
74	},
75	{
76	"STX", '\002'
77	},
78	{
79	"ETX", '\003'
80	},
81	{
82	"EOT", '\004'
83	},
84	{
85	"ENQ", '\005'
86	},
87	{
88	"ACK", '\006'
89	},
90	{
91	"BEL", '\007'
92	},
93	{
94	"alert", '\007'
95	},
96	{
97	"BS", '\010'
98	},
99	{
100	"backspace", '\b'
101	},
102	{
103	"HT", '\011'
104	},
105	{
106	"tab", '\t'
107	},
108	{
109	"LF", '\012'
110	},
111	{
112	"newline", '\n'
113	},
114	{
115	"VT", '\013'
116	},
117	{
118	"vertical-tab", '\v'
119	},
120	{
121	"FF", '\014'
122	},
123	{
124	"form-feed", '\f'
125	},
126	{
127	"CR", '\015'
128	},
129	{
130	"carriage-return", '\r'
131	},
132	{
133	"SO", '\016'
134	},
135	{
136	"SI", '\017'
137	},
138	{
139	"DLE", '\020'
140	},
141	{
142	"DC1", '\021'
143	},
144	{
145	"DC2", '\022'
146	},
147	{
148	"DC3", '\023'
149	},
150	{
151	"DC4", '\024'
152	},
153	{
154	"NAK", '\025'
155	},
156	{
157	"SYN", '\026'
158	},
159	{
160	"ETB", '\027'
161	},
162	{
163	"CAN", '\030'
164	},
165	{
166	"EM", '\031'
167	},
168	{
169	"SUB", '\032'
170	},
171	{
172	"ESC", '\033'
173	},
174	{
175	"IS4", '\034'
176	},
177	{
178	"FS", '\034'
179	},
180	{
181	"IS3", '\035'
182	},
183	{
184	"GS", '\035'
185	},
186	{
187	"IS2", '\036'
188	},
189	{
190	"RS", '\036'
191	},
192	{
193	"IS1", '\037'
194	},
195	{
196	"US", '\037'
197	},
198	{
199	"space", ' '
200	},
201	{
202	"exclamation-mark", '!'
203	},
204	{
205	"quotation-mark", '"'
206	},
207	{
208	"number-sign", '#'
209	},
210	{
211	"dollar-sign", '$'
212	},
213	{
214	"percent-sign", '%'
215	},
216	{
217	"ampersand", '&'
218	},
219	{
220	"apostrophe", '\''
221	},
222	{
223	"left-parenthesis", '('
224	},
225	{
226	"right-parenthesis", ')'
227	},
228	{
229	"asterisk", '*'
230	},
231	{
232	"plus-sign", '+'
233	},
234	{
235	"comma", ','
236	},
237	{
238	"hyphen", '-'
239	},
240	{
241	"hyphen-minus", '-'
242	},
243	{
244	"period", '.'
245	},
246	{
247	"full-stop", '.'
248	},
249	{
250	"slash", '/'
251	},
252	{
253	"solidus", '/'
254	},
255	{
256	"zero", '0'
257	},
258	{
259	"one", '1'
260	},
261	{
262	"two", '2'
263	},
264	{
265	"three", '3'
266	},
267	{
268	"four", '4'
269	},
270	{
271	"five", '5'
272	},
273	{
274	"six", '6'
275	},
276	{
277	"seven", '7'
278	},
279	{
280	"eight", '8'
281	},
282	{
283	"nine", '9'
284	},
285	{
286	"colon", ':'
287	},
288	{
289	"semicolon", ';'
290	},
291	{
292	"less-than-sign", '<'
293	},
294	{
295	"equals-sign", '='
296	},
297	{
298	"greater-than-sign", '>'
299	},
300	{
301	"question-mark", '?'
302	},
303	{
304	"commercial-at", '@'
305	},
306	{
307	"left-square-bracket", '['
308	},
309	{
310	"backslash", '\\'
311	},
312	{
313	"reverse-solidus", '\\'
314	},
315	{
316	"right-square-bracket", ']'
317	},
318	{
319	"circumflex", '^'
320	},
321	{
322	"circumflex-accent", '^'
323	},
324	{
325	"underscore", '_'
326	},
327	{
328	"low-line", '_'
329	},
330	{
331	"grave-accent", '`'
332	},
333	{
334	"left-brace", '{'
335	},
336	{
337	"left-curly-bracket", '{'
338	},
339	{
340	"vertical-line", '\|'
341	},
342	{
343	"right-brace", '}'
344	},
345	{
346	"right-curly-bracket", '}'
347	},
348	{
349	"tilde", '~'
350	},
351	{
352	"DEL", '\177'
353	},
354	{
355	NULL, 0
356	}
357	};
358
830efc9b RN	359
	360	/*
	361	* nmcces - how many distinct MCCEs are there?
	362	*/
	363	static int
	364	nmcces(struct vars * v)
	365	{
	366	/*
	367	* No multi-character collating elements defined at the moment.
	368	*/
	369	return 0;
	370	}
	371
	372	/*
	373	* nleaders - how many chrs can be first chrs of MCCEs?
	374	*/
	375	static int
	376	nleaders(struct vars * v)
	377	{
	378	return 0;
	379	}
	380
	381	/*
	382	* allmcces - return a cvec with all the MCCEs of the locale
	383	*/
	384	static struct cvec *
	385	allmcces(struct vars * v, /* context */
	386	struct cvec * cv) /* this is supposed to have enough room */
	387	{
	388	return clearcvec(cv);
	389	}
	390
	391	/*
	392	* element - map collating-element name to celt
	393	*/
	394	static celt
	395	element(struct vars * v, /* context */
	396	chr startp, / points to start of name */
	397	chr endp) / points just past end of name */
	398	{
	399	struct cname *cn;
	400	size_t len;
	401
	402	/* generic: one-chr names stand for themselves */
	403	assert(startp < endp);
	404	len = endp - startp;
	405	if (len == 1)
	406	return *startp;
	407
	408	NOTE(REG_ULOCALE);
	409
	410	/* search table */
	411	for (cn = cnames; cn->name != NULL; cn++)
	412	{
	413	if (strlen(cn->name) == len &&
	414	char_and_wchar_strncmp(cn->name, startp, len) == 0)
	415	{
	416	break; /* NOTE BREAK OUT */
	417	}
	418	}
	419	if (cn->name != NULL)
	420	return CHR(cn->code);
	421
	422	/* couldn't find it */
423	ERR(REG_ECOLLATE);
424	return 0;
425	}
426
427	/*
428	* range - supply cvec for a range, including legality check
429	*/
430	static struct cvec *
431	range(struct vars * v, /* context */
432	celt a, /* range start */
433	celt b, /* range end, might equal a */
434	int cases) /* case-independent? */
435	{
436	int nchrs;
437	struct cvec *cv;
438	celt c,
439	lc,
440	uc;
441
442	if (a != b && !before(a, b))
443	{
444	ERR(REG_ERANGE);
445	return NULL;
446	}
447
448	if (!cases)
449	{ /* easy version */
450	cv = getcvec(v, 0, 1, 0);
451	NOERRN();
452	addrange(cv, a, b);
453	return cv;
454	}
455
456	/*
457	* When case-independent, it's hard to decide when cvec ranges are
458	* usable, so for now at least, we won't try. We allocate enough
459	* space for two case variants plus a little extra for the two title
460	* case variants.
461	*/
462
463	nchrs = (b - a + 1) * 2 + 4;
464
465	cv = getcvec(v, nchrs, 0, 0);
466	NOERRN();
467
468	for (c = a; c <= b; c++)
469	{
470	addchr(cv, c);
471	lc = wx_tolower((chr) c);
472	if (c != lc)
473	addchr(cv, lc);
474	uc = wx_toupper((chr) c);
475	if (c != uc)
476	addchr(cv, uc);
477	}
478
479	return cv;
480	}
481
482	/*
483	* before - is celt x before celt y, for purposes of range legality?
484	*/
485	static int /* predicate */
486	before(celt x, celt y)
487	{
488	/* trivial because no MCCEs */
489	if (x < y)
490	return 1;
491	return 0;
492	}
493
494	/*
495	* eclass - supply cvec for an equivalence class
496	* Must include case counterparts on request.
497	*/
498	static struct cvec *
499	eclass(struct vars * v, /* context */
500	celt c, /* Collating element representing the
501	* equivalence class. */
502	int cases) /* all cases? */
503	{
504	struct cvec *cv;
505
506	/* crude fake equivalence class for testing */
507	if ((v->cflags & REG_FAKE) && c == 'x')
508	{
509	cv = getcvec(v, 4, 0, 0);
510	addchr(cv, (chr) 'x');
511	addchr(cv, (chr) 'y');
512	if (cases)
513	{
514	addchr(cv, (chr) 'X');
515	addchr(cv, (chr) 'Y');
516	}
517	return cv;
518	}
519
520	/* otherwise, none */
521	if (cases)
522	return allcases(v, c);
523	cv = getcvec(v, 1, 0, 0);
524	assert(cv != NULL);
525	addchr(cv, (chr) c);
526	return cv;
527	}
528
529	/*
530	* cclass - supply cvec for a character class
531	*
532	* Must include case counterparts on request.
533	*/
534	static struct cvec *
535	cclass(struct vars * v, /* context */
536	chr startp, / where the name starts */
537	chr endp, / just past the end of the name */
538	int cases) /* case-independent? */
539	{
540	size_t len;
541	struct cvec *cv = NULL;
542	char **namePtr;
543	int i,
544	index;
545
546	/*
547	* The following arrays define the valid character class names.
548	*/
549
550	static char *classNames[] = {
551	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
552	"lower", "print", "punct", "space", "upper", "xdigit", NULL
553	};
554
555	enum classes
556	{
557	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
558	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
559	};
560
561	/*
562	* Map the name to the corresponding enumerated value.
563	*/
564	len = endp - startp;
565	index = -1;
566	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
567	{
568	if (strlen(*namePtr) == len &&
569	char_and_wchar_strncmp(*namePtr, startp, len) == 0)
570	{
571	index = i;
572	break;
573	}
574	}
575	if (index == -1)
576	{
577	ERR(REG_ECTYPE);
578	return NULL;
579	}
580
581	/*
582	* Remap lower and upper to alpha if the match is case insensitive.
583	*/
584
585	if (cases &&
586	((enum classes) index == CC_LOWER \|\|
587	(enum classes) index == CC_UPPER))
588	index = (int) CC_ALPHA;
589
590	/*
591	* Now compute the character class contents.
592	*
593	* For the moment, assume that only char codes < 256 can be in these
594	* classes.
595	*/
596
597	switch ((enum classes) index)
598	{
599	case CC_PRINT:
600	case CC_ALNUM:
601	cv = getcvec(v, UCHAR_MAX, 1, 0);
602	if (cv)
603	{
604	for (i = 0; i <= UCHAR_MAX; i++)
605	{
606	if (wx_isalpha((chr) i))
607	addchr(cv, (chr) i);
608	}
609	addrange(cv, (chr) '0', (chr) '9');
610	}
611	break;
612	case CC_ALPHA:
613	cv = getcvec(v, UCHAR_MAX, 0, 0);
614	if (cv)
615	{
616	for (i = 0; i <= UCHAR_MAX; i++)
617	{
618	if (wx_isalpha((chr) i))
619	addchr(cv, (chr) i);
620	}
621	}
622	break;
623	case CC_ASCII:
624	cv = getcvec(v, 0, 1, 0);
625	if (cv)
626	addrange(cv, 0, 0x7f);
627	break;
628	case CC_BLANK:
629	cv = getcvec(v, 2, 0, 0);
630	addchr(cv, '\t');
631	addchr(cv, ' ');
632	break;
633	case CC_CNTRL:
634	cv = getcvec(v, 0, 2, 0);
635	addrange(cv, 0x0, 0x1f);
636	addrange(cv, 0x7f, 0x9f);
637	break;
638	case CC_DIGIT:
639	cv = getcvec(v, 0, 1, 0);
640	if (cv)
641	addrange(cv, (chr) '0', (chr) '9');
642	break;
643	case CC_PUNCT:
644	cv = getcvec(v, UCHAR_MAX, 0, 0);
645	if (cv)
646	{
647	for (i = 0; i <= UCHAR_MAX; i++)
648	{
649	if (wx_ispunct((chr) i))
650	addchr(cv, (chr) i);
651	}
652	}
653	break;
654	case CC_XDIGIT:
655	cv = getcvec(v, 0, 3, 0);
656	if (cv)
657	{
658	addrange(cv, '0', '9');
659	addrange(cv, 'a', 'f');
660	addrange(cv, 'A', 'F');
661	}
662	break;
663	case CC_SPACE:
664	cv = getcvec(v, UCHAR_MAX, 0, 0);
665	if (cv)
666	{
667	for (i = 0; i <= UCHAR_MAX; i++)
668	{
669	if (wx_isspace((chr) i))
670	addchr(cv, (chr) i);
671	}
672	}
673	break;
674	case CC_LOWER:
675	cv = getcvec(v, UCHAR_MAX, 0, 0);
676	if (cv)
677	{
678	for (i = 0; i <= UCHAR_MAX; i++)
679	{
680	if (wx_islower((chr) i))
681	addchr(cv, (chr) i);
682	}
683	}
684	break;
685	case CC_UPPER:
686	cv = getcvec(v, UCHAR_MAX, 0, 0);
687	if (cv)
688	{
689	for (i = 0; i <= UCHAR_MAX; i++)
690	{
691	if (wx_isupper((chr) i))
692	addchr(cv, (chr) i);
693	}
694	}
695	break;
696	case CC_GRAPH:
697	cv = getcvec(v, UCHAR_MAX, 0, 0);
698	if (cv)
699	{
700	for (i = 0; i <= UCHAR_MAX; i++)
701	{
702	if (wx_isgraph((chr) i))
703	addchr(cv, (chr) i);
704	}
705	}
706	break;
707	}
708	if (cv == NULL)
709	ERR(REG_ESPACE);
710	return cv;
711	}
712
713	/*
714	* allcases - supply cvec for all case counterparts of a chr (including itself)
715	*
716	* This is a shortcut, preferably an efficient one, for simple characters;
717	* messy cases are done via range().
718	*/
719	static struct cvec *
720	allcases(struct vars * v, /* context */
721	chr pc) /* character to get case equivs of */
722	{
723	struct cvec *cv;
724	chr c = (chr) pc;
725	chr lc,
726	uc;
727
728	lc = wx_tolower((chr) c);
729	uc = wx_toupper((chr) c);
730
731	cv = getcvec(v, 2, 0, 0);
732	addchr(cv, lc);
733	if (lc != uc)
734	addchr(cv, uc);
735	return cv;
736	}
737
738	/*
739	* cmp - chr-substring compare
740	*
741	* Backrefs need this. It should preferably be efficient.
742	* Note that it does not need to report anything except equal/unequal.
743	* Note also that the length is exact, and the comparison should not
744	* stop at embedded NULs!
745	*/
746	static int /* 0 for equal, nonzero for unequal */
747	cmp(const chr x, const chr y, /* strings to compare */
748	size_t len) /* exact length of comparison */
749	{
750	return memcmp(VS(x), VS(y), len * sizeof(chr));
751	}
752
753	/*
754	* casecmp - case-independent chr-substring compare
755	*
756	* REG_ICASE backrefs need this. It should preferably be efficient.
757	* Note that it does not need to report anything except equal/unequal.
758	* Note also that the length is exact, and the comparison should not
759	* stop at embedded NULs!
760	*/
761	static int /* 0 for equal, nonzero for unequal */
762	casecmp(const chr x, const chr y, /* strings to compare */
763	size_t len) /* exact length of comparison */
764	{
765	for (; len > 0; len--, x++, y++)
766	{
767	if ((x != y) && (wx_tolower(x) != wx_tolower(y)))
768	return 1;
769	}
770	return 0;
771	}