Got rid wx-license
[wxWidgets.git] / src / regex / regc_locale.c
1 /*
2 * regc_locale.c --
3 *
4 * This file contains locale-specific regexp routines.
5 * This file is #included by regcomp.c.
6 *
7 * Copyright (c) 1998 by Scriptics Corporation.
8 *
9 * This software is copyrighted by the Regents of the University of
10 * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11 * Corporation and other parties. The following terms apply to all files
12 * associated with the software unless explicitly disclaimed in
13 * individual files.
14 *
15 * The authors hereby grant permission to use, copy, modify, distribute,
16 * and license this software and its documentation for any purpose, provided
17 * that existing copyright notices are retained in all copies and that this
18 * notice is included verbatim in any distributions. No written agreement,
19 * license, or royalty fee is required for any of the authorized uses.
20 * Modifications to this software may be copyrighted by their authors
21 * and need not follow the licensing terms described here, provided that
22 * the new terms are clearly indicated on the first page of each file where
23 * they apply.
24 *
25 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27 * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28 * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
34 * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35 * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
36 * MODIFICATIONS.
37 *
38 * GOVERNMENT USE: If you are acquiring this software on behalf of the
39 * U.S. government, the Government shall have only "Restricted Rights"
40 * in the software and related documentation as defined in the Federal
41 * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
42 * are acquiring the software on behalf of the Department of Defense, the
43 * software shall be classified as "Commercial Computer Software" and the
44 * Government shall have only "Restricted Rights" as defined in Clause
45 * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
46 * authors grant the U.S. Government and others acting in its behalf
47 * permission to use and distribute the software in accordance with the
48 * terms specified in this license.
49 *
50 * $Header$
51 */
52
53 int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
54 {
55 while(*cp++ == (const char)*wp++ && --nNum){}
56
57 return nNum;
58 }
59
60 /* ASCII character-name table */
61
62 static struct cname
63 {
64 char *name;
65 char code;
66 } cnames[] =
67
68 {
69 {
70 "NUL", '\0'
71 },
72 {
73 "SOH", '\001'
74 },
75 {
76 "STX", '\002'
77 },
78 {
79 "ETX", '\003'
80 },
81 {
82 "EOT", '\004'
83 },
84 {
85 "ENQ", '\005'
86 },
87 {
88 "ACK", '\006'
89 },
90 {
91 "BEL", '\007'
92 },
93 {
94 "alert", '\007'
95 },
96 {
97 "BS", '\010'
98 },
99 {
100 "backspace", '\b'
101 },
102 {
103 "HT", '\011'
104 },
105 {
106 "tab", '\t'
107 },
108 {
109 "LF", '\012'
110 },
111 {
112 "newline", '\n'
113 },
114 {
115 "VT", '\013'
116 },
117 {
118 "vertical-tab", '\v'
119 },
120 {
121 "FF", '\014'
122 },
123 {
124 "form-feed", '\f'
125 },
126 {
127 "CR", '\015'
128 },
129 {
130 "carriage-return", '\r'
131 },
132 {
133 "SO", '\016'
134 },
135 {
136 "SI", '\017'
137 },
138 {
139 "DLE", '\020'
140 },
141 {
142 "DC1", '\021'
143 },
144 {
145 "DC2", '\022'
146 },
147 {
148 "DC3", '\023'
149 },
150 {
151 "DC4", '\024'
152 },
153 {
154 "NAK", '\025'
155 },
156 {
157 "SYN", '\026'
158 },
159 {
160 "ETB", '\027'
161 },
162 {
163 "CAN", '\030'
164 },
165 {
166 "EM", '\031'
167 },
168 {
169 "SUB", '\032'
170 },
171 {
172 "ESC", '\033'
173 },
174 {
175 "IS4", '\034'
176 },
177 {
178 "FS", '\034'
179 },
180 {
181 "IS3", '\035'
182 },
183 {
184 "GS", '\035'
185 },
186 {
187 "IS2", '\036'
188 },
189 {
190 "RS", '\036'
191 },
192 {
193 "IS1", '\037'
194 },
195 {
196 "US", '\037'
197 },
198 {
199 "space", ' '
200 },
201 {
202 "exclamation-mark", '!'
203 },
204 {
205 "quotation-mark", '"'
206 },
207 {
208 "number-sign", '#'
209 },
210 {
211 "dollar-sign", '$'
212 },
213 {
214 "percent-sign", '%'
215 },
216 {
217 "ampersand", '&'
218 },
219 {
220 "apostrophe", '\''
221 },
222 {
223 "left-parenthesis", '('
224 },
225 {
226 "right-parenthesis", ')'
227 },
228 {
229 "asterisk", '*'
230 },
231 {
232 "plus-sign", '+'
233 },
234 {
235 "comma", ','
236 },
237 {
238 "hyphen", '-'
239 },
240 {
241 "hyphen-minus", '-'
242 },
243 {
244 "period", '.'
245 },
246 {
247 "full-stop", '.'
248 },
249 {
250 "slash", '/'
251 },
252 {
253 "solidus", '/'
254 },
255 {
256 "zero", '0'
257 },
258 {
259 "one", '1'
260 },
261 {
262 "two", '2'
263 },
264 {
265 "three", '3'
266 },
267 {
268 "four", '4'
269 },
270 {
271 "five", '5'
272 },
273 {
274 "six", '6'
275 },
276 {
277 "seven", '7'
278 },
279 {
280 "eight", '8'
281 },
282 {
283 "nine", '9'
284 },
285 {
286 "colon", ':'
287 },
288 {
289 "semicolon", ';'
290 },
291 {
292 "less-than-sign", '<'
293 },
294 {
295 "equals-sign", '='
296 },
297 {
298 "greater-than-sign", '>'
299 },
300 {
301 "question-mark", '?'
302 },
303 {
304 "commercial-at", '@'
305 },
306 {
307 "left-square-bracket", '['
308 },
309 {
310 "backslash", '\\'
311 },
312 {
313 "reverse-solidus", '\\'
314 },
315 {
316 "right-square-bracket", ']'
317 },
318 {
319 "circumflex", '^'
320 },
321 {
322 "circumflex-accent", '^'
323 },
324 {
325 "underscore", '_'
326 },
327 {
328 "low-line", '_'
329 },
330 {
331 "grave-accent", '`'
332 },
333 {
334 "left-brace", '{'
335 },
336 {
337 "left-curly-bracket", '{'
338 },
339 {
340 "vertical-line", '|'
341 },
342 {
343 "right-brace", '}'
344 },
345 {
346 "right-curly-bracket", '}'
347 },
348 {
349 "tilde", '~'
350 },
351 {
352 "DEL", '\177'
353 },
354 {
355 NULL, 0
356 }
357 };
358
359
360 /*
361 * nmcces - how many distinct MCCEs are there?
362 */
363 static int
364 nmcces(struct vars * v)
365 {
366 /*
367 * No multi-character collating elements defined at the moment.
368 */
369 return 0;
370 }
371
372 /*
373 * nleaders - how many chrs can be first chrs of MCCEs?
374 */
375 static int
376 nleaders(struct vars * v)
377 {
378 return 0;
379 }
380
381 /*
382 * allmcces - return a cvec with all the MCCEs of the locale
383 */
384 static struct cvec *
385 allmcces(struct vars * v, /* context */
386 struct cvec * cv) /* this is supposed to have enough room */
387 {
388 return clearcvec(cv);
389 }
390
391 /*
392 * element - map collating-element name to celt
393 */
394 static celt
395 element(struct vars * v, /* context */
396 chr *startp, /* points to start of name */
397 chr *endp) /* points just past end of name */
398 {
399 struct cname *cn;
400 size_t len;
401
402 /* generic: one-chr names stand for themselves */
403 assert(startp < endp);
404 len = endp - startp;
405 if (len == 1)
406 return *startp;
407
408 NOTE(REG_ULOCALE);
409
410 /* search table */
411 for (cn = cnames; cn->name != NULL; cn++)
412 {
413 if (strlen(cn->name) == len &&
414 char_and_wchar_strncmp(cn->name, startp, len) == 0)
415 {
416 break; /* NOTE BREAK OUT */
417 }
418 }
419 if (cn->name != NULL)
420 return CHR(cn->code);
421
422 /* couldn't find it */
423 ERR(REG_ECOLLATE);
424 return 0;
425 }
426
427 /*
428 * range - supply cvec for a range, including legality check
429 */
430 static struct cvec *
431 range(struct vars * v, /* context */
432 celt a, /* range start */
433 celt b, /* range end, might equal a */
434 int cases) /* case-independent? */
435 {
436 int nchrs;
437 struct cvec *cv;
438 celt c,
439 lc,
440 uc;
441
442 if (a != b && !before(a, b))
443 {
444 ERR(REG_ERANGE);
445 return NULL;
446 }
447
448 if (!cases)
449 { /* easy version */
450 cv = getcvec(v, 0, 1, 0);
451 NOERRN();
452 addrange(cv, a, b);
453 return cv;
454 }
455
456 /*
457 * When case-independent, it's hard to decide when cvec ranges are
458 * usable, so for now at least, we won't try. We allocate enough
459 * space for two case variants plus a little extra for the two title
460 * case variants.
461 */
462
463 nchrs = (b - a + 1) * 2 + 4;
464
465 cv = getcvec(v, nchrs, 0, 0);
466 NOERRN();
467
468 for (c = a; c <= b; c++)
469 {
470 addchr(cv, c);
471 lc = wx_tolower((chr) c);
472 if (c != lc)
473 addchr(cv, lc);
474 uc = wx_toupper((chr) c);
475 if (c != uc)
476 addchr(cv, uc);
477 }
478
479 return cv;
480 }
481
482 /*
483 * before - is celt x before celt y, for purposes of range legality?
484 */
485 static int /* predicate */
486 before(celt x, celt y)
487 {
488 /* trivial because no MCCEs */
489 if (x < y)
490 return 1;
491 return 0;
492 }
493
494 /*
495 * eclass - supply cvec for an equivalence class
496 * Must include case counterparts on request.
497 */
498 static struct cvec *
499 eclass(struct vars * v, /* context */
500 celt c, /* Collating element representing the
501 * equivalence class. */
502 int cases) /* all cases? */
503 {
504 struct cvec *cv;
505
506 /* crude fake equivalence class for testing */
507 if ((v->cflags & REG_FAKE) && c == 'x')
508 {
509 cv = getcvec(v, 4, 0, 0);
510 addchr(cv, (chr) 'x');
511 addchr(cv, (chr) 'y');
512 if (cases)
513 {
514 addchr(cv, (chr) 'X');
515 addchr(cv, (chr) 'Y');
516 }
517 return cv;
518 }
519
520 /* otherwise, none */
521 if (cases)
522 return allcases(v, c);
523 cv = getcvec(v, 1, 0, 0);
524 assert(cv != NULL);
525 addchr(cv, (chr) c);
526 return cv;
527 }
528
529 /*
530 * cclass - supply cvec for a character class
531 *
532 * Must include case counterparts on request.
533 */
534 static struct cvec *
535 cclass(struct vars * v, /* context */
536 chr *startp, /* where the name starts */
537 chr *endp, /* just past the end of the name */
538 int cases) /* case-independent? */
539 {
540 size_t len;
541 struct cvec *cv = NULL;
542 char **namePtr;
543 int i,
544 index;
545
546 /*
547 * The following arrays define the valid character class names.
548 */
549
550 static char *classNames[] = {
551 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
552 "lower", "print", "punct", "space", "upper", "xdigit", NULL
553 };
554
555 enum classes
556 {
557 CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
558 CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
559 };
560
561 /*
562 * Map the name to the corresponding enumerated value.
563 */
564 len = endp - startp;
565 index = -1;
566 for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
567 {
568 if (strlen(*namePtr) == len &&
569 char_and_wchar_strncmp(*namePtr, startp, len) == 0)
570 {
571 index = i;
572 break;
573 }
574 }
575 if (index == -1)
576 {
577 ERR(REG_ECTYPE);
578 return NULL;
579 }
580
581 /*
582 * Remap lower and upper to alpha if the match is case insensitive.
583 */
584
585 if (cases &&
586 ((enum classes) index == CC_LOWER ||
587 (enum classes) index == CC_UPPER))
588 index = (int) CC_ALPHA;
589
590 /*
591 * Now compute the character class contents.
592 *
593 * For the moment, assume that only char codes < 256 can be in these
594 * classes.
595 */
596
597 switch ((enum classes) index)
598 {
599 case CC_PRINT:
600 case CC_ALNUM:
601 cv = getcvec(v, UCHAR_MAX, 1, 0);
602 if (cv)
603 {
604 for (i = 0; i <= UCHAR_MAX; i++)
605 {
606 if (wx_isalpha((chr) i))
607 addchr(cv, (chr) i);
608 }
609 addrange(cv, (chr) '0', (chr) '9');
610 }
611 break;
612 case CC_ALPHA:
613 cv = getcvec(v, UCHAR_MAX, 0, 0);
614 if (cv)
615 {
616 for (i = 0; i <= UCHAR_MAX; i++)
617 {
618 if (wx_isalpha((chr) i))
619 addchr(cv, (chr) i);
620 }
621 }
622 break;
623 case CC_ASCII:
624 cv = getcvec(v, 0, 1, 0);
625 if (cv)
626 addrange(cv, 0, 0x7f);
627 break;
628 case CC_BLANK:
629 cv = getcvec(v, 2, 0, 0);
630 addchr(cv, '\t');
631 addchr(cv, ' ');
632 break;
633 case CC_CNTRL:
634 cv = getcvec(v, 0, 2, 0);
635 addrange(cv, 0x0, 0x1f);
636 addrange(cv, 0x7f, 0x9f);
637 break;
638 case CC_DIGIT:
639 cv = getcvec(v, 0, 1, 0);
640 if (cv)
641 addrange(cv, (chr) '0', (chr) '9');
642 break;
643 case CC_PUNCT:
644 cv = getcvec(v, UCHAR_MAX, 0, 0);
645 if (cv)
646 {
647 for (i = 0; i <= UCHAR_MAX; i++)
648 {
649 if (wx_ispunct((chr) i))
650 addchr(cv, (chr) i);
651 }
652 }
653 break;
654 case CC_XDIGIT:
655 cv = getcvec(v, 0, 3, 0);
656 if (cv)
657 {
658 addrange(cv, '0', '9');
659 addrange(cv, 'a', 'f');
660 addrange(cv, 'A', 'F');
661 }
662 break;
663 case CC_SPACE:
664 cv = getcvec(v, UCHAR_MAX, 0, 0);
665 if (cv)
666 {
667 for (i = 0; i <= UCHAR_MAX; i++)
668 {
669 if (wx_isspace((chr) i))
670 addchr(cv, (chr) i);
671 }
672 }
673 break;
674 case CC_LOWER:
675 cv = getcvec(v, UCHAR_MAX, 0, 0);
676 if (cv)
677 {
678 for (i = 0; i <= UCHAR_MAX; i++)
679 {
680 if (wx_islower((chr) i))
681 addchr(cv, (chr) i);
682 }
683 }
684 break;
685 case CC_UPPER:
686 cv = getcvec(v, UCHAR_MAX, 0, 0);
687 if (cv)
688 {
689 for (i = 0; i <= UCHAR_MAX; i++)
690 {
691 if (wx_isupper((chr) i))
692 addchr(cv, (chr) i);
693 }
694 }
695 break;
696 case CC_GRAPH:
697 cv = getcvec(v, UCHAR_MAX, 0, 0);
698 if (cv)
699 {
700 for (i = 0; i <= UCHAR_MAX; i++)
701 {
702 if (wx_isgraph((chr) i))
703 addchr(cv, (chr) i);
704 }
705 }
706 break;
707 }
708 if (cv == NULL)
709 ERR(REG_ESPACE);
710 return cv;
711 }
712
713 /*
714 * allcases - supply cvec for all case counterparts of a chr (including itself)
715 *
716 * This is a shortcut, preferably an efficient one, for simple characters;
717 * messy cases are done via range().
718 */
719 static struct cvec *
720 allcases(struct vars * v, /* context */
721 chr pc) /* character to get case equivs of */
722 {
723 struct cvec *cv;
724 chr c = (chr) pc;
725 chr lc,
726 uc;
727
728 lc = wx_tolower((chr) c);
729 uc = wx_toupper((chr) c);
730
731 cv = getcvec(v, 2, 0, 0);
732 addchr(cv, lc);
733 if (lc != uc)
734 addchr(cv, uc);
735 return cv;
736 }
737
738 /*
739 * cmp - chr-substring compare
740 *
741 * Backrefs need this. It should preferably be efficient.
742 * Note that it does not need to report anything except equal/unequal.
743 * Note also that the length is exact, and the comparison should not
744 * stop at embedded NULs!
745 */
746 static int /* 0 for equal, nonzero for unequal */
747 cmp(const chr *x, const chr *y, /* strings to compare */
748 size_t len) /* exact length of comparison */
749 {
750 return memcmp(VS(x), VS(y), len * sizeof(chr));
751 }
752
753 /*
754 * casecmp - case-independent chr-substring compare
755 *
756 * REG_ICASE backrefs need this. It should preferably be efficient.
757 * Note that it does not need to report anything except equal/unequal.
758 * Note also that the length is exact, and the comparison should not
759 * stop at embedded NULs!
760 */
761 static int /* 0 for equal, nonzero for unequal */
762 casecmp(const chr *x, const chr *y, /* strings to compare */
763 size_t len) /* exact length of comparison */
764 {
765 for (; len > 0; len--, x++, y++)
766 {
767 if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y)))
768 return 1;
769 }
770 return 0;
771 }