]> git.saurik.com Git - wxWidgets.git/blob - src/regex/regc_locale.c
Cleaned up regex.cpp
[wxWidgets.git] / src / regex / regc_locale.c
1 /*
2 * regc_locale.c --
3 *
4 * This file contains locale-specific regexp routines.
5 * This file is #included by regcomp.c.
6 *
7 * Copyright (c) 1998 by Scriptics Corporation.
8 *
9 * This software is copyrighted by the Regents of the University of
10 * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11 * Corporation and other parties. The following terms apply to all files
12 * associated with the software unless explicitly disclaimed in
13 * individual files.
14 *
15 * The authors hereby grant permission to use, copy, modify, distribute,
16 * and license this software and its documentation for any purpose, provided
17 * that existing copyright notices are retained in all copies and that this
18 * notice is included verbatim in any distributions. No written agreement,
19 * license, or royalty fee is required for any of the authorized uses.
20 * Modifications to this software may be copyrighted by their authors
21 * and need not follow the licensing terms described here, provided that
22 * the new terms are clearly indicated on the first page of each file where
23 * they apply.
24 *
25 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27 * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28 * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
34 * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35 * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
36 * MODIFICATIONS.
37 *
38 * GOVERNMENT USE: If you are acquiring this software on behalf of the
39 * U.S. government, the Government shall have only "Restricted Rights"
40 * in the software and related documentation as defined in the Federal
41 * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
42 * are acquiring the software on behalf of the Department of Defense, the
43 * software shall be classified as "Commercial Computer Software" and the
44 * Government shall have only "Restricted Rights" as defined in Clause
45 * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
46 * authors grant the U.S. Government and others acting in its behalf
47 * permission to use and distribute the software in accordance with the
48 * terms specified in this license.
49 *
50 * $Header$
51 */
52
53 int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
54 {
55 while(*cp++ == (const char)*wp++ && --nNum){}
56 return nNum;
57 }
58
59 int wx_isdigit(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX && wxIsdigit((unsigned char) c));}
60 int wx_isalpha(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX && wxIsalpha((unsigned char) c));}
61 int wx_isalnum(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX && wxIsalnum((unsigned char) c));}
62 int wx_isupper(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX && wxIsupper((unsigned char) c));}
63 int wx_islower(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX && wxIslower((unsigned char) c));}
64 int wx_isgraph(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX && wxIsgraph((unsigned char) c));}
65 int wx_ispunct(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX && wxIspunct((unsigned char) c));}
66 int wx_isspace(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX && wxIsspace((unsigned char) c));}
67
68 wx_wchar wx_toupper(wx_wchar c)
69 {
70 if (c >= 0 && c <= UCHAR_MAX)
71 return wxToupper((unsigned char) c);
72 return c;
73
74 }
75
76 wx_wchar wx_tolower(wx_wchar c)
77 {
78 if (c >= 0 && c <= UCHAR_MAX)
79 return wxTolower((unsigned char) c);
80 return c;
81 }
82
83 int wx_strlen(const wx_wchar* szString)
84 {
85 /*
86 Generic -- note that some clib functions also test for eol character '^Z'
87
88 int nLength = 0;
89 for (; *(szString + nLength) != '\0'; nLength++);
90 return nLength;
91 */
92 return szString == NULL ? 0 : wxStrlen_(szString);
93 }
94 /* ASCII character-name table */
95
96 static struct cname
97 {
98 char *name;
99 char code;
100 } cnames[] =
101
102 {
103 {
104 "NUL", '\0'
105 },
106 {
107 "SOH", '\001'
108 },
109 {
110 "STX", '\002'
111 },
112 {
113 "ETX", '\003'
114 },
115 {
116 "EOT", '\004'
117 },
118 {
119 "ENQ", '\005'
120 },
121 {
122 "ACK", '\006'
123 },
124 {
125 "BEL", '\007'
126 },
127 {
128 "alert", '\007'
129 },
130 {
131 "BS", '\010'
132 },
133 {
134 "backspace", '\b'
135 },
136 {
137 "HT", '\011'
138 },
139 {
140 "tab", '\t'
141 },
142 {
143 "LF", '\012'
144 },
145 {
146 "newline", '\n'
147 },
148 {
149 "VT", '\013'
150 },
151 {
152 "vertical-tab", '\v'
153 },
154 {
155 "FF", '\014'
156 },
157 {
158 "form-feed", '\f'
159 },
160 {
161 "CR", '\015'
162 },
163 {
164 "carriage-return", '\r'
165 },
166 {
167 "SO", '\016'
168 },
169 {
170 "SI", '\017'
171 },
172 {
173 "DLE", '\020'
174 },
175 {
176 "DC1", '\021'
177 },
178 {
179 "DC2", '\022'
180 },
181 {
182 "DC3", '\023'
183 },
184 {
185 "DC4", '\024'
186 },
187 {
188 "NAK", '\025'
189 },
190 {
191 "SYN", '\026'
192 },
193 {
194 "ETB", '\027'
195 },
196 {
197 "CAN", '\030'
198 },
199 {
200 "EM", '\031'
201 },
202 {
203 "SUB", '\032'
204 },
205 {
206 "ESC", '\033'
207 },
208 {
209 "IS4", '\034'
210 },
211 {
212 "FS", '\034'
213 },
214 {
215 "IS3", '\035'
216 },
217 {
218 "GS", '\035'
219 },
220 {
221 "IS2", '\036'
222 },
223 {
224 "RS", '\036'
225 },
226 {
227 "IS1", '\037'
228 },
229 {
230 "US", '\037'
231 },
232 {
233 "space", ' '
234 },
235 {
236 "exclamation-mark", '!'
237 },
238 {
239 "quotation-mark", '"'
240 },
241 {
242 "number-sign", '#'
243 },
244 {
245 "dollar-sign", '$'
246 },
247 {
248 "percent-sign", '%'
249 },
250 {
251 "ampersand", '&'
252 },
253 {
254 "apostrophe", '\''
255 },
256 {
257 "left-parenthesis", '('
258 },
259 {
260 "right-parenthesis", ')'
261 },
262 {
263 "asterisk", '*'
264 },
265 {
266 "plus-sign", '+'
267 },
268 {
269 "comma", ','
270 },
271 {
272 "hyphen", '-'
273 },
274 {
275 "hyphen-minus", '-'
276 },
277 {
278 "period", '.'
279 },
280 {
281 "full-stop", '.'
282 },
283 {
284 "slash", '/'
285 },
286 {
287 "solidus", '/'
288 },
289 {
290 "zero", '0'
291 },
292 {
293 "one", '1'
294 },
295 {
296 "two", '2'
297 },
298 {
299 "three", '3'
300 },
301 {
302 "four", '4'
303 },
304 {
305 "five", '5'
306 },
307 {
308 "six", '6'
309 },
310 {
311 "seven", '7'
312 },
313 {
314 "eight", '8'
315 },
316 {
317 "nine", '9'
318 },
319 {
320 "colon", ':'
321 },
322 {
323 "semicolon", ';'
324 },
325 {
326 "less-than-sign", '<'
327 },
328 {
329 "equals-sign", '='
330 },
331 {
332 "greater-than-sign", '>'
333 },
334 {
335 "question-mark", '?'
336 },
337 {
338 "commercial-at", '@'
339 },
340 {
341 "left-square-bracket", '['
342 },
343 {
344 "backslash", '\\'
345 },
346 {
347 "reverse-solidus", '\\'
348 },
349 {
350 "right-square-bracket", ']'
351 },
352 {
353 "circumflex", '^'
354 },
355 {
356 "circumflex-accent", '^'
357 },
358 {
359 "underscore", '_'
360 },
361 {
362 "low-line", '_'
363 },
364 {
365 "grave-accent", '`'
366 },
367 {
368 "left-brace", '{'
369 },
370 {
371 "left-curly-bracket", '{'
372 },
373 {
374 "vertical-line", '|'
375 },
376 {
377 "right-brace", '}'
378 },
379 {
380 "right-curly-bracket", '}'
381 },
382 {
383 "tilde", '~'
384 },
385 {
386 "DEL", '\177'
387 },
388 {
389 NULL, 0
390 }
391 };
392
393
394 /*
395 * nmcces - how many distinct MCCEs are there?
396 */
397 static int
398 nmcces(struct vars * v)
399 {
400 /*
401 * No multi-character collating elements defined at the moment.
402 */
403 return 0;
404 }
405
406 /*
407 * nleaders - how many chrs can be first chrs of MCCEs?
408 */
409 static int
410 nleaders(struct vars * v)
411 {
412 return 0;
413 }
414
415 /*
416 * allmcces - return a cvec with all the MCCEs of the locale
417 */
418 static struct cvec *
419 allmcces(struct vars * v, /* context */
420 struct cvec * cv) /* this is supposed to have enough room */
421 {
422 return clearcvec(cv);
423 }
424
425 /*
426 * element - map collating-element name to celt
427 */
428 static celt
429 element(struct vars * v, /* context */
430 chr *startp, /* points to start of name */
431 chr *endp) /* points just past end of name */
432 {
433 struct cname *cn;
434 size_t len;
435
436 /* generic: one-chr names stand for themselves */
437 assert(startp < endp);
438 len = endp - startp;
439 if (len == 1)
440 return *startp;
441
442 NOTE(REG_ULOCALE);
443
444 /* search table */
445 for (cn = cnames; cn->name != NULL; cn++)
446 {
447 if (strlen(cn->name) == len &&
448 char_and_wchar_strncmp(cn->name, startp, len) == 0)
449 {
450 break; /* NOTE BREAK OUT */
451 }
452 }
453 if (cn->name != NULL)
454 return CHR(cn->code);
455
456 /* couldn't find it */
457 ERR(REG_ECOLLATE);
458 return 0;
459 }
460
461 /*
462 * range - supply cvec for a range, including legality check
463 */
464 static struct cvec *
465 range(struct vars * v, /* context */
466 celt a, /* range start */
467 celt b, /* range end, might equal a */
468 int cases) /* case-independent? */
469 {
470 int nchrs;
471 struct cvec *cv;
472 celt c,
473 lc,
474 uc;
475
476 if (a != b && !before(a, b))
477 {
478 ERR(REG_ERANGE);
479 return NULL;
480 }
481
482 if (!cases)
483 { /* easy version */
484 cv = getcvec(v, 0, 1, 0);
485 NOERRN();
486 addrange(cv, a, b);
487 return cv;
488 }
489
490 /*
491 * When case-independent, it's hard to decide when cvec ranges are
492 * usable, so for now at least, we won't try. We allocate enough
493 * space for two case variants plus a little extra for the two title
494 * case variants.
495 */
496
497 nchrs = (b - a + 1) * 2 + 4;
498
499 cv = getcvec(v, nchrs, 0, 0);
500 NOERRN();
501
502 for (c = a; c <= b; c++)
503 {
504 addchr(cv, c);
505 lc = wx_tolower((chr) c);
506 if (c != lc)
507 addchr(cv, lc);
508 uc = wx_toupper((chr) c);
509 if (c != uc)
510 addchr(cv, uc);
511 }
512
513 return cv;
514 }
515
516 /*
517 * before - is celt x before celt y, for purposes of range legality?
518 */
519 static int /* predicate */
520 before(celt x, celt y)
521 {
522 /* trivial because no MCCEs */
523 if (x < y)
524 return 1;
525 return 0;
526 }
527
528 /*
529 * eclass - supply cvec for an equivalence class
530 * Must include case counterparts on request.
531 */
532 static struct cvec *
533 eclass(struct vars * v, /* context */
534 celt c, /* Collating element representing the
535 * equivalence class. */
536 int cases) /* all cases? */
537 {
538 struct cvec *cv;
539
540 /* crude fake equivalence class for testing */
541 if ((v->cflags & REG_FAKE) && c == 'x')
542 {
543 cv = getcvec(v, 4, 0, 0);
544 addchr(cv, (chr) 'x');
545 addchr(cv, (chr) 'y');
546 if (cases)
547 {
548 addchr(cv, (chr) 'X');
549 addchr(cv, (chr) 'Y');
550 }
551 return cv;
552 }
553
554 /* otherwise, none */
555 if (cases)
556 return allcases(v, c);
557 cv = getcvec(v, 1, 0, 0);
558 assert(cv != NULL);
559 addchr(cv, (chr) c);
560 return cv;
561 }
562
563 /*
564 * cclass - supply cvec for a character class
565 *
566 * Must include case counterparts on request.
567 */
568 static struct cvec *
569 cclass(struct vars * v, /* context */
570 chr *startp, /* where the name starts */
571 chr *endp, /* just past the end of the name */
572 int cases) /* case-independent? */
573 {
574 size_t len;
575 struct cvec *cv = NULL;
576 char **namePtr;
577 int i,
578 index;
579
580 /*
581 * The following arrays define the valid character class names.
582 */
583
584 static char *classNames[] = {
585 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
586 "lower", "print", "punct", "space", "upper", "xdigit", NULL
587 };
588
589 enum classes
590 {
591 CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
592 CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
593 };
594
595 /*
596 * Map the name to the corresponding enumerated value.
597 */
598 len = endp - startp;
599 index = -1;
600 for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
601 {
602 if (strlen(*namePtr) == len &&
603 char_and_wchar_strncmp(*namePtr, startp, len) == 0)
604 {
605 index = i;
606 break;
607 }
608 }
609 if (index == -1)
610 {
611 ERR(REG_ECTYPE);
612 return NULL;
613 }
614
615 /*
616 * Remap lower and upper to alpha if the match is case insensitive.
617 */
618
619 if (cases &&
620 ((enum classes) index == CC_LOWER ||
621 (enum classes) index == CC_UPPER))
622 index = (int) CC_ALPHA;
623
624 /*
625 * Now compute the character class contents.
626 *
627 * For the moment, assume that only char codes < 256 can be in these
628 * classes.
629 */
630
631 switch ((enum classes) index)
632 {
633 case CC_PRINT:
634 case CC_ALNUM:
635 cv = getcvec(v, UCHAR_MAX, 1, 0);
636 if (cv)
637 {
638 for (i = 0; i <= UCHAR_MAX; i++)
639 {
640 if (wx_isalpha((chr) i))
641 addchr(cv, (chr) i);
642 }
643 addrange(cv, (chr) '0', (chr) '9');
644 }
645 break;
646 case CC_ALPHA:
647 cv = getcvec(v, UCHAR_MAX, 0, 0);
648 if (cv)
649 {
650 for (i = 0; i <= UCHAR_MAX; i++)
651 {
652 if (wx_isalpha((chr) i))
653 addchr(cv, (chr) i);
654 }
655 }
656 break;
657 case CC_ASCII:
658 cv = getcvec(v, 0, 1, 0);
659 if (cv)
660 addrange(cv, 0, 0x7f);
661 break;
662 case CC_BLANK:
663 cv = getcvec(v, 2, 0, 0);
664 addchr(cv, '\t');
665 addchr(cv, ' ');
666 break;
667 case CC_CNTRL:
668 cv = getcvec(v, 0, 2, 0);
669 addrange(cv, 0x0, 0x1f);
670 addrange(cv, 0x7f, 0x9f);
671 break;
672 case CC_DIGIT:
673 cv = getcvec(v, 0, 1, 0);
674 if (cv)
675 addrange(cv, (chr) '0', (chr) '9');
676 break;
677 case CC_PUNCT:
678 cv = getcvec(v, UCHAR_MAX, 0, 0);
679 if (cv)
680 {
681 for (i = 0; i <= UCHAR_MAX; i++)
682 {
683 if (wx_ispunct((chr) i))
684 addchr(cv, (chr) i);
685 }
686 }
687 break;
688 case CC_XDIGIT:
689 cv = getcvec(v, 0, 3, 0);
690 if (cv)
691 {
692 addrange(cv, '0', '9');
693 addrange(cv, 'a', 'f');
694 addrange(cv, 'A', 'F');
695 }
696 break;
697 case CC_SPACE:
698 cv = getcvec(v, UCHAR_MAX, 0, 0);
699 if (cv)
700 {
701 for (i = 0; i <= UCHAR_MAX; i++)
702 {
703 if (wx_isspace((chr) i))
704 addchr(cv, (chr) i);
705 }
706 }
707 break;
708 case CC_LOWER:
709 cv = getcvec(v, UCHAR_MAX, 0, 0);
710 if (cv)
711 {
712 for (i = 0; i <= UCHAR_MAX; i++)
713 {
714 if (wx_islower((chr) i))
715 addchr(cv, (chr) i);
716 }
717 }
718 break;
719 case CC_UPPER:
720 cv = getcvec(v, UCHAR_MAX, 0, 0);
721 if (cv)
722 {
723 for (i = 0; i <= UCHAR_MAX; i++)
724 {
725 if (wx_isupper((chr) i))
726 addchr(cv, (chr) i);
727 }
728 }
729 break;
730 case CC_GRAPH:
731 cv = getcvec(v, UCHAR_MAX, 0, 0);
732 if (cv)
733 {
734 for (i = 0; i <= UCHAR_MAX; i++)
735 {
736 if (wx_isgraph((chr) i))
737 addchr(cv, (chr) i);
738 }
739 }
740 break;
741 }
742 if (cv == NULL)
743 ERR(REG_ESPACE);
744 return cv;
745 }
746
747 /*
748 * allcases - supply cvec for all case counterparts of a chr (including itself)
749 *
750 * This is a shortcut, preferably an efficient one, for simple characters;
751 * messy cases are done via range().
752 */
753 static struct cvec *
754 allcases(struct vars * v, /* context */
755 chr pc) /* character to get case equivs of */
756 {
757 struct cvec *cv;
758 chr c = (chr) pc;
759 chr lc,
760 uc;
761
762 lc = wx_tolower((chr) c);
763 uc = wx_toupper((chr) c);
764
765 cv = getcvec(v, 2, 0, 0);
766 addchr(cv, lc);
767 if (lc != uc)
768 addchr(cv, uc);
769 return cv;
770 }
771
772 /*
773 * cmp - chr-substring compare
774 *
775 * Backrefs need this. It should preferably be efficient.
776 * Note that it does not need to report anything except equal/unequal.
777 * Note also that the length is exact, and the comparison should not
778 * stop at embedded NULs!
779 */
780 static int /* 0 for equal, nonzero for unequal */
781 cmp(const chr *x, const chr *y, /* strings to compare */
782 size_t len) /* exact length of comparison */
783 {
784 return memcmp(VS(x), VS(y), len * sizeof(chr));
785 }
786
787 /*
788 * casecmp - case-independent chr-substring compare
789 *
790 * REG_ICASE backrefs need this. It should preferably be efficient.
791 * Note that it does not need to report anything except equal/unequal.
792 * Note also that the length is exact, and the comparison should not
793 * stop at embedded NULs!
794 */
795 static int /* 0 for equal, nonzero for unequal */
796 casecmp(const chr *x, const chr *y, /* strings to compare */
797 size_t len) /* exact length of comparison */
798 {
799 for (; len > 0; len--, x++, y++)
800 {
801 if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y)))
802 return 1;
803 }
804 return 0;
805 }