]> git.saurik.com Git - wxWidgets.git/blob - src/regex/regc_locale.c
updated wxMBConv docs slightly; added brief docs for UTF16/32 conversions
[wxWidgets.git] / src / regex / regc_locale.c
1 /*
2 * regc_locale.c --
3 *
4 * This file contains locale-specific regexp routines.
5 * This file is #included by regcomp.c.
6 *
7 * Copyright (c) 1998 by Scriptics Corporation.
8 *
9 * This software is copyrighted by the Regents of the University of
10 * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11 * Corporation and other parties. The following terms apply to all files
12 * associated with the software unless explicitly disclaimed in
13 * individual files.
14 *
15 * The authors hereby grant permission to use, copy, modify, distribute,
16 * and license this software and its documentation for any purpose, provided
17 * that existing copyright notices are retained in all copies and that this
18 * notice is included verbatim in any distributions. No written agreement,
19 * license, or royalty fee is required for any of the authorized uses.
20 * Modifications to this software may be copyrighted by their authors
21 * and need not follow the licensing terms described here, provided that
22 * the new terms are clearly indicated on the first page of each file where
23 * they apply.
24 *
25 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27 * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28 * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
34 * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35 * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
36 * MODIFICATIONS.
37 *
38 * GOVERNMENT USE: If you are acquiring this software on behalf of the
39 * U.S. government, the Government shall have only "Restricted Rights"
40 * in the software and related documentation as defined in the Federal
41 * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
42 * are acquiring the software on behalf of the Department of Defense, the
43 * software shall be classified as "Commercial Computer Software" and the
44 * Government shall have only "Restricted Rights" as defined in Clause
45 * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
46 * authors grant the U.S. Government and others acting in its behalf
47 * permission to use and distribute the software in accordance with the
48 * terms specified in this license.
49 *
50 * $Header$
51 */
52
53 int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
54 {
55 while(*cp++ == (const char)*wp++ && --nNum){}
56
57 return nNum;
58 }
59
60 /* ASCII character-name table */
61
62 static struct cname
63 {
64 char *name;
65 char code;
66 } cnames[] =
67
68 {
69 {
70 "NUL", '\0'
71 },
72 {
73 "SOH", '\001'
74 },
75 {
76 "STX", '\002'
77 },
78 {
79 "ETX", '\003'
80 },
81 {
82 "EOT", '\004'
83 },
84 {
85 "ENQ", '\005'
86 },
87 {
88 "ACK", '\006'
89 },
90 {
91 "BEL", '\007'
92 },
93 {
94 "alert", '\007'
95 },
96 {
97 "BS", '\010'
98 },
99 {
100 "backspace", '\b'
101 },
102 {
103 "HT", '\011'
104 },
105 {
106 "tab", '\t'
107 },
108 {
109 "LF", '\012'
110 },
111 {
112 "newline", '\n'
113 },
114 {
115 "VT", '\013'
116 },
117 {
118 "vertical-tab", '\v'
119 },
120 {
121 "FF", '\014'
122 },
123 {
124 "form-feed", '\f'
125 },
126 {
127 "CR", '\015'
128 },
129 {
130 "carriage-return", '\r'
131 },
132 {
133 "SO", '\016'
134 },
135 {
136 "SI", '\017'
137 },
138 {
139 "DLE", '\020'
140 },
141 {
142 "DC1", '\021'
143 },
144 {
145 "DC2", '\022'
146 },
147 {
148 "DC3", '\023'
149 },
150 {
151 "DC4", '\024'
152 },
153 {
154 "NAK", '\025'
155 },
156 {
157 "SYN", '\026'
158 },
159 {
160 "ETB", '\027'
161 },
162 {
163 "CAN", '\030'
164 },
165 {
166 "EM", '\031'
167 },
168 {
169 "SUB", '\032'
170 },
171 {
172 "ESC", '\033'
173 },
174 {
175 "IS4", '\034'
176 },
177 {
178 "FS", '\034'
179 },
180 {
181 "IS3", '\035'
182 },
183 {
184 "GS", '\035'
185 },
186 {
187 "IS2", '\036'
188 },
189 {
190 "RS", '\036'
191 },
192 {
193 "IS1", '\037'
194 },
195 {
196 "US", '\037'
197 },
198 {
199 "space", ' '
200 },
201 {
202 "exclamation-mark", '!'
203 },
204 {
205 "quotation-mark", '"'
206 },
207 {
208 "number-sign", '#'
209 },
210 {
211 "dollar-sign", '$'
212 },
213 {
214 "percent-sign", '%'
215 },
216 {
217 "ampersand", '&'
218 },
219 {
220 "apostrophe", '\''
221 },
222 {
223 "left-parenthesis", '('
224 },
225 {
226 "right-parenthesis", ')'
227 },
228 {
229 "asterisk", '*'
230 },
231 {
232 "plus-sign", '+'
233 },
234 {
235 "comma", ','
236 },
237 {
238 "hyphen", '-'
239 },
240 {
241 "hyphen-minus", '-'
242 },
243 {
244 "period", '.'
245 },
246 {
247 "full-stop", '.'
248 },
249 {
250 "slash", '/'
251 },
252 {
253 "solidus", '/'
254 },
255 {
256 "zero", '0'
257 },
258 {
259 "one", '1'
260 },
261 {
262 "two", '2'
263 },
264 {
265 "three", '3'
266 },
267 {
268 "four", '4'
269 },
270 {
271 "five", '5'
272 },
273 {
274 "six", '6'
275 },
276 {
277 "seven", '7'
278 },
279 {
280 "eight", '8'
281 },
282 {
283 "nine", '9'
284 },
285 {
286 "colon", ':'
287 },
288 {
289 "semicolon", ';'
290 },
291 {
292 "less-than-sign", '<'
293 },
294 {
295 "equals-sign", '='
296 },
297 {
298 "greater-than-sign", '>'
299 },
300 {
301 "question-mark", '?'
302 },
303 {
304 "commercial-at", '@'
305 },
306 {
307 "left-square-bracket", '['
308 },
309 {
310 "backslash", '\\'
311 },
312 {
313 "reverse-solidus", '\\'
314 },
315 {
316 "right-square-bracket", ']'
317 },
318 {
319 "circumflex", '^'
320 },
321 {
322 "circumflex-accent", '^'
323 },
324 {
325 "underscore", '_'
326 },
327 {
328 "low-line", '_'
329 },
330 {
331 "grave-accent", '`'
332 },
333 {
334 "left-brace", '{'
335 },
336 {
337 "left-curly-bracket", '{'
338 },
339 {
340 "vertical-line", '|'
341 },
342 {
343 "right-brace", '}'
344 },
345 {
346 "right-curly-bracket", '}'
347 },
348 {
349 "tilde", '~'
350 },
351 {
352 "DEL", '\177'
353 },
354 {
355 NULL, 0
356 }
357 };
358
359 /*
360 * some ctype functions with non-ascii-char guard
361 */
362 static int
363 wx_isdigit(wx_wchar c)
364 {
365 return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
366 }
367
368 static int
369 wx_isalpha(wx_wchar c)
370 {
371 return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
372 }
373
374 static int
375 wx_isalnum(wx_wchar c)
376 {
377 return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
378 }
379
380 static int
381 wx_isupper(wx_wchar c)
382 {
383 return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
384 }
385
386 static int
387 wx_islower(wx_wchar c)
388 {
389 return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
390 }
391
392 static int
393 wx_isgraph(wx_wchar c)
394 {
395 return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
396 }
397
398 static int
399 wx_ispunct(wx_wchar c)
400 {
401 return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
402 }
403
404 static int
405 wx_isspace(wx_wchar c)
406 {
407 return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
408 }
409
410 static wx_wchar
411 wx_toupper(wx_wchar c)
412 {
413 if (c >= 0 && c <= UCHAR_MAX)
414 return toupper((unsigned char) c);
415 return c;
416 }
417
418 static wx_wchar
419 wx_tolower(wx_wchar c)
420 {
421 if (c >= 0 && c <= UCHAR_MAX)
422 return tolower((unsigned char) c);
423 return c;
424 }
425
426
427 /*
428 * nmcces - how many distinct MCCEs are there?
429 */
430 static int
431 nmcces(struct vars * v)
432 {
433 /*
434 * No multi-character collating elements defined at the moment.
435 */
436 return 0;
437 }
438
439 /*
440 * nleaders - how many chrs can be first chrs of MCCEs?
441 */
442 static int
443 nleaders(struct vars * v)
444 {
445 return 0;
446 }
447
448 /*
449 * allmcces - return a cvec with all the MCCEs of the locale
450 */
451 static struct cvec *
452 allmcces(struct vars * v, /* context */
453 struct cvec * cv) /* this is supposed to have enough room */
454 {
455 return clearcvec(cv);
456 }
457
458 /*
459 * element - map collating-element name to celt
460 */
461 static celt
462 element(struct vars * v, /* context */
463 chr *startp, /* points to start of name */
464 chr *endp) /* points just past end of name */
465 {
466 struct cname *cn;
467 size_t len;
468
469 /* generic: one-chr names stand for themselves */
470 assert(startp < endp);
471 len = endp - startp;
472 if (len == 1)
473 return *startp;
474
475 NOTE(REG_ULOCALE);
476
477 /* search table */
478 for (cn = cnames; cn->name != NULL; cn++)
479 {
480 if (strlen(cn->name) == len &&
481 char_and_wchar_strncmp(cn->name, startp, len) == 0)
482 {
483 break; /* NOTE BREAK OUT */
484 }
485 }
486 if (cn->name != NULL)
487 return CHR(cn->code);
488
489 /* couldn't find it */
490 ERR(REG_ECOLLATE);
491 return 0;
492 }
493
494 /*
495 * range - supply cvec for a range, including legality check
496 */
497 static struct cvec *
498 range(struct vars * v, /* context */
499 celt a, /* range start */
500 celt b, /* range end, might equal a */
501 int cases) /* case-independent? */
502 {
503 int nchrs;
504 struct cvec *cv;
505 celt c,
506 lc,
507 uc;
508
509 if (a != b && !before(a, b))
510 {
511 ERR(REG_ERANGE);
512 return NULL;
513 }
514
515 if (!cases)
516 { /* easy version */
517 cv = getcvec(v, 0, 1, 0);
518 NOERRN();
519 addrange(cv, a, b);
520 return cv;
521 }
522
523 /*
524 * When case-independent, it's hard to decide when cvec ranges are
525 * usable, so for now at least, we won't try. We allocate enough
526 * space for two case variants plus a little extra for the two title
527 * case variants.
528 */
529
530 nchrs = (b - a + 1) * 2 + 4;
531
532 cv = getcvec(v, nchrs, 0, 0);
533 NOERRN();
534
535 for (c = a; c <= b; c++)
536 {
537 addchr(cv, c);
538 lc = wx_tolower((chr) c);
539 if (c != lc)
540 addchr(cv, lc);
541 uc = wx_toupper((chr) c);
542 if (c != uc)
543 addchr(cv, uc);
544 }
545
546 return cv;
547 }
548
549 /*
550 * before - is celt x before celt y, for purposes of range legality?
551 */
552 static int /* predicate */
553 before(celt x, celt y)
554 {
555 /* trivial because no MCCEs */
556 if (x < y)
557 return 1;
558 return 0;
559 }
560
561 /*
562 * eclass - supply cvec for an equivalence class
563 * Must include case counterparts on request.
564 */
565 static struct cvec *
566 eclass(struct vars * v, /* context */
567 celt c, /* Collating element representing the
568 * equivalence class. */
569 int cases) /* all cases? */
570 {
571 struct cvec *cv;
572
573 /* crude fake equivalence class for testing */
574 if ((v->cflags & REG_FAKE) && c == 'x')
575 {
576 cv = getcvec(v, 4, 0, 0);
577 addchr(cv, (chr) 'x');
578 addchr(cv, (chr) 'y');
579 if (cases)
580 {
581 addchr(cv, (chr) 'X');
582 addchr(cv, (chr) 'Y');
583 }
584 return cv;
585 }
586
587 /* otherwise, none */
588 if (cases)
589 return allcases(v, c);
590 cv = getcvec(v, 1, 0, 0);
591 assert(cv != NULL);
592 addchr(cv, (chr) c);
593 return cv;
594 }
595
596 /*
597 * cclass - supply cvec for a character class
598 *
599 * Must include case counterparts on request.
600 */
601 static struct cvec *
602 cclass(struct vars * v, /* context */
603 chr *startp, /* where the name starts */
604 chr *endp, /* just past the end of the name */
605 int cases) /* case-independent? */
606 {
607 size_t len;
608 struct cvec *cv = NULL;
609 char **namePtr;
610 int i,
611 index;
612
613 /*
614 * The following arrays define the valid character class names.
615 */
616
617 static char *classNames[] = {
618 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
619 "lower", "print", "punct", "space", "upper", "xdigit", NULL
620 };
621
622 enum classes
623 {
624 CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
625 CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
626 };
627
628 /*
629 * Map the name to the corresponding enumerated value.
630 */
631 len = endp - startp;
632 index = -1;
633 for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
634 {
635 if (strlen(*namePtr) == len &&
636 char_and_wchar_strncmp(*namePtr, startp, len) == 0)
637 {
638 index = i;
639 break;
640 }
641 }
642 if (index == -1)
643 {
644 ERR(REG_ECTYPE);
645 return NULL;
646 }
647
648 /*
649 * Remap lower and upper to alpha if the match is case insensitive.
650 */
651
652 if (cases &&
653 ((enum classes) index == CC_LOWER ||
654 (enum classes) index == CC_UPPER))
655 index = (int) CC_ALPHA;
656
657 /*
658 * Now compute the character class contents.
659 *
660 * For the moment, assume that only char codes < 256 can be in these
661 * classes.
662 */
663
664 switch ((enum classes) index)
665 {
666 case CC_PRINT:
667 case CC_ALNUM:
668 cv = getcvec(v, UCHAR_MAX, 1, 0);
669 if (cv)
670 {
671 for (i = 0; i <= UCHAR_MAX; i++)
672 {
673 if (wx_isalpha((chr) i))
674 addchr(cv, (chr) i);
675 }
676 addrange(cv, (chr) '0', (chr) '9');
677 }
678 break;
679 case CC_ALPHA:
680 cv = getcvec(v, UCHAR_MAX, 0, 0);
681 if (cv)
682 {
683 for (i = 0; i <= UCHAR_MAX; i++)
684 {
685 if (wx_isalpha((chr) i))
686 addchr(cv, (chr) i);
687 }
688 }
689 break;
690 case CC_ASCII:
691 cv = getcvec(v, 0, 1, 0);
692 if (cv)
693 addrange(cv, 0, 0x7f);
694 break;
695 case CC_BLANK:
696 cv = getcvec(v, 2, 0, 0);
697 addchr(cv, '\t');
698 addchr(cv, ' ');
699 break;
700 case CC_CNTRL:
701 cv = getcvec(v, 0, 2, 0);
702 addrange(cv, 0x0, 0x1f);
703 addrange(cv, 0x7f, 0x9f);
704 break;
705 case CC_DIGIT:
706 cv = getcvec(v, 0, 1, 0);
707 if (cv)
708 addrange(cv, (chr) '0', (chr) '9');
709 break;
710 case CC_PUNCT:
711 cv = getcvec(v, UCHAR_MAX, 0, 0);
712 if (cv)
713 {
714 for (i = 0; i <= UCHAR_MAX; i++)
715 {
716 if (wx_ispunct((chr) i))
717 addchr(cv, (chr) i);
718 }
719 }
720 break;
721 case CC_XDIGIT:
722 cv = getcvec(v, 0, 3, 0);
723 if (cv)
724 {
725 addrange(cv, '0', '9');
726 addrange(cv, 'a', 'f');
727 addrange(cv, 'A', 'F');
728 }
729 break;
730 case CC_SPACE:
731 cv = getcvec(v, UCHAR_MAX, 0, 0);
732 if (cv)
733 {
734 for (i = 0; i <= UCHAR_MAX; i++)
735 {
736 if (wx_isspace((chr) i))
737 addchr(cv, (chr) i);
738 }
739 }
740 break;
741 case CC_LOWER:
742 cv = getcvec(v, UCHAR_MAX, 0, 0);
743 if (cv)
744 {
745 for (i = 0; i <= UCHAR_MAX; i++)
746 {
747 if (wx_islower((chr) i))
748 addchr(cv, (chr) i);
749 }
750 }
751 break;
752 case CC_UPPER:
753 cv = getcvec(v, UCHAR_MAX, 0, 0);
754 if (cv)
755 {
756 for (i = 0; i <= UCHAR_MAX; i++)
757 {
758 if (wx_isupper((chr) i))
759 addchr(cv, (chr) i);
760 }
761 }
762 break;
763 case CC_GRAPH:
764 cv = getcvec(v, UCHAR_MAX, 0, 0);
765 if (cv)
766 {
767 for (i = 0; i <= UCHAR_MAX; i++)
768 {
769 if (wx_isgraph((chr) i))
770 addchr(cv, (chr) i);
771 }
772 }
773 break;
774 }
775 if (cv == NULL)
776 ERR(REG_ESPACE);
777 return cv;
778 }
779
780 /*
781 * allcases - supply cvec for all case counterparts of a chr (including itself)
782 *
783 * This is a shortcut, preferably an efficient one, for simple characters;
784 * messy cases are done via range().
785 */
786 static struct cvec *
787 allcases(struct vars * v, /* context */
788 chr pc) /* character to get case equivs of */
789 {
790 struct cvec *cv;
791 chr c = (chr) pc;
792 chr lc,
793 uc;
794
795 lc = wx_tolower((chr) c);
796 uc = wx_toupper((chr) c);
797
798 cv = getcvec(v, 2, 0, 0);
799 addchr(cv, lc);
800 if (lc != uc)
801 addchr(cv, uc);
802 return cv;
803 }
804
805 /*
806 * cmp - chr-substring compare
807 *
808 * Backrefs need this. It should preferably be efficient.
809 * Note that it does not need to report anything except equal/unequal.
810 * Note also that the length is exact, and the comparison should not
811 * stop at embedded NULs!
812 */
813 static int /* 0 for equal, nonzero for unequal */
814 cmp(const chr *x, const chr *y, /* strings to compare */
815 size_t len) /* exact length of comparison */
816 {
817 return memcmp(VS(x), VS(y), len * sizeof(chr));
818 }
819
820 /*
821 * casecmp - case-independent chr-substring compare
822 *
823 * REG_ICASE backrefs need this. It should preferably be efficient.
824 * Note that it does not need to report anything except equal/unequal.
825 * Note also that the length is exact, and the comparison should not
826 * stop at embedded NULs!
827 */
828 static int /* 0 for equal, nonzero for unequal */
829 casecmp(const chr *x, const chr *y, /* strings to compare */
830 size_t len) /* exact length of comparison */
831 {
832 for (; len > 0; len--, x++, y++)
833 {
834 if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y)))
835 return 1;
836 }
837 return 0;
838 }