]> git.saurik.com Git - wxWidgets.git/blob - src/regex/regc_locale.c
unicode savvy conversions
[wxWidgets.git] / src / regex / regc_locale.c
1 /*
2 * regc_locale.c --
3 *
4 * This file contains locale-specific regexp routines.
5 * This file is #included by regcomp.c.
6 *
7 * Copyright (c) 1998 by Scriptics Corporation.
8 *
9 * This software is copyrighted by the Regents of the University of
10 * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11 * Corporation and other parties. The following terms apply to all files
12 * associated with the software unless explicitly disclaimed in
13 * individual files.
14 *
15 * The authors hereby grant permission to use, copy, modify, distribute,
16 * and license this software and its documentation for any purpose, provided
17 * that existing copyright notices are retained in all copies and that this
18 * notice is included verbatim in any distributions. No written agreement,
19 * license, or royalty fee is required for any of the authorized uses.
20 * Modifications to this software may be copyrighted by their authors
21 * and need not follow the licensing terms described here, provided that
22 * the new terms are clearly indicated on the first page of each file where
23 * they apply.
24 *
25 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27 * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28 * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
34 * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35 * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
36 * MODIFICATIONS.
37 *
38 * GOVERNMENT USE: If you are acquiring this software on behalf of the
39 * U.S. government, the Government shall have only "Restricted Rights"
40 * in the software and related documentation as defined in the Federal
41 * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
42 * are acquiring the software on behalf of the Department of Defense, the
43 * software shall be classified as "Commercial Computer Software" and the
44 * Government shall have only "Restricted Rights" as defined in Clause
45 * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
46 * authors grant the U.S. Government and others acting in its behalf
47 * permission to use and distribute the software in accordance with the
48 * terms specified in this license.
49 *
50 * $Header$
51 */
52
53 int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
54 {
55 while(*cp++ == (const char)*wp++ && --nNum){}
56 return nNum;
57 }
58
59 int wx_isdigit(wx_wchar c) {return wxIsdigit(c);}
60 int wx_isalpha(wx_wchar c) {return wxIsalpha(c);}
61 int wx_isalnum(wx_wchar c) {return wxIsalnum(c);}
62 int wx_isupper(wx_wchar c) {return wxIsupper(c);}
63 int wx_islower(wx_wchar c) {return wxIslower(c);}
64 int wx_isgraph(wx_wchar c) {return wxIsgraph(c);}
65 int wx_ispunct(wx_wchar c) {return wxIspunct(c);}
66 int wx_isspace(wx_wchar c) {return wxIsspace(c);}
67
68 wx_wchar wx_toupper(wx_wchar c)
69 {
70 return wxToupper(c);
71 }
72
73 wx_wchar wx_tolower(wx_wchar c)
74 {
75 return wxTolower(c);
76 }
77
78 int wx_strlen(const wx_wchar* szString)
79 {
80 /*
81 Generic -- note that some clib functions also test for eol character '^Z'
82
83 int nLength = 0;
84 for (; *(szString + nLength) != '\0'; nLength++);
85 return nLength;
86 */
87 return szString == NULL ? 0 : wxStrlen_(szString);
88 }
89 /* ASCII character-name table */
90
91 static struct cname
92 {
93 char *name;
94 char code;
95 } cnames[] =
96
97 {
98 {
99 "NUL", '\0'
100 },
101 {
102 "SOH", '\001'
103 },
104 {
105 "STX", '\002'
106 },
107 {
108 "ETX", '\003'
109 },
110 {
111 "EOT", '\004'
112 },
113 {
114 "ENQ", '\005'
115 },
116 {
117 "ACK", '\006'
118 },
119 {
120 "BEL", '\007'
121 },
122 {
123 "alert", '\007'
124 },
125 {
126 "BS", '\010'
127 },
128 {
129 "backspace", '\b'
130 },
131 {
132 "HT", '\011'
133 },
134 {
135 "tab", '\t'
136 },
137 {
138 "LF", '\012'
139 },
140 {
141 "newline", '\n'
142 },
143 {
144 "VT", '\013'
145 },
146 {
147 "vertical-tab", '\v'
148 },
149 {
150 "FF", '\014'
151 },
152 {
153 "form-feed", '\f'
154 },
155 {
156 "CR", '\015'
157 },
158 {
159 "carriage-return", '\r'
160 },
161 {
162 "SO", '\016'
163 },
164 {
165 "SI", '\017'
166 },
167 {
168 "DLE", '\020'
169 },
170 {
171 "DC1", '\021'
172 },
173 {
174 "DC2", '\022'
175 },
176 {
177 "DC3", '\023'
178 },
179 {
180 "DC4", '\024'
181 },
182 {
183 "NAK", '\025'
184 },
185 {
186 "SYN", '\026'
187 },
188 {
189 "ETB", '\027'
190 },
191 {
192 "CAN", '\030'
193 },
194 {
195 "EM", '\031'
196 },
197 {
198 "SUB", '\032'
199 },
200 {
201 "ESC", '\033'
202 },
203 {
204 "IS4", '\034'
205 },
206 {
207 "FS", '\034'
208 },
209 {
210 "IS3", '\035'
211 },
212 {
213 "GS", '\035'
214 },
215 {
216 "IS2", '\036'
217 },
218 {
219 "RS", '\036'
220 },
221 {
222 "IS1", '\037'
223 },
224 {
225 "US", '\037'
226 },
227 {
228 "space", ' '
229 },
230 {
231 "exclamation-mark", '!'
232 },
233 {
234 "quotation-mark", '"'
235 },
236 {
237 "number-sign", '#'
238 },
239 {
240 "dollar-sign", '$'
241 },
242 {
243 "percent-sign", '%'
244 },
245 {
246 "ampersand", '&'
247 },
248 {
249 "apostrophe", '\''
250 },
251 {
252 "left-parenthesis", '('
253 },
254 {
255 "right-parenthesis", ')'
256 },
257 {
258 "asterisk", '*'
259 },
260 {
261 "plus-sign", '+'
262 },
263 {
264 "comma", ','
265 },
266 {
267 "hyphen", '-'
268 },
269 {
270 "hyphen-minus", '-'
271 },
272 {
273 "period", '.'
274 },
275 {
276 "full-stop", '.'
277 },
278 {
279 "slash", '/'
280 },
281 {
282 "solidus", '/'
283 },
284 {
285 "zero", '0'
286 },
287 {
288 "one", '1'
289 },
290 {
291 "two", '2'
292 },
293 {
294 "three", '3'
295 },
296 {
297 "four", '4'
298 },
299 {
300 "five", '5'
301 },
302 {
303 "six", '6'
304 },
305 {
306 "seven", '7'
307 },
308 {
309 "eight", '8'
310 },
311 {
312 "nine", '9'
313 },
314 {
315 "colon", ':'
316 },
317 {
318 "semicolon", ';'
319 },
320 {
321 "less-than-sign", '<'
322 },
323 {
324 "equals-sign", '='
325 },
326 {
327 "greater-than-sign", '>'
328 },
329 {
330 "question-mark", '?'
331 },
332 {
333 "commercial-at", '@'
334 },
335 {
336 "left-square-bracket", '['
337 },
338 {
339 "backslash", '\\'
340 },
341 {
342 "reverse-solidus", '\\'
343 },
344 {
345 "right-square-bracket", ']'
346 },
347 {
348 "circumflex", '^'
349 },
350 {
351 "circumflex-accent", '^'
352 },
353 {
354 "underscore", '_'
355 },
356 {
357 "low-line", '_'
358 },
359 {
360 "grave-accent", '`'
361 },
362 {
363 "left-brace", '{'
364 },
365 {
366 "left-curly-bracket", '{'
367 },
368 {
369 "vertical-line", '|'
370 },
371 {
372 "right-brace", '}'
373 },
374 {
375 "right-curly-bracket", '}'
376 },
377 {
378 "tilde", '~'
379 },
380 {
381 "DEL", '\177'
382 },
383 {
384 NULL, 0
385 }
386 };
387
388
389 /*
390 * nmcces - how many distinct MCCEs are there?
391 */
392 static int
393 nmcces(struct vars * v)
394 {
395 /*
396 * No multi-character collating elements defined at the moment.
397 */
398 return 0;
399 }
400
401 /*
402 * nleaders - how many chrs can be first chrs of MCCEs?
403 */
404 static int
405 nleaders(struct vars * v)
406 {
407 return 0;
408 }
409
410 /*
411 * allmcces - return a cvec with all the MCCEs of the locale
412 */
413 static struct cvec *
414 allmcces(struct vars * v, /* context */
415 struct cvec * cv) /* this is supposed to have enough room */
416 {
417 return clearcvec(cv);
418 }
419
420 /*
421 * element - map collating-element name to celt
422 */
423 static celt
424 element(struct vars * v, /* context */
425 chr *startp, /* points to start of name */
426 chr *endp) /* points just past end of name */
427 {
428 struct cname *cn;
429 size_t len;
430
431 /* generic: one-chr names stand for themselves */
432 assert(startp < endp);
433 len = endp - startp;
434 if (len == 1)
435 return *startp;
436
437 NOTE(REG_ULOCALE);
438
439 /* search table */
440 for (cn = cnames; cn->name != NULL; cn++)
441 {
442 if (strlen(cn->name) == len &&
443 char_and_wchar_strncmp(cn->name, startp, len) == 0)
444 {
445 break; /* NOTE BREAK OUT */
446 }
447 }
448 if (cn->name != NULL)
449 return CHR(cn->code);
450
451 /* couldn't find it */
452 ERR(REG_ECOLLATE);
453 return 0;
454 }
455
456 /*
457 * range - supply cvec for a range, including legality check
458 */
459 static struct cvec *
460 range(struct vars * v, /* context */
461 celt a, /* range start */
462 celt b, /* range end, might equal a */
463 int cases) /* case-independent? */
464 {
465 int nchrs;
466 struct cvec *cv;
467 celt c,
468 lc,
469 uc;
470
471 if (a != b && !before(a, b))
472 {
473 ERR(REG_ERANGE);
474 return NULL;
475 }
476
477 if (!cases)
478 { /* easy version */
479 cv = getcvec(v, 0, 1, 0);
480 NOERRN();
481 addrange(cv, a, b);
482 return cv;
483 }
484
485 /*
486 * When case-independent, it's hard to decide when cvec ranges are
487 * usable, so for now at least, we won't try. We allocate enough
488 * space for two case variants plus a little extra for the two title
489 * case variants.
490 */
491
492 nchrs = (b - a + 1) * 2 + 4;
493
494 cv = getcvec(v, nchrs, 0, 0);
495 NOERRN();
496
497 for (c = a; c <= b; c++)
498 {
499 addchr(cv, c);
500 lc = wx_tolower((chr) c);
501 if (c != lc)
502 addchr(cv, lc);
503 uc = wx_toupper((chr) c);
504 if (c != uc)
505 addchr(cv, uc);
506 }
507
508 return cv;
509 }
510
511 /*
512 * before - is celt x before celt y, for purposes of range legality?
513 */
514 static int /* predicate */
515 before(celt x, celt y)
516 {
517 /* trivial because no MCCEs */
518 if (x < y)
519 return 1;
520 return 0;
521 }
522
523 /*
524 * eclass - supply cvec for an equivalence class
525 * Must include case counterparts on request.
526 */
527 static struct cvec *
528 eclass(struct vars * v, /* context */
529 celt c, /* Collating element representing the
530 * equivalence class. */
531 int cases) /* all cases? */
532 {
533 struct cvec *cv;
534
535 /* crude fake equivalence class for testing */
536 if ((v->cflags & REG_FAKE) && c == 'x')
537 {
538 cv = getcvec(v, 4, 0, 0);
539 addchr(cv, (chr) 'x');
540 addchr(cv, (chr) 'y');
541 if (cases)
542 {
543 addchr(cv, (chr) 'X');
544 addchr(cv, (chr) 'Y');
545 }
546 return cv;
547 }
548
549 /* otherwise, none */
550 if (cases)
551 return allcases(v, c);
552 cv = getcvec(v, 1, 0, 0);
553 assert(cv != NULL);
554 addchr(cv, (chr) c);
555 return cv;
556 }
557
558 /*
559 * cclass - supply cvec for a character class
560 *
561 * Must include case counterparts on request.
562 */
563 static struct cvec *
564 cclass(struct vars * v, /* context */
565 chr *startp, /* where the name starts */
566 chr *endp, /* just past the end of the name */
567 int cases) /* case-independent? */
568 {
569 size_t len;
570 struct cvec *cv = NULL;
571 char **namePtr;
572 int i,
573 index;
574
575 /*
576 * The following arrays define the valid character class names.
577 */
578
579 static char *classNames[] = {
580 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
581 "lower", "print", "punct", "space", "upper", "xdigit", NULL
582 };
583
584 enum classes
585 {
586 CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
587 CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
588 };
589
590 /*
591 * Map the name to the corresponding enumerated value.
592 */
593 len = endp - startp;
594 index = -1;
595 for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
596 {
597 if (strlen(*namePtr) == len &&
598 char_and_wchar_strncmp(*namePtr, startp, len) == 0)
599 {
600 index = i;
601 break;
602 }
603 }
604 if (index == -1)
605 {
606 ERR(REG_ECTYPE);
607 return NULL;
608 }
609
610 /*
611 * Remap lower and upper to alpha if the match is case insensitive.
612 */
613
614 if (cases &&
615 ((enum classes) index == CC_LOWER ||
616 (enum classes) index == CC_UPPER))
617 index = (int) CC_ALPHA;
618
619 /*
620 * Now compute the character class contents.
621 *
622 * For the moment, assume that only char codes < 256 can be in these
623 * classes.
624 */
625
626 switch ((enum classes) index)
627 {
628 case CC_PRINT:
629 case CC_ALNUM:
630 cv = getcvec(v, UCHAR_MAX, 1, 0);
631 if (cv)
632 {
633 for (i = 0; i <= UCHAR_MAX; i++)
634 {
635 if (wx_isalpha((chr) i))
636 addchr(cv, (chr) i);
637 }
638 addrange(cv, (chr) '0', (chr) '9');
639 }
640 break;
641 case CC_ALPHA:
642 cv = getcvec(v, UCHAR_MAX, 0, 0);
643 if (cv)
644 {
645 for (i = 0; i <= UCHAR_MAX; i++)
646 {
647 if (wx_isalpha((chr) i))
648 addchr(cv, (chr) i);
649 }
650 }
651 break;
652 case CC_ASCII:
653 cv = getcvec(v, 0, 1, 0);
654 if (cv)
655 addrange(cv, 0, 0x7f);
656 break;
657 case CC_BLANK:
658 cv = getcvec(v, 2, 0, 0);
659 addchr(cv, '\t');
660 addchr(cv, ' ');
661 break;
662 case CC_CNTRL:
663 cv = getcvec(v, 0, 2, 0);
664 addrange(cv, 0x0, 0x1f);
665 addrange(cv, 0x7f, 0x9f);
666 break;
667 case CC_DIGIT:
668 cv = getcvec(v, 0, 1, 0);
669 if (cv)
670 addrange(cv, (chr) '0', (chr) '9');
671 break;
672 case CC_PUNCT:
673 cv = getcvec(v, UCHAR_MAX, 0, 0);
674 if (cv)
675 {
676 for (i = 0; i <= UCHAR_MAX; i++)
677 {
678 if (wx_ispunct((chr) i))
679 addchr(cv, (chr) i);
680 }
681 }
682 break;
683 case CC_XDIGIT:
684 cv = getcvec(v, 0, 3, 0);
685 if (cv)
686 {
687 addrange(cv, '0', '9');
688 addrange(cv, 'a', 'f');
689 addrange(cv, 'A', 'F');
690 }
691 break;
692 case CC_SPACE:
693 cv = getcvec(v, UCHAR_MAX, 0, 0);
694 if (cv)
695 {
696 for (i = 0; i <= UCHAR_MAX; i++)
697 {
698 if (wx_isspace((chr) i))
699 addchr(cv, (chr) i);
700 }
701 }
702 break;
703 case CC_LOWER:
704 cv = getcvec(v, UCHAR_MAX, 0, 0);
705 if (cv)
706 {
707 for (i = 0; i <= UCHAR_MAX; i++)
708 {
709 if (wx_islower((chr) i))
710 addchr(cv, (chr) i);
711 }
712 }
713 break;
714 case CC_UPPER:
715 cv = getcvec(v, UCHAR_MAX, 0, 0);
716 if (cv)
717 {
718 for (i = 0; i <= UCHAR_MAX; i++)
719 {
720 if (wx_isupper((chr) i))
721 addchr(cv, (chr) i);
722 }
723 }
724 break;
725 case CC_GRAPH:
726 cv = getcvec(v, UCHAR_MAX, 0, 0);
727 if (cv)
728 {
729 for (i = 0; i <= UCHAR_MAX; i++)
730 {
731 if (wx_isgraph((chr) i))
732 addchr(cv, (chr) i);
733 }
734 }
735 break;
736 }
737 if (cv == NULL)
738 ERR(REG_ESPACE);
739 return cv;
740 }
741
742 /*
743 * allcases - supply cvec for all case counterparts of a chr (including itself)
744 *
745 * This is a shortcut, preferably an efficient one, for simple characters;
746 * messy cases are done via range().
747 */
748 static struct cvec *
749 allcases(struct vars * v, /* context */
750 chr pc) /* character to get case equivs of */
751 {
752 struct cvec *cv;
753 chr c = (chr) pc;
754 chr lc,
755 uc;
756
757 lc = wx_tolower((chr) c);
758 uc = wx_toupper((chr) c);
759
760 cv = getcvec(v, 2, 0, 0);
761 addchr(cv, lc);
762 if (lc != uc)
763 addchr(cv, uc);
764 return cv;
765 }
766
767 /*
768 * cmp - chr-substring compare
769 *
770 * Backrefs need this. It should preferably be efficient.
771 * Note that it does not need to report anything except equal/unequal.
772 * Note also that the length is exact, and the comparison should not
773 * stop at embedded NULs!
774 */
775 static int /* 0 for equal, nonzero for unequal */
776 cmp(const chr *x, const chr *y, /* strings to compare */
777 size_t len) /* exact length of comparison */
778 {
779 return memcmp(VS(x), VS(y), len * sizeof(chr));
780 }
781
782 /*
783 * casecmp - case-independent chr-substring compare
784 *
785 * REG_ICASE backrefs need this. It should preferably be efficient.
786 * Note that it does not need to report anything except equal/unequal.
787 * Note also that the length is exact, and the comparison should not
788 * stop at embedded NULs!
789 */
790 static int /* 0 for equal, nonzero for unequal */
791 casecmp(const chr *x, const chr *y, /* strings to compare */
792 size_t len) /* exact length of comparison */
793 {
794 for (; len > 0; len--, x++, y++)
795 {
796 if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y)))
797 return 1;
798 }
799 return 0;
800 }