]>
git.saurik.com Git - wxWidgets.git/blob - src/regex/regc_locale.c
4 * This file contains locale-specific regexp routines.
5 * This file is #included by regcomp.c.
7 * Copyright (c) 1998 by Scriptics Corporation.
9 * This software is copyrighted by the Regents of the University of
10 * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11 * Corporation and other parties. The following terms apply to all files
12 * associated with the software unless explicitly disclaimed in
15 * The authors hereby grant permission to use, copy, modify, distribute,
16 * and license this software and its documentation for any purpose, provided
17 * that existing copyright notices are retained in all copies and that this
18 * notice is included verbatim in any distributions. No written agreement,
19 * license, or royalty fee is required for any of the authorized uses.
20 * Modifications to this software may be copyrighted by their authors
21 * and need not follow the licensing terms described here, provided that
22 * the new terms are clearly indicated on the first page of each file where
25 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27 * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28 * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
31 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
34 * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35 * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
38 * GOVERNMENT USE: If you are acquiring this software on behalf of the
39 * U.S. government, the Government shall have only "Restricted Rights"
40 * in the software and related documentation as defined in the Federal
41 * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
42 * are acquiring the software on behalf of the Department of Defense, the
43 * software shall be classified as "Commercial Computer Software" and the
44 * Government shall have only "Restricted Rights" as defined in Clause
45 * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
46 * authors grant the U.S. Government and others acting in its behalf
47 * permission to use and distribute the software in accordance with the
48 * terms specified in this license.
53 int char_and_wchar_strncmp (const char* cp
, const wx_wchar
* wp
, size_t nNum
)
55 while(*cp
++ == (const char)*wp
++ && --nNum
){}
59 int wx_isdigit(wx_wchar c
) {return (c
>= 0 && c
<= UCHAR_MAX
&& wxIsdigit((unsigned char) c
));}
60 int wx_isalpha(wx_wchar c
) {return (c
>= 0 && c
<= UCHAR_MAX
&& wxIsalpha((unsigned char) c
));}
61 int wx_isalnum(wx_wchar c
) {return (c
>= 0 && c
<= UCHAR_MAX
&& wxIsalnum((unsigned char) c
));}
62 int wx_isupper(wx_wchar c
) {return (c
>= 0 && c
<= UCHAR_MAX
&& wxIsupper((unsigned char) c
));}
63 int wx_islower(wx_wchar c
) {return (c
>= 0 && c
<= UCHAR_MAX
&& wxIslower((unsigned char) c
));}
64 int wx_isgraph(wx_wchar c
) {return (c
>= 0 && c
<= UCHAR_MAX
&& wxIsgraph((unsigned char) c
));}
65 int wx_ispunct(wx_wchar c
) {return (c
>= 0 && c
<= UCHAR_MAX
&& wxIspunct((unsigned char) c
));}
66 int wx_isspace(wx_wchar c
) {return (c
>= 0 && c
<= UCHAR_MAX
&& wxIsspace((unsigned char) c
));}
68 wx_wchar
wx_toupper(wx_wchar c
)
70 if (c
>= 0 && c
<= UCHAR_MAX
)
71 return wxToupper((unsigned char) c
);
76 wx_wchar
wx_tolower(wx_wchar c
)
78 if (c
>= 0 && c
<= UCHAR_MAX
)
79 return wxTolower((unsigned char) c
);
83 int wx_strlen(const wx_wchar
* szString
)
86 Generic -- note that some clib functions also test for eol character '^Z'
89 for (; *(szString + nLength) != '\0'; nLength++);
92 return szString
== NULL
? 0 : wxStrlen_(szString
);
94 /* ASCII character-name table */
164 "carriage-return", '\r'
236 "exclamation-mark", '!'
239 "quotation-mark", '"'
257 "left-parenthesis", '('
260 "right-parenthesis", ')'
326 "less-than-sign", '<'
332 "greater-than-sign", '>'
341 "left-square-bracket", '['
347 "reverse-solidus", '\\'
350 "right-square-bracket", ']'
356 "circumflex-accent", '^'
371 "left-curly-bracket", '{'
380 "right-curly-bracket", '}'
395 * nmcces - how many distinct MCCEs are there?
398 nmcces(struct vars
* v
)
401 * No multi-character collating elements defined at the moment.
407 * nleaders - how many chrs can be first chrs of MCCEs?
410 nleaders(struct vars
* v
)
416 * allmcces - return a cvec with all the MCCEs of the locale
419 allmcces(struct vars
* v
, /* context */
420 struct cvec
* cv
) /* this is supposed to have enough room */
422 return clearcvec(cv
);
426 * element - map collating-element name to celt
429 element(struct vars
* v
, /* context */
430 chr
*startp
, /* points to start of name */
431 chr
*endp
) /* points just past end of name */
436 /* generic: one-chr names stand for themselves */
437 assert(startp
< endp
);
445 for (cn
= cnames
; cn
->name
!= NULL
; cn
++)
447 if (strlen(cn
->name
) == len
&&
448 char_and_wchar_strncmp(cn
->name
, startp
, len
) == 0)
450 break; /* NOTE BREAK OUT */
453 if (cn
->name
!= NULL
)
454 return CHR(cn
->code
);
456 /* couldn't find it */
462 * range - supply cvec for a range, including legality check
465 range(struct vars
* v
, /* context */
466 celt a
, /* range start */
467 celt b
, /* range end, might equal a */
468 int cases
) /* case-independent? */
476 if (a
!= b
&& !before(a
, b
))
484 cv
= getcvec(v
, 0, 1, 0);
491 * When case-independent, it's hard to decide when cvec ranges are
492 * usable, so for now at least, we won't try. We allocate enough
493 * space for two case variants plus a little extra for the two title
497 nchrs
= (b
- a
+ 1) * 2 + 4;
499 cv
= getcvec(v
, nchrs
, 0, 0);
502 for (c
= a
; c
<= b
; c
++)
505 lc
= wx_tolower((chr
) c
);
508 uc
= wx_toupper((chr
) c
);
517 * before - is celt x before celt y, for purposes of range legality?
519 static int /* predicate */
520 before(celt x
, celt y
)
522 /* trivial because no MCCEs */
529 * eclass - supply cvec for an equivalence class
530 * Must include case counterparts on request.
533 eclass(struct vars
* v
, /* context */
534 celt c
, /* Collating element representing the
535 * equivalence class. */
536 int cases
) /* all cases? */
540 /* crude fake equivalence class for testing */
541 if ((v
->cflags
& REG_FAKE
) && c
== 'x')
543 cv
= getcvec(v
, 4, 0, 0);
544 addchr(cv
, (chr
) 'x');
545 addchr(cv
, (chr
) 'y');
548 addchr(cv
, (chr
) 'X');
549 addchr(cv
, (chr
) 'Y');
554 /* otherwise, none */
556 return allcases(v
, c
);
557 cv
= getcvec(v
, 1, 0, 0);
564 * cclass - supply cvec for a character class
566 * Must include case counterparts on request.
569 cclass(struct vars
* v
, /* context */
570 chr
*startp
, /* where the name starts */
571 chr
*endp
, /* just past the end of the name */
572 int cases
) /* case-independent? */
575 struct cvec
*cv
= NULL
;
581 * The following arrays define the valid character class names.
584 static char *classNames
[] = {
585 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
586 "lower", "print", "punct", "space", "upper", "xdigit", NULL
591 CC_ALNUM
, CC_ALPHA
, CC_ASCII
, CC_BLANK
, CC_CNTRL
, CC_DIGIT
, CC_GRAPH
,
592 CC_LOWER
, CC_PRINT
, CC_PUNCT
, CC_SPACE
, CC_UPPER
, CC_XDIGIT
596 * Map the name to the corresponding enumerated value.
600 for (namePtr
= classNames
, i
= 0; *namePtr
!= NULL
; namePtr
++, i
++)
602 if (strlen(*namePtr
) == len
&&
603 char_and_wchar_strncmp(*namePtr
, startp
, len
) == 0)
616 * Remap lower and upper to alpha if the match is case insensitive.
620 ((enum classes
) index
== CC_LOWER
||
621 (enum classes
) index
== CC_UPPER
))
622 index
= (int) CC_ALPHA
;
625 * Now compute the character class contents.
627 * For the moment, assume that only char codes < 256 can be in these
631 switch ((enum classes
) index
)
635 cv
= getcvec(v
, UCHAR_MAX
, 1, 0);
638 for (i
= 0; i
<= UCHAR_MAX
; i
++)
640 if (wx_isalpha((chr
) i
))
643 addrange(cv
, (chr
) '0', (chr
) '9');
647 cv
= getcvec(v
, UCHAR_MAX
, 0, 0);
650 for (i
= 0; i
<= UCHAR_MAX
; i
++)
652 if (wx_isalpha((chr
) i
))
658 cv
= getcvec(v
, 0, 1, 0);
660 addrange(cv
, 0, 0x7f);
663 cv
= getcvec(v
, 2, 0, 0);
668 cv
= getcvec(v
, 0, 2, 0);
669 addrange(cv
, 0x0, 0x1f);
670 addrange(cv
, 0x7f, 0x9f);
673 cv
= getcvec(v
, 0, 1, 0);
675 addrange(cv
, (chr
) '0', (chr
) '9');
678 cv
= getcvec(v
, UCHAR_MAX
, 0, 0);
681 for (i
= 0; i
<= UCHAR_MAX
; i
++)
683 if (wx_ispunct((chr
) i
))
689 cv
= getcvec(v
, 0, 3, 0);
692 addrange(cv
, '0', '9');
693 addrange(cv
, 'a', 'f');
694 addrange(cv
, 'A', 'F');
698 cv
= getcvec(v
, UCHAR_MAX
, 0, 0);
701 for (i
= 0; i
<= UCHAR_MAX
; i
++)
703 if (wx_isspace((chr
) i
))
709 cv
= getcvec(v
, UCHAR_MAX
, 0, 0);
712 for (i
= 0; i
<= UCHAR_MAX
; i
++)
714 if (wx_islower((chr
) i
))
720 cv
= getcvec(v
, UCHAR_MAX
, 0, 0);
723 for (i
= 0; i
<= UCHAR_MAX
; i
++)
725 if (wx_isupper((chr
) i
))
731 cv
= getcvec(v
, UCHAR_MAX
, 0, 0);
734 for (i
= 0; i
<= UCHAR_MAX
; i
++)
736 if (wx_isgraph((chr
) i
))
748 * allcases - supply cvec for all case counterparts of a chr (including itself)
750 * This is a shortcut, preferably an efficient one, for simple characters;
751 * messy cases are done via range().
754 allcases(struct vars
* v
, /* context */
755 chr pc
) /* character to get case equivs of */
762 lc
= wx_tolower((chr
) c
);
763 uc
= wx_toupper((chr
) c
);
765 cv
= getcvec(v
, 2, 0, 0);
773 * cmp - chr-substring compare
775 * Backrefs need this. It should preferably be efficient.
776 * Note that it does not need to report anything except equal/unequal.
777 * Note also that the length is exact, and the comparison should not
778 * stop at embedded NULs!
780 static int /* 0 for equal, nonzero for unequal */
781 cmp(const chr
*x
, const chr
*y
, /* strings to compare */
782 size_t len
) /* exact length of comparison */
784 return memcmp(VS(x
), VS(y
), len
* sizeof(chr
));
788 * casecmp - case-independent chr-substring compare
790 * REG_ICASE backrefs need this. It should preferably be efficient.
791 * Note that it does not need to report anything except equal/unequal.
792 * Note also that the length is exact, and the comparison should not
793 * stop at embedded NULs!
795 static int /* 0 for equal, nonzero for unequal */
796 casecmp(const chr
*x
, const chr
*y
, /* strings to compare */
797 size_t len
) /* exact length of comparison */
799 for (; len
> 0; len
--, x
++, y
++)
801 if ((*x
!= *y
) && (wx_tolower(*x
) != wx_tolower(*y
)))