]>
git.saurik.com Git - wxWidgets.git/blob - src/regex/regc_locale.c
   4  *      This file contains locale-specific regexp routines. 
   5  *      This file is #included by regcomp.c. 
   7  * Copyright (c) 1998 by Scriptics Corporation. 
   9  * This software is copyrighted by the Regents of the University of 
  10  * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState 
  11  * Corporation and other parties.  The following terms apply to all files 
  12  * associated with the software unless explicitly disclaimed in 
  15  * The authors hereby grant permission to use, copy, modify, distribute, 
  16  * and license this software and its documentation for any purpose, provided 
  17  * that existing copyright notices are retained in all copies and that this 
  18  * notice is included verbatim in any distributions. No written agreement, 
  19  * license, or royalty fee is required for any of the authorized uses. 
  20  * Modifications to this software may be copyrighted by their authors 
  21  * and need not follow the licensing terms described here, provided that 
  22  * the new terms are clearly indicated on the first page of each file where 
  25  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY 
  26  * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 
  27  * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY 
  28  * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE 
  29  * POSSIBILITY OF SUCH DAMAGE. 
  31  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, 
  32  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, 
  33  * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.      THIS SOFTWARE 
  34  * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE 
  35  * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR 
  38  * GOVERNMENT USE: If you are acquiring this software on behalf of the 
  39  * U.S. government, the Government shall have only "Restricted Rights" 
  40  * in the software and related documentation as defined in the Federal 
  41  * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you 
  42  * are acquiring the software on behalf of the Department of Defense, the 
  43  * software shall be classified as "Commercial Computer Software" and the 
  44  * Government shall have only "Restricted Rights" as defined in Clause 
  45  * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the 
  46  * authors grant the U.S. Government and others acting in its behalf 
  47  * permission to use and distribute the software in accordance with the 
  48  * terms specified in this license. 
  53 int char_and_wchar_strncmp (const char* cp
, const wx_wchar
* wp
, size_t nNum
) 
  55         while(*cp
++ == (const char)*wp
++ && --nNum
){} 
  60 /* ASCII character-name table */ 
 130                 "carriage-return", '\r' 
 202                 "exclamation-mark", '!' 
 205                 "quotation-mark", '"' 
 223                 "left-parenthesis", '(' 
 226                 "right-parenthesis", ')' 
 292                 "less-than-sign", '<' 
 298                 "greater-than-sign", '>' 
 307                 "left-square-bracket", '[' 
 313                 "reverse-solidus", '\\' 
 316                 "right-square-bracket", ']' 
 322                 "circumflex-accent", '^' 
 337                 "left-curly-bracket", '{' 
 346                 "right-curly-bracket", '}' 
 360  * some ctype functions with non-ascii-char guard 
 363 wx_isdigit(wx_wchar c
) 
 365         return (c 
>= 0 && c 
<= UCHAR_MAX 
&& isdigit((unsigned char) c
)); 
 369 wx_isalpha(wx_wchar c
) 
 371         return (c 
>= 0 && c 
<= UCHAR_MAX 
&& isalpha((unsigned char) c
)); 
 375 wx_isalnum(wx_wchar c
) 
 377         return (c 
>= 0 && c 
<= UCHAR_MAX 
&& isalnum((unsigned char) c
)); 
 381 wx_isupper(wx_wchar c
) 
 383         return (c 
>= 0 && c 
<= UCHAR_MAX 
&& isupper((unsigned char) c
)); 
 387 wx_islower(wx_wchar c
) 
 389         return (c 
>= 0 && c 
<= UCHAR_MAX 
&& islower((unsigned char) c
)); 
 393 wx_isgraph(wx_wchar c
) 
 395         return (c 
>= 0 && c 
<= UCHAR_MAX 
&& isgraph((unsigned char) c
)); 
 399 wx_ispunct(wx_wchar c
) 
 401         return (c 
>= 0 && c 
<= UCHAR_MAX 
&& ispunct((unsigned char) c
)); 
 405 wx_isspace(wx_wchar c
) 
 407         return (c 
>= 0 && c 
<= UCHAR_MAX 
&& isspace((unsigned char) c
)); 
 411 wx_toupper(wx_wchar c
) 
 413         if (c 
>= 0 && c 
<= UCHAR_MAX
) 
 414                 return toupper((unsigned char) c
); 
 419 wx_tolower(wx_wchar c
) 
 421         if (c 
>= 0 && c 
<= UCHAR_MAX
) 
 422                 return tolower((unsigned char) c
); 
 428  * nmcces - how many distinct MCCEs are there? 
 431 nmcces(struct vars 
* v
) 
 434          * No multi-character collating elements defined at the moment. 
 440  * nleaders - how many chrs can be first chrs of MCCEs? 
 443 nleaders(struct vars 
* v
) 
 449  * allmcces - return a cvec with all the MCCEs of the locale 
 452 allmcces(struct vars 
* v
,               /* context */ 
 453                  struct cvec 
* cv
)              /* this is supposed to have enough room */ 
 455         return clearcvec(cv
); 
 459  * element - map collating-element name to celt 
 462 element(struct vars 
* v
,                /* context */ 
 463                 chr 
*startp
,                    /* points to start of name */ 
 464                 chr 
*endp
)                              /* points just past end of name */ 
 469         /* generic:  one-chr names stand for themselves */ 
 470         assert(startp 
< endp
); 
 478         for (cn 
= cnames
; cn
->name 
!= NULL
; cn
++) 
 480                 if (strlen(cn
->name
) == len 
&& 
 481                         char_and_wchar_strncmp(cn
->name
, startp
, len
) == 0) 
 483                         break;                          /* NOTE BREAK OUT */ 
 486         if (cn
->name 
!= NULL
) 
 487                 return CHR(cn
->code
); 
 489         /* couldn't find it */ 
 495  * range - supply cvec for a range, including legality check 
 498 range(struct vars 
* v
,                  /* context */ 
 499           celt a
,                                       /* range start */ 
 500           celt b
,                                       /* range end, might equal a */ 
 501           int cases
)                            /* case-independent? */ 
 509         if (a 
!= b 
&& !before(a
, b
)) 
 517                 cv 
= getcvec(v
, 0, 1, 0); 
 524          * When case-independent, it's hard to decide when cvec ranges are 
 525          * usable, so for now at least, we won't try.  We allocate enough 
 526          * space for two case variants plus a little extra for the two title 
 530         nchrs 
= (b 
- a 
+ 1) * 2 + 4; 
 532         cv 
= getcvec(v
, nchrs
, 0, 0); 
 535         for (c 
= a
; c 
<= b
; c
++) 
 538                 lc 
= wx_tolower((chr
) c
); 
 541                 uc 
= wx_toupper((chr
) c
); 
 550  * before - is celt x before celt y, for purposes of range legality? 
 552 static int                                              /* predicate */ 
 553 before(celt x
, celt y
) 
 555         /* trivial because no MCCEs */ 
 562  * eclass - supply cvec for an equivalence class 
 563  * Must include case counterparts on request. 
 566 eclass(struct vars 
* v
,                 /* context */ 
 567            celt c
,                                      /* Collating element representing the 
 568                                                                  * equivalence class. */ 
 569            int cases
)                           /* all cases? */ 
 573         /* crude fake equivalence class for testing */ 
 574         if ((v
->cflags 
& REG_FAKE
) && c 
== 'x') 
 576                 cv 
= getcvec(v
, 4, 0, 0); 
 577                 addchr(cv
, (chr
) 'x'); 
 578                 addchr(cv
, (chr
) 'y'); 
 581                         addchr(cv
, (chr
) 'X'); 
 582                         addchr(cv
, (chr
) 'Y'); 
 587         /* otherwise, none */ 
 589                 return allcases(v
, c
); 
 590         cv 
= getcvec(v
, 1, 0, 0); 
 597  * cclass - supply cvec for a character class 
 599  * Must include case counterparts on request. 
 602 cclass(struct vars 
* v
,                 /* context */ 
 603            chr 
*startp
,                         /* where the name starts */ 
 604            chr 
*endp
,                           /* just past the end of the name */ 
 605            int cases
)                           /* case-independent? */ 
 608         struct cvec 
*cv 
= NULL
; 
 614          * The following arrays define the valid character class names. 
 617         static char *classNames
[] = { 
 618                 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", 
 619                 "lower", "print", "punct", "space", "upper", "xdigit", NULL
 
 624                 CC_ALNUM
, CC_ALPHA
, CC_ASCII
, CC_BLANK
, CC_CNTRL
, CC_DIGIT
, CC_GRAPH
, 
 625                 CC_LOWER
, CC_PRINT
, CC_PUNCT
, CC_SPACE
, CC_UPPER
, CC_XDIGIT
 
 629          * Map the name to the corresponding enumerated value. 
 633         for (namePtr 
= classNames
, i 
= 0; *namePtr 
!= NULL
; namePtr
++, i
++) 
 635                 if (strlen(*namePtr
) == len 
&& 
 636                         char_and_wchar_strncmp(*namePtr
, startp
, len
) == 0) 
 649          * Remap lower and upper to alpha if the match is case insensitive. 
 653                 ((enum classes
) index 
== CC_LOWER 
|| 
 654                  (enum classes
) index 
== CC_UPPER
)) 
 655                 index 
= (int) CC_ALPHA
; 
 658          * Now compute the character class contents. 
 660          * For the moment, assume that only char codes < 256 can be in these 
 664         switch ((enum classes
) index
) 
 668                         cv 
= getcvec(v
, UCHAR_MAX
, 1, 0); 
 671                                 for (i 
= 0; i 
<= UCHAR_MAX
; i
++) 
 673                                         if (wx_isalpha((chr
) i
)) 
 676                                 addrange(cv
, (chr
) '0', (chr
) '9'); 
 680                         cv 
= getcvec(v
, UCHAR_MAX
, 0, 0); 
 683                                 for (i 
= 0; i 
<= UCHAR_MAX
; i
++) 
 685                                         if (wx_isalpha((chr
) i
)) 
 691                         cv 
= getcvec(v
, 0, 1, 0); 
 693                                 addrange(cv
, 0, 0x7f); 
 696                         cv 
= getcvec(v
, 2, 0, 0); 
 701                         cv 
= getcvec(v
, 0, 2, 0); 
 702                         addrange(cv
, 0x0, 0x1f); 
 703                         addrange(cv
, 0x7f, 0x9f); 
 706                         cv 
= getcvec(v
, 0, 1, 0); 
 708                                 addrange(cv
, (chr
) '0', (chr
) '9'); 
 711                         cv 
= getcvec(v
, UCHAR_MAX
, 0, 0); 
 714                                 for (i 
= 0; i 
<= UCHAR_MAX
; i
++) 
 716                                         if (wx_ispunct((chr
) i
)) 
 722                         cv 
= getcvec(v
, 0, 3, 0); 
 725                                 addrange(cv
, '0', '9'); 
 726                                 addrange(cv
, 'a', 'f'); 
 727                                 addrange(cv
, 'A', 'F'); 
 731                         cv 
= getcvec(v
, UCHAR_MAX
, 0, 0); 
 734                                 for (i 
= 0; i 
<= UCHAR_MAX
; i
++) 
 736                                         if (wx_isspace((chr
) i
)) 
 742                         cv 
= getcvec(v
, UCHAR_MAX
, 0, 0); 
 745                                 for (i 
= 0; i 
<= UCHAR_MAX
; i
++) 
 747                                         if (wx_islower((chr
) i
)) 
 753                         cv 
= getcvec(v
, UCHAR_MAX
, 0, 0); 
 756                                 for (i 
= 0; i 
<= UCHAR_MAX
; i
++) 
 758                                         if (wx_isupper((chr
) i
)) 
 764                         cv 
= getcvec(v
, UCHAR_MAX
, 0, 0); 
 767                                 for (i 
= 0; i 
<= UCHAR_MAX
; i
++) 
 769                                         if (wx_isgraph((chr
) i
)) 
 781  * allcases - supply cvec for all case counterparts of a chr (including itself) 
 783  * This is a shortcut, preferably an efficient one, for simple characters; 
 784  * messy cases are done via range(). 
 787 allcases(struct vars 
* v
,               /* context */ 
 788                  chr pc
)                                /* character to get case equivs of */ 
 795         lc 
= wx_tolower((chr
) c
); 
 796         uc 
= wx_toupper((chr
) c
); 
 798         cv 
= getcvec(v
, 2, 0, 0); 
 806  * cmp - chr-substring compare 
 808  * Backrefs need this.  It should preferably be efficient. 
 809  * Note that it does not need to report anything except equal/unequal. 
 810  * Note also that the length is exact, and the comparison should not 
 811  * stop at embedded NULs! 
 813 static int                                              /* 0 for equal, nonzero for unequal */ 
 814 cmp(const chr 
*x
, const chr 
*y
, /* strings to compare */ 
 815         size_t len
)                                     /* exact length of comparison */ 
 817         return memcmp(VS(x
), VS(y
), len 
* sizeof(chr
)); 
 821  * casecmp - case-independent chr-substring compare 
 823  * REG_ICASE backrefs need this.  It should preferably be efficient. 
 824  * Note that it does not need to report anything except equal/unequal. 
 825  * Note also that the length is exact, and the comparison should not 
 826  * stop at embedded NULs! 
 828 static int                                              /* 0 for equal, nonzero for unequal */ 
 829 casecmp(const chr 
*x
, const chr 
*y
,             /* strings to compare */ 
 830                 size_t len
)                             /* exact length of comparison */ 
 832         for (; len 
> 0; len
--, x
++, y
++) 
 834                 if ((*x 
!= *y
) && (wx_tolower(*x
) != wx_tolower(*y
)))