src/regex/regc_locale.c

   1 /*
   2  * regc_locale.c --
   3  *
   4  *      This file contains locale-specific regexp routines.
   5  *      This file is #included by regcomp.c.
   6  *
   7  * Copyright (c) 1998 by Scriptics Corporation.
   8  *
   9  * This software is copyrighted by the Regents of the University of
  10  * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
  11  * Corporation and other parties.  The following terms apply to all files
  12  * associated with the software unless explicitly disclaimed in
  13  * individual files.
  14  *
  15  * The authors hereby grant permission to use, copy, modify, distribute,
  16  * and license this software and its documentation for any purpose, provided
  17  * that existing copyright notices are retained in all copies and that this
  18  * notice is included verbatim in any distributions. No written agreement,
  19  * license, or royalty fee is required for any of the authorized uses.
  20  * Modifications to this software may be copyrighted by their authors
  21  * and need not follow the licensing terms described here, provided that
  22  * the new terms are clearly indicated on the first page of each file where
  23  * they apply.
  24  *
  25  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
  26  * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  27  * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
  28  * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  *
  31  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
  32  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
  33  * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.      THIS SOFTWARE
  34  * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
  35  * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
  36  * MODIFICATIONS.
  37  *
  38  * GOVERNMENT USE: If you are acquiring this software on behalf of the
  39  * U.S. government, the Government shall have only "Restricted Rights"
  40  * in the software and related documentation as defined in the Federal
  41  * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
  42  * are acquiring the software on behalf of the Department of Defense, the
  43  * software shall be classified as "Commercial Computer Software" and the
  44  * Government shall have only "Restricted Rights" as defined in Clause
  45  * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
  46  * authors grant the U.S. Government and others acting in its behalf
  47  * permission to use and distribute the software in accordance with the
  48  * terms specified in this license.
  49  *
  50  * $Header$
  51  */
  52
  53 int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
  54 {
  55         while(*cp++ == (const char)*wp++ && --nNum){}
  56         return nNum;
  57 }
  58
  59 int wx_isdigit(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX &&  wxIsdigit((unsigned char) c));}
  60 int wx_isalpha(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX &&  wxIsalpha((unsigned char) c));}
  61 int wx_isalnum(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX &&  wxIsalnum((unsigned char) c));}
  62 int wx_isupper(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX &&  wxIsupper((unsigned char) c));}
  63 int wx_islower(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX &&  wxIslower((unsigned char) c));}
  64 int wx_isgraph(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX &&  wxIsgraph((unsigned char) c));}
  65 int wx_ispunct(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX &&  wxIspunct((unsigned char) c));}
  66 int wx_isspace(wx_wchar c) {return (c >= 0 && c <= UCHAR_MAX &&  wxIsspace((unsigned char) c));}
  67
  68 wx_wchar wx_toupper(wx_wchar c)
  69 {
  70         if (c >= 0 && c <= UCHAR_MAX)
  71                 return wxToupper((unsigned char) c);
  72         return c;
  73
  74 }
  75
  76 wx_wchar wx_tolower(wx_wchar c)
  77 {
  78         if (c >= 0 && c <= UCHAR_MAX)
  79                 return wxTolower((unsigned char) c);
  80         return c;
  81 }
  82
  83 int wx_strlen(const wx_wchar* szString)
  84 {
  85     /*
  86     Generic -- note that some clib functions also test for eol character '^Z'
  87
  88         int     nLength = 0;
  89         for (; *(szString + nLength) != '\0'; nLength++);
  90         return nLength;
  91     */
  92     return szString == NULL ? 0 : wxStrlen_(szString);
  93 }
  94 /* ASCII character-name table */
  95
  96 static struct cname
  97 {
  98         char       *name;
  99         char            code;
 100 }       cnames[] =
 101
 102 {
 103         {
 104                 "NUL", '\0'
 105         },
 106         {
 107                 "SOH", '\001'
 108         },
 109         {
 110                 "STX", '\002'
 111         },
 112         {
 113                 "ETX", '\003'
 114         },
 115         {
 116                 "EOT", '\004'
 117         },
 118         {
 119                 "ENQ", '\005'
 120         },
 121         {
 122                 "ACK", '\006'
 123         },
 124         {
 125                 "BEL", '\007'
 126         },
 127         {
 128                 "alert", '\007'
 129         },
 130         {
 131                 "BS", '\010'
 132         },
 133         {
 134                 "backspace", '\b'
 135         },
 136         {
 137                 "HT", '\011'
 138         },
 139         {
 140                 "tab", '\t'
 141         },
 142         {
 143                 "LF", '\012'
 144         },
 145         {
 146                 "newline", '\n'
 147         },
 148         {
 149                 "VT", '\013'
 150         },
 151         {
 152                 "vertical-tab", '\v'
 153         },
 154         {
 155                 "FF", '\014'
 156         },
 157         {
 158                 "form-feed", '\f'
 159         },
 160         {
 161                 "CR", '\015'
 162         },
 163         {
 164                 "carriage-return", '\r'
 165         },
 166         {
 167                 "SO", '\016'
 168         },
 169         {
 170                 "SI", '\017'
 171         },
 172         {
 173                 "DLE", '\020'
 174         },
 175         {
 176                 "DC1", '\021'
 177         },
 178         {
 179                 "DC2", '\022'
 180         },
 181         {
 182                 "DC3", '\023'
 183         },
 184         {
 185                 "DC4", '\024'
 186         },
 187         {
 188                 "NAK", '\025'
 189         },
 190         {
 191                 "SYN", '\026'
 192         },
 193         {
 194                 "ETB", '\027'
 195         },
 196         {
 197                 "CAN", '\030'
 198         },
 199         {
 200                 "EM", '\031'
 201         },
 202         {
 203                 "SUB", '\032'
 204         },
 205         {
 206                 "ESC", '\033'
 207         },
 208         {
 209                 "IS4", '\034'
 210         },
 211         {
 212                 "FS", '\034'
 213         },
 214         {
 215                 "IS3", '\035'
 216         },
 217         {
 218                 "GS", '\035'
 219         },
 220         {
 221                 "IS2", '\036'
 222         },
 223         {
 224                 "RS", '\036'
 225         },
 226         {
 227                 "IS1", '\037'
 228         },
 229         {
 230                 "US", '\037'
 231         },
 232         {
 233                 "space", ' '
 234         },
 235         {
 236                 "exclamation-mark", '!'
 237         },
 238         {
 239                 "quotation-mark", '"'
 240         },
 241         {
 242                 "number-sign", '#'
 243         },
 244         {
 245                 "dollar-sign", '$'
 246         },
 247         {
 248                 "percent-sign", '%'
 249         },
 250         {
 251                 "ampersand", '&'
 252         },
 253         {
 254                 "apostrophe", '\''
 255         },
 256         {
 257                 "left-parenthesis", '('
 258         },
 259         {
 260                 "right-parenthesis", ')'
 261         },
 262         {
 263                 "asterisk", '*'
 264         },
 265         {
 266                 "plus-sign", '+'
 267         },
 268         {
 269                 "comma", ','
 270         },
 271         {
 272                 "hyphen", '-'
 273         },
 274         {
 275                 "hyphen-minus", '-'
 276         },
 277         {
 278                 "period", '.'
 279         },
 280         {
 281                 "full-stop", '.'
 282         },
 283         {
 284                 "slash", '/'
 285         },
 286         {
 287                 "solidus", '/'
 288         },
 289         {
 290                 "zero", '0'
 291         },
 292         {
 293                 "one", '1'
 294         },
 295         {
 296                 "two", '2'
 297         },
 298         {
 299                 "three", '3'
 300         },
 301         {
 302                 "four", '4'
 303         },
 304         {
 305                 "five", '5'
 306         },
 307         {
 308                 "six", '6'
 309         },
 310         {
 311                 "seven", '7'
 312         },
 313         {
 314                 "eight", '8'
 315         },
 316         {
 317                 "nine", '9'
 318         },
 319         {
 320                 "colon", ':'
 321         },
 322         {
 323                 "semicolon", ';'
 324         },
 325         {
 326                 "less-than-sign", '<'
 327         },
 328         {
 329                 "equals-sign", '='
 330         },
 331         {
 332                 "greater-than-sign", '>'
 333         },
 334         {
 335                 "question-mark", '?'
 336         },
 337         {
 338                 "commercial-at", '@'
 339         },
 340         {
 341                 "left-square-bracket", '['
 342         },
 343         {
 344                 "backslash", '\\'
 345         },
 346         {
 347                 "reverse-solidus", '\\'
 348         },
 349         {
 350                 "right-square-bracket", ']'
 351         },
 352         {
 353                 "circumflex", '^'
 354         },
 355         {
 356                 "circumflex-accent", '^'
 357         },
 358         {
 359                 "underscore", '_'
 360         },
 361         {
 362                 "low-line", '_'
 363         },
 364         {
 365                 "grave-accent", '`'
 366         },
 367         {
 368                 "left-brace", '{'
 369         },
 370         {
 371                 "left-curly-bracket", '{'
 372         },
 373         {
 374                 "vertical-line", '|'
 375         },
 376         {
 377                 "right-brace", '}'
 378         },
 379         {
 380                 "right-curly-bracket", '}'
 381         },
 382         {
 383                 "tilde", '~'
 384         },
 385         {
 386                 "DEL", '\177'
 387         },
 388         {
 389                 NULL, 0
 390         }
 391 };
 392
 393
 394 /*
 395  * nmcces - how many distinct MCCEs are there?
 396  */
 397 static int
 398 nmcces(struct vars * v)
 399 {
 400         /*
 401          * No multi-character collating elements defined at the moment.
 402          */
 403         return 0;
 404 }
 405
 406 /*
 407  * nleaders - how many chrs can be first chrs of MCCEs?
 408  */
 409 static int
 410 nleaders(struct vars * v)
 411 {
 412         return 0;
 413 }
 414
 415 /*
 416  * allmcces - return a cvec with all the MCCEs of the locale
 417  */
 418 static struct cvec *
 419 allmcces(struct vars * v,               /* context */
 420                  struct cvec * cv)              /* this is supposed to have enough room */
 421 {
 422         return clearcvec(cv);
 423 }
 424
 425 /*
 426  * element - map collating-element name to celt
 427  */
 428 static celt
 429 element(struct vars * v,                /* context */
 430                 chr *startp,                    /* points to start of name */
 431                 chr *endp)                              /* points just past end of name */
 432 {
 433         struct cname *cn;
 434         size_t          len;
 435
 436         /* generic:  one-chr names stand for themselves */
 437         assert(startp < endp);
 438         len = endp - startp;
 439         if (len == 1)
 440                 return *startp;
 441
 442         NOTE(REG_ULOCALE);
 443
 444         /* search table */
 445         for (cn = cnames; cn->name != NULL; cn++)
 446         {
 447                 if (strlen(cn->name) == len &&
 448                         char_and_wchar_strncmp(cn->name, startp, len) == 0)
 449                 {
 450                         break;                          /* NOTE BREAK OUT */
 451                 }
 452         }
 453         if (cn->name != NULL)
 454                 return CHR(cn->code);
 455
 456         /* couldn't find it */
 457         ERR(REG_ECOLLATE);
 458         return 0;
 459 }
 460
 461 /*
 462  * range - supply cvec for a range, including legality check
 463  */
 464 static struct cvec *
 465 range(struct vars * v,                  /* context */
 466           celt a,                                       /* range start */
 467           celt b,                                       /* range end, might equal a */
 468           int cases)                            /* case-independent? */
 469 {
 470         int                     nchrs;
 471         struct cvec *cv;
 472         celt            c,
 473                                 lc,
 474                                 uc;
 475
 476         if (a != b && !before(a, b))
 477         {
 478                 ERR(REG_ERANGE);
 479                 return NULL;
 480         }
 481
 482         if (!cases)
 483         {                                                       /* easy version */
 484                 cv = getcvec(v, 0, 1, 0);
 485                 NOERRN();
 486                 addrange(cv, a, b);
 487                 return cv;
 488         }
 489
 490         /*
 491          * When case-independent, it's hard to decide when cvec ranges are
 492          * usable, so for now at least, we won't try.  We allocate enough
 493          * space for two case variants plus a little extra for the two title
 494          * case variants.
 495          */
 496
 497         nchrs = (b - a + 1) * 2 + 4;
 498
 499         cv = getcvec(v, nchrs, 0, 0);
 500         NOERRN();
 501
 502         for (c = a; c <= b; c++)
 503         {
 504                 addchr(cv, c);
 505                 lc = wx_tolower((chr) c);
 506                 if (c != lc)
 507                         addchr(cv, lc);
 508                 uc = wx_toupper((chr) c);
 509                 if (c != uc)
 510                         addchr(cv, uc);
 511         }
 512
 513         return cv;
 514 }
 515
 516 /*
 517  * before - is celt x before celt y, for purposes of range legality?
 518  */
 519 static int                                              /* predicate */
 520 before(celt x, celt y)
 521 {
 522         /* trivial because no MCCEs */
 523         if (x < y)
 524                 return 1;
 525         return 0;
 526 }
 527
 528 /*
 529  * eclass - supply cvec for an equivalence class
 530  * Must include case counterparts on request.
 531  */
 532 static struct cvec *
 533 eclass(struct vars * v,                 /* context */
 534            celt c,                                      /* Collating element representing the
 535                                                                  * equivalence class. */
 536            int cases)                           /* all cases? */
 537 {
 538         struct cvec *cv;
 539
 540         /* crude fake equivalence class for testing */
 541         if ((v->cflags & REG_FAKE) && c == 'x')
 542         {
 543                 cv = getcvec(v, 4, 0, 0);
 544                 addchr(cv, (chr) 'x');
 545                 addchr(cv, (chr) 'y');
 546                 if (cases)
 547                 {
 548                         addchr(cv, (chr) 'X');
 549                         addchr(cv, (chr) 'Y');
 550                 }
 551                 return cv;
 552         }
 553
 554         /* otherwise, none */
 555         if (cases)
 556                 return allcases(v, c);
 557         cv = getcvec(v, 1, 0, 0);
 558         assert(cv != NULL);
 559         addchr(cv, (chr) c);
 560         return cv;
 561 }
 562
 563 /*
 564  * cclass - supply cvec for a character class
 565  *
 566  * Must include case counterparts on request.
 567  */
 568 static struct cvec *
 569 cclass(struct vars * v,                 /* context */
 570            chr *startp,                         /* where the name starts */
 571            chr *endp,                           /* just past the end of the name */
 572            int cases)                           /* case-independent? */
 573 {
 574         size_t          len;
 575         struct cvec *cv = NULL;
 576         char      **namePtr;
 577         int                     i,
 578                                 index;
 579
 580         /*
 581          * The following arrays define the valid character class names.
 582          */
 583
 584         static char *classNames[] = {
 585                 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
 586                 "lower", "print", "punct", "space", "upper", "xdigit", NULL
 587         };
 588
 589         enum classes
 590         {
 591                 CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
 592                 CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
 593         };
 594
 595         /*
 596          * Map the name to the corresponding enumerated value.
 597          */
 598         len = endp - startp;
 599         index = -1;
 600         for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
 601         {
 602                 if (strlen(*namePtr) == len &&
 603                         char_and_wchar_strncmp(*namePtr, startp, len) == 0)
 604                 {
 605                         index = i;
 606                         break;
 607                 }
 608         }
 609         if (index == -1)
 610         {
 611                 ERR(REG_ECTYPE);
 612                 return NULL;
 613         }
 614
 615         /*
 616          * Remap lower and upper to alpha if the match is case insensitive.
 617          */
 618
 619         if (cases &&
 620                 ((enum classes) index == CC_LOWER ||
 621                  (enum classes) index == CC_UPPER))
 622                 index = (int) CC_ALPHA;
 623
 624         /*
 625          * Now compute the character class contents.
 626          *
 627          * For the moment, assume that only char codes < 256 can be in these
 628          * classes.
 629          */
 630
 631         switch ((enum classes) index)
 632         {
 633                 case CC_PRINT:
 634                 case CC_ALNUM:
 635                         cv = getcvec(v, UCHAR_MAX, 1, 0);
 636                         if (cv)
 637                         {
 638                                 for (i = 0; i <= UCHAR_MAX; i++)
 639                                 {
 640                                         if (wx_isalpha((chr) i))
 641                                                 addchr(cv, (chr) i);
 642                                 }
 643                                 addrange(cv, (chr) '0', (chr) '9');
 644                         }
 645                         break;
 646                 case CC_ALPHA:
 647                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 648                         if (cv)
 649                         {
 650                                 for (i = 0; i <= UCHAR_MAX; i++)
 651                                 {
 652                                         if (wx_isalpha((chr) i))
 653                                                 addchr(cv, (chr) i);
 654                                 }
 655                         }
 656                         break;
 657                 case CC_ASCII:
 658                         cv = getcvec(v, 0, 1, 0);
 659                         if (cv)
 660                                 addrange(cv, 0, 0x7f);
 661                         break;
 662                 case CC_BLANK:
 663                         cv = getcvec(v, 2, 0, 0);
 664                         addchr(cv, '\t');
 665                         addchr(cv, ' ');
 666                         break;
 667                 case CC_CNTRL:
 668                         cv = getcvec(v, 0, 2, 0);
 669                         addrange(cv, 0x0, 0x1f);
 670                         addrange(cv, 0x7f, 0x9f);
 671                         break;
 672                 case CC_DIGIT:
 673                         cv = getcvec(v, 0, 1, 0);
 674                         if (cv)
 675                                 addrange(cv, (chr) '0', (chr) '9');
 676                         break;
 677                 case CC_PUNCT:
 678                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 679                         if (cv)
 680                         {
 681                                 for (i = 0; i <= UCHAR_MAX; i++)
 682                                 {
 683                                         if (wx_ispunct((chr) i))
 684                                                 addchr(cv, (chr) i);
 685                                 }
 686                         }
 687                         break;
 688                 case CC_XDIGIT:
 689                         cv = getcvec(v, 0, 3, 0);
 690                         if (cv)
 691                         {
 692                                 addrange(cv, '0', '9');
 693                                 addrange(cv, 'a', 'f');
 694                                 addrange(cv, 'A', 'F');
 695                         }
 696                         break;
 697                 case CC_SPACE:
 698                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 699                         if (cv)
 700                         {
 701                                 for (i = 0; i <= UCHAR_MAX; i++)
 702                                 {
 703                                         if (wx_isspace((chr) i))
 704                                                 addchr(cv, (chr) i);
 705                                 }
 706                         }
 707                         break;
 708                 case CC_LOWER:
 709                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 710                         if (cv)
 711                         {
 712                                 for (i = 0; i <= UCHAR_MAX; i++)
 713                                 {
 714                                         if (wx_islower((chr) i))
 715                                                 addchr(cv, (chr) i);
 716                                 }
 717                         }
 718                         break;
 719                 case CC_UPPER:
 720                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 721                         if (cv)
 722                         {
 723                                 for (i = 0; i <= UCHAR_MAX; i++)
 724                                 {
 725                                         if (wx_isupper((chr) i))
 726                                                 addchr(cv, (chr) i);
 727                                 }
 728                         }
 729                         break;
 730                 case CC_GRAPH:
 731                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 732                         if (cv)
 733                         {
 734                                 for (i = 0; i <= UCHAR_MAX; i++)
 735                                 {
 736                                         if (wx_isgraph((chr) i))
 737                                                 addchr(cv, (chr) i);
 738                                 }
 739                         }
 740                         break;
 741         }
 742         if (cv == NULL)
 743                 ERR(REG_ESPACE);
 744         return cv;
 745 }
 746
 747 /*
 748  * allcases - supply cvec for all case counterparts of a chr (including itself)
 749  *
 750  * This is a shortcut, preferably an efficient one, for simple characters;
 751  * messy cases are done via range().
 752  */
 753 static struct cvec *
 754 allcases(struct vars * v,               /* context */
 755                  chr pc)                                /* character to get case equivs of */
 756 {
 757         struct cvec *cv;
 758         chr                     c = (chr) pc;
 759         chr                     lc,
 760                                 uc;
 761
 762         lc = wx_tolower((chr) c);
 763         uc = wx_toupper((chr) c);
 764
 765         cv = getcvec(v, 2, 0, 0);
 766         addchr(cv, lc);
 767         if (lc != uc)
 768                 addchr(cv, uc);
 769         return cv;
 770 }
 771
 772 /*
 773  * cmp - chr-substring compare
 774  *
 775  * Backrefs need this.  It should preferably be efficient.
 776  * Note that it does not need to report anything except equal/unequal.
 777  * Note also that the length is exact, and the comparison should not
 778  * stop at embedded NULs!
 779  */
 780 static int                                              /* 0 for equal, nonzero for unequal */
 781 cmp(const chr *x, const chr *y, /* strings to compare */
 782         size_t len)                                     /* exact length of comparison */
 783 {
 784         return memcmp(VS(x), VS(y), len * sizeof(chr));
 785 }
 786
 787 /*
 788  * casecmp - case-independent chr-substring compare
 789  *
 790  * REG_ICASE backrefs need this.  It should preferably be efficient.
 791  * Note that it does not need to report anything except equal/unequal.
 792  * Note also that the length is exact, and the comparison should not
 793  * stop at embedded NULs!
 794  */
 795 static int                                              /* 0 for equal, nonzero for unequal */
 796 casecmp(const chr *x, const chr *y,             /* strings to compare */
 797                 size_t len)                             /* exact length of comparison */
 798 {
 799         for (; len > 0; len--, x++, y++)
 800         {
 801                 if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y)))
 802                         return 1;
 803         }
 804         return 0;
 805 }