src/regex/regc_locale.c

   1 /*
   2  * regc_locale.c --
   3  *
   4  *      This file contains locale-specific regexp routines.
   5  *      This file is #included by regcomp.c.
   6  *
   7  * Copyright (c) 1998 by Scriptics Corporation.
   8  *
   9  * This software is copyrighted by the Regents of the University of
  10  * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
  11  * Corporation and other parties.  The following terms apply to all files
  12  * associated with the software unless explicitly disclaimed in
  13  * individual files.
  14  *
  15  * The authors hereby grant permission to use, copy, modify, distribute,
  16  * and license this software and its documentation for any purpose, provided
  17  * that existing copyright notices are retained in all copies and that this
  18  * notice is included verbatim in any distributions. No written agreement,
  19  * license, or royalty fee is required for any of the authorized uses.
  20  * Modifications to this software may be copyrighted by their authors
  21  * and need not follow the licensing terms described here, provided that
  22  * the new terms are clearly indicated on the first page of each file where
  23  * they apply.
  24  *
  25  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
  26  * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  27  * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
  28  * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  *
  31  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
  32  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
  33  * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.      THIS SOFTWARE
  34  * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
  35  * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
  36  * MODIFICATIONS.
  37  *
  38  * GOVERNMENT USE: If you are acquiring this software on behalf of the
  39  * U.S. government, the Government shall have only "Restricted Rights"
  40  * in the software and related documentation as defined in the Federal
  41  * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
  42  * are acquiring the software on behalf of the Department of Defense, the
  43  * software shall be classified as "Commercial Computer Software" and the
  44  * Government shall have only "Restricted Rights" as defined in Clause
  45  * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
  46  * authors grant the U.S. Government and others acting in its behalf
  47  * permission to use and distribute the software in accordance with the
  48  * terms specified in this license.
  49  *
  50  * $Header$
  51  */
  52
  53 int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
  54 {
  55         while(*cp++ == (const char)*wp++ && --nNum){}
  56         return nNum;
  57 }
  58
  59 int wx_isdigit(wx_wchar c) {return wxIsdigit(c);}
  60 int wx_isalpha(wx_wchar c) {return wxIsalpha(c);}
  61 int wx_isalnum(wx_wchar c) {return wxIsalnum(c);}
  62 int wx_isupper(wx_wchar c) {return wxIsupper(c);}
  63 int wx_islower(wx_wchar c) {return wxIslower(c);}
  64 int wx_isgraph(wx_wchar c) {return wxIsgraph(c);}
  65 int wx_ispunct(wx_wchar c) {return wxIspunct(c);}
  66 int wx_isspace(wx_wchar c) {return wxIsspace(c);}
  67
  68 wx_wchar wx_toupper(wx_wchar c)
  69 {
  70     return wxToupper(c);
  71 }
  72
  73 wx_wchar wx_tolower(wx_wchar c)
  74 {
  75     return wxTolower(c);
  76 }
  77
  78 int wx_strlen(const wx_wchar* szString)
  79 {
  80     /*
  81     Generic -- note that some clib functions also test for eol character '^Z'
  82
  83         int     nLength = 0;
  84         for (; *(szString + nLength) != '\0'; nLength++);
  85         return nLength;
  86     */
  87     return szString == NULL ? 0 : wxStrlen_(szString);
  88 }
  89 /* ASCII character-name table */
  90
  91 static struct cname
  92 {
  93         char       *name;
  94         char            code;
  95 }       cnames[] =
  96
  97 {
  98         {
  99                 "NUL", '\0'
 100         },
 101         {
 102                 "SOH", '\001'
 103         },
 104         {
 105                 "STX", '\002'
 106         },
 107         {
 108                 "ETX", '\003'
 109         },
 110         {
 111                 "EOT", '\004'
 112         },
 113         {
 114                 "ENQ", '\005'
 115         },
 116         {
 117                 "ACK", '\006'
 118         },
 119         {
 120                 "BEL", '\007'
 121         },
 122         {
 123                 "alert", '\007'
 124         },
 125         {
 126                 "BS", '\010'
 127         },
 128         {
 129                 "backspace", '\b'
 130         },
 131         {
 132                 "HT", '\011'
 133         },
 134         {
 135                 "tab", '\t'
 136         },
 137         {
 138                 "LF", '\012'
 139         },
 140         {
 141                 "newline", '\n'
 142         },
 143         {
 144                 "VT", '\013'
 145         },
 146         {
 147                 "vertical-tab", '\v'
 148         },
 149         {
 150                 "FF", '\014'
 151         },
 152         {
 153                 "form-feed", '\f'
 154         },
 155         {
 156                 "CR", '\015'
 157         },
 158         {
 159                 "carriage-return", '\r'
 160         },
 161         {
 162                 "SO", '\016'
 163         },
 164         {
 165                 "SI", '\017'
 166         },
 167         {
 168                 "DLE", '\020'
 169         },
 170         {
 171                 "DC1", '\021'
 172         },
 173         {
 174                 "DC2", '\022'
 175         },
 176         {
 177                 "DC3", '\023'
 178         },
 179         {
 180                 "DC4", '\024'
 181         },
 182         {
 183                 "NAK", '\025'
 184         },
 185         {
 186                 "SYN", '\026'
 187         },
 188         {
 189                 "ETB", '\027'
 190         },
 191         {
 192                 "CAN", '\030'
 193         },
 194         {
 195                 "EM", '\031'
 196         },
 197         {
 198                 "SUB", '\032'
 199         },
 200         {
 201                 "ESC", '\033'
 202         },
 203         {
 204                 "IS4", '\034'
 205         },
 206         {
 207                 "FS", '\034'
 208         },
 209         {
 210                 "IS3", '\035'
 211         },
 212         {
 213                 "GS", '\035'
 214         },
 215         {
 216                 "IS2", '\036'
 217         },
 218         {
 219                 "RS", '\036'
 220         },
 221         {
 222                 "IS1", '\037'
 223         },
 224         {
 225                 "US", '\037'
 226         },
 227         {
 228                 "space", ' '
 229         },
 230         {
 231                 "exclamation-mark", '!'
 232         },
 233         {
 234                 "quotation-mark", '"'
 235         },
 236         {
 237                 "number-sign", '#'
 238         },
 239         {
 240                 "dollar-sign", '$'
 241         },
 242         {
 243                 "percent-sign", '%'
 244         },
 245         {
 246                 "ampersand", '&'
 247         },
 248         {
 249                 "apostrophe", '\''
 250         },
 251         {
 252                 "left-parenthesis", '('
 253         },
 254         {
 255                 "right-parenthesis", ')'
 256         },
 257         {
 258                 "asterisk", '*'
 259         },
 260         {
 261                 "plus-sign", '+'
 262         },
 263         {
 264                 "comma", ','
 265         },
 266         {
 267                 "hyphen", '-'
 268         },
 269         {
 270                 "hyphen-minus", '-'
 271         },
 272         {
 273                 "period", '.'
 274         },
 275         {
 276                 "full-stop", '.'
 277         },
 278         {
 279                 "slash", '/'
 280         },
 281         {
 282                 "solidus", '/'
 283         },
 284         {
 285                 "zero", '0'
 286         },
 287         {
 288                 "one", '1'
 289         },
 290         {
 291                 "two", '2'
 292         },
 293         {
 294                 "three", '3'
 295         },
 296         {
 297                 "four", '4'
 298         },
 299         {
 300                 "five", '5'
 301         },
 302         {
 303                 "six", '6'
 304         },
 305         {
 306                 "seven", '7'
 307         },
 308         {
 309                 "eight", '8'
 310         },
 311         {
 312                 "nine", '9'
 313         },
 314         {
 315                 "colon", ':'
 316         },
 317         {
 318                 "semicolon", ';'
 319         },
 320         {
 321                 "less-than-sign", '<'
 322         },
 323         {
 324                 "equals-sign", '='
 325         },
 326         {
 327                 "greater-than-sign", '>'
 328         },
 329         {
 330                 "question-mark", '?'
 331         },
 332         {
 333                 "commercial-at", '@'
 334         },
 335         {
 336                 "left-square-bracket", '['
 337         },
 338         {
 339                 "backslash", '\\'
 340         },
 341         {
 342                 "reverse-solidus", '\\'
 343         },
 344         {
 345                 "right-square-bracket", ']'
 346         },
 347         {
 348                 "circumflex", '^'
 349         },
 350         {
 351                 "circumflex-accent", '^'
 352         },
 353         {
 354                 "underscore", '_'
 355         },
 356         {
 357                 "low-line", '_'
 358         },
 359         {
 360                 "grave-accent", '`'
 361         },
 362         {
 363                 "left-brace", '{'
 364         },
 365         {
 366                 "left-curly-bracket", '{'
 367         },
 368         {
 369                 "vertical-line", '|'
 370         },
 371         {
 372                 "right-brace", '}'
 373         },
 374         {
 375                 "right-curly-bracket", '}'
 376         },
 377         {
 378                 "tilde", '~'
 379         },
 380         {
 381                 "DEL", '\177'
 382         },
 383         {
 384                 NULL, 0
 385         }
 386 };
 387
 388
 389 /*
 390  * nmcces - how many distinct MCCEs are there?
 391  */
 392 static int
 393 nmcces(struct vars * v)
 394 {
 395         /*
 396          * No multi-character collating elements defined at the moment.
 397          */
 398         return 0;
 399 }
 400
 401 /*
 402  * nleaders - how many chrs can be first chrs of MCCEs?
 403  */
 404 static int
 405 nleaders(struct vars * v)
 406 {
 407         return 0;
 408 }
 409
 410 /*
 411  * allmcces - return a cvec with all the MCCEs of the locale
 412  */
 413 static struct cvec *
 414 allmcces(struct vars * v,               /* context */
 415                  struct cvec * cv)              /* this is supposed to have enough room */
 416 {
 417         return clearcvec(cv);
 418 }
 419
 420 /*
 421  * element - map collating-element name to celt
 422  */
 423 static celt
 424 element(struct vars * v,                /* context */
 425                 chr *startp,                    /* points to start of name */
 426                 chr *endp)                              /* points just past end of name */
 427 {
 428         struct cname *cn;
 429         size_t          len;
 430
 431         /* generic:  one-chr names stand for themselves */
 432         assert(startp < endp);
 433         len = endp - startp;
 434         if (len == 1)
 435                 return *startp;
 436
 437         NOTE(REG_ULOCALE);
 438
 439         /* search table */
 440         for (cn = cnames; cn->name != NULL; cn++)
 441         {
 442                 if (strlen(cn->name) == len &&
 443                         char_and_wchar_strncmp(cn->name, startp, len) == 0)
 444                 {
 445                         break;                          /* NOTE BREAK OUT */
 446                 }
 447         }
 448         if (cn->name != NULL)
 449                 return CHR(cn->code);
 450
 451         /* couldn't find it */
 452         ERR(REG_ECOLLATE);
 453         return 0;
 454 }
 455
 456 /*
 457  * range - supply cvec for a range, including legality check
 458  */
 459 static struct cvec *
 460 range(struct vars * v,                  /* context */
 461           celt a,                                       /* range start */
 462           celt b,                                       /* range end, might equal a */
 463           int cases)                            /* case-independent? */
 464 {
 465         int                     nchrs;
 466         struct cvec *cv;
 467         celt            c,
 468                                 lc,
 469                                 uc;
 470
 471         if (a != b && !before(a, b))
 472         {
 473                 ERR(REG_ERANGE);
 474                 return NULL;
 475         }
 476
 477         if (!cases)
 478         {                                                       /* easy version */
 479                 cv = getcvec(v, 0, 1, 0);
 480                 NOERRN();
 481                 addrange(cv, a, b);
 482                 return cv;
 483         }
 484
 485         /*
 486          * When case-independent, it's hard to decide when cvec ranges are
 487          * usable, so for now at least, we won't try.  We allocate enough
 488          * space for two case variants plus a little extra for the two title
 489          * case variants.
 490          */
 491
 492         nchrs = (b - a + 1) * 2 + 4;
 493
 494         cv = getcvec(v, nchrs, 0, 0);
 495         NOERRN();
 496
 497         for (c = a; c <= b; c++)
 498         {
 499                 addchr(cv, c);
 500                 lc = wx_tolower((chr) c);
 501                 if (c != lc)
 502                         addchr(cv, lc);
 503                 uc = wx_toupper((chr) c);
 504                 if (c != uc)
 505                         addchr(cv, uc);
 506         }
 507
 508         return cv;
 509 }
 510
 511 /*
 512  * before - is celt x before celt y, for purposes of range legality?
 513  */
 514 static int                                              /* predicate */
 515 before(celt x, celt y)
 516 {
 517         /* trivial because no MCCEs */
 518         if (x < y)
 519                 return 1;
 520         return 0;
 521 }
 522
 523 /*
 524  * eclass - supply cvec for an equivalence class
 525  * Must include case counterparts on request.
 526  */
 527 static struct cvec *
 528 eclass(struct vars * v,                 /* context */
 529            celt c,                                      /* Collating element representing the
 530                                                                  * equivalence class. */
 531            int cases)                           /* all cases? */
 532 {
 533         struct cvec *cv;
 534
 535         /* crude fake equivalence class for testing */
 536         if ((v->cflags & REG_FAKE) && c == 'x')
 537         {
 538                 cv = getcvec(v, 4, 0, 0);
 539                 addchr(cv, (chr) 'x');
 540                 addchr(cv, (chr) 'y');
 541                 if (cases)
 542                 {
 543                         addchr(cv, (chr) 'X');
 544                         addchr(cv, (chr) 'Y');
 545                 }
 546                 return cv;
 547         }
 548
 549         /* otherwise, none */
 550         if (cases)
 551                 return allcases(v, c);
 552         cv = getcvec(v, 1, 0, 0);
 553         assert(cv != NULL);
 554         addchr(cv, (chr) c);
 555         return cv;
 556 }
 557
 558 /*
 559  * cclass - supply cvec for a character class
 560  *
 561  * Must include case counterparts on request.
 562  */
 563 static struct cvec *
 564 cclass(struct vars * v,                 /* context */
 565            chr *startp,                         /* where the name starts */
 566            chr *endp,                           /* just past the end of the name */
 567            int cases)                           /* case-independent? */
 568 {
 569         size_t          len;
 570         struct cvec *cv = NULL;
 571         char      **namePtr;
 572         int                     i,
 573                                 index;
 574
 575         /*
 576          * The following arrays define the valid character class names.
 577          */
 578
 579         static char *classNames[] = {
 580                 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
 581                 "lower", "print", "punct", "space", "upper", "xdigit", NULL
 582         };
 583
 584         enum classes
 585         {
 586                 CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
 587                 CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
 588         };
 589
 590         /*
 591          * Map the name to the corresponding enumerated value.
 592          */
 593         len = endp - startp;
 594         index = -1;
 595         for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
 596         {
 597                 if (strlen(*namePtr) == len &&
 598                         char_and_wchar_strncmp(*namePtr, startp, len) == 0)
 599                 {
 600                         index = i;
 601                         break;
 602                 }
 603         }
 604         if (index == -1)
 605         {
 606                 ERR(REG_ECTYPE);
 607                 return NULL;
 608         }
 609
 610         /*
 611          * Remap lower and upper to alpha if the match is case insensitive.
 612          */
 613
 614         if (cases &&
 615                 ((enum classes) index == CC_LOWER ||
 616                  (enum classes) index == CC_UPPER))
 617                 index = (int) CC_ALPHA;
 618
 619         /*
 620          * Now compute the character class contents.
 621          *
 622          * For the moment, assume that only char codes < 256 can be in these
 623          * classes.
 624          */
 625
 626         switch ((enum classes) index)
 627         {
 628                 case CC_PRINT:
 629                 case CC_ALNUM:
 630                         cv = getcvec(v, UCHAR_MAX, 1, 0);
 631                         if (cv)
 632                         {
 633                                 for (i = 0; i <= UCHAR_MAX; i++)
 634                                 {
 635                                         if (wx_isalpha((chr) i))
 636                                                 addchr(cv, (chr) i);
 637                                 }
 638                                 addrange(cv, (chr) '0', (chr) '9');
 639                         }
 640                         break;
 641                 case CC_ALPHA:
 642                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 643                         if (cv)
 644                         {
 645                                 for (i = 0; i <= UCHAR_MAX; i++)
 646                                 {
 647                                         if (wx_isalpha((chr) i))
 648                                                 addchr(cv, (chr) i);
 649                                 }
 650                         }
 651                         break;
 652                 case CC_ASCII:
 653                         cv = getcvec(v, 0, 1, 0);
 654                         if (cv)
 655                                 addrange(cv, 0, 0x7f);
 656                         break;
 657                 case CC_BLANK:
 658                         cv = getcvec(v, 2, 0, 0);
 659                         addchr(cv, '\t');
 660                         addchr(cv, ' ');
 661                         break;
 662                 case CC_CNTRL:
 663                         cv = getcvec(v, 0, 2, 0);
 664                         addrange(cv, 0x0, 0x1f);
 665                         addrange(cv, 0x7f, 0x9f);
 666                         break;
 667                 case CC_DIGIT:
 668                         cv = getcvec(v, 0, 1, 0);
 669                         if (cv)
 670                                 addrange(cv, (chr) '0', (chr) '9');
 671                         break;
 672                 case CC_PUNCT:
 673                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 674                         if (cv)
 675                         {
 676                                 for (i = 0; i <= UCHAR_MAX; i++)
 677                                 {
 678                                         if (wx_ispunct((chr) i))
 679                                                 addchr(cv, (chr) i);
 680                                 }
 681                         }
 682                         break;
 683                 case CC_XDIGIT:
 684                         cv = getcvec(v, 0, 3, 0);
 685                         if (cv)
 686                         {
 687                                 addrange(cv, '0', '9');
 688                                 addrange(cv, 'a', 'f');
 689                                 addrange(cv, 'A', 'F');
 690                         }
 691                         break;
 692                 case CC_SPACE:
 693                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 694                         if (cv)
 695                         {
 696                                 for (i = 0; i <= UCHAR_MAX; i++)
 697                                 {
 698                                         if (wx_isspace((chr) i))
 699                                                 addchr(cv, (chr) i);
 700                                 }
 701                         }
 702                         break;
 703                 case CC_LOWER:
 704                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 705                         if (cv)
 706                         {
 707                                 for (i = 0; i <= UCHAR_MAX; i++)
 708                                 {
 709                                         if (wx_islower((chr) i))
 710                                                 addchr(cv, (chr) i);
 711                                 }
 712                         }
 713                         break;
 714                 case CC_UPPER:
 715                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 716                         if (cv)
 717                         {
 718                                 for (i = 0; i <= UCHAR_MAX; i++)
 719                                 {
 720                                         if (wx_isupper((chr) i))
 721                                                 addchr(cv, (chr) i);
 722                                 }
 723                         }
 724                         break;
 725                 case CC_GRAPH:
 726                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 727                         if (cv)
 728                         {
 729                                 for (i = 0; i <= UCHAR_MAX; i++)
 730                                 {
 731                                         if (wx_isgraph((chr) i))
 732                                                 addchr(cv, (chr) i);
 733                                 }
 734                         }
 735                         break;
 736         }
 737         if (cv == NULL)
 738                 ERR(REG_ESPACE);
 739         return cv;
 740 }
 741
 742 /*
 743  * allcases - supply cvec for all case counterparts of a chr (including itself)
 744  *
 745  * This is a shortcut, preferably an efficient one, for simple characters;
 746  * messy cases are done via range().
 747  */
 748 static struct cvec *
 749 allcases(struct vars * v,               /* context */
 750                  chr pc)                                /* character to get case equivs of */
 751 {
 752         struct cvec *cv;
 753         chr                     c = (chr) pc;
 754         chr                     lc,
 755                                 uc;
 756
 757         lc = wx_tolower((chr) c);
 758         uc = wx_toupper((chr) c);
 759
 760         cv = getcvec(v, 2, 0, 0);
 761         addchr(cv, lc);
 762         if (lc != uc)
 763                 addchr(cv, uc);
 764         return cv;
 765 }
 766
 767 /*
 768  * cmp - chr-substring compare
 769  *
 770  * Backrefs need this.  It should preferably be efficient.
 771  * Note that it does not need to report anything except equal/unequal.
 772  * Note also that the length is exact, and the comparison should not
 773  * stop at embedded NULs!
 774  */
 775 static int                                              /* 0 for equal, nonzero for unequal */
 776 cmp(const chr *x, const chr *y, /* strings to compare */
 777         size_t len)                                     /* exact length of comparison */
 778 {
 779         return memcmp(VS(x), VS(y), len * sizeof(chr));
 780 }
 781
 782 /*
 783  * casecmp - case-independent chr-substring compare
 784  *
 785  * REG_ICASE backrefs need this.  It should preferably be efficient.
 786  * Note that it does not need to report anything except equal/unequal.
 787  * Note also that the length is exact, and the comparison should not
 788  * stop at embedded NULs!
 789  */
 790 static int                                              /* 0 for equal, nonzero for unequal */
 791 casecmp(const chr *x, const chr *y,             /* strings to compare */
 792                 size_t len)                             /* exact length of comparison */
 793 {
 794         for (; len > 0; len--, x++, y++)
 795         {
 796                 if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y)))
 797                         return 1;
 798         }
 799         return 0;
 800 }